xref: /openbmc/linux/drivers/block/rbd.c (revision 131fd9f6)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44f8a22fc2SIlya Dryomov #include <linux/idr.h>
45602adf40SYehuda Sadeh 
46602adf40SYehuda Sadeh #include "rbd_types.h"
47602adf40SYehuda Sadeh 
48aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
49aafb230eSAlex Elder 
50593a9e7bSAlex Elder /*
51593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
52593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
53593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
54593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
55593a9e7bSAlex Elder  */
56593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
57593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
58593a9e7bSAlex Elder 
59a2acd00eSAlex Elder /*
60a2acd00eSAlex Elder  * Increment the given counter and return its updated value.
61a2acd00eSAlex Elder  * If the counter is already 0 it will not be incremented.
62a2acd00eSAlex Elder  * If the counter is already at its maximum value returns
63a2acd00eSAlex Elder  * -EINVAL without updating it.
64a2acd00eSAlex Elder  */
65a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
66a2acd00eSAlex Elder {
67a2acd00eSAlex Elder 	unsigned int counter;
68a2acd00eSAlex Elder 
69a2acd00eSAlex Elder 	counter = (unsigned int)__atomic_add_unless(v, 1, 0);
70a2acd00eSAlex Elder 	if (counter <= (unsigned int)INT_MAX)
71a2acd00eSAlex Elder 		return (int)counter;
72a2acd00eSAlex Elder 
73a2acd00eSAlex Elder 	atomic_dec(v);
74a2acd00eSAlex Elder 
75a2acd00eSAlex Elder 	return -EINVAL;
76a2acd00eSAlex Elder }
77a2acd00eSAlex Elder 
78a2acd00eSAlex Elder /* Decrement the counter.  Return the resulting value, or -EINVAL */
79a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
80a2acd00eSAlex Elder {
81a2acd00eSAlex Elder 	int counter;
82a2acd00eSAlex Elder 
83a2acd00eSAlex Elder 	counter = atomic_dec_return(v);
84a2acd00eSAlex Elder 	if (counter >= 0)
85a2acd00eSAlex Elder 		return counter;
86a2acd00eSAlex Elder 
87a2acd00eSAlex Elder 	atomic_inc(v);
88a2acd00eSAlex Elder 
89a2acd00eSAlex Elder 	return -EINVAL;
90a2acd00eSAlex Elder }
91a2acd00eSAlex Elder 
92f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
93602adf40SYehuda Sadeh 
947e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR		256
957e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT	4
96602adf40SYehuda Sadeh 
97d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
98d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
99d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
100d4b125e9SAlex Elder 
10135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
102602adf40SYehuda Sadeh 
103602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
104602adf40SYehuda Sadeh 
1059682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
1069682fc6dSAlex Elder 
1079e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1089e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
109589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
1109e15b77dSAlex Elder 
1111e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
112589d30e0SAlex Elder 
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder 
1155cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
1165cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
1175cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
1185cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
119d889140cSAlex Elder 
120d889140cSAlex Elder /* Features supported by this (client software) implementation. */
121d889140cSAlex Elder 
122770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
123d889140cSAlex Elder 
12481a89793SAlex Elder /*
12581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
12681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
12781a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
12881a89793SAlex Elder  * enough to hold all possible device names.
12981a89793SAlex Elder  */
130602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
13181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
132602adf40SYehuda Sadeh 
133602adf40SYehuda Sadeh /*
134602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
135602adf40SYehuda Sadeh  */
136602adf40SYehuda Sadeh struct rbd_image_header {
137f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
138849b4260SAlex Elder 	char *object_prefix;
139602adf40SYehuda Sadeh 	__u8 obj_order;
140602adf40SYehuda Sadeh 	__u8 crypt_type;
141602adf40SYehuda Sadeh 	__u8 comp_type;
142f35a4deeSAlex Elder 	u64 stripe_unit;
143f35a4deeSAlex Elder 	u64 stripe_count;
144f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
145602adf40SYehuda Sadeh 
146f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
147f84344f3SAlex Elder 	u64 image_size;
148f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
149f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
150f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
15159c2be1eSYehuda Sadeh };
15259c2be1eSYehuda Sadeh 
1530d7dbfceSAlex Elder /*
1540d7dbfceSAlex Elder  * An rbd image specification.
1550d7dbfceSAlex Elder  *
1560d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
157c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
158c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
159c66c6e0cSAlex Elder  *
160c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
161c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
162c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
163c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
164c66c6e0cSAlex Elder  *
165c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
166c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
167c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
168c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
169c66c6e0cSAlex Elder  * is shared between the parent and child).
170c66c6e0cSAlex Elder  *
171c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
172c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
173c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
174c66c6e0cSAlex Elder  *
175c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
176c66c6e0cSAlex Elder  * could be a null pointer).
1770d7dbfceSAlex Elder  */
1780d7dbfceSAlex Elder struct rbd_spec {
1790d7dbfceSAlex Elder 	u64		pool_id;
180ecb4dc22SAlex Elder 	const char	*pool_name;
1810d7dbfceSAlex Elder 
182ecb4dc22SAlex Elder 	const char	*image_id;
183ecb4dc22SAlex Elder 	const char	*image_name;
1840d7dbfceSAlex Elder 
1850d7dbfceSAlex Elder 	u64		snap_id;
186ecb4dc22SAlex Elder 	const char	*snap_name;
1870d7dbfceSAlex Elder 
1880d7dbfceSAlex Elder 	struct kref	kref;
1890d7dbfceSAlex Elder };
1900d7dbfceSAlex Elder 
191602adf40SYehuda Sadeh /*
192f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
193602adf40SYehuda Sadeh  */
194602adf40SYehuda Sadeh struct rbd_client {
195602adf40SYehuda Sadeh 	struct ceph_client	*client;
196602adf40SYehuda Sadeh 	struct kref		kref;
197602adf40SYehuda Sadeh 	struct list_head	node;
198602adf40SYehuda Sadeh };
199602adf40SYehuda Sadeh 
200bf0d5f50SAlex Elder struct rbd_img_request;
201bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
202bf0d5f50SAlex Elder 
203bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder struct rbd_obj_request;
206bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
207bf0d5f50SAlex Elder 
2089969ebc5SAlex Elder enum obj_request_type {
2099969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
2109969ebc5SAlex Elder };
211bf0d5f50SAlex Elder 
212926f9b3fSAlex Elder enum obj_req_flags {
213926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
2146365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
2155679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
2165679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
217926f9b3fSAlex Elder };
218926f9b3fSAlex Elder 
219bf0d5f50SAlex Elder struct rbd_obj_request {
220bf0d5f50SAlex Elder 	const char		*object_name;
221bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
222bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
223926f9b3fSAlex Elder 	unsigned long		flags;
224bf0d5f50SAlex Elder 
225c5b5ef6cSAlex Elder 	/*
226c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
227c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
228c5b5ef6cSAlex Elder 	 *
229c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
230c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
231c5b5ef6cSAlex Elder 	 *
232c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
233c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
234c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
235c5b5ef6cSAlex Elder 	 *
236c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
237c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
238c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
239c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
240c5b5ef6cSAlex Elder 	 */
241c5b5ef6cSAlex Elder 	union {
242c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
243c5b5ef6cSAlex Elder 		struct {
244bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
245c5b5ef6cSAlex Elder 			u64			img_offset;
246c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
247c5b5ef6cSAlex Elder 			struct list_head	links;
248c5b5ef6cSAlex Elder 		};
249c5b5ef6cSAlex Elder 	};
250bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
251bf0d5f50SAlex Elder 
252bf0d5f50SAlex Elder 	enum obj_request_type	type;
253788e2df3SAlex Elder 	union {
254bf0d5f50SAlex Elder 		struct bio	*bio_list;
255788e2df3SAlex Elder 		struct {
256788e2df3SAlex Elder 			struct page	**pages;
257788e2df3SAlex Elder 			u32		page_count;
258788e2df3SAlex Elder 		};
259788e2df3SAlex Elder 	};
2600eefd470SAlex Elder 	struct page		**copyup_pages;
261ebda6408SAlex Elder 	u32			copyup_page_count;
262bf0d5f50SAlex Elder 
263bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2661b83bef2SSage Weil 	int			result;
267bf0d5f50SAlex Elder 
268bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
269788e2df3SAlex Elder 	struct completion	completion;
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder 	struct kref		kref;
272bf0d5f50SAlex Elder };
273bf0d5f50SAlex Elder 
2740c425248SAlex Elder enum img_req_flags {
2759849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2769849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
277d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2780c425248SAlex Elder };
2790c425248SAlex Elder 
280bf0d5f50SAlex Elder struct rbd_img_request {
281bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
282bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
283bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2840c425248SAlex Elder 	unsigned long		flags;
285bf0d5f50SAlex Elder 	union {
286bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2879849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2889849e986SAlex Elder 	};
2899849e986SAlex Elder 	union {
2909849e986SAlex Elder 		struct request		*rq;		/* block request */
2919849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
292bf0d5f50SAlex Elder 	};
2933d7efd18SAlex Elder 	struct page		**copyup_pages;
294ebda6408SAlex Elder 	u32			copyup_page_count;
295bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
296bf0d5f50SAlex Elder 	u32			next_completion;
297bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
29855f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
299a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
300bf0d5f50SAlex Elder 
301bf0d5f50SAlex Elder 	u32			obj_request_count;
302bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
303bf0d5f50SAlex Elder 
304bf0d5f50SAlex Elder 	struct kref		kref;
305bf0d5f50SAlex Elder };
306bf0d5f50SAlex Elder 
307bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
308ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
309bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
310ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
311bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
312ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
313bf0d5f50SAlex Elder 
314f84344f3SAlex Elder struct rbd_mapping {
31599c1f08fSAlex Elder 	u64                     size;
31634b13184SAlex Elder 	u64                     features;
317f84344f3SAlex Elder 	bool			read_only;
318f84344f3SAlex Elder };
319f84344f3SAlex Elder 
320602adf40SYehuda Sadeh /*
321602adf40SYehuda Sadeh  * a single device
322602adf40SYehuda Sadeh  */
323602adf40SYehuda Sadeh struct rbd_device {
324de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
325602adf40SYehuda Sadeh 
326602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
327dd82fff1SIlya Dryomov 	int			minor;
328602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
329602adf40SYehuda Sadeh 
330a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
331602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
332602adf40SYehuda Sadeh 
333602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
334602adf40SYehuda Sadeh 
335b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	struct rbd_image_header	header;
338b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3390d7dbfceSAlex Elder 	struct rbd_spec		*spec;
340602adf40SYehuda Sadeh 
3410d7dbfceSAlex Elder 	char			*header_name;
342971f839aSAlex Elder 
3430903e875SAlex Elder 	struct ceph_file_layout	layout;
3440903e875SAlex Elder 
34559c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
346975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
34759c2be1eSYehuda Sadeh 
34886b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
34986b00e0dSAlex Elder 	u64			parent_overlap;
350a2acd00eSAlex Elder 	atomic_t		parent_ref;
3512f82ee54SAlex Elder 	struct rbd_device	*parent;
35286b00e0dSAlex Elder 
353c666601aSJosh Durgin 	/* protects updating the header */
354c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
355f84344f3SAlex Elder 
356f84344f3SAlex Elder 	struct rbd_mapping	mapping;
357602adf40SYehuda Sadeh 
358602adf40SYehuda Sadeh 	struct list_head	node;
359dfc5606dSYehuda Sadeh 
360dfc5606dSYehuda Sadeh 	/* sysfs related */
361dfc5606dSYehuda Sadeh 	struct device		dev;
362b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
363dfc5606dSYehuda Sadeh };
364dfc5606dSYehuda Sadeh 
365b82d167bSAlex Elder /*
366b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
367b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
368b82d167bSAlex Elder  *
369b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
370b82d167bSAlex Elder  * "open_count" field) requires atomic access.
371b82d167bSAlex Elder  */
3726d292906SAlex Elder enum rbd_dev_flags {
3736d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
374b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3756d292906SAlex Elder };
3766d292906SAlex Elder 
377cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
378e124a82fSAlex Elder 
379602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
380e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
381e124a82fSAlex Elder 
382602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
383432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
384602adf40SYehuda Sadeh 
38578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
38678c2a44aSAlex Elder 
3871c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
388868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
38978c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3901c2a9dfeSAlex Elder 
3919b60e70bSIlya Dryomov static int rbd_major;
392f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
393f8a22fc2SIlya Dryomov 
3949b60e70bSIlya Dryomov /*
3959b60e70bSIlya Dryomov  * Default to false for now, as single-major requires >= 0.75 version of
3969b60e70bSIlya Dryomov  * userspace rbd utility.
3979b60e70bSIlya Dryomov  */
3989b60e70bSIlya Dryomov static bool single_major = false;
3999b60e70bSIlya Dryomov module_param(single_major, bool, S_IRUGO);
4009b60e70bSIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
4019b60e70bSIlya Dryomov 
4023d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
4033d7efd18SAlex Elder 
404200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
405dfc5606dSYehuda Sadeh 
406f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
407f0f8cef5SAlex Elder 		       size_t count);
408f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
409f0f8cef5SAlex Elder 			  size_t count);
4109b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
4119b60e70bSIlya Dryomov 				    size_t count);
4129b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
4139b60e70bSIlya Dryomov 				       size_t count);
4141f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
415a2acd00eSAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
416f0f8cef5SAlex Elder 
4179b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
4189b60e70bSIlya Dryomov {
4197e513d43SIlya Dryomov 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
4209b60e70bSIlya Dryomov }
4219b60e70bSIlya Dryomov 
4229b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
4239b60e70bSIlya Dryomov {
4247e513d43SIlya Dryomov 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
4259b60e70bSIlya Dryomov }
4269b60e70bSIlya Dryomov 
427b15a21ddSGreg Kroah-Hartman static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
428b15a21ddSGreg Kroah-Hartman static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
4299b60e70bSIlya Dryomov static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
4309b60e70bSIlya Dryomov static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
431b15a21ddSGreg Kroah-Hartman 
432b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
433b15a21ddSGreg Kroah-Hartman 	&bus_attr_add.attr,
434b15a21ddSGreg Kroah-Hartman 	&bus_attr_remove.attr,
4359b60e70bSIlya Dryomov 	&bus_attr_add_single_major.attr,
4369b60e70bSIlya Dryomov 	&bus_attr_remove_single_major.attr,
437b15a21ddSGreg Kroah-Hartman 	NULL,
438f0f8cef5SAlex Elder };
43992c76dc0SIlya Dryomov 
44092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
44192c76dc0SIlya Dryomov 				  struct attribute *attr, int index)
44292c76dc0SIlya Dryomov {
4439b60e70bSIlya Dryomov 	if (!single_major &&
4449b60e70bSIlya Dryomov 	    (attr == &bus_attr_add_single_major.attr ||
4459b60e70bSIlya Dryomov 	     attr == &bus_attr_remove_single_major.attr))
4469b60e70bSIlya Dryomov 		return 0;
4479b60e70bSIlya Dryomov 
44892c76dc0SIlya Dryomov 	return attr->mode;
44992c76dc0SIlya Dryomov }
45092c76dc0SIlya Dryomov 
45192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
45292c76dc0SIlya Dryomov 	.attrs = rbd_bus_attrs,
45392c76dc0SIlya Dryomov 	.is_visible = rbd_bus_is_visible,
45492c76dc0SIlya Dryomov };
45592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
456f0f8cef5SAlex Elder 
457f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
458f0f8cef5SAlex Elder 	.name		= "rbd",
459b15a21ddSGreg Kroah-Hartman 	.bus_groups	= rbd_bus_groups,
460f0f8cef5SAlex Elder };
461f0f8cef5SAlex Elder 
462f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
463f0f8cef5SAlex Elder {
464f0f8cef5SAlex Elder }
465f0f8cef5SAlex Elder 
466f0f8cef5SAlex Elder static struct device rbd_root_dev = {
467f0f8cef5SAlex Elder 	.init_name =    "rbd",
468f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
469f0f8cef5SAlex Elder };
470f0f8cef5SAlex Elder 
47106ecc6cbSAlex Elder static __printf(2, 3)
47206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
47306ecc6cbSAlex Elder {
47406ecc6cbSAlex Elder 	struct va_format vaf;
47506ecc6cbSAlex Elder 	va_list args;
47606ecc6cbSAlex Elder 
47706ecc6cbSAlex Elder 	va_start(args, fmt);
47806ecc6cbSAlex Elder 	vaf.fmt = fmt;
47906ecc6cbSAlex Elder 	vaf.va = &args;
48006ecc6cbSAlex Elder 
48106ecc6cbSAlex Elder 	if (!rbd_dev)
48206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
48306ecc6cbSAlex Elder 	else if (rbd_dev->disk)
48406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
48506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
48606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
48706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
48806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
48906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
49006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
49106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
49206ecc6cbSAlex Elder 	else	/* punt */
49306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
49406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
49506ecc6cbSAlex Elder 	va_end(args);
49606ecc6cbSAlex Elder }
49706ecc6cbSAlex Elder 
498aafb230eSAlex Elder #ifdef RBD_DEBUG
499aafb230eSAlex Elder #define rbd_assert(expr)						\
500aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
501aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
502aafb230eSAlex Elder 						"at line %d:\n\n"	\
503aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
504aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
505aafb230eSAlex Elder 			BUG();						\
506aafb230eSAlex Elder 		}
507aafb230eSAlex Elder #else /* !RBD_DEBUG */
508aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
509aafb230eSAlex Elder #endif /* !RBD_DEBUG */
510dfc5606dSYehuda Sadeh 
511b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
51205a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
51305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
5148b3e1a56SAlex Elder 
515cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
5162df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
5172df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev);
51854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
51954cac61fSAlex Elder 					u64 snap_id);
5202ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5212ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
5222ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5232ad3d716SAlex Elder 		u64 *snap_features);
5242ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
52559c2be1eSYehuda Sadeh 
526602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
527602adf40SYehuda Sadeh {
528f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
529b82d167bSAlex Elder 	bool removing = false;
530602adf40SYehuda Sadeh 
531f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
532602adf40SYehuda Sadeh 		return -EROFS;
533602adf40SYehuda Sadeh 
534a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
535b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
536b82d167bSAlex Elder 		removing = true;
537b82d167bSAlex Elder 	else
538b82d167bSAlex Elder 		rbd_dev->open_count++;
539a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
540b82d167bSAlex Elder 	if (removing)
541b82d167bSAlex Elder 		return -ENOENT;
542b82d167bSAlex Elder 
543c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
544f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
545340c7a2bSAlex Elder 
546602adf40SYehuda Sadeh 	return 0;
547602adf40SYehuda Sadeh }
548602adf40SYehuda Sadeh 
549db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode)
550dfc5606dSYehuda Sadeh {
551dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
552b82d167bSAlex Elder 	unsigned long open_count_before;
553b82d167bSAlex Elder 
554a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
555b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
556a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
557b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
558dfc5606dSYehuda Sadeh 
559c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
560dfc5606dSYehuda Sadeh }
561dfc5606dSYehuda Sadeh 
562*131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
563*131fd9f6SGuangliang Zhao {
564*131fd9f6SGuangliang Zhao 	int val;
565*131fd9f6SGuangliang Zhao 	bool ro;
566*131fd9f6SGuangliang Zhao 
567*131fd9f6SGuangliang Zhao 	if (get_user(val, (int __user *)(arg)))
568*131fd9f6SGuangliang Zhao 		return -EFAULT;
569*131fd9f6SGuangliang Zhao 
570*131fd9f6SGuangliang Zhao 	ro = val ? true : false;
571*131fd9f6SGuangliang Zhao 	/* Snapshot doesn't allow to write*/
572*131fd9f6SGuangliang Zhao 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
573*131fd9f6SGuangliang Zhao 		return -EROFS;
574*131fd9f6SGuangliang Zhao 
575*131fd9f6SGuangliang Zhao 	if (rbd_dev->mapping.read_only != ro) {
576*131fd9f6SGuangliang Zhao 		rbd_dev->mapping.read_only = ro;
577*131fd9f6SGuangliang Zhao 		set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
578*131fd9f6SGuangliang Zhao 	}
579*131fd9f6SGuangliang Zhao 
580*131fd9f6SGuangliang Zhao 	return 0;
581*131fd9f6SGuangliang Zhao }
582*131fd9f6SGuangliang Zhao 
583*131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
584*131fd9f6SGuangliang Zhao 			unsigned int cmd, unsigned long arg)
585*131fd9f6SGuangliang Zhao {
586*131fd9f6SGuangliang Zhao 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
587*131fd9f6SGuangliang Zhao 	int ret = 0;
588*131fd9f6SGuangliang Zhao 
589*131fd9f6SGuangliang Zhao 	spin_lock_irq(&rbd_dev->lock);
590*131fd9f6SGuangliang Zhao 	/* prevent others open this device */
591*131fd9f6SGuangliang Zhao 	if (rbd_dev->open_count > 1) {
592*131fd9f6SGuangliang Zhao 		ret = -EBUSY;
593*131fd9f6SGuangliang Zhao 		goto out;
594*131fd9f6SGuangliang Zhao 	}
595*131fd9f6SGuangliang Zhao 
596*131fd9f6SGuangliang Zhao 	switch (cmd) {
597*131fd9f6SGuangliang Zhao 	case BLKROSET:
598*131fd9f6SGuangliang Zhao 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
599*131fd9f6SGuangliang Zhao 		break;
600*131fd9f6SGuangliang Zhao 	default:
601*131fd9f6SGuangliang Zhao 		ret = -ENOTTY;
602*131fd9f6SGuangliang Zhao 	}
603*131fd9f6SGuangliang Zhao 
604*131fd9f6SGuangliang Zhao out:
605*131fd9f6SGuangliang Zhao 	spin_unlock_irq(&rbd_dev->lock);
606*131fd9f6SGuangliang Zhao 	return ret;
607*131fd9f6SGuangliang Zhao }
608*131fd9f6SGuangliang Zhao 
609*131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
610*131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
611*131fd9f6SGuangliang Zhao 				unsigned int cmd, unsigned long arg)
612*131fd9f6SGuangliang Zhao {
613*131fd9f6SGuangliang Zhao 	return rbd_ioctl(bdev, mode, cmd, arg);
614*131fd9f6SGuangliang Zhao }
615*131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */
616*131fd9f6SGuangliang Zhao 
617602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
618602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
619602adf40SYehuda Sadeh 	.open			= rbd_open,
620dfc5606dSYehuda Sadeh 	.release		= rbd_release,
621*131fd9f6SGuangliang Zhao 	.ioctl			= rbd_ioctl,
622*131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT
623*131fd9f6SGuangliang Zhao 	.compat_ioctl		= rbd_compat_ioctl,
624*131fd9f6SGuangliang Zhao #endif
625602adf40SYehuda Sadeh };
626602adf40SYehuda Sadeh 
627602adf40SYehuda Sadeh /*
6287262cfcaSAlex Elder  * Initialize an rbd client instance.  Success or not, this function
629cfbf6377SAlex Elder  * consumes ceph_opts.  Caller holds client_mutex.
630602adf40SYehuda Sadeh  */
631f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
632602adf40SYehuda Sadeh {
633602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
634602adf40SYehuda Sadeh 	int ret = -ENOMEM;
635602adf40SYehuda Sadeh 
63637206ee5SAlex Elder 	dout("%s:\n", __func__);
637602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
638602adf40SYehuda Sadeh 	if (!rbdc)
639602adf40SYehuda Sadeh 		goto out_opt;
640602adf40SYehuda Sadeh 
641602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
642602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
643602adf40SYehuda Sadeh 
64443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
645602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
64608f75463SAlex Elder 		goto out_rbdc;
64743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
648602adf40SYehuda Sadeh 
649602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
650602adf40SYehuda Sadeh 	if (ret < 0)
65108f75463SAlex Elder 		goto out_client;
652602adf40SYehuda Sadeh 
653432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
654602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
655432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
656602adf40SYehuda Sadeh 
65737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
658bc534d86SAlex Elder 
659602adf40SYehuda Sadeh 	return rbdc;
66008f75463SAlex Elder out_client:
661602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
66208f75463SAlex Elder out_rbdc:
663602adf40SYehuda Sadeh 	kfree(rbdc);
664602adf40SYehuda Sadeh out_opt:
66543ae4701SAlex Elder 	if (ceph_opts)
66643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
66737206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
66837206ee5SAlex Elder 
66928f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
670602adf40SYehuda Sadeh }
671602adf40SYehuda Sadeh 
6722f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
6732f82ee54SAlex Elder {
6742f82ee54SAlex Elder 	kref_get(&rbdc->kref);
6752f82ee54SAlex Elder 
6762f82ee54SAlex Elder 	return rbdc;
6772f82ee54SAlex Elder }
6782f82ee54SAlex Elder 
679602adf40SYehuda Sadeh /*
6801f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
6811f7ba331SAlex Elder  * found, bump its reference count.
682602adf40SYehuda Sadeh  */
6831f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
684602adf40SYehuda Sadeh {
685602adf40SYehuda Sadeh 	struct rbd_client *client_node;
6861f7ba331SAlex Elder 	bool found = false;
687602adf40SYehuda Sadeh 
68843ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
689602adf40SYehuda Sadeh 		return NULL;
690602adf40SYehuda Sadeh 
6911f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
6921f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
6931f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
6942f82ee54SAlex Elder 			__rbd_get_client(client_node);
6952f82ee54SAlex Elder 
6961f7ba331SAlex Elder 			found = true;
6971f7ba331SAlex Elder 			break;
6981f7ba331SAlex Elder 		}
6991f7ba331SAlex Elder 	}
7001f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
7011f7ba331SAlex Elder 
7021f7ba331SAlex Elder 	return found ? client_node : NULL;
703602adf40SYehuda Sadeh }
704602adf40SYehuda Sadeh 
705602adf40SYehuda Sadeh /*
70659c2be1eSYehuda Sadeh  * mount options
70759c2be1eSYehuda Sadeh  */
70859c2be1eSYehuda Sadeh enum {
70959c2be1eSYehuda Sadeh 	Opt_last_int,
71059c2be1eSYehuda Sadeh 	/* int args above */
71159c2be1eSYehuda Sadeh 	Opt_last_string,
71259c2be1eSYehuda Sadeh 	/* string args above */
713cc0538b6SAlex Elder 	Opt_read_only,
714cc0538b6SAlex Elder 	Opt_read_write,
715cc0538b6SAlex Elder 	/* Boolean args above */
716cc0538b6SAlex Elder 	Opt_last_bool,
71759c2be1eSYehuda Sadeh };
71859c2be1eSYehuda Sadeh 
71943ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
72059c2be1eSYehuda Sadeh 	/* int args above */
72159c2be1eSYehuda Sadeh 	/* string args above */
722be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
723cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
724cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
725cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
726cc0538b6SAlex Elder 	/* Boolean args above */
72759c2be1eSYehuda Sadeh 	{-1, NULL}
72859c2be1eSYehuda Sadeh };
72959c2be1eSYehuda Sadeh 
73098571b5aSAlex Elder struct rbd_options {
73198571b5aSAlex Elder 	bool	read_only;
73298571b5aSAlex Elder };
73398571b5aSAlex Elder 
73498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
73598571b5aSAlex Elder 
73659c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
73759c2be1eSYehuda Sadeh {
73843ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
73959c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
74059c2be1eSYehuda Sadeh 	int token, intval, ret;
74159c2be1eSYehuda Sadeh 
74243ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
74359c2be1eSYehuda Sadeh 	if (token < 0)
74459c2be1eSYehuda Sadeh 		return -EINVAL;
74559c2be1eSYehuda Sadeh 
74659c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
74759c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
74859c2be1eSYehuda Sadeh 		if (ret < 0) {
74959c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
75059c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
75159c2be1eSYehuda Sadeh 			return ret;
75259c2be1eSYehuda Sadeh 		}
75359c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
75459c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
75559c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
75659c2be1eSYehuda Sadeh 		     argstr[0].from);
757cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
758cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
75959c2be1eSYehuda Sadeh 	} else {
76059c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
76159c2be1eSYehuda Sadeh 	}
76259c2be1eSYehuda Sadeh 
76359c2be1eSYehuda Sadeh 	switch (token) {
764cc0538b6SAlex Elder 	case Opt_read_only:
765cc0538b6SAlex Elder 		rbd_opts->read_only = true;
766cc0538b6SAlex Elder 		break;
767cc0538b6SAlex Elder 	case Opt_read_write:
768cc0538b6SAlex Elder 		rbd_opts->read_only = false;
769cc0538b6SAlex Elder 		break;
77059c2be1eSYehuda Sadeh 	default:
771aafb230eSAlex Elder 		rbd_assert(false);
772aafb230eSAlex Elder 		break;
77359c2be1eSYehuda Sadeh 	}
77459c2be1eSYehuda Sadeh 	return 0;
77559c2be1eSYehuda Sadeh }
77659c2be1eSYehuda Sadeh 
77759c2be1eSYehuda Sadeh /*
778602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
7797262cfcaSAlex Elder  * not exist create it.  Either way, ceph_opts is consumed by this
7807262cfcaSAlex Elder  * function.
781602adf40SYehuda Sadeh  */
7829d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
783602adf40SYehuda Sadeh {
784f8c38929SAlex Elder 	struct rbd_client *rbdc;
78559c2be1eSYehuda Sadeh 
786cfbf6377SAlex Elder 	mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
7871f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
7889d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
78943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
7909d3997fdSAlex Elder 	else
791f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
792cfbf6377SAlex Elder 	mutex_unlock(&client_mutex);
793d720bcb0SAlex Elder 
7949d3997fdSAlex Elder 	return rbdc;
795602adf40SYehuda Sadeh }
796602adf40SYehuda Sadeh 
797602adf40SYehuda Sadeh /*
798602adf40SYehuda Sadeh  * Destroy ceph client
799d23a4b3fSAlex Elder  *
800432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
801602adf40SYehuda Sadeh  */
802602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
803602adf40SYehuda Sadeh {
804602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
805602adf40SYehuda Sadeh 
80637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
807cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
808602adf40SYehuda Sadeh 	list_del(&rbdc->node);
809cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
810602adf40SYehuda Sadeh 
811602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
812602adf40SYehuda Sadeh 	kfree(rbdc);
813602adf40SYehuda Sadeh }
814602adf40SYehuda Sadeh 
815602adf40SYehuda Sadeh /*
816602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
817602adf40SYehuda Sadeh  * it.
818602adf40SYehuda Sadeh  */
8199d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
820602adf40SYehuda Sadeh {
821c53d5893SAlex Elder 	if (rbdc)
8229d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
823602adf40SYehuda Sadeh }
824602adf40SYehuda Sadeh 
825a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
826a30b71b9SAlex Elder {
827a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
828a30b71b9SAlex Elder }
829a30b71b9SAlex Elder 
8308e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
8318e94af8eSAlex Elder {
832103a150fSAlex Elder 	size_t size;
833103a150fSAlex Elder 	u32 snap_count;
834103a150fSAlex Elder 
835103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
836103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
837103a150fSAlex Elder 		return false;
838103a150fSAlex Elder 
839db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
840db2388b6SAlex Elder 
841db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
842db2388b6SAlex Elder 		return false;
843db2388b6SAlex Elder 
844db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
845db2388b6SAlex Elder 
846db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
847db2388b6SAlex Elder 		return false;
848db2388b6SAlex Elder 
849103a150fSAlex Elder 	/*
850103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
851103a150fSAlex Elder 	 * that limits the number of snapshots.
852103a150fSAlex Elder 	 */
853103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
854103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
855103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
856103a150fSAlex Elder 		return false;
857103a150fSAlex Elder 
858103a150fSAlex Elder 	/*
859103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
860103a150fSAlex Elder 	 * header must also be representable in a size_t.
861103a150fSAlex Elder 	 */
862103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
863103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
864103a150fSAlex Elder 		return false;
865103a150fSAlex Elder 
866103a150fSAlex Elder 	return true;
8678e94af8eSAlex Elder }
8688e94af8eSAlex Elder 
869602adf40SYehuda Sadeh /*
870bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
871bb23e37aSAlex Elder  * on-disk header.
872602adf40SYehuda Sadeh  */
873662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
8744156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
875602adf40SYehuda Sadeh {
876662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
877bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
878bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
879bb23e37aSAlex Elder 	char *object_prefix = NULL;
880bb23e37aSAlex Elder 	char *snap_names = NULL;
881bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
882ccece235SAlex Elder 	u32 snap_count;
883d2bb24e5SAlex Elder 	size_t size;
884bb23e37aSAlex Elder 	int ret = -ENOMEM;
885621901d6SAlex Elder 	u32 i;
886602adf40SYehuda Sadeh 
887bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
888103a150fSAlex Elder 
889bb23e37aSAlex Elder 	if (first_time) {
890bb23e37aSAlex Elder 		size_t len;
891bb23e37aSAlex Elder 
892bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
893bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
894bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
895bb23e37aSAlex Elder 		if (!object_prefix)
896602adf40SYehuda Sadeh 			return -ENOMEM;
897bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
898bb23e37aSAlex Elder 		object_prefix[len] = '\0';
899bb23e37aSAlex Elder 	}
90000f1f36fSAlex Elder 
901bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
902d2bb24e5SAlex Elder 
903602adf40SYehuda Sadeh 	snap_count = le32_to_cpu(ondisk->snap_count);
904bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
905bb23e37aSAlex Elder 	if (!snapc)
906bb23e37aSAlex Elder 		goto out_err;
907bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
908602adf40SYehuda Sadeh 	if (snap_count) {
909bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
910f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
911f785cc1dSAlex Elder 
912bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
913621901d6SAlex Elder 
914f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
915bb23e37aSAlex Elder 			goto out_2big;
916bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
917bb23e37aSAlex Elder 		if (!snap_names)
918602adf40SYehuda Sadeh 			goto out_err;
919bb23e37aSAlex Elder 
920bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
921bb23e37aSAlex Elder 
922bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
923bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
924bb23e37aSAlex Elder 		if (!snap_sizes)
925bb23e37aSAlex Elder 			goto out_err;
926bb23e37aSAlex Elder 
927f785cc1dSAlex Elder 		/*
928bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
929bb23e37aSAlex Elder 		 * and size.
930bb23e37aSAlex Elder 		 *
93199a41ebcSAlex Elder 		 * Note that rbd_dev_v1_header_info() guarantees the
932bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
933f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
934f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
935f785cc1dSAlex Elder 		 */
936bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
937bb23e37aSAlex Elder 		snaps = ondisk->snaps;
938bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
939bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
940bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
941bb23e37aSAlex Elder 		}
942602adf40SYehuda Sadeh 	}
943849b4260SAlex Elder 
944bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
945bb23e37aSAlex Elder 
946bb23e37aSAlex Elder 	if (first_time) {
947bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
948602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
949602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
950602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
951bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
952bb23e37aSAlex Elder 		header->stripe_unit = 0;
953bb23e37aSAlex Elder 		header->stripe_count = 0;
954bb23e37aSAlex Elder 		header->features = 0;
955662518b1SAlex Elder 	} else {
956662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
957662518b1SAlex Elder 		kfree(header->snap_names);
958662518b1SAlex Elder 		kfree(header->snap_sizes);
959bb23e37aSAlex Elder 	}
9606a52325fSAlex Elder 
961bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
962621901d6SAlex Elder 
963f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
964bb23e37aSAlex Elder 	header->snapc = snapc;
965bb23e37aSAlex Elder 	header->snap_names = snap_names;
966bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
967468521c1SAlex Elder 
968662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
969662518b1SAlex Elder 
970662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
971662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
972662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
973662518b1SAlex Elder 
974602adf40SYehuda Sadeh 	return 0;
975bb23e37aSAlex Elder out_2big:
976bb23e37aSAlex Elder 	ret = -EIO;
9776a52325fSAlex Elder out_err:
978bb23e37aSAlex Elder 	kfree(snap_sizes);
979bb23e37aSAlex Elder 	kfree(snap_names);
980bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
981bb23e37aSAlex Elder 	kfree(object_prefix);
982ccece235SAlex Elder 
983bb23e37aSAlex Elder 	return ret;
984602adf40SYehuda Sadeh }
985602adf40SYehuda Sadeh 
9869682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
9879682fc6dSAlex Elder {
9889682fc6dSAlex Elder 	const char *snap_name;
9899682fc6dSAlex Elder 
9909682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
9919682fc6dSAlex Elder 
9929682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
9939682fc6dSAlex Elder 
9949682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
9959682fc6dSAlex Elder 	while (which--)
9969682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
9979682fc6dSAlex Elder 
9989682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
9999682fc6dSAlex Elder }
10009682fc6dSAlex Elder 
100130d1cff8SAlex Elder /*
100230d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
100330d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
100430d1cff8SAlex Elder  */
100530d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
100630d1cff8SAlex Elder {
100730d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
100830d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
100930d1cff8SAlex Elder 
101030d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
101130d1cff8SAlex Elder 		return 1;
101230d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
101330d1cff8SAlex Elder }
101430d1cff8SAlex Elder 
101530d1cff8SAlex Elder /*
101630d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
101730d1cff8SAlex Elder  * present.
101830d1cff8SAlex Elder  *
101930d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
102030d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
102130d1cff8SAlex Elder  *
102230d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
102330d1cff8SAlex Elder  * reverse order, highest snapshot id first.
102430d1cff8SAlex Elder  */
10259682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
10269682fc6dSAlex Elder {
10279682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
102830d1cff8SAlex Elder 	u64 *found;
10299682fc6dSAlex Elder 
103030d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
103130d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
10329682fc6dSAlex Elder 
103330d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
10349682fc6dSAlex Elder }
10359682fc6dSAlex Elder 
10362ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
10372ad3d716SAlex Elder 					u64 snap_id)
103854cac61fSAlex Elder {
103954cac61fSAlex Elder 	u32 which;
1040da6a6b63SJosh Durgin 	const char *snap_name;
104154cac61fSAlex Elder 
104254cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
104354cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
1044da6a6b63SJosh Durgin 		return ERR_PTR(-ENOENT);
104554cac61fSAlex Elder 
1046da6a6b63SJosh Durgin 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1047da6a6b63SJosh Durgin 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
104854cac61fSAlex Elder }
104954cac61fSAlex Elder 
10509e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
10519e15b77dSAlex Elder {
10529e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
10539e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
10549e15b77dSAlex Elder 
105554cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
105654cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
105754cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
10589e15b77dSAlex Elder 
105954cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
10609e15b77dSAlex Elder }
10619e15b77dSAlex Elder 
10622ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
10632ad3d716SAlex Elder 				u64 *snap_size)
1064602adf40SYehuda Sadeh {
10652ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10662ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10672ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
10682ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10692ad3d716SAlex Elder 		u32 which;
107000f1f36fSAlex Elder 
10712ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
10722ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
10732ad3d716SAlex Elder 			return -ENOENT;
107400f1f36fSAlex Elder 
10752ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
10762ad3d716SAlex Elder 	} else {
10772ad3d716SAlex Elder 		u64 size = 0;
10782ad3d716SAlex Elder 		int ret;
10792ad3d716SAlex Elder 
10802ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
10812ad3d716SAlex Elder 		if (ret)
10822ad3d716SAlex Elder 			return ret;
10832ad3d716SAlex Elder 
10842ad3d716SAlex Elder 		*snap_size = size;
10852ad3d716SAlex Elder 	}
10862ad3d716SAlex Elder 	return 0;
10872ad3d716SAlex Elder }
10882ad3d716SAlex Elder 
10892ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
10902ad3d716SAlex Elder 			u64 *snap_features)
10912ad3d716SAlex Elder {
10922ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
10932ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
10942ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
10952ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
10962ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
10972ad3d716SAlex Elder 	} else {
10982ad3d716SAlex Elder 		u64 features = 0;
10992ad3d716SAlex Elder 		int ret;
11002ad3d716SAlex Elder 
11012ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
11022ad3d716SAlex Elder 		if (ret)
11032ad3d716SAlex Elder 			return ret;
11042ad3d716SAlex Elder 
11052ad3d716SAlex Elder 		*snap_features = features;
11062ad3d716SAlex Elder 	}
11072ad3d716SAlex Elder 	return 0;
110800f1f36fSAlex Elder }
1109602adf40SYehuda Sadeh 
1110d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1111602adf40SYehuda Sadeh {
11128f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
11132ad3d716SAlex Elder 	u64 size = 0;
11142ad3d716SAlex Elder 	u64 features = 0;
11152ad3d716SAlex Elder 	int ret;
11168b0241f8SAlex Elder 
11172ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
11182ad3d716SAlex Elder 	if (ret)
11192ad3d716SAlex Elder 		return ret;
11202ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
11212ad3d716SAlex Elder 	if (ret)
11222ad3d716SAlex Elder 		return ret;
11232ad3d716SAlex Elder 
11242ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
11252ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
11262ad3d716SAlex Elder 
11278b0241f8SAlex Elder 	return 0;
1128602adf40SYehuda Sadeh }
1129602adf40SYehuda Sadeh 
1130d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1131d1cf5788SAlex Elder {
1132d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
1133d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
1134200a6a8bSAlex Elder }
1135200a6a8bSAlex Elder 
113698571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
1137602adf40SYehuda Sadeh {
113865ccfe21SAlex Elder 	char *name;
113965ccfe21SAlex Elder 	u64 segment;
114065ccfe21SAlex Elder 	int ret;
11413a96d5cdSJosh Durgin 	char *name_format;
1142602adf40SYehuda Sadeh 
114378c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
114465ccfe21SAlex Elder 	if (!name)
114565ccfe21SAlex Elder 		return NULL;
114665ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
11473a96d5cdSJosh Durgin 	name_format = "%s.%012llx";
11483a96d5cdSJosh Durgin 	if (rbd_dev->image_format == 2)
11493a96d5cdSJosh Durgin 		name_format = "%s.%016llx";
11502d0ebc5dSIlya Dryomov 	ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
115165ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
11522d0ebc5dSIlya Dryomov 	if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
115365ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
115465ccfe21SAlex Elder 			segment, ret);
115565ccfe21SAlex Elder 		kfree(name);
115665ccfe21SAlex Elder 		name = NULL;
115765ccfe21SAlex Elder 	}
1158602adf40SYehuda Sadeh 
115965ccfe21SAlex Elder 	return name;
116065ccfe21SAlex Elder }
1161602adf40SYehuda Sadeh 
116278c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
116378c2a44aSAlex Elder {
116478c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
116578c2a44aSAlex Elder 
116678c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
116778c2a44aSAlex Elder }
116878c2a44aSAlex Elder 
116965ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
117065ccfe21SAlex Elder {
117165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1172602adf40SYehuda Sadeh 
117365ccfe21SAlex Elder 	return offset & (segment_size - 1);
117465ccfe21SAlex Elder }
117565ccfe21SAlex Elder 
117665ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
117765ccfe21SAlex Elder 				u64 offset, u64 length)
117865ccfe21SAlex Elder {
117965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
118065ccfe21SAlex Elder 
118165ccfe21SAlex Elder 	offset &= segment_size - 1;
118265ccfe21SAlex Elder 
1183aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
118465ccfe21SAlex Elder 	if (offset + length > segment_size)
118565ccfe21SAlex Elder 		length = segment_size - offset;
118665ccfe21SAlex Elder 
118765ccfe21SAlex Elder 	return length;
1188602adf40SYehuda Sadeh }
1189602adf40SYehuda Sadeh 
1190602adf40SYehuda Sadeh /*
1191029bcbd8SJosh Durgin  * returns the size of an object in the image
1192029bcbd8SJosh Durgin  */
1193029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1194029bcbd8SJosh Durgin {
1195029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1196029bcbd8SJosh Durgin }
1197029bcbd8SJosh Durgin 
1198029bcbd8SJosh Durgin /*
1199602adf40SYehuda Sadeh  * bio helpers
1200602adf40SYehuda Sadeh  */
1201602adf40SYehuda Sadeh 
1202602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1203602adf40SYehuda Sadeh {
1204602adf40SYehuda Sadeh 	struct bio *tmp;
1205602adf40SYehuda Sadeh 
1206602adf40SYehuda Sadeh 	while (chain) {
1207602adf40SYehuda Sadeh 		tmp = chain;
1208602adf40SYehuda Sadeh 		chain = chain->bi_next;
1209602adf40SYehuda Sadeh 		bio_put(tmp);
1210602adf40SYehuda Sadeh 	}
1211602adf40SYehuda Sadeh }
1212602adf40SYehuda Sadeh 
1213602adf40SYehuda Sadeh /*
1214602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1215602adf40SYehuda Sadeh  */
1216602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1217602adf40SYehuda Sadeh {
12187988613bSKent Overstreet 	struct bio_vec bv;
12197988613bSKent Overstreet 	struct bvec_iter iter;
1220602adf40SYehuda Sadeh 	unsigned long flags;
1221602adf40SYehuda Sadeh 	void *buf;
1222602adf40SYehuda Sadeh 	int pos = 0;
1223602adf40SYehuda Sadeh 
1224602adf40SYehuda Sadeh 	while (chain) {
12257988613bSKent Overstreet 		bio_for_each_segment(bv, chain, iter) {
12267988613bSKent Overstreet 			if (pos + bv.bv_len > start_ofs) {
1227602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
12287988613bSKent Overstreet 				buf = bvec_kmap_irq(&bv, &flags);
1229602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
12307988613bSKent Overstreet 				       bv.bv_len - remainder);
12317988613bSKent Overstreet 				flush_dcache_page(bv.bv_page);
123285b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1233602adf40SYehuda Sadeh 			}
12347988613bSKent Overstreet 			pos += bv.bv_len;
1235602adf40SYehuda Sadeh 		}
1236602adf40SYehuda Sadeh 
1237602adf40SYehuda Sadeh 		chain = chain->bi_next;
1238602adf40SYehuda Sadeh 	}
1239602adf40SYehuda Sadeh }
1240602adf40SYehuda Sadeh 
1241602adf40SYehuda Sadeh /*
1242b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1243b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1244b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1245b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1246b9434c5bSAlex Elder  */
1247b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1248b9434c5bSAlex Elder {
1249b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1250b9434c5bSAlex Elder 
1251b9434c5bSAlex Elder 	rbd_assert(end > offset);
1252b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1253b9434c5bSAlex Elder 	while (offset < end) {
1254b9434c5bSAlex Elder 		size_t page_offset;
1255b9434c5bSAlex Elder 		size_t length;
1256b9434c5bSAlex Elder 		unsigned long flags;
1257b9434c5bSAlex Elder 		void *kaddr;
1258b9434c5bSAlex Elder 
1259491205a8SGeert Uytterhoeven 		page_offset = offset & ~PAGE_MASK;
1260491205a8SGeert Uytterhoeven 		length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
1261b9434c5bSAlex Elder 		local_irq_save(flags);
1262b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1263b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1264e2156054SAlex Elder 		flush_dcache_page(*page);
1265b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1266b9434c5bSAlex Elder 		local_irq_restore(flags);
1267b9434c5bSAlex Elder 
1268b9434c5bSAlex Elder 		offset += length;
1269b9434c5bSAlex Elder 		page++;
1270b9434c5bSAlex Elder 	}
1271b9434c5bSAlex Elder }
1272b9434c5bSAlex Elder 
1273b9434c5bSAlex Elder /*
1274f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1275f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1276602adf40SYehuda Sadeh  */
1277f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1278f7760dadSAlex Elder 					unsigned int offset,
1279f7760dadSAlex Elder 					unsigned int len,
1280f7760dadSAlex Elder 					gfp_t gfpmask)
1281602adf40SYehuda Sadeh {
1282f7760dadSAlex Elder 	struct bio *bio;
1283602adf40SYehuda Sadeh 
12845341a627SKent Overstreet 	bio = bio_clone(bio_src, gfpmask);
1285f7760dadSAlex Elder 	if (!bio)
1286f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1287f7760dadSAlex Elder 
12885341a627SKent Overstreet 	bio_advance(bio, offset);
12894f024f37SKent Overstreet 	bio->bi_iter.bi_size = len;
1290602adf40SYehuda Sadeh 
1291f7760dadSAlex Elder 	return bio;
1292602adf40SYehuda Sadeh }
1293602adf40SYehuda Sadeh 
1294f7760dadSAlex Elder /*
1295f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1296f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1297f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1298f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1299f7760dadSAlex Elder  *
1300f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1301f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1302f7760dadSAlex Elder  * the start of data to be cloned is located.
1303f7760dadSAlex Elder  *
1304f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1305f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1306f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1307f7760dadSAlex Elder  */
1308f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1309f7760dadSAlex Elder 					unsigned int *offset,
1310f7760dadSAlex Elder 					unsigned int len,
1311f7760dadSAlex Elder 					gfp_t gfpmask)
1312f7760dadSAlex Elder {
1313f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1314f7760dadSAlex Elder 	unsigned int off = *offset;
1315f7760dadSAlex Elder 	struct bio *chain = NULL;
1316f7760dadSAlex Elder 	struct bio **end;
1317602adf40SYehuda Sadeh 
1318f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1319602adf40SYehuda Sadeh 
13204f024f37SKent Overstreet 	if (!bi || off >= bi->bi_iter.bi_size || !len)
1321f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1322602adf40SYehuda Sadeh 
1323f7760dadSAlex Elder 	end = &chain;
1324f7760dadSAlex Elder 	while (len) {
1325f7760dadSAlex Elder 		unsigned int bi_size;
1326f7760dadSAlex Elder 		struct bio *bio;
1327f7760dadSAlex Elder 
1328f5400b7aSAlex Elder 		if (!bi) {
1329f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1330f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1331f5400b7aSAlex Elder 		}
13324f024f37SKent Overstreet 		bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
1333f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1334f7760dadSAlex Elder 		if (!bio)
1335f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1336f7760dadSAlex Elder 
1337f7760dadSAlex Elder 		*end = bio;
1338f7760dadSAlex Elder 		end = &bio->bi_next;
1339f7760dadSAlex Elder 
1340f7760dadSAlex Elder 		off += bi_size;
13414f024f37SKent Overstreet 		if (off == bi->bi_iter.bi_size) {
1342f7760dadSAlex Elder 			bi = bi->bi_next;
1343f7760dadSAlex Elder 			off = 0;
1344f7760dadSAlex Elder 		}
1345f7760dadSAlex Elder 		len -= bi_size;
1346f7760dadSAlex Elder 	}
1347f7760dadSAlex Elder 	*bio_src = bi;
1348f7760dadSAlex Elder 	*offset = off;
1349f7760dadSAlex Elder 
1350f7760dadSAlex Elder 	return chain;
1351f7760dadSAlex Elder out_err:
1352f7760dadSAlex Elder 	bio_chain_put(chain);
1353f7760dadSAlex Elder 
1354602adf40SYehuda Sadeh 	return NULL;
1355602adf40SYehuda Sadeh }
1356602adf40SYehuda Sadeh 
1357926f9b3fSAlex Elder /*
1358926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1359926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1360926f9b3fSAlex Elder  * again.
1361926f9b3fSAlex Elder  */
13626365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
13636365d33aSAlex Elder {
13646365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
13656365d33aSAlex Elder 		struct rbd_device *rbd_dev;
13666365d33aSAlex Elder 
136757acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
13686365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
13696365d33aSAlex Elder 			obj_request);
13706365d33aSAlex Elder 	}
13716365d33aSAlex Elder }
13726365d33aSAlex Elder 
13736365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
13746365d33aSAlex Elder {
13756365d33aSAlex Elder 	smp_mb();
13766365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
13776365d33aSAlex Elder }
13786365d33aSAlex Elder 
137957acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
138057acbaa7SAlex Elder {
138157acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
138257acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
138357acbaa7SAlex Elder 
138457acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
138557acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
138657acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
138757acbaa7SAlex Elder 			obj_request);
138857acbaa7SAlex Elder 	}
138957acbaa7SAlex Elder }
139057acbaa7SAlex Elder 
139157acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
139257acbaa7SAlex Elder {
139357acbaa7SAlex Elder 	smp_mb();
139457acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
139557acbaa7SAlex Elder }
139657acbaa7SAlex Elder 
13975679c59fSAlex Elder /*
13985679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
13995679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
14005679c59fSAlex Elder  *
14015679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
14025679c59fSAlex Elder  * away again.  It's possible that the response from two existence
14035679c59fSAlex Elder  * checks are separated by the creation of the target object, and
14045679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
14055679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
14065679c59fSAlex Elder  */
14075679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
14085679c59fSAlex Elder 				bool exists)
14095679c59fSAlex Elder {
14105679c59fSAlex Elder 	if (exists)
14115679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
14125679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
14135679c59fSAlex Elder 	smp_mb();
14145679c59fSAlex Elder }
14155679c59fSAlex Elder 
14165679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
14175679c59fSAlex Elder {
14185679c59fSAlex Elder 	smp_mb();
14195679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
14205679c59fSAlex Elder }
14215679c59fSAlex Elder 
14225679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
14235679c59fSAlex Elder {
14245679c59fSAlex Elder 	smp_mb();
14255679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
14265679c59fSAlex Elder }
14275679c59fSAlex Elder 
1428bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1429bf0d5f50SAlex Elder {
143037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
143137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1432bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1433bf0d5f50SAlex Elder }
1434bf0d5f50SAlex Elder 
1435bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1436bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1437bf0d5f50SAlex Elder {
1438bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
143937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
144037206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1441bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1442bf0d5f50SAlex Elder }
1443bf0d5f50SAlex Elder 
14440f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
14450f2d5be7SAlex Elder {
14460f2d5be7SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
14470f2d5be7SAlex Elder 	     atomic_read(&img_request->kref.refcount));
14480f2d5be7SAlex Elder 	kref_get(&img_request->kref);
14490f2d5be7SAlex Elder }
14500f2d5be7SAlex Elder 
1451e93f3152SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request);
1452e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref);
1453bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1454bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1455bf0d5f50SAlex Elder {
1456bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
145737206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
145837206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1459e93f3152SAlex Elder 	if (img_request_child_test(img_request))
1460e93f3152SAlex Elder 		kref_put(&img_request->kref, rbd_parent_request_destroy);
1461e93f3152SAlex Elder 	else
1462bf0d5f50SAlex Elder 		kref_put(&img_request->kref, rbd_img_request_destroy);
1463bf0d5f50SAlex Elder }
1464bf0d5f50SAlex Elder 
1465bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1466bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1467bf0d5f50SAlex Elder {
146825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
146925dcf954SAlex Elder 
1470b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1471bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
147225dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
14736365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
14746365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1475bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
147625dcf954SAlex Elder 	img_request->obj_request_count++;
147725dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
147837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
147937206ee5SAlex Elder 		obj_request->which);
1480bf0d5f50SAlex Elder }
1481bf0d5f50SAlex Elder 
1482bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1483bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1484bf0d5f50SAlex Elder {
1485bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
148625dcf954SAlex Elder 
148737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
148837206ee5SAlex Elder 		obj_request->which);
1489bf0d5f50SAlex Elder 	list_del(&obj_request->links);
149025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
149125dcf954SAlex Elder 	img_request->obj_request_count--;
149225dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
149325dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
14946365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1495bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1496bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
149725dcf954SAlex Elder 	obj_request->callback = NULL;
1498bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1499bf0d5f50SAlex Elder }
1500bf0d5f50SAlex Elder 
1501bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1502bf0d5f50SAlex Elder {
1503bf0d5f50SAlex Elder 	switch (type) {
15049969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1505bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1506788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1507bf0d5f50SAlex Elder 		return true;
1508bf0d5f50SAlex Elder 	default:
1509bf0d5f50SAlex Elder 		return false;
1510bf0d5f50SAlex Elder 	}
1511bf0d5f50SAlex Elder }
1512bf0d5f50SAlex Elder 
1513bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1514bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1515bf0d5f50SAlex Elder {
151637206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
151737206ee5SAlex Elder 
1518bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1519bf0d5f50SAlex Elder }
1520bf0d5f50SAlex Elder 
1521bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1522bf0d5f50SAlex Elder {
152355f27e09SAlex Elder 
152437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
152555f27e09SAlex Elder 
152655f27e09SAlex Elder 	/*
152755f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
152855f27e09SAlex Elder 	 * count for the image request.  We could instead use
152955f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
153055f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
153155f27e09SAlex Elder 	 */
153255f27e09SAlex Elder 	if (!img_request->result) {
153355f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
153455f27e09SAlex Elder 		u64 xferred = 0;
153555f27e09SAlex Elder 
153655f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
153755f27e09SAlex Elder 			xferred += obj_request->xferred;
153855f27e09SAlex Elder 		img_request->xferred = xferred;
153955f27e09SAlex Elder 	}
154055f27e09SAlex Elder 
1541bf0d5f50SAlex Elder 	if (img_request->callback)
1542bf0d5f50SAlex Elder 		img_request->callback(img_request);
1543bf0d5f50SAlex Elder 	else
1544bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1545bf0d5f50SAlex Elder }
1546bf0d5f50SAlex Elder 
1547788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1548788e2df3SAlex Elder 
1549788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1550788e2df3SAlex Elder {
155137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
155237206ee5SAlex Elder 
1553788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1554788e2df3SAlex Elder }
1555788e2df3SAlex Elder 
15560c425248SAlex Elder /*
15570c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
15580c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
15590c425248SAlex Elder  * and currently never change thereafter.
15600c425248SAlex Elder  */
15610c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
15620c425248SAlex Elder {
15630c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
15640c425248SAlex Elder 	smp_mb();
15650c425248SAlex Elder }
15660c425248SAlex Elder 
15670c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
15680c425248SAlex Elder {
15690c425248SAlex Elder 	smp_mb();
15700c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
15710c425248SAlex Elder }
15720c425248SAlex Elder 
15739849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
15749849e986SAlex Elder {
15759849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
15769849e986SAlex Elder 	smp_mb();
15779849e986SAlex Elder }
15789849e986SAlex Elder 
1579e93f3152SAlex Elder static void img_request_child_clear(struct rbd_img_request *img_request)
1580e93f3152SAlex Elder {
1581e93f3152SAlex Elder 	clear_bit(IMG_REQ_CHILD, &img_request->flags);
1582e93f3152SAlex Elder 	smp_mb();
1583e93f3152SAlex Elder }
1584e93f3152SAlex Elder 
15859849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
15869849e986SAlex Elder {
15879849e986SAlex Elder 	smp_mb();
15889849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
15899849e986SAlex Elder }
15909849e986SAlex Elder 
1591d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1592d0b2e944SAlex Elder {
1593d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1594d0b2e944SAlex Elder 	smp_mb();
1595d0b2e944SAlex Elder }
1596d0b2e944SAlex Elder 
1597a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request)
1598a2acd00eSAlex Elder {
1599a2acd00eSAlex Elder 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1600a2acd00eSAlex Elder 	smp_mb();
1601a2acd00eSAlex Elder }
1602a2acd00eSAlex Elder 
1603d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1604d0b2e944SAlex Elder {
1605d0b2e944SAlex Elder 	smp_mb();
1606d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1607d0b2e944SAlex Elder }
1608d0b2e944SAlex Elder 
16096e2a4505SAlex Elder static void
16106e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
16116e2a4505SAlex Elder {
1612b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1613b9434c5bSAlex Elder 	u64 length = obj_request->length;
1614b9434c5bSAlex Elder 
16156e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16166e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1617b9434c5bSAlex Elder 		xferred, length);
16186e2a4505SAlex Elder 	/*
161917c1cc1dSJosh Durgin 	 * ENOENT means a hole in the image.  We zero-fill the entire
162017c1cc1dSJosh Durgin 	 * length of the request.  A short read also implies zero-fill
162117c1cc1dSJosh Durgin 	 * to the end of the request.  An error requires the whole
162217c1cc1dSJosh Durgin 	 * length of the request to be reported finished with an error
162317c1cc1dSJosh Durgin 	 * to the block layer.  In each case we update the xferred
162417c1cc1dSJosh Durgin 	 * count to indicate the whole request was satisfied.
16256e2a4505SAlex Elder 	 */
1626b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
16276e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1628b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
16296e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1630b9434c5bSAlex Elder 		else
1631b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
16326e2a4505SAlex Elder 		obj_request->result = 0;
1633b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1634b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1635b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1636b9434c5bSAlex Elder 		else
1637b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
16386e2a4505SAlex Elder 	}
163917c1cc1dSJosh Durgin 	obj_request->xferred = length;
16406e2a4505SAlex Elder 	obj_request_done_set(obj_request);
16416e2a4505SAlex Elder }
16426e2a4505SAlex Elder 
1643bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1644bf0d5f50SAlex Elder {
164537206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
164637206ee5SAlex Elder 		obj_request->callback);
1647bf0d5f50SAlex Elder 	if (obj_request->callback)
1648bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1649788e2df3SAlex Elder 	else
1650788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1651bf0d5f50SAlex Elder }
1652bf0d5f50SAlex Elder 
1653c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
165439bf2c5dSAlex Elder {
165539bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
165639bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
165739bf2c5dSAlex Elder }
165839bf2c5dSAlex Elder 
1659c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1660bf0d5f50SAlex Elder {
166157acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1662a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
166357acbaa7SAlex Elder 	bool layered = false;
166457acbaa7SAlex Elder 
166557acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
166657acbaa7SAlex Elder 		img_request = obj_request->img_request;
166757acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1668a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
166957acbaa7SAlex Elder 	}
16708b3e1a56SAlex Elder 
16718b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
16728b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
16738b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1674a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1675a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
16768b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
16778b3e1a56SAlex Elder 	else if (img_request)
16786e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
16796e2a4505SAlex Elder 	else
168007741308SAlex Elder 		obj_request_done_set(obj_request);
1681bf0d5f50SAlex Elder }
1682bf0d5f50SAlex Elder 
1683c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1684bf0d5f50SAlex Elder {
16851b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
16861b83bef2SSage Weil 		obj_request->result, obj_request->length);
16871b83bef2SSage Weil 	/*
16888b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
16898b3e1a56SAlex Elder 	 * it to our originally-requested length.
16901b83bef2SSage Weil 	 */
16911b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
169207741308SAlex Elder 	obj_request_done_set(obj_request);
1693bf0d5f50SAlex Elder }
1694bf0d5f50SAlex Elder 
1695fbfab539SAlex Elder /*
1696fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1697fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1698fbfab539SAlex Elder  */
1699c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1700fbfab539SAlex Elder {
170137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1702fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1703fbfab539SAlex Elder }
1704fbfab539SAlex Elder 
1705bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1706bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1707bf0d5f50SAlex Elder {
1708bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1709bf0d5f50SAlex Elder 	u16 opcode;
1710bf0d5f50SAlex Elder 
171137206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1712bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
171357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
171457acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
171557acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
171657acbaa7SAlex Elder 	} else {
171757acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
171857acbaa7SAlex Elder 	}
1719bf0d5f50SAlex Elder 
17201b83bef2SSage Weil 	if (osd_req->r_result < 0)
17211b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1722bf0d5f50SAlex Elder 
17237cc69d42SIlya Dryomov 	rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
1724bf0d5f50SAlex Elder 
1725c47f9371SAlex Elder 	/*
1726c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1727c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1728c47f9371SAlex Elder 	 */
17291b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1730c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
17310ccd5926SIlya Dryomov 
173279528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1733bf0d5f50SAlex Elder 	switch (opcode) {
1734bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1735c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1736bf0d5f50SAlex Elder 		break;
17370ccd5926SIlya Dryomov 	case CEPH_OSD_OP_SETALLOCHINT:
17380ccd5926SIlya Dryomov 		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
17390ccd5926SIlya Dryomov 		/* fall through */
1740bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1741c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1742bf0d5f50SAlex Elder 		break;
1743fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1744c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1745fbfab539SAlex Elder 		break;
174636be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1747b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
17489969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1749c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
17509969ebc5SAlex Elder 		break;
1751bf0d5f50SAlex Elder 	default:
1752bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1753bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1754bf0d5f50SAlex Elder 		break;
1755bf0d5f50SAlex Elder 	}
1756bf0d5f50SAlex Elder 
175707741308SAlex Elder 	if (obj_request_done_test(obj_request))
1758bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1759bf0d5f50SAlex Elder }
1760bf0d5f50SAlex Elder 
17619d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1762430c28c3SAlex Elder {
1763430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17648c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17659d4df01fSAlex Elder 	u64 snap_id;
1766430c28c3SAlex Elder 
17678c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1768430c28c3SAlex Elder 
17699d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
17708c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17719d4df01fSAlex Elder 			NULL, snap_id, NULL);
17729d4df01fSAlex Elder }
17739d4df01fSAlex Elder 
17749d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
17759d4df01fSAlex Elder {
17769d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
17779d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
17789d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
17799d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
17809d4df01fSAlex Elder 
17819d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
17829d4df01fSAlex Elder 
17839d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
17849d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
17859d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1786430c28c3SAlex Elder }
1787430c28c3SAlex Elder 
17880ccd5926SIlya Dryomov /*
17890ccd5926SIlya Dryomov  * Create an osd request.  A read request has one osd op (read).
17900ccd5926SIlya Dryomov  * A write request has either one (watch) or two (hint+write) osd ops.
17910ccd5926SIlya Dryomov  * (All rbd data writes are prefixed with an allocation hint op, but
17920ccd5926SIlya Dryomov  * technically osd watch is a write request, hence this distinction.)
17930ccd5926SIlya Dryomov  */
1794bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1795bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1796bf0d5f50SAlex Elder 					bool write_request,
1797deb236b3SIlya Dryomov 					unsigned int num_ops,
1798430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1799bf0d5f50SAlex Elder {
1800bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1801bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1802bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1803bf0d5f50SAlex Elder 
18046365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
18056365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
18066365d33aSAlex Elder 
18070c425248SAlex Elder 		rbd_assert(write_request ==
18080c425248SAlex Elder 				img_request_write_test(img_request));
18090c425248SAlex Elder 		if (write_request)
1810bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1811bf0d5f50SAlex Elder 	}
1812bf0d5f50SAlex Elder 
18130ccd5926SIlya Dryomov 	rbd_assert(num_ops == 1 || (write_request && num_ops == 2));
1814deb236b3SIlya Dryomov 
1815deb236b3SIlya Dryomov 	/* Allocate and initialize the request, for the num_ops ops */
1816bf0d5f50SAlex Elder 
1817bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1818deb236b3SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
1819deb236b3SIlya Dryomov 					  GFP_ATOMIC);
1820bf0d5f50SAlex Elder 	if (!osd_req)
1821bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1822bf0d5f50SAlex Elder 
1823430c28c3SAlex Elder 	if (write_request)
1824bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1825430c28c3SAlex Elder 	else
1826bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1827bf0d5f50SAlex Elder 
1828bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1829bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1830bf0d5f50SAlex Elder 
18313c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
18323c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
1833bf0d5f50SAlex Elder 
1834bf0d5f50SAlex Elder 	return osd_req;
1835bf0d5f50SAlex Elder }
1836bf0d5f50SAlex Elder 
18370eefd470SAlex Elder /*
18380eefd470SAlex Elder  * Create a copyup osd request based on the information in the
18390ccd5926SIlya Dryomov  * object request supplied.  A copyup request has three osd ops,
18400ccd5926SIlya Dryomov  * a copyup method call, a hint op, and a write op.
18410eefd470SAlex Elder  */
18420eefd470SAlex Elder static struct ceph_osd_request *
18430eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
18440eefd470SAlex Elder {
18450eefd470SAlex Elder 	struct rbd_img_request *img_request;
18460eefd470SAlex Elder 	struct ceph_snap_context *snapc;
18470eefd470SAlex Elder 	struct rbd_device *rbd_dev;
18480eefd470SAlex Elder 	struct ceph_osd_client *osdc;
18490eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
18500eefd470SAlex Elder 
18510eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18520eefd470SAlex Elder 	img_request = obj_request->img_request;
18530eefd470SAlex Elder 	rbd_assert(img_request);
18540eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
18550eefd470SAlex Elder 
18560ccd5926SIlya Dryomov 	/* Allocate and initialize the request, for the three ops */
18570eefd470SAlex Elder 
18580eefd470SAlex Elder 	snapc = img_request->snapc;
18590eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
18600eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
18610ccd5926SIlya Dryomov 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC);
18620eefd470SAlex Elder 	if (!osd_req)
18630eefd470SAlex Elder 		return NULL;	/* ENOMEM */
18640eefd470SAlex Elder 
18650eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
18660eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
18670eefd470SAlex Elder 	osd_req->r_priv = obj_request;
18680eefd470SAlex Elder 
18693c972c95SIlya Dryomov 	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
18703c972c95SIlya Dryomov 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
18710eefd470SAlex Elder 
18720eefd470SAlex Elder 	return osd_req;
18730eefd470SAlex Elder }
18740eefd470SAlex Elder 
18750eefd470SAlex Elder 
1876bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1877bf0d5f50SAlex Elder {
1878bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1879bf0d5f50SAlex Elder }
1880bf0d5f50SAlex Elder 
1881bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1882bf0d5f50SAlex Elder 
1883bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1884bf0d5f50SAlex Elder 						u64 offset, u64 length,
1885bf0d5f50SAlex Elder 						enum obj_request_type type)
1886bf0d5f50SAlex Elder {
1887bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1888bf0d5f50SAlex Elder 	size_t size;
1889bf0d5f50SAlex Elder 	char *name;
1890bf0d5f50SAlex Elder 
1891bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1892bf0d5f50SAlex Elder 
1893bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1894f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1895f907ad55SAlex Elder 	if (!name)
1896bf0d5f50SAlex Elder 		return NULL;
1897bf0d5f50SAlex Elder 
1898868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1899f907ad55SAlex Elder 	if (!obj_request) {
1900f907ad55SAlex Elder 		kfree(name);
1901f907ad55SAlex Elder 		return NULL;
1902f907ad55SAlex Elder 	}
1903f907ad55SAlex Elder 
1904bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1905bf0d5f50SAlex Elder 	obj_request->offset = offset;
1906bf0d5f50SAlex Elder 	obj_request->length = length;
1907926f9b3fSAlex Elder 	obj_request->flags = 0;
1908bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1909bf0d5f50SAlex Elder 	obj_request->type = type;
1910bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1911788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1912bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1913bf0d5f50SAlex Elder 
191437206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
191537206ee5SAlex Elder 		offset, length, (int)type, obj_request);
191637206ee5SAlex Elder 
1917bf0d5f50SAlex Elder 	return obj_request;
1918bf0d5f50SAlex Elder }
1919bf0d5f50SAlex Elder 
1920bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1921bf0d5f50SAlex Elder {
1922bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1923bf0d5f50SAlex Elder 
1924bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1925bf0d5f50SAlex Elder 
192637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
192737206ee5SAlex Elder 
1928bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1929bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1930bf0d5f50SAlex Elder 
1931bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1932bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1933bf0d5f50SAlex Elder 
1934bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1935bf0d5f50SAlex Elder 	switch (obj_request->type) {
19369969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
19379969ebc5SAlex Elder 		break;		/* Nothing to do */
1938bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1939bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1940bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1941bf0d5f50SAlex Elder 		break;
1942788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1943788e2df3SAlex Elder 		if (obj_request->pages)
1944788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1945788e2df3SAlex Elder 						obj_request->page_count);
1946788e2df3SAlex Elder 		break;
1947bf0d5f50SAlex Elder 	}
1948bf0d5f50SAlex Elder 
1949f907ad55SAlex Elder 	kfree(obj_request->object_name);
1950868311b1SAlex Elder 	obj_request->object_name = NULL;
1951868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1952bf0d5f50SAlex Elder }
1953bf0d5f50SAlex Elder 
1954fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1955fb65d228SAlex Elder 
1956fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
1957fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1958fb65d228SAlex Elder {
1959fb65d228SAlex Elder 	rbd_dev_remove_parent(rbd_dev);
1960fb65d228SAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
1961fb65d228SAlex Elder 	rbd_dev->parent_spec = NULL;
1962fb65d228SAlex Elder 	rbd_dev->parent_overlap = 0;
1963fb65d228SAlex Elder }
1964fb65d228SAlex Elder 
1965bf0d5f50SAlex Elder /*
1966a2acd00eSAlex Elder  * Parent image reference counting is used to determine when an
1967a2acd00eSAlex Elder  * image's parent fields can be safely torn down--after there are no
1968a2acd00eSAlex Elder  * more in-flight requests to the parent image.  When the last
1969a2acd00eSAlex Elder  * reference is dropped, cleaning them up is safe.
1970a2acd00eSAlex Elder  */
1971a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1972a2acd00eSAlex Elder {
1973a2acd00eSAlex Elder 	int counter;
1974a2acd00eSAlex Elder 
1975a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
1976a2acd00eSAlex Elder 		return;
1977a2acd00eSAlex Elder 
1978a2acd00eSAlex Elder 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1979a2acd00eSAlex Elder 	if (counter > 0)
1980a2acd00eSAlex Elder 		return;
1981a2acd00eSAlex Elder 
1982a2acd00eSAlex Elder 	/* Last reference; clean up parent data structures */
1983a2acd00eSAlex Elder 
1984a2acd00eSAlex Elder 	if (!counter)
1985a2acd00eSAlex Elder 		rbd_dev_unparent(rbd_dev);
1986a2acd00eSAlex Elder 	else
1987a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference underflow\n");
1988a2acd00eSAlex Elder }
1989a2acd00eSAlex Elder 
1990a2acd00eSAlex Elder /*
1991a2acd00eSAlex Elder  * If an image has a non-zero parent overlap, get a reference to its
1992a2acd00eSAlex Elder  * parent.
1993a2acd00eSAlex Elder  *
1994392a9dadSAlex Elder  * We must get the reference before checking for the overlap to
1995392a9dadSAlex Elder  * coordinate properly with zeroing the parent overlap in
1996392a9dadSAlex Elder  * rbd_dev_v2_parent_info() when an image gets flattened.  We
1997392a9dadSAlex Elder  * drop it again if there is no overlap.
1998392a9dadSAlex Elder  *
1999a2acd00eSAlex Elder  * Returns true if the rbd device has a parent with a non-zero
2000a2acd00eSAlex Elder  * overlap and a reference for it was successfully taken, or
2001a2acd00eSAlex Elder  * false otherwise.
2002a2acd00eSAlex Elder  */
2003a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2004a2acd00eSAlex Elder {
2005a2acd00eSAlex Elder 	int counter;
2006a2acd00eSAlex Elder 
2007a2acd00eSAlex Elder 	if (!rbd_dev->parent_spec)
2008a2acd00eSAlex Elder 		return false;
2009a2acd00eSAlex Elder 
2010a2acd00eSAlex Elder 	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2011a2acd00eSAlex Elder 	if (counter > 0 && rbd_dev->parent_overlap)
2012a2acd00eSAlex Elder 		return true;
2013a2acd00eSAlex Elder 
2014a2acd00eSAlex Elder 	/* Image was flattened, but parent is not yet torn down */
2015a2acd00eSAlex Elder 
2016a2acd00eSAlex Elder 	if (counter < 0)
2017a2acd00eSAlex Elder 		rbd_warn(rbd_dev, "parent reference overflow\n");
2018a2acd00eSAlex Elder 
2019a2acd00eSAlex Elder 	return false;
2020a2acd00eSAlex Elder }
2021a2acd00eSAlex Elder 
2022bf0d5f50SAlex Elder /*
2023bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
2024bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
2025bf0d5f50SAlex Elder  * (if there is one).
2026bf0d5f50SAlex Elder  */
2027cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
2028cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
2029bf0d5f50SAlex Elder 					u64 offset, u64 length,
2030e93f3152SAlex Elder 					bool write_request)
2031bf0d5f50SAlex Elder {
2032bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2033bf0d5f50SAlex Elder 
20341c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
2035bf0d5f50SAlex Elder 	if (!img_request)
2036bf0d5f50SAlex Elder 		return NULL;
2037bf0d5f50SAlex Elder 
2038bf0d5f50SAlex Elder 	if (write_request) {
2039bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
2040812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
2041bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
2042bf0d5f50SAlex Elder 	}
2043bf0d5f50SAlex Elder 
2044bf0d5f50SAlex Elder 	img_request->rq = NULL;
2045bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
2046bf0d5f50SAlex Elder 	img_request->offset = offset;
2047bf0d5f50SAlex Elder 	img_request->length = length;
20480c425248SAlex Elder 	img_request->flags = 0;
20490c425248SAlex Elder 	if (write_request) {
20500c425248SAlex Elder 		img_request_write_set(img_request);
2051468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
20520c425248SAlex Elder 	} else {
2053bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
20540c425248SAlex Elder 	}
2055a2acd00eSAlex Elder 	if (rbd_dev_parent_get(rbd_dev))
2056d0b2e944SAlex Elder 		img_request_layered_set(img_request);
2057bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
2058bf0d5f50SAlex Elder 	img_request->next_completion = 0;
2059bf0d5f50SAlex Elder 	img_request->callback = NULL;
2060a5a337d4SAlex Elder 	img_request->result = 0;
2061bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
2062bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
2063bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
2064bf0d5f50SAlex Elder 
206537206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
206637206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
206737206ee5SAlex Elder 		img_request);
206837206ee5SAlex Elder 
2069bf0d5f50SAlex Elder 	return img_request;
2070bf0d5f50SAlex Elder }
2071bf0d5f50SAlex Elder 
2072bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
2073bf0d5f50SAlex Elder {
2074bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
2075bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
2076bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
2077bf0d5f50SAlex Elder 
2078bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
2079bf0d5f50SAlex Elder 
208037206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
208137206ee5SAlex Elder 
2082bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2083bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
208425dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
2085bf0d5f50SAlex Elder 
2086a2acd00eSAlex Elder 	if (img_request_layered_test(img_request)) {
2087a2acd00eSAlex Elder 		img_request_layered_clear(img_request);
2088a2acd00eSAlex Elder 		rbd_dev_parent_put(img_request->rbd_dev);
2089a2acd00eSAlex Elder 	}
2090a2acd00eSAlex Elder 
20910c425248SAlex Elder 	if (img_request_write_test(img_request))
2092812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
2093bf0d5f50SAlex Elder 
20941c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
2095bf0d5f50SAlex Elder }
2096bf0d5f50SAlex Elder 
2097e93f3152SAlex Elder static struct rbd_img_request *rbd_parent_request_create(
2098e93f3152SAlex Elder 					struct rbd_obj_request *obj_request,
2099e93f3152SAlex Elder 					u64 img_offset, u64 length)
2100e93f3152SAlex Elder {
2101e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2102e93f3152SAlex Elder 	struct rbd_device *rbd_dev;
2103e93f3152SAlex Elder 
2104e93f3152SAlex Elder 	rbd_assert(obj_request->img_request);
2105e93f3152SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2106e93f3152SAlex Elder 
2107e93f3152SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
2108e93f3152SAlex Elder 						img_offset, length, false);
2109e93f3152SAlex Elder 	if (!parent_request)
2110e93f3152SAlex Elder 		return NULL;
2111e93f3152SAlex Elder 
2112e93f3152SAlex Elder 	img_request_child_set(parent_request);
2113e93f3152SAlex Elder 	rbd_obj_request_get(obj_request);
2114e93f3152SAlex Elder 	parent_request->obj_request = obj_request;
2115e93f3152SAlex Elder 
2116e93f3152SAlex Elder 	return parent_request;
2117e93f3152SAlex Elder }
2118e93f3152SAlex Elder 
2119e93f3152SAlex Elder static void rbd_parent_request_destroy(struct kref *kref)
2120e93f3152SAlex Elder {
2121e93f3152SAlex Elder 	struct rbd_img_request *parent_request;
2122e93f3152SAlex Elder 	struct rbd_obj_request *orig_request;
2123e93f3152SAlex Elder 
2124e93f3152SAlex Elder 	parent_request = container_of(kref, struct rbd_img_request, kref);
2125e93f3152SAlex Elder 	orig_request = parent_request->obj_request;
2126e93f3152SAlex Elder 
2127e93f3152SAlex Elder 	parent_request->obj_request = NULL;
2128e93f3152SAlex Elder 	rbd_obj_request_put(orig_request);
2129e93f3152SAlex Elder 	img_request_child_clear(parent_request);
2130e93f3152SAlex Elder 
2131e93f3152SAlex Elder 	rbd_img_request_destroy(kref);
2132e93f3152SAlex Elder }
2133e93f3152SAlex Elder 
21341217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
21351217857fSAlex Elder {
21366365d33aSAlex Elder 	struct rbd_img_request *img_request;
21371217857fSAlex Elder 	unsigned int xferred;
21381217857fSAlex Elder 	int result;
21398b3e1a56SAlex Elder 	bool more;
21401217857fSAlex Elder 
21416365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21426365d33aSAlex Elder 	img_request = obj_request->img_request;
21436365d33aSAlex Elder 
21441217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
21451217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
21461217857fSAlex Elder 	result = obj_request->result;
21471217857fSAlex Elder 	if (result) {
21481217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
21491217857fSAlex Elder 
21501217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
21511217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
21521217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
21531217857fSAlex Elder 			obj_request->offset);
21541217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
21551217857fSAlex Elder 			result, xferred);
21561217857fSAlex Elder 		if (!img_request->result)
21571217857fSAlex Elder 			img_request->result = result;
21581217857fSAlex Elder 	}
21591217857fSAlex Elder 
2160f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
2161f1a4739fSAlex Elder 
2162f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
2163f1a4739fSAlex Elder 		obj_request->pages = NULL;
2164f1a4739fSAlex Elder 		obj_request->page_count = 0;
2165f1a4739fSAlex Elder 	}
2166f1a4739fSAlex Elder 
21678b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
21688b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
21698b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
21708b3e1a56SAlex Elder 	} else {
21718b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
21728b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
21738b3e1a56SAlex Elder 	}
21748b3e1a56SAlex Elder 
21758b3e1a56SAlex Elder 	return more;
21761217857fSAlex Elder }
21771217857fSAlex Elder 
21782169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
21792169238dSAlex Elder {
21802169238dSAlex Elder 	struct rbd_img_request *img_request;
21812169238dSAlex Elder 	u32 which = obj_request->which;
21822169238dSAlex Elder 	bool more = true;
21832169238dSAlex Elder 
21846365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21852169238dSAlex Elder 	img_request = obj_request->img_request;
21862169238dSAlex Elder 
21872169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
21882169238dSAlex Elder 	rbd_assert(img_request != NULL);
21892169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
21902169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
21912169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
21922169238dSAlex Elder 
21932169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
21942169238dSAlex Elder 	if (which != img_request->next_completion)
21952169238dSAlex Elder 		goto out;
21962169238dSAlex Elder 
21972169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
21982169238dSAlex Elder 		rbd_assert(more);
21992169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
22002169238dSAlex Elder 
22012169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
22022169238dSAlex Elder 			break;
22031217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
22042169238dSAlex Elder 		which++;
22052169238dSAlex Elder 	}
22062169238dSAlex Elder 
22072169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
22082169238dSAlex Elder 	img_request->next_completion = which;
22092169238dSAlex Elder out:
22102169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
22110f2d5be7SAlex Elder 	rbd_img_request_put(img_request);
22122169238dSAlex Elder 
22132169238dSAlex Elder 	if (!more)
22142169238dSAlex Elder 		rbd_img_request_complete(img_request);
22152169238dSAlex Elder }
22162169238dSAlex Elder 
2217f1a4739fSAlex Elder /*
2218f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2219f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2220f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2221f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2222f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2223f1a4739fSAlex Elder  * all data described by the image request.
2224f1a4739fSAlex Elder  */
2225f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2226f1a4739fSAlex Elder 					enum obj_request_type type,
2227f1a4739fSAlex Elder 					void *data_desc)
2228bf0d5f50SAlex Elder {
2229bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2230bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2231bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
22320c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2233a158073cSJingoo Han 	struct bio *bio_list = NULL;
2234f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2235a158073cSJingoo Han 	struct page **pages = NULL;
22367da22d29SAlex Elder 	u64 img_offset;
2237bf0d5f50SAlex Elder 	u64 resid;
2238bf0d5f50SAlex Elder 	u16 opcode;
2239bf0d5f50SAlex Elder 
2240f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2241f1a4739fSAlex Elder 		(int)type, data_desc);
224237206ee5SAlex Elder 
2243430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
22447da22d29SAlex Elder 	img_offset = img_request->offset;
2245bf0d5f50SAlex Elder 	resid = img_request->length;
22464dda41d3SAlex Elder 	rbd_assert(resid > 0);
2247f1a4739fSAlex Elder 
2248f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2249f1a4739fSAlex Elder 		bio_list = data_desc;
22504f024f37SKent Overstreet 		rbd_assert(img_offset ==
22514f024f37SKent Overstreet 			   bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2252f1a4739fSAlex Elder 	} else {
2253f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2254f1a4739fSAlex Elder 		pages = data_desc;
2255f1a4739fSAlex Elder 	}
2256f1a4739fSAlex Elder 
2257bf0d5f50SAlex Elder 	while (resid) {
22582fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2259bf0d5f50SAlex Elder 		const char *object_name;
2260bf0d5f50SAlex Elder 		u64 offset;
2261bf0d5f50SAlex Elder 		u64 length;
22620ccd5926SIlya Dryomov 		unsigned int which = 0;
2263bf0d5f50SAlex Elder 
22647da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2265bf0d5f50SAlex Elder 		if (!object_name)
2266bf0d5f50SAlex Elder 			goto out_unwind;
22677da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
22687da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2269bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2270f1a4739fSAlex Elder 						offset, length, type);
227178c2a44aSAlex Elder 		/* object request has its own copy of the object name */
227278c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2273bf0d5f50SAlex Elder 		if (!obj_request)
2274bf0d5f50SAlex Elder 			goto out_unwind;
227562054da6SIlya Dryomov 
227603507db6SJosh Durgin 		/*
227703507db6SJosh Durgin 		 * set obj_request->img_request before creating the
227803507db6SJosh Durgin 		 * osd_request so that it gets the right snapc
227903507db6SJosh Durgin 		 */
228003507db6SJosh Durgin 		rbd_img_obj_request_add(img_request, obj_request);
2281bf0d5f50SAlex Elder 
2282f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2283f1a4739fSAlex Elder 			unsigned int clone_size;
2284f1a4739fSAlex Elder 
2285bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2286bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2287f1a4739fSAlex Elder 			obj_request->bio_list =
2288f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2289f1a4739fSAlex Elder 								&bio_offset,
2290f1a4739fSAlex Elder 								clone_size,
2291bf0d5f50SAlex Elder 								GFP_ATOMIC);
2292bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
229362054da6SIlya Dryomov 				goto out_unwind;
2294f1a4739fSAlex Elder 		} else {
2295f1a4739fSAlex Elder 			unsigned int page_count;
2296f1a4739fSAlex Elder 
2297f1a4739fSAlex Elder 			obj_request->pages = pages;
2298f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2299f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2300f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2301f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2302f1a4739fSAlex Elder 			pages += page_count;
2303f1a4739fSAlex Elder 		}
2304bf0d5f50SAlex Elder 
23050ccd5926SIlya Dryomov 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
23060ccd5926SIlya Dryomov 					     (write_request ? 2 : 1),
23072fa12320SAlex Elder 					     obj_request);
23082fa12320SAlex Elder 		if (!osd_req)
230962054da6SIlya Dryomov 			goto out_unwind;
23102fa12320SAlex Elder 		obj_request->osd_req = osd_req;
23112169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
23120f2d5be7SAlex Elder 		rbd_img_request_get(img_request);
2313430c28c3SAlex Elder 
23140ccd5926SIlya Dryomov 		if (write_request) {
23150ccd5926SIlya Dryomov 			osd_req_op_alloc_hint_init(osd_req, which,
23160ccd5926SIlya Dryomov 					     rbd_obj_bytes(&rbd_dev->header),
23170ccd5926SIlya Dryomov 					     rbd_obj_bytes(&rbd_dev->header));
23180ccd5926SIlya Dryomov 			which++;
23190ccd5926SIlya Dryomov 		}
23200ccd5926SIlya Dryomov 
23210ccd5926SIlya Dryomov 		osd_req_op_extent_init(osd_req, which, opcode, offset, length,
23222fa12320SAlex Elder 				       0, 0);
2323f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
23240ccd5926SIlya Dryomov 			osd_req_op_extent_osd_data_bio(osd_req, which,
2325f1a4739fSAlex Elder 					obj_request->bio_list, length);
2326f1a4739fSAlex Elder 		else
23270ccd5926SIlya Dryomov 			osd_req_op_extent_osd_data_pages(osd_req, which,
2328f1a4739fSAlex Elder 					obj_request->pages, length,
2329f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
23309d4df01fSAlex Elder 
23319d4df01fSAlex Elder 		if (write_request)
23329d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
23339d4df01fSAlex Elder 		else
23349d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2335430c28c3SAlex Elder 
23367da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2337bf0d5f50SAlex Elder 
23387da22d29SAlex Elder 		img_offset += length;
2339bf0d5f50SAlex Elder 		resid -= length;
2340bf0d5f50SAlex Elder 	}
2341bf0d5f50SAlex Elder 
2342bf0d5f50SAlex Elder 	return 0;
2343bf0d5f50SAlex Elder 
2344bf0d5f50SAlex Elder out_unwind:
2345bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
234642dd037cSIlya Dryomov 		rbd_img_obj_request_del(img_request, obj_request);
2347bf0d5f50SAlex Elder 
2348bf0d5f50SAlex Elder 	return -ENOMEM;
2349bf0d5f50SAlex Elder }
2350bf0d5f50SAlex Elder 
23513d7efd18SAlex Elder static void
23520eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
23530eefd470SAlex Elder {
23540eefd470SAlex Elder 	struct rbd_img_request *img_request;
23550eefd470SAlex Elder 	struct rbd_device *rbd_dev;
2356ebda6408SAlex Elder 	struct page **pages;
23570eefd470SAlex Elder 	u32 page_count;
23580eefd470SAlex Elder 
23590eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
23600eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
23610eefd470SAlex Elder 	img_request = obj_request->img_request;
23620eefd470SAlex Elder 	rbd_assert(img_request);
23630eefd470SAlex Elder 
23640eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
23650eefd470SAlex Elder 	rbd_assert(rbd_dev);
23660eefd470SAlex Elder 
2367ebda6408SAlex Elder 	pages = obj_request->copyup_pages;
2368ebda6408SAlex Elder 	rbd_assert(pages != NULL);
23690eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
2370ebda6408SAlex Elder 	page_count = obj_request->copyup_page_count;
2371ebda6408SAlex Elder 	rbd_assert(page_count);
2372ebda6408SAlex Elder 	obj_request->copyup_page_count = 0;
2373ebda6408SAlex Elder 	ceph_release_page_vector(pages, page_count);
23740eefd470SAlex Elder 
23750eefd470SAlex Elder 	/*
23760eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
23770eefd470SAlex Elder 	 * original write request.  There is no such thing as a
23780eefd470SAlex Elder 	 * successful short write, so if the request was successful
23790eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
23800eefd470SAlex Elder 	 */
23810eefd470SAlex Elder 	if (!obj_request->result)
23820eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
23830eefd470SAlex Elder 
23840eefd470SAlex Elder 	/* Finish up with the normal image object callback */
23850eefd470SAlex Elder 
23860eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
23870eefd470SAlex Elder }
23880eefd470SAlex Elder 
23890eefd470SAlex Elder static void
23903d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
23913d7efd18SAlex Elder {
23923d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
23930eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
23940eefd470SAlex Elder 	struct ceph_osd_client *osdc;
23950eefd470SAlex Elder 	struct rbd_device *rbd_dev;
23963d7efd18SAlex Elder 	struct page **pages;
2397ebda6408SAlex Elder 	u32 page_count;
2398bbea1c1aSAlex Elder 	int img_result;
2399ebda6408SAlex Elder 	u64 parent_length;
2400b91f09f1SAlex Elder 	u64 offset;
2401b91f09f1SAlex Elder 	u64 length;
24023d7efd18SAlex Elder 
24033d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
24043d7efd18SAlex Elder 
24053d7efd18SAlex Elder 	/* First get what we need from the image request */
24063d7efd18SAlex Elder 
24073d7efd18SAlex Elder 	pages = img_request->copyup_pages;
24083d7efd18SAlex Elder 	rbd_assert(pages != NULL);
24093d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
2410ebda6408SAlex Elder 	page_count = img_request->copyup_page_count;
2411ebda6408SAlex Elder 	rbd_assert(page_count);
2412ebda6408SAlex Elder 	img_request->copyup_page_count = 0;
24133d7efd18SAlex Elder 
24143d7efd18SAlex Elder 	orig_request = img_request->obj_request;
24153d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
2416b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(orig_request->type));
2417bbea1c1aSAlex Elder 	img_result = img_request->result;
2418ebda6408SAlex Elder 	parent_length = img_request->length;
2419ebda6408SAlex Elder 	rbd_assert(parent_length == img_request->xferred);
24203d7efd18SAlex Elder 	rbd_img_request_put(img_request);
24213d7efd18SAlex Elder 
242291c6febbSAlex Elder 	rbd_assert(orig_request->img_request);
242391c6febbSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
24243d7efd18SAlex Elder 	rbd_assert(rbd_dev);
24253d7efd18SAlex Elder 
2426bbea1c1aSAlex Elder 	/*
2427bbea1c1aSAlex Elder 	 * If the overlap has become 0 (most likely because the
2428bbea1c1aSAlex Elder 	 * image has been flattened) we need to free the pages
2429bbea1c1aSAlex Elder 	 * and re-submit the original write request.
2430bbea1c1aSAlex Elder 	 */
2431bbea1c1aSAlex Elder 	if (!rbd_dev->parent_overlap) {
2432bbea1c1aSAlex Elder 		struct ceph_osd_client *osdc;
2433bbea1c1aSAlex Elder 
2434bbea1c1aSAlex Elder 		ceph_release_page_vector(pages, page_count);
2435bbea1c1aSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2436bbea1c1aSAlex Elder 		img_result = rbd_obj_request_submit(osdc, orig_request);
2437bbea1c1aSAlex Elder 		if (!img_result)
2438bbea1c1aSAlex Elder 			return;
2439bbea1c1aSAlex Elder 	}
2440bbea1c1aSAlex Elder 
2441bbea1c1aSAlex Elder 	if (img_result)
24420eefd470SAlex Elder 		goto out_err;
24433d7efd18SAlex Elder 
24448785b1d4SAlex Elder 	/*
24458785b1d4SAlex Elder 	 * The original osd request is of no use to use any more.
24460ccd5926SIlya Dryomov 	 * We need a new one that can hold the three ops in a copyup
24478785b1d4SAlex Elder 	 * request.  Allocate the new copyup osd request for the
24488785b1d4SAlex Elder 	 * original request, and release the old one.
24498785b1d4SAlex Elder 	 */
2450bbea1c1aSAlex Elder 	img_result = -ENOMEM;
24510eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
24520eefd470SAlex Elder 	if (!osd_req)
24530eefd470SAlex Elder 		goto out_err;
24548785b1d4SAlex Elder 	rbd_osd_req_destroy(orig_request->osd_req);
24550eefd470SAlex Elder 	orig_request->osd_req = osd_req;
24560eefd470SAlex Elder 	orig_request->copyup_pages = pages;
2457ebda6408SAlex Elder 	orig_request->copyup_page_count = page_count;
24583d7efd18SAlex Elder 
24590eefd470SAlex Elder 	/* Initialize the copyup op */
24600eefd470SAlex Elder 
24610eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2462ebda6408SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
24630eefd470SAlex Elder 						false, false);
24640eefd470SAlex Elder 
24650ccd5926SIlya Dryomov 	/* Then the hint op */
24660ccd5926SIlya Dryomov 
24670ccd5926SIlya Dryomov 	osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header),
24680ccd5926SIlya Dryomov 				   rbd_obj_bytes(&rbd_dev->header));
24690ccd5926SIlya Dryomov 
24700ccd5926SIlya Dryomov 	/* And the original write request op */
24710eefd470SAlex Elder 
2472b91f09f1SAlex Elder 	offset = orig_request->offset;
2473b91f09f1SAlex Elder 	length = orig_request->length;
24740ccd5926SIlya Dryomov 	osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2475b91f09f1SAlex Elder 					offset, length, 0, 0);
2476b91f09f1SAlex Elder 	if (orig_request->type == OBJ_REQUEST_BIO)
24770ccd5926SIlya Dryomov 		osd_req_op_extent_osd_data_bio(osd_req, 2,
2478b91f09f1SAlex Elder 					orig_request->bio_list, length);
2479b91f09f1SAlex Elder 	else
24800ccd5926SIlya Dryomov 		osd_req_op_extent_osd_data_pages(osd_req, 2,
2481b91f09f1SAlex Elder 					orig_request->pages, length,
2482b91f09f1SAlex Elder 					offset & ~PAGE_MASK, false, false);
24830eefd470SAlex Elder 
24840eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
24850eefd470SAlex Elder 
24860eefd470SAlex Elder 	/* All set, send it off. */
24870eefd470SAlex Elder 
24880eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
24890eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2490bbea1c1aSAlex Elder 	img_result = rbd_obj_request_submit(osdc, orig_request);
2491bbea1c1aSAlex Elder 	if (!img_result)
24920eefd470SAlex Elder 		return;
24930eefd470SAlex Elder out_err:
24940eefd470SAlex Elder 	/* Record the error code and complete the request */
24950eefd470SAlex Elder 
2496bbea1c1aSAlex Elder 	orig_request->result = img_result;
24970eefd470SAlex Elder 	orig_request->xferred = 0;
24983d7efd18SAlex Elder 	obj_request_done_set(orig_request);
24993d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
25003d7efd18SAlex Elder }
25013d7efd18SAlex Elder 
25023d7efd18SAlex Elder /*
25033d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
25043d7efd18SAlex Elder  * entire target of the given object request.  This is used for
25053d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
25063d7efd18SAlex Elder  * object request from the image request does not exist.
25073d7efd18SAlex Elder  *
25083d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
25093d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
25103d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
25113d7efd18SAlex Elder  * the original object request for the copyup operation.
25123d7efd18SAlex Elder  *
25133d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
25143d7efd18SAlex Elder  * object request and mark it done so it gets completed.
25153d7efd18SAlex Elder  */
25163d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
25173d7efd18SAlex Elder {
25183d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
25193d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
25203d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
25213d7efd18SAlex Elder 	u64 img_offset;
25223d7efd18SAlex Elder 	u64 length;
25233d7efd18SAlex Elder 	struct page **pages = NULL;
25243d7efd18SAlex Elder 	u32 page_count;
25253d7efd18SAlex Elder 	int result;
25263d7efd18SAlex Elder 
25273d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2528b91f09f1SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
25293d7efd18SAlex Elder 
25303d7efd18SAlex Elder 	img_request = obj_request->img_request;
25313d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
25323d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
25333d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
25343d7efd18SAlex Elder 
25353d7efd18SAlex Elder 	/*
25363d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
25373d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
25383d7efd18SAlex Elder 	 */
25393d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
25403d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
25413d7efd18SAlex Elder 
25423d7efd18SAlex Elder 	/*
2543a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2544a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2545a9e8ba2cSAlex Elder 	 * necessary.
2546a9e8ba2cSAlex Elder 	 */
2547a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2548a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2549a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2550a9e8ba2cSAlex Elder 	}
2551a9e8ba2cSAlex Elder 
2552a9e8ba2cSAlex Elder 	/*
25533d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
25543d7efd18SAlex Elder 	 * from the parent.
25553d7efd18SAlex Elder 	 */
25563d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
25573d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
25583d7efd18SAlex Elder 	if (IS_ERR(pages)) {
25593d7efd18SAlex Elder 		result = PTR_ERR(pages);
25603d7efd18SAlex Elder 		pages = NULL;
25613d7efd18SAlex Elder 		goto out_err;
25623d7efd18SAlex Elder 	}
25633d7efd18SAlex Elder 
25643d7efd18SAlex Elder 	result = -ENOMEM;
2565e93f3152SAlex Elder 	parent_request = rbd_parent_request_create(obj_request,
2566e93f3152SAlex Elder 						img_offset, length);
25673d7efd18SAlex Elder 	if (!parent_request)
25683d7efd18SAlex Elder 		goto out_err;
25693d7efd18SAlex Elder 
25703d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
25713d7efd18SAlex Elder 	if (result)
25723d7efd18SAlex Elder 		goto out_err;
25733d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
2574ebda6408SAlex Elder 	parent_request->copyup_page_count = page_count;
25753d7efd18SAlex Elder 
25763d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
25773d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
25783d7efd18SAlex Elder 	if (!result)
25793d7efd18SAlex Elder 		return 0;
25803d7efd18SAlex Elder 
25813d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
2582ebda6408SAlex Elder 	parent_request->copyup_page_count = 0;
25833d7efd18SAlex Elder 	parent_request->obj_request = NULL;
25843d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
25853d7efd18SAlex Elder out_err:
25863d7efd18SAlex Elder 	if (pages)
25873d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
25883d7efd18SAlex Elder 	if (parent_request)
25893d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
25903d7efd18SAlex Elder 	obj_request->result = result;
25913d7efd18SAlex Elder 	obj_request->xferred = 0;
25923d7efd18SAlex Elder 	obj_request_done_set(obj_request);
25933d7efd18SAlex Elder 
25943d7efd18SAlex Elder 	return result;
25953d7efd18SAlex Elder }
25963d7efd18SAlex Elder 
2597c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2598c5b5ef6cSAlex Elder {
2599c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2600638f5abeSAlex Elder 	struct rbd_device *rbd_dev;
2601c5b5ef6cSAlex Elder 	int result;
2602c5b5ef6cSAlex Elder 
2603c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2604c5b5ef6cSAlex Elder 
2605c5b5ef6cSAlex Elder 	/*
2606c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2607c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2608c5b5ef6cSAlex Elder 	 * we're done with the request.
2609c5b5ef6cSAlex Elder 	 */
2610c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2611c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2612912c317dSAlex Elder 	rbd_obj_request_put(orig_request);
2613c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2614c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2615c5b5ef6cSAlex Elder 
2616c5b5ef6cSAlex Elder 	result = obj_request->result;
2617c5b5ef6cSAlex Elder 	obj_request->result = 0;
2618c5b5ef6cSAlex Elder 
2619c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2620c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2621c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2622c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2623c5b5ef6cSAlex Elder 
2624638f5abeSAlex Elder 	/*
2625638f5abeSAlex Elder 	 * If the overlap has become 0 (most likely because the
2626638f5abeSAlex Elder 	 * image has been flattened) we need to free the pages
2627638f5abeSAlex Elder 	 * and re-submit the original write request.
2628638f5abeSAlex Elder 	 */
2629638f5abeSAlex Elder 	rbd_dev = orig_request->img_request->rbd_dev;
2630638f5abeSAlex Elder 	if (!rbd_dev->parent_overlap) {
2631638f5abeSAlex Elder 		struct ceph_osd_client *osdc;
2632638f5abeSAlex Elder 
2633638f5abeSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2634638f5abeSAlex Elder 		result = rbd_obj_request_submit(osdc, orig_request);
2635638f5abeSAlex Elder 		if (!result)
2636638f5abeSAlex Elder 			return;
2637638f5abeSAlex Elder 	}
2638c5b5ef6cSAlex Elder 
2639c5b5ef6cSAlex Elder 	/*
2640c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2641c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2642c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2643c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2644c5b5ef6cSAlex Elder 	 */
2645c5b5ef6cSAlex Elder 	if (!result) {
2646c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2647c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2648c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2649c5b5ef6cSAlex Elder 	} else if (result) {
2650c5b5ef6cSAlex Elder 		orig_request->result = result;
26513d7efd18SAlex Elder 		goto out;
2652c5b5ef6cSAlex Elder 	}
2653c5b5ef6cSAlex Elder 
2654c5b5ef6cSAlex Elder 	/*
2655c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2656c5b5ef6cSAlex Elder 	 * whether the target object exists.
2657c5b5ef6cSAlex Elder 	 */
2658b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
26593d7efd18SAlex Elder out:
2660c5b5ef6cSAlex Elder 	if (orig_request->result)
2661c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2662c5b5ef6cSAlex Elder }
2663c5b5ef6cSAlex Elder 
2664c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2665c5b5ef6cSAlex Elder {
2666c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2667c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2668c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2669c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2670c5b5ef6cSAlex Elder 	u32 page_count;
2671c5b5ef6cSAlex Elder 	size_t size;
2672c5b5ef6cSAlex Elder 	int ret;
2673c5b5ef6cSAlex Elder 
2674c5b5ef6cSAlex Elder 	/*
2675c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2676c5b5ef6cSAlex Elder 	 *     le64 length;
2677c5b5ef6cSAlex Elder 	 *     struct {
2678c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2679c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2680c5b5ef6cSAlex Elder 	 *     } mtime;
2681c5b5ef6cSAlex Elder 	 */
2682c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2683c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2684c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2685c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2686c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2687c5b5ef6cSAlex Elder 
2688c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2689c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2690c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2691c5b5ef6cSAlex Elder 	if (!stat_request)
2692c5b5ef6cSAlex Elder 		goto out;
2693c5b5ef6cSAlex Elder 
2694c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2695c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2696c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2697c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2698c5b5ef6cSAlex Elder 
2699c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2700c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2701deb236b3SIlya Dryomov 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2702c5b5ef6cSAlex Elder 						   stat_request);
2703c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2704c5b5ef6cSAlex Elder 		goto out;
2705c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2706c5b5ef6cSAlex Elder 
2707c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2708c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2709c5b5ef6cSAlex Elder 					false, false);
27109d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2711c5b5ef6cSAlex Elder 
2712c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2713c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2714c5b5ef6cSAlex Elder out:
2715c5b5ef6cSAlex Elder 	if (ret)
2716c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2717c5b5ef6cSAlex Elder 
2718c5b5ef6cSAlex Elder 	return ret;
2719c5b5ef6cSAlex Elder }
2720c5b5ef6cSAlex Elder 
2721b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2722b454e36dSAlex Elder {
2723b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2724a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
27253d7efd18SAlex Elder 	bool known;
2726b454e36dSAlex Elder 
2727b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2728b454e36dSAlex Elder 
2729b454e36dSAlex Elder 	img_request = obj_request->img_request;
2730b454e36dSAlex Elder 	rbd_assert(img_request);
2731a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2732b454e36dSAlex Elder 
2733b454e36dSAlex Elder 	/*
2734a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2735a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2736a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2737a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2738a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2739a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2740a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2741a9e8ba2cSAlex Elder 	 * simple object request.
2742b454e36dSAlex Elder 	 */
2743b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2744b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2745a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
27463d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
27473d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2748b454e36dSAlex Elder 
2749b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2750b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2751b454e36dSAlex Elder 
2752b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2753b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2754b454e36dSAlex Elder 
2755b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2756b454e36dSAlex Elder 	}
2757b454e36dSAlex Elder 
2758b454e36dSAlex Elder 	/*
27593d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
27603d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
27613d7efd18SAlex Elder 	 * start by reading the data for the full target object from
27623d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2763b454e36dSAlex Elder 	 */
27643d7efd18SAlex Elder 	if (known)
27653d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
27663d7efd18SAlex Elder 
27673d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2768b454e36dSAlex Elder 
2769b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2770b454e36dSAlex Elder }
2771b454e36dSAlex Elder 
2772bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2773bf0d5f50SAlex Elder {
2774bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
277546faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2776bf0d5f50SAlex Elder 
277737206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
277846faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2779bf0d5f50SAlex Elder 		int ret;
2780bf0d5f50SAlex Elder 
2781b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2782bf0d5f50SAlex Elder 		if (ret)
2783bf0d5f50SAlex Elder 			return ret;
2784bf0d5f50SAlex Elder 	}
2785bf0d5f50SAlex Elder 
2786bf0d5f50SAlex Elder 	return 0;
2787bf0d5f50SAlex Elder }
2788bf0d5f50SAlex Elder 
27898b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
27908b3e1a56SAlex Elder {
27918b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2792a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2793a9e8ba2cSAlex Elder 	u64 obj_end;
279402c74fbaSAlex Elder 	u64 img_xferred;
279502c74fbaSAlex Elder 	int img_result;
27968b3e1a56SAlex Elder 
27978b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
27988b3e1a56SAlex Elder 
279902c74fbaSAlex Elder 	/* First get what we need from the image request and release it */
280002c74fbaSAlex Elder 
28018b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
280202c74fbaSAlex Elder 	img_xferred = img_request->xferred;
280302c74fbaSAlex Elder 	img_result = img_request->result;
280402c74fbaSAlex Elder 	rbd_img_request_put(img_request);
280502c74fbaSAlex Elder 
280602c74fbaSAlex Elder 	/*
280702c74fbaSAlex Elder 	 * If the overlap has become 0 (most likely because the
280802c74fbaSAlex Elder 	 * image has been flattened) we need to re-submit the
280902c74fbaSAlex Elder 	 * original request.
281002c74fbaSAlex Elder 	 */
2811a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2812a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
281302c74fbaSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
281402c74fbaSAlex Elder 	if (!rbd_dev->parent_overlap) {
281502c74fbaSAlex Elder 		struct ceph_osd_client *osdc;
28168b3e1a56SAlex Elder 
281702c74fbaSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
281802c74fbaSAlex Elder 		img_result = rbd_obj_request_submit(osdc, obj_request);
281902c74fbaSAlex Elder 		if (!img_result)
282002c74fbaSAlex Elder 			return;
282102c74fbaSAlex Elder 	}
282202c74fbaSAlex Elder 
282302c74fbaSAlex Elder 	obj_request->result = img_result;
2824a9e8ba2cSAlex Elder 	if (obj_request->result)
2825a9e8ba2cSAlex Elder 		goto out;
2826a9e8ba2cSAlex Elder 
2827a9e8ba2cSAlex Elder 	/*
2828a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2829a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2830a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2831a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2832a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2833a9e8ba2cSAlex Elder 	 */
2834a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2835a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2836a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2837a9e8ba2cSAlex Elder 		u64 xferred = 0;
2838a9e8ba2cSAlex Elder 
2839a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2840a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2841a9e8ba2cSAlex Elder 					obj_request->img_offset;
2842a9e8ba2cSAlex Elder 
284302c74fbaSAlex Elder 		obj_request->xferred = min(img_xferred, xferred);
2844a9e8ba2cSAlex Elder 	} else {
284502c74fbaSAlex Elder 		obj_request->xferred = img_xferred;
2846a9e8ba2cSAlex Elder 	}
2847a9e8ba2cSAlex Elder out:
28488b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
28498b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
28508b3e1a56SAlex Elder }
28518b3e1a56SAlex Elder 
28528b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
28538b3e1a56SAlex Elder {
28548b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
28558b3e1a56SAlex Elder 	int result;
28568b3e1a56SAlex Elder 
28578b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
28588b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
28598b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
28605b2ab72dSAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
28618b3e1a56SAlex Elder 
28628b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
2863e93f3152SAlex Elder 	img_request = rbd_parent_request_create(obj_request,
28648b3e1a56SAlex Elder 						obj_request->img_offset,
2865e93f3152SAlex Elder 						obj_request->length);
28668b3e1a56SAlex Elder 	result = -ENOMEM;
28678b3e1a56SAlex Elder 	if (!img_request)
28688b3e1a56SAlex Elder 		goto out_err;
28698b3e1a56SAlex Elder 
28705b2ab72dSAlex Elder 	if (obj_request->type == OBJ_REQUEST_BIO)
2871f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2872f1a4739fSAlex Elder 						obj_request->bio_list);
28735b2ab72dSAlex Elder 	else
28745b2ab72dSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
28755b2ab72dSAlex Elder 						obj_request->pages);
28768b3e1a56SAlex Elder 	if (result)
28778b3e1a56SAlex Elder 		goto out_err;
28788b3e1a56SAlex Elder 
28798b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
28808b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
28818b3e1a56SAlex Elder 	if (result)
28828b3e1a56SAlex Elder 		goto out_err;
28838b3e1a56SAlex Elder 
28848b3e1a56SAlex Elder 	return;
28858b3e1a56SAlex Elder out_err:
28868b3e1a56SAlex Elder 	if (img_request)
28878b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
28888b3e1a56SAlex Elder 	obj_request->result = result;
28898b3e1a56SAlex Elder 	obj_request->xferred = 0;
28908b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
28918b3e1a56SAlex Elder }
28928b3e1a56SAlex Elder 
289320e0af67SJosh Durgin static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2894b8d70035SAlex Elder {
2895b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
28962169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2897b8d70035SAlex Elder 	int ret;
2898b8d70035SAlex Elder 
2899b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2900b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2901b8d70035SAlex Elder 	if (!obj_request)
2902b8d70035SAlex Elder 		return -ENOMEM;
2903b8d70035SAlex Elder 
2904b8d70035SAlex Elder 	ret = -ENOMEM;
2905deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
2906deb236b3SIlya Dryomov 						  obj_request);
2907b8d70035SAlex Elder 	if (!obj_request->osd_req)
2908b8d70035SAlex Elder 		goto out;
2909b8d70035SAlex Elder 
2910c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2911cc4a38bdSAlex Elder 					notify_id, 0, 0);
29129d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2913430c28c3SAlex Elder 
2914b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2915cf81b60eSAlex Elder 	if (ret)
291620e0af67SJosh Durgin 		goto out;
291720e0af67SJosh Durgin 	ret = rbd_obj_request_wait(obj_request);
291820e0af67SJosh Durgin out:
2919b8d70035SAlex Elder 	rbd_obj_request_put(obj_request);
2920b8d70035SAlex Elder 
2921b8d70035SAlex Elder 	return ret;
2922b8d70035SAlex Elder }
2923b8d70035SAlex Elder 
2924b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2925b8d70035SAlex Elder {
2926b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2927e627db08SAlex Elder 	int ret;
2928b8d70035SAlex Elder 
2929b8d70035SAlex Elder 	if (!rbd_dev)
2930b8d70035SAlex Elder 		return;
2931b8d70035SAlex Elder 
293237206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2933b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2934b8d70035SAlex Elder 		(unsigned int)opcode);
2935e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2936e627db08SAlex Elder 	if (ret)
29373b5cf2a2SAlex Elder 		rbd_warn(rbd_dev, "header refresh error (%d)\n", ret);
2938b8d70035SAlex Elder 
293920e0af67SJosh Durgin 	rbd_obj_notify_ack_sync(rbd_dev, notify_id);
2940b8d70035SAlex Elder }
2941b8d70035SAlex Elder 
29429969ebc5SAlex Elder /*
2943b30a01f2SIlya Dryomov  * Initiate a watch request, synchronously.
29449969ebc5SAlex Elder  */
2945b30a01f2SIlya Dryomov static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
29469969ebc5SAlex Elder {
29479969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
29489969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
29499969ebc5SAlex Elder 	int ret;
29509969ebc5SAlex Elder 
2951b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_event);
2952b30a01f2SIlya Dryomov 	rbd_assert(!rbd_dev->watch_request);
29539969ebc5SAlex Elder 
29543c663bbdSAlex Elder 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
29559969ebc5SAlex Elder 				     &rbd_dev->watch_event);
29569969ebc5SAlex Elder 	if (ret < 0)
29579969ebc5SAlex Elder 		return ret;
29589969ebc5SAlex Elder 
2959b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_event);
2960b30a01f2SIlya Dryomov 
29619969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
29629969ebc5SAlex Elder 					     OBJ_REQUEST_NODATA);
2963b30a01f2SIlya Dryomov 	if (!obj_request) {
2964b30a01f2SIlya Dryomov 		ret = -ENOMEM;
29659969ebc5SAlex Elder 		goto out_cancel;
2966b30a01f2SIlya Dryomov 	}
29679969ebc5SAlex Elder 
2968deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
2969deb236b3SIlya Dryomov 						  obj_request);
2970b30a01f2SIlya Dryomov 	if (!obj_request->osd_req) {
2971b30a01f2SIlya Dryomov 		ret = -ENOMEM;
2972b30a01f2SIlya Dryomov 		goto out_put;
2973b30a01f2SIlya Dryomov 	}
2974430c28c3SAlex Elder 
2975975241afSAlex Elder 	ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
29762169238dSAlex Elder 
29772169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2978b30a01f2SIlya Dryomov 			      rbd_dev->watch_event->cookie, 0, 1);
29799d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
29802169238dSAlex Elder 
29819969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
29829969ebc5SAlex Elder 	if (ret)
2983b30a01f2SIlya Dryomov 		goto out_linger;
2984b30a01f2SIlya Dryomov 
29859969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
29869969ebc5SAlex Elder 	if (ret)
2987b30a01f2SIlya Dryomov 		goto out_linger;
2988b30a01f2SIlya Dryomov 
29899969ebc5SAlex Elder 	ret = obj_request->result;
29909969ebc5SAlex Elder 	if (ret)
2991b30a01f2SIlya Dryomov 		goto out_linger;
29929969ebc5SAlex Elder 
29938eb87565SAlex Elder 	/*
29948eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
29958eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
29968eb87565SAlex Elder 	 * a pointer to the object request during that time (in
29978eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
29988eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
29998eb87565SAlex Elder 	 * unregistered it.
30008eb87565SAlex Elder 	 */
30018eb87565SAlex Elder 	rbd_dev->watch_request = obj_request;
30028eb87565SAlex Elder 
30038eb87565SAlex Elder 	return 0;
30048eb87565SAlex Elder 
3005b30a01f2SIlya Dryomov out_linger:
3006b30a01f2SIlya Dryomov 	ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req);
3007b30a01f2SIlya Dryomov out_put:
3008b30a01f2SIlya Dryomov 	rbd_obj_request_put(obj_request);
30099969ebc5SAlex Elder out_cancel:
30109969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
30119969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
30129969ebc5SAlex Elder 
30139969ebc5SAlex Elder 	return ret;
30149969ebc5SAlex Elder }
30159969ebc5SAlex Elder 
3016b30a01f2SIlya Dryomov /*
3017b30a01f2SIlya Dryomov  * Tear down a watch request, synchronously.
3018b30a01f2SIlya Dryomov  */
3019b30a01f2SIlya Dryomov static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3020fca27065SIlya Dryomov {
3021b30a01f2SIlya Dryomov 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3022b30a01f2SIlya Dryomov 	struct rbd_obj_request *obj_request;
3023b30a01f2SIlya Dryomov 	int ret;
3024b30a01f2SIlya Dryomov 
3025b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_event);
3026b30a01f2SIlya Dryomov 	rbd_assert(rbd_dev->watch_request);
3027b30a01f2SIlya Dryomov 
3028b30a01f2SIlya Dryomov 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
3029b30a01f2SIlya Dryomov 					     OBJ_REQUEST_NODATA);
3030b30a01f2SIlya Dryomov 	if (!obj_request) {
3031b30a01f2SIlya Dryomov 		ret = -ENOMEM;
3032b30a01f2SIlya Dryomov 		goto out_cancel;
3033b30a01f2SIlya Dryomov 	}
3034b30a01f2SIlya Dryomov 
3035b30a01f2SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1,
3036b30a01f2SIlya Dryomov 						  obj_request);
3037b30a01f2SIlya Dryomov 	if (!obj_request->osd_req) {
3038b30a01f2SIlya Dryomov 		ret = -ENOMEM;
3039b30a01f2SIlya Dryomov 		goto out_put;
3040b30a01f2SIlya Dryomov 	}
3041b30a01f2SIlya Dryomov 
3042b30a01f2SIlya Dryomov 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
3043b30a01f2SIlya Dryomov 			      rbd_dev->watch_event->cookie, 0, 0);
3044b30a01f2SIlya Dryomov 	rbd_osd_req_format_write(obj_request);
3045b30a01f2SIlya Dryomov 
3046b30a01f2SIlya Dryomov 	ret = rbd_obj_request_submit(osdc, obj_request);
3047b30a01f2SIlya Dryomov 	if (ret)
3048b30a01f2SIlya Dryomov 		goto out_put;
3049b30a01f2SIlya Dryomov 
3050b30a01f2SIlya Dryomov 	ret = rbd_obj_request_wait(obj_request);
3051b30a01f2SIlya Dryomov 	if (ret)
3052b30a01f2SIlya Dryomov 		goto out_put;
3053b30a01f2SIlya Dryomov 
3054b30a01f2SIlya Dryomov 	ret = obj_request->result;
3055b30a01f2SIlya Dryomov 	if (ret)
3056b30a01f2SIlya Dryomov 		goto out_put;
3057b30a01f2SIlya Dryomov 
3058b30a01f2SIlya Dryomov 	/* We have successfully torn down the watch request */
3059b30a01f2SIlya Dryomov 
3060b30a01f2SIlya Dryomov 	ceph_osdc_unregister_linger_request(osdc,
3061b30a01f2SIlya Dryomov 					    rbd_dev->watch_request->osd_req);
3062b30a01f2SIlya Dryomov 	rbd_obj_request_put(rbd_dev->watch_request);
3063b30a01f2SIlya Dryomov 	rbd_dev->watch_request = NULL;
3064b30a01f2SIlya Dryomov 
3065b30a01f2SIlya Dryomov out_put:
3066b30a01f2SIlya Dryomov 	rbd_obj_request_put(obj_request);
3067b30a01f2SIlya Dryomov out_cancel:
3068b30a01f2SIlya Dryomov 	ceph_osdc_cancel_event(rbd_dev->watch_event);
3069b30a01f2SIlya Dryomov 	rbd_dev->watch_event = NULL;
3070b30a01f2SIlya Dryomov 
3071b30a01f2SIlya Dryomov 	return ret;
3072fca27065SIlya Dryomov }
3073fca27065SIlya Dryomov 
3074fca27065SIlya Dryomov static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
3075fca27065SIlya Dryomov {
3076fca27065SIlya Dryomov 	int ret;
3077fca27065SIlya Dryomov 
3078b30a01f2SIlya Dryomov 	ret = __rbd_dev_header_unwatch_sync(rbd_dev);
3079fca27065SIlya Dryomov 	if (ret) {
3080fca27065SIlya Dryomov 		rbd_warn(rbd_dev, "unable to tear down watch request: %d\n",
3081fca27065SIlya Dryomov 			 ret);
3082fca27065SIlya Dryomov 	}
3083fca27065SIlya Dryomov }
3084fca27065SIlya Dryomov 
308536be9a76SAlex Elder /*
3086f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
3087f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
308836be9a76SAlex Elder  */
308936be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
309036be9a76SAlex Elder 			     const char *object_name,
309136be9a76SAlex Elder 			     const char *class_name,
309236be9a76SAlex Elder 			     const char *method_name,
30934157976bSAlex Elder 			     const void *outbound,
309436be9a76SAlex Elder 			     size_t outbound_size,
30954157976bSAlex Elder 			     void *inbound,
3096e2a58ee5SAlex Elder 			     size_t inbound_size)
309736be9a76SAlex Elder {
30982169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
309936be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
310036be9a76SAlex Elder 	struct page **pages;
310136be9a76SAlex Elder 	u32 page_count;
310236be9a76SAlex Elder 	int ret;
310336be9a76SAlex Elder 
310436be9a76SAlex Elder 	/*
31056010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
31066010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
31076010a451SAlex Elder 	 * also supply outbound data--parameters for the object
31086010a451SAlex Elder 	 * method.  Currently if this is present it will be a
31096010a451SAlex Elder 	 * snapshot id.
311036be9a76SAlex Elder 	 */
311136be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
311236be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
311336be9a76SAlex Elder 	if (IS_ERR(pages))
311436be9a76SAlex Elder 		return PTR_ERR(pages);
311536be9a76SAlex Elder 
311636be9a76SAlex Elder 	ret = -ENOMEM;
31176010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
311836be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
311936be9a76SAlex Elder 	if (!obj_request)
312036be9a76SAlex Elder 		goto out;
312136be9a76SAlex Elder 
312236be9a76SAlex Elder 	obj_request->pages = pages;
312336be9a76SAlex Elder 	obj_request->page_count = page_count;
312436be9a76SAlex Elder 
3125deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3126deb236b3SIlya Dryomov 						  obj_request);
312736be9a76SAlex Elder 	if (!obj_request->osd_req)
312836be9a76SAlex Elder 		goto out;
312936be9a76SAlex Elder 
3130c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
313104017e29SAlex Elder 					class_name, method_name);
313204017e29SAlex Elder 	if (outbound_size) {
313304017e29SAlex Elder 		struct ceph_pagelist *pagelist;
313404017e29SAlex Elder 
313504017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
313604017e29SAlex Elder 		if (!pagelist)
313704017e29SAlex Elder 			goto out;
313804017e29SAlex Elder 
313904017e29SAlex Elder 		ceph_pagelist_init(pagelist);
314004017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
314104017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
314204017e29SAlex Elder 						pagelist);
314304017e29SAlex Elder 	}
3144a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
3145a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
314644cd188dSAlex Elder 					0, false, false);
31479d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3148430c28c3SAlex Elder 
314936be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
315036be9a76SAlex Elder 	if (ret)
315136be9a76SAlex Elder 		goto out;
315236be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
315336be9a76SAlex Elder 	if (ret)
315436be9a76SAlex Elder 		goto out;
315536be9a76SAlex Elder 
315636be9a76SAlex Elder 	ret = obj_request->result;
315736be9a76SAlex Elder 	if (ret < 0)
315836be9a76SAlex Elder 		goto out;
315957385b51SAlex Elder 
316057385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
316157385b51SAlex Elder 	ret = (int)obj_request->xferred;
3162903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
316336be9a76SAlex Elder out:
316436be9a76SAlex Elder 	if (obj_request)
316536be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
316636be9a76SAlex Elder 	else
316736be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
316836be9a76SAlex Elder 
316936be9a76SAlex Elder 	return ret;
317036be9a76SAlex Elder }
317136be9a76SAlex Elder 
3172bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
3173cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
3174bf0d5f50SAlex Elder {
3175bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
3176bf0d5f50SAlex Elder 	struct request *rq;
3177bf0d5f50SAlex Elder 	int result;
3178bf0d5f50SAlex Elder 
3179bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
3180bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
3181bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
3182bf0d5f50SAlex Elder 		u64 offset;
3183bf0d5f50SAlex Elder 		u64 length;
3184bf0d5f50SAlex Elder 
3185bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
3186bf0d5f50SAlex Elder 
3187bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
31884dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
31894dda41d3SAlex Elder 				(int) rq->cmd_type);
31904dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
31914dda41d3SAlex Elder 			continue;
31924dda41d3SAlex Elder 		}
31934dda41d3SAlex Elder 
31944dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
31954dda41d3SAlex Elder 
31964dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
31974dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
31984dda41d3SAlex Elder 
31994dda41d3SAlex Elder 		if (!length) {
32004dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
3201bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
3202bf0d5f50SAlex Elder 			continue;
3203bf0d5f50SAlex Elder 		}
3204bf0d5f50SAlex Elder 
3205bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
3206bf0d5f50SAlex Elder 
3207bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
3208bf0d5f50SAlex Elder 
3209bf0d5f50SAlex Elder 		if (write_request) {
3210bf0d5f50SAlex Elder 			result = -EROFS;
3211*131fd9f6SGuangliang Zhao 			if (rbd_dev->mapping.read_only)
3212bf0d5f50SAlex Elder 				goto end_request;
3213bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
3214bf0d5f50SAlex Elder 		}
3215bf0d5f50SAlex Elder 
32166d292906SAlex Elder 		/*
32176d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
32186d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
32196d292906SAlex Elder 		 * have disappeared by the time our request arrives
32206d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
32216d292906SAlex Elder 		 * we already know.
32226d292906SAlex Elder 		 */
32236d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3224bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
3225bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3226bf0d5f50SAlex Elder 			result = -ENXIO;
3227bf0d5f50SAlex Elder 			goto end_request;
3228bf0d5f50SAlex Elder 		}
3229bf0d5f50SAlex Elder 
3230bf0d5f50SAlex Elder 		result = -EINVAL;
3231c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
3232c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
3233c0cd10dbSAlex Elder 				offset, length);
3234bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
3235c0cd10dbSAlex Elder 		}
3236bf0d5f50SAlex Elder 
323700a653e2SAlex Elder 		result = -EIO;
323800a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
323900a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
324000a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
324100a653e2SAlex Elder 			goto end_request;
324200a653e2SAlex Elder 		}
324300a653e2SAlex Elder 
3244bf0d5f50SAlex Elder 		result = -ENOMEM;
3245bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
3246e93f3152SAlex Elder 							write_request);
3247bf0d5f50SAlex Elder 		if (!img_request)
3248bf0d5f50SAlex Elder 			goto end_request;
3249bf0d5f50SAlex Elder 
3250bf0d5f50SAlex Elder 		img_request->rq = rq;
3251bf0d5f50SAlex Elder 
3252f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3253f1a4739fSAlex Elder 						rq->bio);
3254bf0d5f50SAlex Elder 		if (!result)
3255bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
3256bf0d5f50SAlex Elder 		if (result)
3257bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
3258bf0d5f50SAlex Elder end_request:
3259bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
3260bf0d5f50SAlex Elder 		if (result < 0) {
32617da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
32627da22d29SAlex Elder 				write_request ? "write" : "read",
32637da22d29SAlex Elder 				length, offset, result);
32647da22d29SAlex Elder 
3265bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
3266bf0d5f50SAlex Elder 		}
3267bf0d5f50SAlex Elder 	}
3268bf0d5f50SAlex Elder }
3269bf0d5f50SAlex Elder 
3270602adf40SYehuda Sadeh /*
3271602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
3272602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
3273f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
3274602adf40SYehuda Sadeh  */
3275602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
3276602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
3277602adf40SYehuda Sadeh {
3278602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
3279e5cfeed2SAlex Elder 	sector_t sector_offset;
3280e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
3281e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
3282e5cfeed2SAlex Elder 	int ret;
3283602adf40SYehuda Sadeh 
3284e5cfeed2SAlex Elder 	/*
3285e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
3286e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
3287e5cfeed2SAlex Elder 	 * device.
3288e5cfeed2SAlex Elder 	 */
3289e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
3290e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
3291e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
3292593a9e7bSAlex Elder 
3293e5cfeed2SAlex Elder 	/*
3294e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
3295e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
3296e5cfeed2SAlex Elder 	 */
3297e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
3298e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
3299e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
3300e5cfeed2SAlex Elder 	else
3301e5cfeed2SAlex Elder 		ret = 0;
3302e5cfeed2SAlex Elder 
3303e5cfeed2SAlex Elder 	/*
3304e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
3305e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
3306e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
3307e5cfeed2SAlex Elder 	 * added to an empty bio."
3308e5cfeed2SAlex Elder 	 */
3309e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
3310e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
3311e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
3312e5cfeed2SAlex Elder 
3313e5cfeed2SAlex Elder 	return ret;
3314602adf40SYehuda Sadeh }
3315602adf40SYehuda Sadeh 
3316602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
3317602adf40SYehuda Sadeh {
3318602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
3319602adf40SYehuda Sadeh 
3320602adf40SYehuda Sadeh 	if (!disk)
3321602adf40SYehuda Sadeh 		return;
3322602adf40SYehuda Sadeh 
3323a0cab924SAlex Elder 	rbd_dev->disk = NULL;
3324a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
3325602adf40SYehuda Sadeh 		del_gendisk(disk);
3326602adf40SYehuda Sadeh 		if (disk->queue)
3327602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
3328a0cab924SAlex Elder 	}
3329602adf40SYehuda Sadeh 	put_disk(disk);
3330602adf40SYehuda Sadeh }
3331602adf40SYehuda Sadeh 
3332788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3333788e2df3SAlex Elder 				const char *object_name,
33347097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
3335788e2df3SAlex Elder 
3336788e2df3SAlex Elder {
33372169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3338788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
3339788e2df3SAlex Elder 	struct page **pages = NULL;
3340788e2df3SAlex Elder 	u32 page_count;
33411ceae7efSAlex Elder 	size_t size;
3342788e2df3SAlex Elder 	int ret;
3343788e2df3SAlex Elder 
3344788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
3345788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
3346788e2df3SAlex Elder 	if (IS_ERR(pages))
3347788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3348788e2df3SAlex Elder 
3349788e2df3SAlex Elder 	ret = -ENOMEM;
3350788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3351788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3352788e2df3SAlex Elder 	if (!obj_request)
3353788e2df3SAlex Elder 		goto out;
3354788e2df3SAlex Elder 
3355788e2df3SAlex Elder 	obj_request->pages = pages;
3356788e2df3SAlex Elder 	obj_request->page_count = page_count;
3357788e2df3SAlex Elder 
3358deb236b3SIlya Dryomov 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1,
3359deb236b3SIlya Dryomov 						  obj_request);
3360788e2df3SAlex Elder 	if (!obj_request->osd_req)
3361788e2df3SAlex Elder 		goto out;
3362788e2df3SAlex Elder 
3363c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3364c99d2d4aSAlex Elder 					offset, length, 0, 0);
3365406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3366a4ce40a9SAlex Elder 					obj_request->pages,
336744cd188dSAlex Elder 					obj_request->length,
336844cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
336944cd188dSAlex Elder 					false, false);
33709d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3371430c28c3SAlex Elder 
3372788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3373788e2df3SAlex Elder 	if (ret)
3374788e2df3SAlex Elder 		goto out;
3375788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3376788e2df3SAlex Elder 	if (ret)
3377788e2df3SAlex Elder 		goto out;
3378788e2df3SAlex Elder 
3379788e2df3SAlex Elder 	ret = obj_request->result;
3380788e2df3SAlex Elder 	if (ret < 0)
3381788e2df3SAlex Elder 		goto out;
33821ceae7efSAlex Elder 
33831ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
33841ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3385903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
338623ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
338723ed6e13SAlex Elder 	ret = (int)size;
3388788e2df3SAlex Elder out:
3389788e2df3SAlex Elder 	if (obj_request)
3390788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3391788e2df3SAlex Elder 	else
3392788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3393788e2df3SAlex Elder 
3394788e2df3SAlex Elder 	return ret;
3395788e2df3SAlex Elder }
3396788e2df3SAlex Elder 
3397602adf40SYehuda Sadeh /*
3398662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3399662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3400662518b1SAlex Elder  * information about the image.
34014156d998SAlex Elder  */
340299a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
34034156d998SAlex Elder {
34044156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
34054156d998SAlex Elder 	u32 snap_count = 0;
34064156d998SAlex Elder 	u64 names_size = 0;
34074156d998SAlex Elder 	u32 want_count;
34084156d998SAlex Elder 	int ret;
34094156d998SAlex Elder 
34104156d998SAlex Elder 	/*
34114156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
34124156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
34134156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
34144156d998SAlex Elder 	 * the number of snapshots could change by the time we read
34154156d998SAlex Elder 	 * it in, in which case we re-read it.
34164156d998SAlex Elder 	 */
34174156d998SAlex Elder 	do {
34184156d998SAlex Elder 		size_t size;
34194156d998SAlex Elder 
34204156d998SAlex Elder 		kfree(ondisk);
34214156d998SAlex Elder 
34224156d998SAlex Elder 		size = sizeof (*ondisk);
34234156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
34244156d998SAlex Elder 		size += names_size;
34254156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
34264156d998SAlex Elder 		if (!ondisk)
3427662518b1SAlex Elder 			return -ENOMEM;
34284156d998SAlex Elder 
3429788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
34307097f8dfSAlex Elder 				       0, size, ondisk);
34314156d998SAlex Elder 		if (ret < 0)
3432662518b1SAlex Elder 			goto out;
3433c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
34344156d998SAlex Elder 			ret = -ENXIO;
343506ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
343606ecc6cbSAlex Elder 				size, ret);
3437662518b1SAlex Elder 			goto out;
34384156d998SAlex Elder 		}
34394156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
34404156d998SAlex Elder 			ret = -ENXIO;
344106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3442662518b1SAlex Elder 			goto out;
34434156d998SAlex Elder 		}
34444156d998SAlex Elder 
34454156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
34464156d998SAlex Elder 		want_count = snap_count;
34474156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
34484156d998SAlex Elder 	} while (snap_count != want_count);
34494156d998SAlex Elder 
3450662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3451662518b1SAlex Elder out:
34524156d998SAlex Elder 	kfree(ondisk);
34534156d998SAlex Elder 
3454dfc5606dSYehuda Sadeh 	return ret;
3455602adf40SYehuda Sadeh }
3456602adf40SYehuda Sadeh 
345715228edeSAlex Elder /*
345815228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
345915228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
346015228edeSAlex Elder  */
346115228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
346215228edeSAlex Elder {
346315228edeSAlex Elder 	u64 snap_id;
346415228edeSAlex Elder 
346515228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
346615228edeSAlex Elder 		return;
346715228edeSAlex Elder 
346815228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
346915228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
347015228edeSAlex Elder 		return;
347115228edeSAlex Elder 
347215228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
347315228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
347415228edeSAlex Elder }
347515228edeSAlex Elder 
34769875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
34779875201eSJosh Durgin {
34789875201eSJosh Durgin 	sector_t size;
34799875201eSJosh Durgin 	bool removing;
34809875201eSJosh Durgin 
34819875201eSJosh Durgin 	/*
34829875201eSJosh Durgin 	 * Don't hold the lock while doing disk operations,
34839875201eSJosh Durgin 	 * or lock ordering will conflict with the bdev mutex via:
34849875201eSJosh Durgin 	 * rbd_add() -> blkdev_get() -> rbd_open()
34859875201eSJosh Durgin 	 */
34869875201eSJosh Durgin 	spin_lock_irq(&rbd_dev->lock);
34879875201eSJosh Durgin 	removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
34889875201eSJosh Durgin 	spin_unlock_irq(&rbd_dev->lock);
34899875201eSJosh Durgin 	/*
34909875201eSJosh Durgin 	 * If the device is being removed, rbd_dev->disk has
34919875201eSJosh Durgin 	 * been destroyed, so don't try to update its size
34929875201eSJosh Durgin 	 */
34939875201eSJosh Durgin 	if (!removing) {
34949875201eSJosh Durgin 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
34959875201eSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long)size);
34969875201eSJosh Durgin 		set_capacity(rbd_dev->disk, size);
34979875201eSJosh Durgin 		revalidate_disk(rbd_dev->disk);
34989875201eSJosh Durgin 	}
34999875201eSJosh Durgin }
35009875201eSJosh Durgin 
3501cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
35021fe5e993SAlex Elder {
3503e627db08SAlex Elder 	u64 mapping_size;
35041fe5e993SAlex Elder 	int ret;
35051fe5e993SAlex Elder 
3506117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3507cfbf6377SAlex Elder 	down_write(&rbd_dev->header_rwsem);
35083b5cf2a2SAlex Elder 	mapping_size = rbd_dev->mapping.size;
3509117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
351099a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
3511117973fbSAlex Elder 	else
35122df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
351315228edeSAlex Elder 
351415228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
351515228edeSAlex Elder 
351615228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
3517cfbf6377SAlex Elder 	up_write(&rbd_dev->header_rwsem);
3518cfbf6377SAlex Elder 
351900a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
35209875201eSJosh Durgin 		rbd_dev_update_size(rbd_dev);
352100a653e2SAlex Elder 	}
35221fe5e993SAlex Elder 
35231fe5e993SAlex Elder 	return ret;
35241fe5e993SAlex Elder }
35251fe5e993SAlex Elder 
3526602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3527602adf40SYehuda Sadeh {
3528602adf40SYehuda Sadeh 	struct gendisk *disk;
3529602adf40SYehuda Sadeh 	struct request_queue *q;
3530593a9e7bSAlex Elder 	u64 segment_size;
3531602adf40SYehuda Sadeh 
3532602adf40SYehuda Sadeh 	/* create gendisk info */
35337e513d43SIlya Dryomov 	disk = alloc_disk(single_major ?
35347e513d43SIlya Dryomov 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
35357e513d43SIlya Dryomov 			  RBD_MINORS_PER_MAJOR);
3536602adf40SYehuda Sadeh 	if (!disk)
35371fcdb8aaSAlex Elder 		return -ENOMEM;
3538602adf40SYehuda Sadeh 
3539f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3540de71a297SAlex Elder 		 rbd_dev->dev_id);
3541602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3542dd82fff1SIlya Dryomov 	disk->first_minor = rbd_dev->minor;
35437e513d43SIlya Dryomov 	if (single_major)
35447e513d43SIlya Dryomov 		disk->flags |= GENHD_FL_EXT_DEVT;
3545602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3546602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3547602adf40SYehuda Sadeh 
3548bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3549602adf40SYehuda Sadeh 	if (!q)
3550602adf40SYehuda Sadeh 		goto out_disk;
3551029bcbd8SJosh Durgin 
3552593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3553593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3554593a9e7bSAlex Elder 
3555029bcbd8SJosh Durgin 	/* set io sizes to object size */
3556593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3557593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3558593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3559593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3560593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3561029bcbd8SJosh Durgin 
3562602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3563602adf40SYehuda Sadeh 	disk->queue = q;
3564602adf40SYehuda Sadeh 
3565602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3566602adf40SYehuda Sadeh 
3567602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3568602adf40SYehuda Sadeh 
3569602adf40SYehuda Sadeh 	return 0;
3570602adf40SYehuda Sadeh out_disk:
3571602adf40SYehuda Sadeh 	put_disk(disk);
35721fcdb8aaSAlex Elder 
35731fcdb8aaSAlex Elder 	return -ENOMEM;
3574602adf40SYehuda Sadeh }
3575602adf40SYehuda Sadeh 
3576dfc5606dSYehuda Sadeh /*
3577dfc5606dSYehuda Sadeh   sysfs
3578dfc5606dSYehuda Sadeh */
3579602adf40SYehuda Sadeh 
3580593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3581593a9e7bSAlex Elder {
3582593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3583593a9e7bSAlex Elder }
3584593a9e7bSAlex Elder 
3585dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3586dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3587602adf40SYehuda Sadeh {
3588593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3589dfc5606dSYehuda Sadeh 
3590fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3591fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3592602adf40SYehuda Sadeh }
3593602adf40SYehuda Sadeh 
359434b13184SAlex Elder /*
359534b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
359634b13184SAlex Elder  * necessarily the base image.
359734b13184SAlex Elder  */
359834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
359934b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
360034b13184SAlex Elder {
360134b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
360234b13184SAlex Elder 
360334b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
360434b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
360534b13184SAlex Elder }
360634b13184SAlex Elder 
3607dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3608dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3609602adf40SYehuda Sadeh {
3610593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3611dfc5606dSYehuda Sadeh 
3612fc71d833SAlex Elder 	if (rbd_dev->major)
3613dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3614fc71d833SAlex Elder 
3615fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3616dd82fff1SIlya Dryomov }
3617fc71d833SAlex Elder 
3618dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
3619dd82fff1SIlya Dryomov 			      struct device_attribute *attr, char *buf)
3620dd82fff1SIlya Dryomov {
3621dd82fff1SIlya Dryomov 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3622dd82fff1SIlya Dryomov 
3623dd82fff1SIlya Dryomov 	return sprintf(buf, "%d\n", rbd_dev->minor);
3624dfc5606dSYehuda Sadeh }
3625dfc5606dSYehuda Sadeh 
3626dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3627dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3628dfc5606dSYehuda Sadeh {
3629593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3630dfc5606dSYehuda Sadeh 
36311dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
36321dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3633dfc5606dSYehuda Sadeh }
3634dfc5606dSYehuda Sadeh 
3635dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3636dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3637dfc5606dSYehuda Sadeh {
3638593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3639dfc5606dSYehuda Sadeh 
36400d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3641dfc5606dSYehuda Sadeh }
3642dfc5606dSYehuda Sadeh 
36439bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
36449bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
36459bb2f334SAlex Elder {
36469bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
36479bb2f334SAlex Elder 
36480d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
36490d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
36509bb2f334SAlex Elder }
36519bb2f334SAlex Elder 
3652dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3653dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3654dfc5606dSYehuda Sadeh {
3655593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3656dfc5606dSYehuda Sadeh 
3657a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
36580d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3659a92ffdf8SAlex Elder 
3660a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3661dfc5606dSYehuda Sadeh }
3662dfc5606dSYehuda Sadeh 
3663589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3664589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3665589d30e0SAlex Elder {
3666589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3667589d30e0SAlex Elder 
36680d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3669589d30e0SAlex Elder }
3670589d30e0SAlex Elder 
367134b13184SAlex Elder /*
367234b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
367334b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
367434b13184SAlex Elder  */
3675dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3676dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3677dfc5606dSYehuda Sadeh 			     char *buf)
3678dfc5606dSYehuda Sadeh {
3679593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3680dfc5606dSYehuda Sadeh 
36810d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3682dfc5606dSYehuda Sadeh }
3683dfc5606dSYehuda Sadeh 
368486b00e0dSAlex Elder /*
368586b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
368686b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
368786b00e0dSAlex Elder  * "(no parent image)".
368886b00e0dSAlex Elder  */
368986b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
369086b00e0dSAlex Elder 			     struct device_attribute *attr,
369186b00e0dSAlex Elder 			     char *buf)
369286b00e0dSAlex Elder {
369386b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
369486b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
369586b00e0dSAlex Elder 	int count;
369686b00e0dSAlex Elder 	char *bufp = buf;
369786b00e0dSAlex Elder 
369886b00e0dSAlex Elder 	if (!spec)
369986b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
370086b00e0dSAlex Elder 
370186b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
370286b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
370386b00e0dSAlex Elder 	if (count < 0)
370486b00e0dSAlex Elder 		return count;
370586b00e0dSAlex Elder 	bufp += count;
370686b00e0dSAlex Elder 
370786b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
370886b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
370986b00e0dSAlex Elder 	if (count < 0)
371086b00e0dSAlex Elder 		return count;
371186b00e0dSAlex Elder 	bufp += count;
371286b00e0dSAlex Elder 
371386b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
371486b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
371586b00e0dSAlex Elder 	if (count < 0)
371686b00e0dSAlex Elder 		return count;
371786b00e0dSAlex Elder 	bufp += count;
371886b00e0dSAlex Elder 
371986b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
372086b00e0dSAlex Elder 	if (count < 0)
372186b00e0dSAlex Elder 		return count;
372286b00e0dSAlex Elder 	bufp += count;
372386b00e0dSAlex Elder 
372486b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
372586b00e0dSAlex Elder }
372686b00e0dSAlex Elder 
3727dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3728dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3729dfc5606dSYehuda Sadeh 				 const char *buf,
3730dfc5606dSYehuda Sadeh 				 size_t size)
3731dfc5606dSYehuda Sadeh {
3732593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3733b813623aSAlex Elder 	int ret;
3734602adf40SYehuda Sadeh 
3735cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3736e627db08SAlex Elder 	if (ret)
3737e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3738b813623aSAlex Elder 
3739b813623aSAlex Elder 	return ret < 0 ? ret : size;
3740dfc5606dSYehuda Sadeh }
3741602adf40SYehuda Sadeh 
3742dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
374334b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3744dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3745dd82fff1SIlya Dryomov static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
3746dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3747dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
37489bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3749dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3750589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3751dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3752dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
375386b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3754dfc5606dSYehuda Sadeh 
3755dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3756dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
375734b13184SAlex Elder 	&dev_attr_features.attr,
3758dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3759dd82fff1SIlya Dryomov 	&dev_attr_minor.attr,
3760dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3761dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
37629bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3763dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3764589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3765dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
376686b00e0dSAlex Elder 	&dev_attr_parent.attr,
3767dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3768dfc5606dSYehuda Sadeh 	NULL
3769dfc5606dSYehuda Sadeh };
3770dfc5606dSYehuda Sadeh 
3771dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3772dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3773dfc5606dSYehuda Sadeh };
3774dfc5606dSYehuda Sadeh 
3775dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3776dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3777dfc5606dSYehuda Sadeh 	NULL
3778dfc5606dSYehuda Sadeh };
3779dfc5606dSYehuda Sadeh 
3780dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3781dfc5606dSYehuda Sadeh {
3782dfc5606dSYehuda Sadeh }
3783dfc5606dSYehuda Sadeh 
3784dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3785dfc5606dSYehuda Sadeh 	.name		= "rbd",
3786dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3787dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3788dfc5606dSYehuda Sadeh };
3789dfc5606dSYehuda Sadeh 
37908b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
37918b8fb99cSAlex Elder {
37928b8fb99cSAlex Elder 	kref_get(&spec->kref);
37938b8fb99cSAlex Elder 
37948b8fb99cSAlex Elder 	return spec;
37958b8fb99cSAlex Elder }
37968b8fb99cSAlex Elder 
37978b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
37988b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
37998b8fb99cSAlex Elder {
38008b8fb99cSAlex Elder 	if (spec)
38018b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
38028b8fb99cSAlex Elder }
38038b8fb99cSAlex Elder 
38048b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
38058b8fb99cSAlex Elder {
38068b8fb99cSAlex Elder 	struct rbd_spec *spec;
38078b8fb99cSAlex Elder 
38088b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
38098b8fb99cSAlex Elder 	if (!spec)
38108b8fb99cSAlex Elder 		return NULL;
38118b8fb99cSAlex Elder 	kref_init(&spec->kref);
38128b8fb99cSAlex Elder 
38138b8fb99cSAlex Elder 	return spec;
38148b8fb99cSAlex Elder }
38158b8fb99cSAlex Elder 
38168b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
38178b8fb99cSAlex Elder {
38188b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
38198b8fb99cSAlex Elder 
38208b8fb99cSAlex Elder 	kfree(spec->pool_name);
38218b8fb99cSAlex Elder 	kfree(spec->image_id);
38228b8fb99cSAlex Elder 	kfree(spec->image_name);
38238b8fb99cSAlex Elder 	kfree(spec->snap_name);
38248b8fb99cSAlex Elder 	kfree(spec);
38258b8fb99cSAlex Elder }
38268b8fb99cSAlex Elder 
3827cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3828c53d5893SAlex Elder 				struct rbd_spec *spec)
3829c53d5893SAlex Elder {
3830c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3831c53d5893SAlex Elder 
3832c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3833c53d5893SAlex Elder 	if (!rbd_dev)
3834c53d5893SAlex Elder 		return NULL;
3835c53d5893SAlex Elder 
3836c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
38376d292906SAlex Elder 	rbd_dev->flags = 0;
3838a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 0);
3839c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3840c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3841c53d5893SAlex Elder 
3842c53d5893SAlex Elder 	rbd_dev->spec = spec;
3843c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3844c53d5893SAlex Elder 
38450903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
38460903e875SAlex Elder 
38470903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
38480903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
38490903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
38500903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
38510903e875SAlex Elder 
3852c53d5893SAlex Elder 	return rbd_dev;
3853c53d5893SAlex Elder }
3854c53d5893SAlex Elder 
3855c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3856c53d5893SAlex Elder {
3857c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3858c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3859c53d5893SAlex Elder 	kfree(rbd_dev);
3860c53d5893SAlex Elder }
3861c53d5893SAlex Elder 
3862dfc5606dSYehuda Sadeh /*
38639d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
38649d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
38659d475de5SAlex Elder  * image.
38669d475de5SAlex Elder  */
38679d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
38689d475de5SAlex Elder 				u8 *order, u64 *snap_size)
38699d475de5SAlex Elder {
38709d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
38719d475de5SAlex Elder 	int ret;
38729d475de5SAlex Elder 	struct {
38739d475de5SAlex Elder 		u8 order;
38749d475de5SAlex Elder 		__le64 size;
38759d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
38769d475de5SAlex Elder 
387736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38789d475de5SAlex Elder 				"rbd", "get_size",
38794157976bSAlex Elder 				&snapid, sizeof (snapid),
3880e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
388136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
38829d475de5SAlex Elder 	if (ret < 0)
38839d475de5SAlex Elder 		return ret;
388457385b51SAlex Elder 	if (ret < sizeof (size_buf))
388557385b51SAlex Elder 		return -ERANGE;
38869d475de5SAlex Elder 
3887c3545579SJosh Durgin 	if (order) {
38889d475de5SAlex Elder 		*order = size_buf.order;
3889c3545579SJosh Durgin 		dout("  order %u", (unsigned int)*order);
3890c3545579SJosh Durgin 	}
38919d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
38929d475de5SAlex Elder 
3893c3545579SJosh Durgin 	dout("  snap_id 0x%016llx snap_size = %llu\n",
3894c3545579SJosh Durgin 		(unsigned long long)snap_id,
38959d475de5SAlex Elder 		(unsigned long long)*snap_size);
38969d475de5SAlex Elder 
38979d475de5SAlex Elder 	return 0;
38989d475de5SAlex Elder }
38999d475de5SAlex Elder 
39009d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
39019d475de5SAlex Elder {
39029d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
39039d475de5SAlex Elder 					&rbd_dev->header.obj_order,
39049d475de5SAlex Elder 					&rbd_dev->header.image_size);
39059d475de5SAlex Elder }
39069d475de5SAlex Elder 
39071e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
39081e130199SAlex Elder {
39091e130199SAlex Elder 	void *reply_buf;
39101e130199SAlex Elder 	int ret;
39111e130199SAlex Elder 	void *p;
39121e130199SAlex Elder 
39131e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
39141e130199SAlex Elder 	if (!reply_buf)
39151e130199SAlex Elder 		return -ENOMEM;
39161e130199SAlex Elder 
391736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
39184157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3919e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
392036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
39211e130199SAlex Elder 	if (ret < 0)
39221e130199SAlex Elder 		goto out;
39231e130199SAlex Elder 
39241e130199SAlex Elder 	p = reply_buf;
39251e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
392657385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
392757385b51SAlex Elder 	ret = 0;
39281e130199SAlex Elder 
39291e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
39301e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
39311e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
39321e130199SAlex Elder 	} else {
39331e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
39341e130199SAlex Elder 	}
39351e130199SAlex Elder out:
39361e130199SAlex Elder 	kfree(reply_buf);
39371e130199SAlex Elder 
39381e130199SAlex Elder 	return ret;
39391e130199SAlex Elder }
39401e130199SAlex Elder 
3941b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3942b1b5402aSAlex Elder 		u64 *snap_features)
3943b1b5402aSAlex Elder {
3944b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3945b1b5402aSAlex Elder 	struct {
3946b1b5402aSAlex Elder 		__le64 features;
3947b1b5402aSAlex Elder 		__le64 incompat;
39484157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3949d889140cSAlex Elder 	u64 incompat;
3950b1b5402aSAlex Elder 	int ret;
3951b1b5402aSAlex Elder 
395236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3953b1b5402aSAlex Elder 				"rbd", "get_features",
39544157976bSAlex Elder 				&snapid, sizeof (snapid),
3955e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
395636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3957b1b5402aSAlex Elder 	if (ret < 0)
3958b1b5402aSAlex Elder 		return ret;
395957385b51SAlex Elder 	if (ret < sizeof (features_buf))
396057385b51SAlex Elder 		return -ERANGE;
3961d889140cSAlex Elder 
3962d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
39635cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3964b8f5c6edSAlex Elder 		return -ENXIO;
3965d889140cSAlex Elder 
3966b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3967b1b5402aSAlex Elder 
3968b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3969b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3970b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3971b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3972b1b5402aSAlex Elder 
3973b1b5402aSAlex Elder 	return 0;
3974b1b5402aSAlex Elder }
3975b1b5402aSAlex Elder 
3976b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3977b1b5402aSAlex Elder {
3978b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3979b1b5402aSAlex Elder 						&rbd_dev->header.features);
3980b1b5402aSAlex Elder }
3981b1b5402aSAlex Elder 
398286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
398386b00e0dSAlex Elder {
398486b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
398586b00e0dSAlex Elder 	size_t size;
398686b00e0dSAlex Elder 	void *reply_buf = NULL;
398786b00e0dSAlex Elder 	__le64 snapid;
398886b00e0dSAlex Elder 	void *p;
398986b00e0dSAlex Elder 	void *end;
3990642a2537SAlex Elder 	u64 pool_id;
399186b00e0dSAlex Elder 	char *image_id;
39923b5cf2a2SAlex Elder 	u64 snap_id;
399386b00e0dSAlex Elder 	u64 overlap;
399486b00e0dSAlex Elder 	int ret;
399586b00e0dSAlex Elder 
399686b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
399786b00e0dSAlex Elder 	if (!parent_spec)
399886b00e0dSAlex Elder 		return -ENOMEM;
399986b00e0dSAlex Elder 
400086b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
400186b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
400286b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
400386b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
400486b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
400586b00e0dSAlex Elder 	if (!reply_buf) {
400686b00e0dSAlex Elder 		ret = -ENOMEM;
400786b00e0dSAlex Elder 		goto out_err;
400886b00e0dSAlex Elder 	}
400986b00e0dSAlex Elder 
401086b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
401136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
401286b00e0dSAlex Elder 				"rbd", "get_parent",
40134157976bSAlex Elder 				&snapid, sizeof (snapid),
4014e2a58ee5SAlex Elder 				reply_buf, size);
401536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
401686b00e0dSAlex Elder 	if (ret < 0)
401786b00e0dSAlex Elder 		goto out_err;
401886b00e0dSAlex Elder 
401986b00e0dSAlex Elder 	p = reply_buf;
402057385b51SAlex Elder 	end = reply_buf + ret;
402157385b51SAlex Elder 	ret = -ERANGE;
4022642a2537SAlex Elder 	ceph_decode_64_safe(&p, end, pool_id, out_err);
4023392a9dadSAlex Elder 	if (pool_id == CEPH_NOPOOL) {
4024392a9dadSAlex Elder 		/*
4025392a9dadSAlex Elder 		 * Either the parent never existed, or we have
4026392a9dadSAlex Elder 		 * record of it but the image got flattened so it no
4027392a9dadSAlex Elder 		 * longer has a parent.  When the parent of a
4028392a9dadSAlex Elder 		 * layered image disappears we immediately set the
4029392a9dadSAlex Elder 		 * overlap to 0.  The effect of this is that all new
4030392a9dadSAlex Elder 		 * requests will be treated as if the image had no
4031392a9dadSAlex Elder 		 * parent.
4032392a9dadSAlex Elder 		 */
4033392a9dadSAlex Elder 		if (rbd_dev->parent_overlap) {
4034392a9dadSAlex Elder 			rbd_dev->parent_overlap = 0;
4035392a9dadSAlex Elder 			smp_mb();
4036392a9dadSAlex Elder 			rbd_dev_parent_put(rbd_dev);
4037392a9dadSAlex Elder 			pr_info("%s: clone image has been flattened\n",
4038392a9dadSAlex Elder 				rbd_dev->disk->disk_name);
4039392a9dadSAlex Elder 		}
4040392a9dadSAlex Elder 
404186b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
4042392a9dadSAlex Elder 	}
404386b00e0dSAlex Elder 
40440903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
40450903e875SAlex Elder 
40460903e875SAlex Elder 	ret = -EIO;
4047642a2537SAlex Elder 	if (pool_id > (u64)U32_MAX) {
4048c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
4049642a2537SAlex Elder 			(unsigned long long)pool_id, U32_MAX);
405057385b51SAlex Elder 		goto out_err;
4051c0cd10dbSAlex Elder 	}
40520903e875SAlex Elder 
4053979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
405486b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
405586b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
405686b00e0dSAlex Elder 		goto out_err;
405786b00e0dSAlex Elder 	}
40583b5cf2a2SAlex Elder 	ceph_decode_64_safe(&p, end, snap_id, out_err);
405986b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
406086b00e0dSAlex Elder 
40613b5cf2a2SAlex Elder 	/*
40623b5cf2a2SAlex Elder 	 * The parent won't change (except when the clone is
40633b5cf2a2SAlex Elder 	 * flattened, already handled that).  So we only need to
40643b5cf2a2SAlex Elder 	 * record the parent spec we have not already done so.
40653b5cf2a2SAlex Elder 	 */
40663b5cf2a2SAlex Elder 	if (!rbd_dev->parent_spec) {
40673b5cf2a2SAlex Elder 		parent_spec->pool_id = pool_id;
40683b5cf2a2SAlex Elder 		parent_spec->image_id = image_id;
40693b5cf2a2SAlex Elder 		parent_spec->snap_id = snap_id;
407086b00e0dSAlex Elder 		rbd_dev->parent_spec = parent_spec;
407186b00e0dSAlex Elder 		parent_spec = NULL;	/* rbd_dev now owns this */
40723b5cf2a2SAlex Elder 	}
40733b5cf2a2SAlex Elder 
40743b5cf2a2SAlex Elder 	/*
40753b5cf2a2SAlex Elder 	 * We always update the parent overlap.  If it's zero we
40763b5cf2a2SAlex Elder 	 * treat it specially.
40773b5cf2a2SAlex Elder 	 */
407870cf49cfSAlex Elder 	rbd_dev->parent_overlap = overlap;
40793b5cf2a2SAlex Elder 	smp_mb();
40803b5cf2a2SAlex Elder 	if (!overlap) {
40813b5cf2a2SAlex Elder 
40823b5cf2a2SAlex Elder 		/* A null parent_spec indicates it's the initial probe */
40833b5cf2a2SAlex Elder 
40843b5cf2a2SAlex Elder 		if (parent_spec) {
40853b5cf2a2SAlex Elder 			/*
40863b5cf2a2SAlex Elder 			 * The overlap has become zero, so the clone
40873b5cf2a2SAlex Elder 			 * must have been resized down to 0 at some
40883b5cf2a2SAlex Elder 			 * point.  Treat this the same as a flatten.
40893b5cf2a2SAlex Elder 			 */
40903b5cf2a2SAlex Elder 			rbd_dev_parent_put(rbd_dev);
40913b5cf2a2SAlex Elder 			pr_info("%s: clone image now standalone\n",
40923b5cf2a2SAlex Elder 				rbd_dev->disk->disk_name);
409370cf49cfSAlex Elder 		} else {
40943b5cf2a2SAlex Elder 			/*
40953b5cf2a2SAlex Elder 			 * For the initial probe, if we find the
40963b5cf2a2SAlex Elder 			 * overlap is zero we just pretend there was
40973b5cf2a2SAlex Elder 			 * no parent image.
40983b5cf2a2SAlex Elder 			 */
40993b5cf2a2SAlex Elder 			rbd_warn(rbd_dev, "ignoring parent of "
41003b5cf2a2SAlex Elder 						"clone with overlap 0\n");
41013b5cf2a2SAlex Elder 		}
410270cf49cfSAlex Elder 	}
410386b00e0dSAlex Elder out:
410486b00e0dSAlex Elder 	ret = 0;
410586b00e0dSAlex Elder out_err:
410686b00e0dSAlex Elder 	kfree(reply_buf);
410786b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
410886b00e0dSAlex Elder 
410986b00e0dSAlex Elder 	return ret;
411086b00e0dSAlex Elder }
411186b00e0dSAlex Elder 
4112cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4113cc070d59SAlex Elder {
4114cc070d59SAlex Elder 	struct {
4115cc070d59SAlex Elder 		__le64 stripe_unit;
4116cc070d59SAlex Elder 		__le64 stripe_count;
4117cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
4118cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
4119cc070d59SAlex Elder 	void *p;
4120cc070d59SAlex Elder 	u64 obj_size;
4121cc070d59SAlex Elder 	u64 stripe_unit;
4122cc070d59SAlex Elder 	u64 stripe_count;
4123cc070d59SAlex Elder 	int ret;
4124cc070d59SAlex Elder 
4125cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4126cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
4127e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
4128cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4129cc070d59SAlex Elder 	if (ret < 0)
4130cc070d59SAlex Elder 		return ret;
4131cc070d59SAlex Elder 	if (ret < size)
4132cc070d59SAlex Elder 		return -ERANGE;
4133cc070d59SAlex Elder 
4134cc070d59SAlex Elder 	/*
4135cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
4136cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
4137cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
4138cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
4139cc070d59SAlex Elder 	 */
4140cc070d59SAlex Elder 	ret = -EINVAL;
4141cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
4142cc070d59SAlex Elder 	p = &striping_info_buf;
4143cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
4144cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
4145cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
4146cc070d59SAlex Elder 				"(got %llu want %llu)",
4147cc070d59SAlex Elder 				stripe_unit, obj_size);
4148cc070d59SAlex Elder 		return -EINVAL;
4149cc070d59SAlex Elder 	}
4150cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
4151cc070d59SAlex Elder 	if (stripe_count != 1) {
4152cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
4153cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
4154cc070d59SAlex Elder 		return -EINVAL;
4155cc070d59SAlex Elder 	}
4156500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
4157500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
4158cc070d59SAlex Elder 
4159cc070d59SAlex Elder 	return 0;
4160cc070d59SAlex Elder }
4161cc070d59SAlex Elder 
41629e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
41639e15b77dSAlex Elder {
41649e15b77dSAlex Elder 	size_t image_id_size;
41659e15b77dSAlex Elder 	char *image_id;
41669e15b77dSAlex Elder 	void *p;
41679e15b77dSAlex Elder 	void *end;
41689e15b77dSAlex Elder 	size_t size;
41699e15b77dSAlex Elder 	void *reply_buf = NULL;
41709e15b77dSAlex Elder 	size_t len = 0;
41719e15b77dSAlex Elder 	char *image_name = NULL;
41729e15b77dSAlex Elder 	int ret;
41739e15b77dSAlex Elder 
41749e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
41759e15b77dSAlex Elder 
417669e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
417769e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
41789e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
41799e15b77dSAlex Elder 	if (!image_id)
41809e15b77dSAlex Elder 		return NULL;
41819e15b77dSAlex Elder 
41829e15b77dSAlex Elder 	p = image_id;
41834157976bSAlex Elder 	end = image_id + image_id_size;
418469e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
41859e15b77dSAlex Elder 
41869e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
41879e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
41889e15b77dSAlex Elder 	if (!reply_buf)
41899e15b77dSAlex Elder 		goto out;
41909e15b77dSAlex Elder 
419136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
41929e15b77dSAlex Elder 				"rbd", "dir_get_name",
41939e15b77dSAlex Elder 				image_id, image_id_size,
4194e2a58ee5SAlex Elder 				reply_buf, size);
41959e15b77dSAlex Elder 	if (ret < 0)
41969e15b77dSAlex Elder 		goto out;
41979e15b77dSAlex Elder 	p = reply_buf;
4198f40eb349SAlex Elder 	end = reply_buf + ret;
4199f40eb349SAlex Elder 
42009e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
42019e15b77dSAlex Elder 	if (IS_ERR(image_name))
42029e15b77dSAlex Elder 		image_name = NULL;
42039e15b77dSAlex Elder 	else
42049e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
42059e15b77dSAlex Elder out:
42069e15b77dSAlex Elder 	kfree(reply_buf);
42079e15b77dSAlex Elder 	kfree(image_id);
42089e15b77dSAlex Elder 
42099e15b77dSAlex Elder 	return image_name;
42109e15b77dSAlex Elder }
42119e15b77dSAlex Elder 
42122ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
42132ad3d716SAlex Elder {
42142ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
42152ad3d716SAlex Elder 	const char *snap_name;
42162ad3d716SAlex Elder 	u32 which = 0;
42172ad3d716SAlex Elder 
42182ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
42192ad3d716SAlex Elder 
42202ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
42212ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
42222ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
42232ad3d716SAlex Elder 			return snapc->snaps[which];
42242ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
42252ad3d716SAlex Elder 		which++;
42262ad3d716SAlex Elder 	}
42272ad3d716SAlex Elder 	return CEPH_NOSNAP;
42282ad3d716SAlex Elder }
42292ad3d716SAlex Elder 
42302ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
42312ad3d716SAlex Elder {
42322ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
42332ad3d716SAlex Elder 	u32 which;
42342ad3d716SAlex Elder 	bool found = false;
42352ad3d716SAlex Elder 	u64 snap_id;
42362ad3d716SAlex Elder 
42372ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
42382ad3d716SAlex Elder 		const char *snap_name;
42392ad3d716SAlex Elder 
42402ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
42412ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
4242efadc98aSJosh Durgin 		if (IS_ERR(snap_name)) {
4243efadc98aSJosh Durgin 			/* ignore no-longer existing snapshots */
4244efadc98aSJosh Durgin 			if (PTR_ERR(snap_name) == -ENOENT)
4245efadc98aSJosh Durgin 				continue;
4246efadc98aSJosh Durgin 			else
42472ad3d716SAlex Elder 				break;
4248efadc98aSJosh Durgin 		}
42492ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
42502ad3d716SAlex Elder 		kfree(snap_name);
42512ad3d716SAlex Elder 	}
42522ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
42532ad3d716SAlex Elder }
42542ad3d716SAlex Elder 
42552ad3d716SAlex Elder /*
42562ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
42572ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
42582ad3d716SAlex Elder  */
42592ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
42602ad3d716SAlex Elder {
42612ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
42622ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
42632ad3d716SAlex Elder 
42642ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
42652ad3d716SAlex Elder }
42662ad3d716SAlex Elder 
42679e15b77dSAlex Elder /*
42682e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
42692e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
42702e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
42712e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
42722e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
42732e9f7f1cSAlex Elder  * allocated.
4274e1d4213fSAlex Elder  *
4275e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
4276e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
4277e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
42789e15b77dSAlex Elder  */
42792e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
42809e15b77dSAlex Elder {
42812e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
42822e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
42832e9f7f1cSAlex Elder 	const char *pool_name;
42842e9f7f1cSAlex Elder 	const char *image_name;
42852e9f7f1cSAlex Elder 	const char *snap_name;
42869e15b77dSAlex Elder 	int ret;
42879e15b77dSAlex Elder 
4288e1d4213fSAlex Elder 	/*
4289e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
4290e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
4291e1d4213fSAlex Elder 	 */
42922e9f7f1cSAlex Elder 	if (spec->pool_name) {
42932e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
42942ad3d716SAlex Elder 			u64 snap_id;
4295e1d4213fSAlex Elder 
42962ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
42972ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
4298e1d4213fSAlex Elder 				return -ENOENT;
42992ad3d716SAlex Elder 			spec->snap_id = snap_id;
4300e1d4213fSAlex Elder 		} else {
43012e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
4302e1d4213fSAlex Elder 		}
4303e1d4213fSAlex Elder 
4304e1d4213fSAlex Elder 		return 0;
4305e1d4213fSAlex Elder 	}
43069e15b77dSAlex Elder 
43072e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
43089e15b77dSAlex Elder 
43092e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
43102e9f7f1cSAlex Elder 	if (!pool_name) {
43112e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
4312935dc89fSAlex Elder 		return -EIO;
4313935dc89fSAlex Elder 	}
43142e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
43152e9f7f1cSAlex Elder 	if (!pool_name)
43169e15b77dSAlex Elder 		return -ENOMEM;
43179e15b77dSAlex Elder 
43189e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
43199e15b77dSAlex Elder 
43202e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
43212e9f7f1cSAlex Elder 	if (!image_name)
432206ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
43239e15b77dSAlex Elder 
43242e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
43259e15b77dSAlex Elder 
43262e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
4327da6a6b63SJosh Durgin 	if (IS_ERR(snap_name)) {
4328da6a6b63SJosh Durgin 		ret = PTR_ERR(snap_name);
43299e15b77dSAlex Elder 		goto out_err;
43302e9f7f1cSAlex Elder 	}
43312e9f7f1cSAlex Elder 
43322e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
43332e9f7f1cSAlex Elder 	spec->image_name = image_name;
43342e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
43359e15b77dSAlex Elder 
43369e15b77dSAlex Elder 	return 0;
43379e15b77dSAlex Elder out_err:
43382e9f7f1cSAlex Elder 	kfree(image_name);
43392e9f7f1cSAlex Elder 	kfree(pool_name);
43409e15b77dSAlex Elder 
43419e15b77dSAlex Elder 	return ret;
43429e15b77dSAlex Elder }
43439e15b77dSAlex Elder 
4344cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
434535d489f9SAlex Elder {
434635d489f9SAlex Elder 	size_t size;
434735d489f9SAlex Elder 	int ret;
434835d489f9SAlex Elder 	void *reply_buf;
434935d489f9SAlex Elder 	void *p;
435035d489f9SAlex Elder 	void *end;
435135d489f9SAlex Elder 	u64 seq;
435235d489f9SAlex Elder 	u32 snap_count;
435335d489f9SAlex Elder 	struct ceph_snap_context *snapc;
435435d489f9SAlex Elder 	u32 i;
435535d489f9SAlex Elder 
435635d489f9SAlex Elder 	/*
435735d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
435835d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
435935d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
436035d489f9SAlex Elder 	 * prepared to receive.
436135d489f9SAlex Elder 	 */
436235d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
436335d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
436435d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
436535d489f9SAlex Elder 	if (!reply_buf)
436635d489f9SAlex Elder 		return -ENOMEM;
436735d489f9SAlex Elder 
436836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
43694157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
4370e2a58ee5SAlex Elder 				reply_buf, size);
437136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
437235d489f9SAlex Elder 	if (ret < 0)
437335d489f9SAlex Elder 		goto out;
437435d489f9SAlex Elder 
437535d489f9SAlex Elder 	p = reply_buf;
437657385b51SAlex Elder 	end = reply_buf + ret;
437757385b51SAlex Elder 	ret = -ERANGE;
437835d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
437935d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
438035d489f9SAlex Elder 
438135d489f9SAlex Elder 	/*
438235d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
438335d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
438435d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
438535d489f9SAlex Elder 	 * allocate is representable in a size_t.
438635d489f9SAlex Elder 	 */
438735d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
438835d489f9SAlex Elder 				 / sizeof (u64)) {
438935d489f9SAlex Elder 		ret = -EINVAL;
439035d489f9SAlex Elder 		goto out;
439135d489f9SAlex Elder 	}
439235d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
439335d489f9SAlex Elder 		goto out;
4394468521c1SAlex Elder 	ret = 0;
439535d489f9SAlex Elder 
4396812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
439735d489f9SAlex Elder 	if (!snapc) {
439835d489f9SAlex Elder 		ret = -ENOMEM;
439935d489f9SAlex Elder 		goto out;
440035d489f9SAlex Elder 	}
440135d489f9SAlex Elder 	snapc->seq = seq;
440235d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
440335d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
440435d489f9SAlex Elder 
440549ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
440635d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
440735d489f9SAlex Elder 
440835d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
440935d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
441035d489f9SAlex Elder out:
441135d489f9SAlex Elder 	kfree(reply_buf);
441235d489f9SAlex Elder 
441357385b51SAlex Elder 	return ret;
441435d489f9SAlex Elder }
441535d489f9SAlex Elder 
441654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
441754cac61fSAlex Elder 					u64 snap_id)
4418b8b1e2dbSAlex Elder {
4419b8b1e2dbSAlex Elder 	size_t size;
4420b8b1e2dbSAlex Elder 	void *reply_buf;
442154cac61fSAlex Elder 	__le64 snapid;
4422b8b1e2dbSAlex Elder 	int ret;
4423b8b1e2dbSAlex Elder 	void *p;
4424b8b1e2dbSAlex Elder 	void *end;
4425b8b1e2dbSAlex Elder 	char *snap_name;
4426b8b1e2dbSAlex Elder 
4427b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4428b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4429b8b1e2dbSAlex Elder 	if (!reply_buf)
4430b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4431b8b1e2dbSAlex Elder 
443254cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
443336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4434b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
443554cac61fSAlex Elder 				&snapid, sizeof (snapid),
4436e2a58ee5SAlex Elder 				reply_buf, size);
443736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4438f40eb349SAlex Elder 	if (ret < 0) {
4439f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4440b8b1e2dbSAlex Elder 		goto out;
4441f40eb349SAlex Elder 	}
4442b8b1e2dbSAlex Elder 
4443b8b1e2dbSAlex Elder 	p = reply_buf;
4444f40eb349SAlex Elder 	end = reply_buf + ret;
4445e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4446f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4447b8b1e2dbSAlex Elder 		goto out;
4448f40eb349SAlex Elder 
4449b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
445054cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4451b8b1e2dbSAlex Elder out:
4452b8b1e2dbSAlex Elder 	kfree(reply_buf);
4453b8b1e2dbSAlex Elder 
4454f40eb349SAlex Elder 	return snap_name;
4455b8b1e2dbSAlex Elder }
4456b8b1e2dbSAlex Elder 
44572df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4458117973fbSAlex Elder {
44592df3fac7SAlex Elder 	bool first_time = rbd_dev->header.object_prefix == NULL;
4460117973fbSAlex Elder 	int ret;
4461117973fbSAlex Elder 
44621617e40cSJosh Durgin 	ret = rbd_dev_v2_image_size(rbd_dev);
44631617e40cSJosh Durgin 	if (ret)
4464cfbf6377SAlex Elder 		return ret;
44651617e40cSJosh Durgin 
44662df3fac7SAlex Elder 	if (first_time) {
44672df3fac7SAlex Elder 		ret = rbd_dev_v2_header_onetime(rbd_dev);
44682df3fac7SAlex Elder 		if (ret)
4469cfbf6377SAlex Elder 			return ret;
44702df3fac7SAlex Elder 	}
44712df3fac7SAlex Elder 
4472642a2537SAlex Elder 	/*
4473642a2537SAlex Elder 	 * If the image supports layering, get the parent info.  We
4474642a2537SAlex Elder 	 * need to probe the first time regardless.  Thereafter we
4475642a2537SAlex Elder 	 * only need to if there's a parent, to see if it has
4476642a2537SAlex Elder 	 * disappeared due to the mapped image getting flattened.
4477642a2537SAlex Elder 	 */
4478642a2537SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING &&
4479642a2537SAlex Elder 			(first_time || rbd_dev->parent_spec)) {
4480642a2537SAlex Elder 		bool warn;
4481642a2537SAlex Elder 
4482642a2537SAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
4483642a2537SAlex Elder 		if (ret)
4484cfbf6377SAlex Elder 			return ret;
4485642a2537SAlex Elder 
4486642a2537SAlex Elder 		/*
4487642a2537SAlex Elder 		 * Print a warning if this is the initial probe and
4488642a2537SAlex Elder 		 * the image has a parent.  Don't print it if the
4489642a2537SAlex Elder 		 * image now being probed is itself a parent.  We
4490642a2537SAlex Elder 		 * can tell at this point because we won't know its
4491642a2537SAlex Elder 		 * pool name yet (just its pool id).
4492642a2537SAlex Elder 		 */
4493642a2537SAlex Elder 		warn = rbd_dev->parent_spec && rbd_dev->spec->pool_name;
4494642a2537SAlex Elder 		if (first_time && warn)
4495642a2537SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
4496642a2537SAlex Elder 					"is EXPERIMENTAL!");
4497642a2537SAlex Elder 	}
4498642a2537SAlex Elder 
449929334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
450029334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
450129334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4502117973fbSAlex Elder 
4503cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4504117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4505117973fbSAlex Elder 
4506117973fbSAlex Elder 	return ret;
4507117973fbSAlex Elder }
4508117973fbSAlex Elder 
4509dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4510dfc5606dSYehuda Sadeh {
4511dfc5606dSYehuda Sadeh 	struct device *dev;
4512cd789ab9SAlex Elder 	int ret;
4513dfc5606dSYehuda Sadeh 
4514cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4515dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4516dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4517dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4518200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4519de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4520dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4521dfc5606dSYehuda Sadeh 
4522dfc5606dSYehuda Sadeh 	return ret;
4523602adf40SYehuda Sadeh }
4524602adf40SYehuda Sadeh 
4525dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4526dfc5606dSYehuda Sadeh {
4527dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4528dfc5606dSYehuda Sadeh }
4529dfc5606dSYehuda Sadeh 
45301ddbe94eSAlex Elder /*
4531499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4532f8a22fc2SIlya Dryomov  * the rbd_dev to the global list.
45331ddbe94eSAlex Elder  */
4534f8a22fc2SIlya Dryomov static int rbd_dev_id_get(struct rbd_device *rbd_dev)
4535b7f23c36SAlex Elder {
4536f8a22fc2SIlya Dryomov 	int new_dev_id;
4537f8a22fc2SIlya Dryomov 
45389b60e70bSIlya Dryomov 	new_dev_id = ida_simple_get(&rbd_dev_id_ida,
45399b60e70bSIlya Dryomov 				    0, minor_to_rbd_dev_id(1 << MINORBITS),
45409b60e70bSIlya Dryomov 				    GFP_KERNEL);
4541f8a22fc2SIlya Dryomov 	if (new_dev_id < 0)
4542f8a22fc2SIlya Dryomov 		return new_dev_id;
4543f8a22fc2SIlya Dryomov 
4544f8a22fc2SIlya Dryomov 	rbd_dev->dev_id = new_dev_id;
4545499afd5bSAlex Elder 
4546499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4547499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4548499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4549f8a22fc2SIlya Dryomov 
455070eebd20SIlya Dryomov 	dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
4551f8a22fc2SIlya Dryomov 
4552f8a22fc2SIlya Dryomov 	return 0;
4553b7f23c36SAlex Elder }
4554b7f23c36SAlex Elder 
45551ddbe94eSAlex Elder /*
4556499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4557499afd5bSAlex Elder  * identifier is no longer in use.
45581ddbe94eSAlex Elder  */
4559e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
45601ddbe94eSAlex Elder {
4561499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4562499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4563499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
45641ddbe94eSAlex Elder 
4565f8a22fc2SIlya Dryomov 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4566f8a22fc2SIlya Dryomov 
4567f8a22fc2SIlya Dryomov 	dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
4568b7f23c36SAlex Elder }
4569b7f23c36SAlex Elder 
4570a725f65eSAlex Elder /*
4571e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4572e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4573593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4574593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4575e28fff26SAlex Elder  */
4576e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4577e28fff26SAlex Elder {
4578e28fff26SAlex Elder         /*
4579e28fff26SAlex Elder         * These are the characters that produce nonzero for
4580e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4581e28fff26SAlex Elder         */
4582e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4583e28fff26SAlex Elder 
4584e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4585e28fff26SAlex Elder 
4586e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4587e28fff26SAlex Elder }
4588e28fff26SAlex Elder 
4589e28fff26SAlex Elder /*
4590e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4591e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4592593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4593593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4594e28fff26SAlex Elder  *
4595e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4596e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4597e28fff26SAlex Elder  * token_size if the token would not fit.
4598e28fff26SAlex Elder  *
4599593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4600e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4601e28fff26SAlex Elder  * too small to hold it.
4602e28fff26SAlex Elder  */
4603e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4604e28fff26SAlex Elder 				char *token,
4605e28fff26SAlex Elder 				size_t token_size)
4606e28fff26SAlex Elder {
4607e28fff26SAlex Elder         size_t len;
4608e28fff26SAlex Elder 
4609e28fff26SAlex Elder 	len = next_token(buf);
4610e28fff26SAlex Elder 	if (len < token_size) {
4611e28fff26SAlex Elder 		memcpy(token, *buf, len);
4612e28fff26SAlex Elder 		*(token + len) = '\0';
4613e28fff26SAlex Elder 	}
4614e28fff26SAlex Elder 	*buf += len;
4615e28fff26SAlex Elder 
4616e28fff26SAlex Elder         return len;
4617e28fff26SAlex Elder }
4618e28fff26SAlex Elder 
4619e28fff26SAlex Elder /*
4620ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4621ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4622ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4623ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4624ea3352f4SAlex Elder  *
4625ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4626ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4627ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4628ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4629ea3352f4SAlex Elder  *
4630ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4631ea3352f4SAlex Elder  * the end of the found token.
4632ea3352f4SAlex Elder  *
4633ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4634ea3352f4SAlex Elder  */
4635ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4636ea3352f4SAlex Elder {
4637ea3352f4SAlex Elder 	char *dup;
4638ea3352f4SAlex Elder 	size_t len;
4639ea3352f4SAlex Elder 
4640ea3352f4SAlex Elder 	len = next_token(buf);
46414caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4642ea3352f4SAlex Elder 	if (!dup)
4643ea3352f4SAlex Elder 		return NULL;
4644ea3352f4SAlex Elder 	*(dup + len) = '\0';
4645ea3352f4SAlex Elder 	*buf += len;
4646ea3352f4SAlex Elder 
4647ea3352f4SAlex Elder 	if (lenp)
4648ea3352f4SAlex Elder 		*lenp = len;
4649ea3352f4SAlex Elder 
4650ea3352f4SAlex Elder 	return dup;
4651ea3352f4SAlex Elder }
4652ea3352f4SAlex Elder 
4653ea3352f4SAlex Elder /*
4654859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4655859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4656859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4657859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4658d22f76e7SAlex Elder  *
4659859c31dfSAlex Elder  * The information extracted from these options is recorded in
4660859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4661859c31dfSAlex Elder  * structures:
4662859c31dfSAlex Elder  *  ceph_opts
4663859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4664859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4665859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4666859c31dfSAlex Elder  *  rbd_opts
4667859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4668859c31dfSAlex Elder  *	this function; caller must release with kfree().
4669859c31dfSAlex Elder  *  spec
4670859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4671859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4672859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4673859c31dfSAlex Elder  *
4674859c31dfSAlex Elder  * The options passed take this form:
4675859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4676859c31dfSAlex Elder  * where:
4677859c31dfSAlex Elder  *  <mon_addrs>
4678859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4679859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4680859c31dfSAlex Elder  *      by a port number (separated by a colon).
4681859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4682859c31dfSAlex Elder  *  <options>
4683859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4684859c31dfSAlex Elder  *  <pool_name>
4685859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4686859c31dfSAlex Elder  *  <image_name>
4687859c31dfSAlex Elder  *      The name of the image in that pool to map.
4688859c31dfSAlex Elder  *  <snap_id>
4689859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4690859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4691859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4692859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4693a725f65eSAlex Elder  */
4694859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4695dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4696859c31dfSAlex Elder 				struct rbd_options **opts,
4697859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4698a725f65eSAlex Elder {
4699e28fff26SAlex Elder 	size_t len;
4700859c31dfSAlex Elder 	char *options;
47010ddebc0cSAlex Elder 	const char *mon_addrs;
4702ecb4dc22SAlex Elder 	char *snap_name;
47030ddebc0cSAlex Elder 	size_t mon_addrs_size;
4704859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
47054e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4706859c31dfSAlex Elder 	struct ceph_options *copts;
4707dc79b113SAlex Elder 	int ret;
4708e28fff26SAlex Elder 
4709e28fff26SAlex Elder 	/* The first four tokens are required */
4710e28fff26SAlex Elder 
47117ef3214aSAlex Elder 	len = next_token(&buf);
47124fb5d671SAlex Elder 	if (!len) {
47134fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
47144fb5d671SAlex Elder 		return -EINVAL;
47154fb5d671SAlex Elder 	}
47160ddebc0cSAlex Elder 	mon_addrs = buf;
4717f28e565aSAlex Elder 	mon_addrs_size = len + 1;
47187ef3214aSAlex Elder 	buf += len;
4719a725f65eSAlex Elder 
4720dc79b113SAlex Elder 	ret = -EINVAL;
4721f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4722f28e565aSAlex Elder 	if (!options)
4723dc79b113SAlex Elder 		return -ENOMEM;
47244fb5d671SAlex Elder 	if (!*options) {
47254fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
47264fb5d671SAlex Elder 		goto out_err;
47274fb5d671SAlex Elder 	}
4728a725f65eSAlex Elder 
4729859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4730859c31dfSAlex Elder 	if (!spec)
4731f28e565aSAlex Elder 		goto out_mem;
4732859c31dfSAlex Elder 
4733859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4734859c31dfSAlex Elder 	if (!spec->pool_name)
4735859c31dfSAlex Elder 		goto out_mem;
47364fb5d671SAlex Elder 	if (!*spec->pool_name) {
47374fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
47384fb5d671SAlex Elder 		goto out_err;
47394fb5d671SAlex Elder 	}
4740e28fff26SAlex Elder 
474169e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4742859c31dfSAlex Elder 	if (!spec->image_name)
4743f28e565aSAlex Elder 		goto out_mem;
47444fb5d671SAlex Elder 	if (!*spec->image_name) {
47454fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
47464fb5d671SAlex Elder 		goto out_err;
47474fb5d671SAlex Elder 	}
4748e28fff26SAlex Elder 
4749f28e565aSAlex Elder 	/*
4750f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4751f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4752f28e565aSAlex Elder 	 */
47533feeb894SAlex Elder 	len = next_token(&buf);
4754820a5f3eSAlex Elder 	if (!len) {
47553feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
47563feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4757f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4758dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4759f28e565aSAlex Elder 		goto out_err;
4760849b4260SAlex Elder 	}
4761ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4762ecb4dc22SAlex Elder 	if (!snap_name)
4763f28e565aSAlex Elder 		goto out_mem;
4764ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4765ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4766e5c35534SAlex Elder 
47670ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4768e28fff26SAlex Elder 
47694e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
47704e9afebaSAlex Elder 	if (!rbd_opts)
47714e9afebaSAlex Elder 		goto out_mem;
47724e9afebaSAlex Elder 
47734e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4774d22f76e7SAlex Elder 
4775859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
47760ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
47774e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4778859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4779859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4780dc79b113SAlex Elder 		goto out_err;
4781dc79b113SAlex Elder 	}
4782859c31dfSAlex Elder 	kfree(options);
4783859c31dfSAlex Elder 
4784859c31dfSAlex Elder 	*ceph_opts = copts;
47854e9afebaSAlex Elder 	*opts = rbd_opts;
4786859c31dfSAlex Elder 	*rbd_spec = spec;
47870ddebc0cSAlex Elder 
4788dc79b113SAlex Elder 	return 0;
4789f28e565aSAlex Elder out_mem:
4790dc79b113SAlex Elder 	ret = -ENOMEM;
4791d22f76e7SAlex Elder out_err:
4792859c31dfSAlex Elder 	kfree(rbd_opts);
4793859c31dfSAlex Elder 	rbd_spec_put(spec);
4794f28e565aSAlex Elder 	kfree(options);
4795d22f76e7SAlex Elder 
4796dc79b113SAlex Elder 	return ret;
4797a725f65eSAlex Elder }
4798a725f65eSAlex Elder 
4799589d30e0SAlex Elder /*
480030ba1f02SIlya Dryomov  * Return pool id (>= 0) or a negative error code.
480130ba1f02SIlya Dryomov  */
480230ba1f02SIlya Dryomov static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
480330ba1f02SIlya Dryomov {
480430ba1f02SIlya Dryomov 	u64 newest_epoch;
480530ba1f02SIlya Dryomov 	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
480630ba1f02SIlya Dryomov 	int tries = 0;
480730ba1f02SIlya Dryomov 	int ret;
480830ba1f02SIlya Dryomov 
480930ba1f02SIlya Dryomov again:
481030ba1f02SIlya Dryomov 	ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
481130ba1f02SIlya Dryomov 	if (ret == -ENOENT && tries++ < 1) {
481230ba1f02SIlya Dryomov 		ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
481330ba1f02SIlya Dryomov 					       &newest_epoch);
481430ba1f02SIlya Dryomov 		if (ret < 0)
481530ba1f02SIlya Dryomov 			return ret;
481630ba1f02SIlya Dryomov 
481730ba1f02SIlya Dryomov 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
481830ba1f02SIlya Dryomov 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
481930ba1f02SIlya Dryomov 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
482030ba1f02SIlya Dryomov 						     newest_epoch, timeout);
482130ba1f02SIlya Dryomov 			goto again;
482230ba1f02SIlya Dryomov 		} else {
482330ba1f02SIlya Dryomov 			/* the osdmap we have is new enough */
482430ba1f02SIlya Dryomov 			return -ENOENT;
482530ba1f02SIlya Dryomov 		}
482630ba1f02SIlya Dryomov 	}
482730ba1f02SIlya Dryomov 
482830ba1f02SIlya Dryomov 	return ret;
482930ba1f02SIlya Dryomov }
483030ba1f02SIlya Dryomov 
483130ba1f02SIlya Dryomov /*
4832589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4833589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4834589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4835589d30e0SAlex Elder  *
4836589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4837589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4838589d30e0SAlex Elder  * with the supplied name.
4839589d30e0SAlex Elder  *
4840589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4841589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4842589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4843589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4844589d30e0SAlex Elder  */
4845589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4846589d30e0SAlex Elder {
4847589d30e0SAlex Elder 	int ret;
4848589d30e0SAlex Elder 	size_t size;
4849589d30e0SAlex Elder 	char *object_name;
4850589d30e0SAlex Elder 	void *response;
4851c0fba368SAlex Elder 	char *image_id;
48522f82ee54SAlex Elder 
4853589d30e0SAlex Elder 	/*
48542c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
48552c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4856c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4857c0fba368SAlex Elder 	 * do still need to set the image format though.
48582c0d0a10SAlex Elder 	 */
4859c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4860c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4861c0fba368SAlex Elder 
48622c0d0a10SAlex Elder 		return 0;
4863c0fba368SAlex Elder 	}
48642c0d0a10SAlex Elder 
48652c0d0a10SAlex Elder 	/*
4866589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4867589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4868589d30e0SAlex Elder 	 */
486969e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4870589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4871589d30e0SAlex Elder 	if (!object_name)
4872589d30e0SAlex Elder 		return -ENOMEM;
48730d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4874589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4875589d30e0SAlex Elder 
4876589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4877589d30e0SAlex Elder 
4878589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4879589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4880589d30e0SAlex Elder 	if (!response) {
4881589d30e0SAlex Elder 		ret = -ENOMEM;
4882589d30e0SAlex Elder 		goto out;
4883589d30e0SAlex Elder 	}
4884589d30e0SAlex Elder 
4885c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4886c0fba368SAlex Elder 
488736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
48884157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4889e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
489036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4891c0fba368SAlex Elder 	if (ret == -ENOENT) {
4892c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4893c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4894c0fba368SAlex Elder 		if (!ret)
4895c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4896c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4897c0fba368SAlex Elder 		void *p = response;
4898589d30e0SAlex Elder 
4899c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4900979ed480SAlex Elder 						NULL, GFP_NOIO);
4901461f758aSDuan Jiong 		ret = PTR_ERR_OR_ZERO(image_id);
4902c0fba368SAlex Elder 		if (!ret)
4903c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4904589d30e0SAlex Elder 	} else {
4905c0fba368SAlex Elder 		ret = -EINVAL;
4906c0fba368SAlex Elder 	}
4907c0fba368SAlex Elder 
4908c0fba368SAlex Elder 	if (!ret) {
4909c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4910c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4911589d30e0SAlex Elder 	}
4912589d30e0SAlex Elder out:
4913589d30e0SAlex Elder 	kfree(response);
4914589d30e0SAlex Elder 	kfree(object_name);
4915589d30e0SAlex Elder 
4916589d30e0SAlex Elder 	return ret;
4917589d30e0SAlex Elder }
4918589d30e0SAlex Elder 
49193abef3b3SAlex Elder /*
49203abef3b3SAlex Elder  * Undo whatever state changes are made by v1 or v2 header info
49213abef3b3SAlex Elder  * call.
49223abef3b3SAlex Elder  */
49236fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
49246fd48b3bSAlex Elder {
49256fd48b3bSAlex Elder 	struct rbd_image_header	*header;
49266fd48b3bSAlex Elder 
4927392a9dadSAlex Elder 	/* Drop parent reference unless it's already been done (or none) */
4928392a9dadSAlex Elder 
4929392a9dadSAlex Elder 	if (rbd_dev->parent_overlap)
4930a2acd00eSAlex Elder 		rbd_dev_parent_put(rbd_dev);
49316fd48b3bSAlex Elder 
49326fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
49336fd48b3bSAlex Elder 
49346fd48b3bSAlex Elder 	header = &rbd_dev->header;
4935812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
49366fd48b3bSAlex Elder 	kfree(header->snap_sizes);
49376fd48b3bSAlex Elder 	kfree(header->snap_names);
49386fd48b3bSAlex Elder 	kfree(header->object_prefix);
49396fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
49406fd48b3bSAlex Elder }
49416fd48b3bSAlex Elder 
49422df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
4943a30b71b9SAlex Elder {
4944a30b71b9SAlex Elder 	int ret;
4945a30b71b9SAlex Elder 
49461e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
494757385b51SAlex Elder 	if (ret)
49481e130199SAlex Elder 		goto out_err;
4949b1b5402aSAlex Elder 
49502df3fac7SAlex Elder 	/*
49512df3fac7SAlex Elder 	 * Get the and check features for the image.  Currently the
49522df3fac7SAlex Elder 	 * features are assumed to never change.
49532df3fac7SAlex Elder 	 */
4954b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
495557385b51SAlex Elder 	if (ret)
4956b1b5402aSAlex Elder 		goto out_err;
495735d489f9SAlex Elder 
4958cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4959cc070d59SAlex Elder 
4960cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4961cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4962cc070d59SAlex Elder 		if (ret < 0)
4963cc070d59SAlex Elder 			goto out_err;
4964cc070d59SAlex Elder 	}
49652df3fac7SAlex Elder 	/* No support for crypto and compression type format 2 images */
4966a30b71b9SAlex Elder 
496735152979SAlex Elder 	return 0;
49689d475de5SAlex Elder out_err:
4969642a2537SAlex Elder 	rbd_dev->header.features = 0;
49701e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
49711e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
49729d475de5SAlex Elder 
49739d475de5SAlex Elder 	return ret;
4974a30b71b9SAlex Elder }
4975a30b71b9SAlex Elder 
4976124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
497783a06263SAlex Elder {
49782f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4979124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4980124afba2SAlex Elder 	struct rbd_client *rbdc;
4981124afba2SAlex Elder 	int ret;
4982124afba2SAlex Elder 
4983124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4984124afba2SAlex Elder 		return 0;
4985124afba2SAlex Elder 	/*
4986124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4987124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4988124afba2SAlex Elder 	 * parent/child relationships always share both.
4989124afba2SAlex Elder 	 */
4990124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4991124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4992124afba2SAlex Elder 
4993124afba2SAlex Elder 	ret = -ENOMEM;
4994124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4995124afba2SAlex Elder 	if (!parent)
4996124afba2SAlex Elder 		goto out_err;
4997124afba2SAlex Elder 
49981f3ef788SAlex Elder 	ret = rbd_dev_image_probe(parent, false);
4999124afba2SAlex Elder 	if (ret < 0)
5000124afba2SAlex Elder 		goto out_err;
5001124afba2SAlex Elder 	rbd_dev->parent = parent;
5002a2acd00eSAlex Elder 	atomic_set(&rbd_dev->parent_ref, 1);
5003124afba2SAlex Elder 
5004124afba2SAlex Elder 	return 0;
5005124afba2SAlex Elder out_err:
5006124afba2SAlex Elder 	if (parent) {
5007fb65d228SAlex Elder 		rbd_dev_unparent(rbd_dev);
5008124afba2SAlex Elder 		kfree(rbd_dev->header_name);
5009124afba2SAlex Elder 		rbd_dev_destroy(parent);
5010124afba2SAlex Elder 	} else {
5011124afba2SAlex Elder 		rbd_put_client(rbdc);
5012124afba2SAlex Elder 		rbd_spec_put(parent_spec);
5013124afba2SAlex Elder 	}
5014124afba2SAlex Elder 
5015124afba2SAlex Elder 	return ret;
5016124afba2SAlex Elder }
5017124afba2SAlex Elder 
5018200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5019124afba2SAlex Elder {
502083a06263SAlex Elder 	int ret;
502183a06263SAlex Elder 
5022f8a22fc2SIlya Dryomov 	/* Get an id and fill in device name. */
502383a06263SAlex Elder 
5024f8a22fc2SIlya Dryomov 	ret = rbd_dev_id_get(rbd_dev);
5025f8a22fc2SIlya Dryomov 	if (ret)
5026f8a22fc2SIlya Dryomov 		return ret;
5027f8a22fc2SIlya Dryomov 
502883a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
502983a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
503083a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
503183a06263SAlex Elder 
50329b60e70bSIlya Dryomov 	/* Record our major and minor device numbers. */
503383a06263SAlex Elder 
50349b60e70bSIlya Dryomov 	if (!single_major) {
503583a06263SAlex Elder 		ret = register_blkdev(0, rbd_dev->name);
503683a06263SAlex Elder 		if (ret < 0)
503783a06263SAlex Elder 			goto err_out_id;
50389b60e70bSIlya Dryomov 
503983a06263SAlex Elder 		rbd_dev->major = ret;
5040dd82fff1SIlya Dryomov 		rbd_dev->minor = 0;
50419b60e70bSIlya Dryomov 	} else {
50429b60e70bSIlya Dryomov 		rbd_dev->major = rbd_major;
50439b60e70bSIlya Dryomov 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
50449b60e70bSIlya Dryomov 	}
504583a06263SAlex Elder 
504683a06263SAlex Elder 	/* Set up the blkdev mapping. */
504783a06263SAlex Elder 
504883a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
504983a06263SAlex Elder 	if (ret)
505083a06263SAlex Elder 		goto err_out_blkdev;
505183a06263SAlex Elder 
5052f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
505383a06263SAlex Elder 	if (ret)
505483a06263SAlex Elder 		goto err_out_disk;
5055f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
5056f35a4deeSAlex Elder 
5057f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
5058f35a4deeSAlex Elder 	if (ret)
5059f35a4deeSAlex Elder 		goto err_out_mapping;
506083a06263SAlex Elder 
506183a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
506283a06263SAlex Elder 
5063129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
506483a06263SAlex Elder 	add_disk(rbd_dev->disk);
506583a06263SAlex Elder 
506683a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
506783a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
506883a06263SAlex Elder 
506983a06263SAlex Elder 	return ret;
50702f82ee54SAlex Elder 
5071f35a4deeSAlex Elder err_out_mapping:
5072f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
507383a06263SAlex Elder err_out_disk:
507483a06263SAlex Elder 	rbd_free_disk(rbd_dev);
507583a06263SAlex Elder err_out_blkdev:
50769b60e70bSIlya Dryomov 	if (!single_major)
507783a06263SAlex Elder 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
507883a06263SAlex Elder err_out_id:
507983a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
5080d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
508183a06263SAlex Elder 
508283a06263SAlex Elder 	return ret;
508383a06263SAlex Elder }
508483a06263SAlex Elder 
5085332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5086332bb12dSAlex Elder {
5087332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
5088332bb12dSAlex Elder 	size_t size;
5089332bb12dSAlex Elder 
5090332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
5091332bb12dSAlex Elder 
5092332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5093332bb12dSAlex Elder 
5094332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5095332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
5096332bb12dSAlex Elder 	else
5097332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
5098332bb12dSAlex Elder 
5099332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
5100332bb12dSAlex Elder 	if (!rbd_dev->header_name)
5101332bb12dSAlex Elder 		return -ENOMEM;
5102332bb12dSAlex Elder 
5103332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
5104332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5105332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
5106332bb12dSAlex Elder 	else
5107332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
5108332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
5109332bb12dSAlex Elder 	return 0;
5110332bb12dSAlex Elder }
5111332bb12dSAlex Elder 
5112200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5113200a6a8bSAlex Elder {
51146fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5115200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
51166fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
51176fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
51186fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
51196fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
51206fd48b3bSAlex Elder 
5121200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
5122200a6a8bSAlex Elder }
5123200a6a8bSAlex Elder 
5124a30b71b9SAlex Elder /*
5125a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
51261f3ef788SAlex Elder  * device.  If this image is the one being mapped (i.e., not a
51271f3ef788SAlex Elder  * parent), initiate a watch on its header object before using that
51281f3ef788SAlex Elder  * object to get detailed information about the rbd image.
5129a30b71b9SAlex Elder  */
51301f3ef788SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
5131a30b71b9SAlex Elder {
5132a30b71b9SAlex Elder 	int ret;
5133a30b71b9SAlex Elder 
5134a30b71b9SAlex Elder 	/*
51353abef3b3SAlex Elder 	 * Get the id from the image id object.  Unless there's an
51363abef3b3SAlex Elder 	 * error, rbd_dev->spec->image_id will be filled in with
51373abef3b3SAlex Elder 	 * a dynamically-allocated string, and rbd_dev->image_format
51383abef3b3SAlex Elder 	 * will be set to either 1 or 2.
5139a30b71b9SAlex Elder 	 */
5140a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
5141a30b71b9SAlex Elder 	if (ret)
5142c0fba368SAlex Elder 		return ret;
5143c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
5144c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5145c0fba368SAlex Elder 
5146332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
5147332bb12dSAlex Elder 	if (ret)
5148332bb12dSAlex Elder 		goto err_out_format;
5149332bb12dSAlex Elder 
51501f3ef788SAlex Elder 	if (mapping) {
5151fca27065SIlya Dryomov 		ret = rbd_dev_header_watch_sync(rbd_dev);
5152b644de2bSAlex Elder 		if (ret)
5153b644de2bSAlex Elder 			goto out_header_name;
51541f3ef788SAlex Elder 	}
5155b644de2bSAlex Elder 
5156c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
515799a41ebcSAlex Elder 		ret = rbd_dev_v1_header_info(rbd_dev);
5158a30b71b9SAlex Elder 	else
51592df3fac7SAlex Elder 		ret = rbd_dev_v2_header_info(rbd_dev);
51605655c4d9SAlex Elder 	if (ret)
5161b644de2bSAlex Elder 		goto err_out_watch;
5162a30b71b9SAlex Elder 
51639bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
51649bb81c9bSAlex Elder 	if (ret)
516533dca39fSAlex Elder 		goto err_out_probe;
51669bb81c9bSAlex Elder 
51679bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
516830d60ba2SAlex Elder 	if (ret)
516930d60ba2SAlex Elder 		goto err_out_probe;
517083a06263SAlex Elder 
517130d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
517230d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
517330d60ba2SAlex Elder 
517430d60ba2SAlex Elder 	return 0;
51756fd48b3bSAlex Elder err_out_probe:
51766fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
5177b644de2bSAlex Elder err_out_watch:
5178fca27065SIlya Dryomov 	if (mapping)
5179fca27065SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
5180332bb12dSAlex Elder out_header_name:
5181332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
5182332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
5183332bb12dSAlex Elder err_out_format:
5184332bb12dSAlex Elder 	rbd_dev->image_format = 0;
51855655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
51865655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
51875655c4d9SAlex Elder 
51885655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
51895655c4d9SAlex Elder 
51905655c4d9SAlex Elder 	return ret;
519183a06263SAlex Elder }
519283a06263SAlex Elder 
51939b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus,
519459c2be1eSYehuda Sadeh 			  const char *buf,
519559c2be1eSYehuda Sadeh 			  size_t count)
5196602adf40SYehuda Sadeh {
5197cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
5198dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
51994e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
5200859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
52019d3997fdSAlex Elder 	struct rbd_client *rbdc;
520251344a38SAlex Elder 	bool read_only;
520327cc2594SAlex Elder 	int rc = -ENOMEM;
5204602adf40SYehuda Sadeh 
5205602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
5206602adf40SYehuda Sadeh 		return -ENODEV;
5207602adf40SYehuda Sadeh 
5208a725f65eSAlex Elder 	/* parse add command */
5209859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
5210dc79b113SAlex Elder 	if (rc < 0)
5211bd4ba655SAlex Elder 		goto err_out_module;
521251344a38SAlex Elder 	read_only = rbd_opts->read_only;
521351344a38SAlex Elder 	kfree(rbd_opts);
521451344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
5215a725f65eSAlex Elder 
52169d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
52179d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
52189d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
52190ddebc0cSAlex Elder 		goto err_out_args;
52209d3997fdSAlex Elder 	}
5221602adf40SYehuda Sadeh 
5222602adf40SYehuda Sadeh 	/* pick the pool */
522330ba1f02SIlya Dryomov 	rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
5224602adf40SYehuda Sadeh 	if (rc < 0)
5225602adf40SYehuda Sadeh 		goto err_out_client;
5226859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
5227859c31dfSAlex Elder 
52280903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
52290903e875SAlex Elder 
5230c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
5231c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5232c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
52330903e875SAlex Elder 		rc = -EIO;
52340903e875SAlex Elder 		goto err_out_client;
52350903e875SAlex Elder 	}
52360903e875SAlex Elder 
5237c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
5238bd4ba655SAlex Elder 	if (!rbd_dev)
5239bd4ba655SAlex Elder 		goto err_out_client;
5240c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
5241c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
5242602adf40SYehuda Sadeh 
52431f3ef788SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, true);
5244a30b71b9SAlex Elder 	if (rc < 0)
5245c53d5893SAlex Elder 		goto err_out_rbd_dev;
524605fd6f6fSAlex Elder 
52477ce4eef7SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
52487ce4eef7SAlex Elder 
52497ce4eef7SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
52507ce4eef7SAlex Elder 		read_only = true;
52517ce4eef7SAlex Elder 	rbd_dev->mapping.read_only = read_only;
52527ce4eef7SAlex Elder 
5253b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
52543abef3b3SAlex Elder 	if (rc) {
5255e37180c0SIlya Dryomov 		/*
5256e37180c0SIlya Dryomov 		 * rbd_dev_header_unwatch_sync() can't be moved into
5257e37180c0SIlya Dryomov 		 * rbd_dev_image_release() without refactoring, see
5258e37180c0SIlya Dryomov 		 * commit 1f3ef78861ac.
5259e37180c0SIlya Dryomov 		 */
5260e37180c0SIlya Dryomov 		rbd_dev_header_unwatch_sync(rbd_dev);
52613abef3b3SAlex Elder 		rbd_dev_image_release(rbd_dev);
52623abef3b3SAlex Elder 		goto err_out_module;
52633abef3b3SAlex Elder 	}
52643abef3b3SAlex Elder 
5265602adf40SYehuda Sadeh 	return count;
5266b536f69aSAlex Elder 
5267c53d5893SAlex Elder err_out_rbd_dev:
5268c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
5269bd4ba655SAlex Elder err_out_client:
52709d3997fdSAlex Elder 	rbd_put_client(rbdc);
52710ddebc0cSAlex Elder err_out_args:
5272859c31dfSAlex Elder 	rbd_spec_put(spec);
5273bd4ba655SAlex Elder err_out_module:
5274bd4ba655SAlex Elder 	module_put(THIS_MODULE);
527527cc2594SAlex Elder 
5276602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
527727cc2594SAlex Elder 
527827cc2594SAlex Elder 	return (ssize_t)rc;
5279602adf40SYehuda Sadeh }
5280602adf40SYehuda Sadeh 
52819b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus,
52829b60e70bSIlya Dryomov 		       const char *buf,
52839b60e70bSIlya Dryomov 		       size_t count)
52849b60e70bSIlya Dryomov {
52859b60e70bSIlya Dryomov 	if (single_major)
52869b60e70bSIlya Dryomov 		return -EINVAL;
52879b60e70bSIlya Dryomov 
52889b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
52899b60e70bSIlya Dryomov }
52909b60e70bSIlya Dryomov 
52919b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus,
52929b60e70bSIlya Dryomov 				    const char *buf,
52939b60e70bSIlya Dryomov 				    size_t count)
52949b60e70bSIlya Dryomov {
52959b60e70bSIlya Dryomov 	return do_rbd_add(bus, buf, count);
52969b60e70bSIlya Dryomov }
52979b60e70bSIlya Dryomov 
5298200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
5299602adf40SYehuda Sadeh {
5300593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5301602adf40SYehuda Sadeh 
5302602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
5303200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
53046d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
53059b60e70bSIlya Dryomov 	if (!single_major)
5306602adf40SYehuda Sadeh 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
5307e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
5308d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
5309602adf40SYehuda Sadeh }
5310602adf40SYehuda Sadeh 
531105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
531205a46afdSAlex Elder {
5313ad945fc1SAlex Elder 	while (rbd_dev->parent) {
531405a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
531505a46afdSAlex Elder 		struct rbd_device *second = first->parent;
531605a46afdSAlex Elder 		struct rbd_device *third;
531705a46afdSAlex Elder 
531805a46afdSAlex Elder 		/*
531905a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
532005a46afdSAlex Elder 		 * remove it.
532105a46afdSAlex Elder 		 */
532205a46afdSAlex Elder 		while (second && (third = second->parent)) {
532305a46afdSAlex Elder 			first = second;
532405a46afdSAlex Elder 			second = third;
532505a46afdSAlex Elder 		}
5326ad945fc1SAlex Elder 		rbd_assert(second);
53278ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5328ad945fc1SAlex Elder 		first->parent = NULL;
5329ad945fc1SAlex Elder 		first->parent_overlap = 0;
5330ad945fc1SAlex Elder 
5331ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
533205a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
533305a46afdSAlex Elder 		first->parent_spec = NULL;
533405a46afdSAlex Elder 	}
533505a46afdSAlex Elder }
533605a46afdSAlex Elder 
53379b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus,
5338602adf40SYehuda Sadeh 			     const char *buf,
5339602adf40SYehuda Sadeh 			     size_t count)
5340602adf40SYehuda Sadeh {
5341602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
5342751cc0e3SAlex Elder 	struct list_head *tmp;
5343751cc0e3SAlex Elder 	int dev_id;
5344602adf40SYehuda Sadeh 	unsigned long ul;
534582a442d2SAlex Elder 	bool already = false;
53460d8189e1SAlex Elder 	int ret;
5347602adf40SYehuda Sadeh 
5348bb8e0e84SJingoo Han 	ret = kstrtoul(buf, 10, &ul);
53490d8189e1SAlex Elder 	if (ret)
53500d8189e1SAlex Elder 		return ret;
5351602adf40SYehuda Sadeh 
5352602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5353751cc0e3SAlex Elder 	dev_id = (int)ul;
5354751cc0e3SAlex Elder 	if (dev_id != ul)
5355602adf40SYehuda Sadeh 		return -EINVAL;
5356602adf40SYehuda Sadeh 
5357602adf40SYehuda Sadeh 	ret = -ENOENT;
5358751cc0e3SAlex Elder 	spin_lock(&rbd_dev_list_lock);
5359751cc0e3SAlex Elder 	list_for_each(tmp, &rbd_dev_list) {
5360751cc0e3SAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
5361751cc0e3SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
5362751cc0e3SAlex Elder 			ret = 0;
5363751cc0e3SAlex Elder 			break;
5364602adf40SYehuda Sadeh 		}
5365751cc0e3SAlex Elder 	}
5366751cc0e3SAlex Elder 	if (!ret) {
5367a14ea269SAlex Elder 		spin_lock_irq(&rbd_dev->lock);
5368b82d167bSAlex Elder 		if (rbd_dev->open_count)
536942382b70SAlex Elder 			ret = -EBUSY;
5370b82d167bSAlex Elder 		else
537182a442d2SAlex Elder 			already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
537282a442d2SAlex Elder 							&rbd_dev->flags);
5373a14ea269SAlex Elder 		spin_unlock_irq(&rbd_dev->lock);
5374751cc0e3SAlex Elder 	}
5375751cc0e3SAlex Elder 	spin_unlock(&rbd_dev_list_lock);
537682a442d2SAlex Elder 	if (ret < 0 || already)
53771ba0f1e7SAlex Elder 		return ret;
5378751cc0e3SAlex Elder 
5379fca27065SIlya Dryomov 	rbd_dev_header_unwatch_sync(rbd_dev);
53809abc5990SJosh Durgin 	/*
53819abc5990SJosh Durgin 	 * flush remaining watch callbacks - these must be complete
53829abc5990SJosh Durgin 	 * before the osd_client is shutdown
53839abc5990SJosh Durgin 	 */
53849abc5990SJosh Durgin 	dout("%s: flushing notifies", __func__);
53859abc5990SJosh Durgin 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
5386fca27065SIlya Dryomov 
53879875201eSJosh Durgin 	/*
53889875201eSJosh Durgin 	 * Don't free anything from rbd_dev->disk until after all
53899875201eSJosh Durgin 	 * notifies are completely processed. Otherwise
53909875201eSJosh Durgin 	 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
53919875201eSJosh Durgin 	 * in a potential use after free of rbd_dev->disk or rbd_dev.
53929875201eSJosh Durgin 	 */
53939875201eSJosh Durgin 	rbd_bus_del_dev(rbd_dev);
53948ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
539579ab7558SAlex Elder 	module_put(THIS_MODULE);
5396aafb230eSAlex Elder 
53971ba0f1e7SAlex Elder 	return count;
5398602adf40SYehuda Sadeh }
5399602adf40SYehuda Sadeh 
54009b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus,
54019b60e70bSIlya Dryomov 			  const char *buf,
54029b60e70bSIlya Dryomov 			  size_t count)
54039b60e70bSIlya Dryomov {
54049b60e70bSIlya Dryomov 	if (single_major)
54059b60e70bSIlya Dryomov 		return -EINVAL;
54069b60e70bSIlya Dryomov 
54079b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
54089b60e70bSIlya Dryomov }
54099b60e70bSIlya Dryomov 
54109b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus,
54119b60e70bSIlya Dryomov 				       const char *buf,
54129b60e70bSIlya Dryomov 				       size_t count)
54139b60e70bSIlya Dryomov {
54149b60e70bSIlya Dryomov 	return do_rbd_remove(bus, buf, count);
54159b60e70bSIlya Dryomov }
54169b60e70bSIlya Dryomov 
5417602adf40SYehuda Sadeh /*
5418602adf40SYehuda Sadeh  * create control files in sysfs
5419dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5420602adf40SYehuda Sadeh  */
5421602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5422602adf40SYehuda Sadeh {
5423dfc5606dSYehuda Sadeh 	int ret;
5424602adf40SYehuda Sadeh 
5425fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5426dfc5606dSYehuda Sadeh 	if (ret < 0)
5427dfc5606dSYehuda Sadeh 		return ret;
5428602adf40SYehuda Sadeh 
5429fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5430fed4c143SAlex Elder 	if (ret < 0)
5431fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5432602adf40SYehuda Sadeh 
5433602adf40SYehuda Sadeh 	return ret;
5434602adf40SYehuda Sadeh }
5435602adf40SYehuda Sadeh 
5436602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5437602adf40SYehuda Sadeh {
5438dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5439fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5440602adf40SYehuda Sadeh }
5441602adf40SYehuda Sadeh 
54421c2a9dfeSAlex Elder static int rbd_slab_init(void)
54431c2a9dfeSAlex Elder {
54441c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
54451c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
54461c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
54471c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
54481c2a9dfeSAlex Elder 					0, NULL);
5449868311b1SAlex Elder 	if (!rbd_img_request_cache)
5450868311b1SAlex Elder 		return -ENOMEM;
5451868311b1SAlex Elder 
5452868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5453868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5454868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5455868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5456868311b1SAlex Elder 					0, NULL);
545778c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
545878c2a44aSAlex Elder 		goto out_err;
545978c2a44aSAlex Elder 
546078c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
546178c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
54622d0ebc5dSIlya Dryomov 					CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
546378c2a44aSAlex Elder 	if (rbd_segment_name_cache)
54641c2a9dfeSAlex Elder 		return 0;
546578c2a44aSAlex Elder out_err:
546678c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
546778c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
546878c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
546978c2a44aSAlex Elder 	}
54701c2a9dfeSAlex Elder 
5471868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5472868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5473868311b1SAlex Elder 
54741c2a9dfeSAlex Elder 	return -ENOMEM;
54751c2a9dfeSAlex Elder }
54761c2a9dfeSAlex Elder 
54771c2a9dfeSAlex Elder static void rbd_slab_exit(void)
54781c2a9dfeSAlex Elder {
547978c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
548078c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
548178c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
548278c2a44aSAlex Elder 
5483868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5484868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5485868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5486868311b1SAlex Elder 
54871c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
54881c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
54891c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
54901c2a9dfeSAlex Elder }
54911c2a9dfeSAlex Elder 
5492cc344fa1SAlex Elder static int __init rbd_init(void)
5493602adf40SYehuda Sadeh {
5494602adf40SYehuda Sadeh 	int rc;
5495602adf40SYehuda Sadeh 
54961e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
54971e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
54981e32d34cSAlex Elder 		return -EINVAL;
54991e32d34cSAlex Elder 	}
5500e1b4d96dSIlya Dryomov 
55011c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5502602adf40SYehuda Sadeh 	if (rc)
5503602adf40SYehuda Sadeh 		return rc;
5504e1b4d96dSIlya Dryomov 
55059b60e70bSIlya Dryomov 	if (single_major) {
55069b60e70bSIlya Dryomov 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
55079b60e70bSIlya Dryomov 		if (rbd_major < 0) {
55089b60e70bSIlya Dryomov 			rc = rbd_major;
55099b60e70bSIlya Dryomov 			goto err_out_slab;
55109b60e70bSIlya Dryomov 		}
55119b60e70bSIlya Dryomov 	}
55129b60e70bSIlya Dryomov 
55131c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
55141c2a9dfeSAlex Elder 	if (rc)
55159b60e70bSIlya Dryomov 		goto err_out_blkdev;
55161c2a9dfeSAlex Elder 
55179b60e70bSIlya Dryomov 	if (single_major)
55189b60e70bSIlya Dryomov 		pr_info("loaded (major %d)\n", rbd_major);
55199b60e70bSIlya Dryomov 	else
5520e1b4d96dSIlya Dryomov 		pr_info("loaded\n");
55219b60e70bSIlya Dryomov 
5522e1b4d96dSIlya Dryomov 	return 0;
5523e1b4d96dSIlya Dryomov 
55249b60e70bSIlya Dryomov err_out_blkdev:
55259b60e70bSIlya Dryomov 	if (single_major)
55269b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
5527e1b4d96dSIlya Dryomov err_out_slab:
5528e1b4d96dSIlya Dryomov 	rbd_slab_exit();
55291c2a9dfeSAlex Elder 	return rc;
5530602adf40SYehuda Sadeh }
5531602adf40SYehuda Sadeh 
5532cc344fa1SAlex Elder static void __exit rbd_exit(void)
5533602adf40SYehuda Sadeh {
5534ffe312cfSIlya Dryomov 	ida_destroy(&rbd_dev_id_ida);
5535602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
55369b60e70bSIlya Dryomov 	if (single_major)
55379b60e70bSIlya Dryomov 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
55381c2a9dfeSAlex Elder 	rbd_slab_exit();
5539602adf40SYehuda Sadeh }
5540602adf40SYehuda Sadeh 
5541602adf40SYehuda Sadeh module_init(rbd_init);
5542602adf40SYehuda Sadeh module_exit(rbd_exit);
5543602adf40SYehuda Sadeh 
5544d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
5545602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5546602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5547602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5548602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5549602adf40SYehuda Sadeh 
555090da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
5551602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5552