xref: /openbmc/linux/drivers/block/rbd.c (revision f35a4dee)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44602adf40SYehuda Sadeh 
45602adf40SYehuda Sadeh #include "rbd_types.h"
46602adf40SYehuda Sadeh 
47aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
48aafb230eSAlex Elder 
49593a9e7bSAlex Elder /*
50593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
51593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
52593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
53593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
54593a9e7bSAlex Elder  */
55593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
56593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
57593a9e7bSAlex Elder 
58f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
59f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
60602adf40SYehuda Sadeh 
61602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
62602adf40SYehuda Sadeh 
63d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
64d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
65d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
66d4b125e9SAlex Elder 
6735d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
68602adf40SYehuda Sadeh 
69602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
70602adf40SYehuda Sadeh 
719682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
729682fc6dSAlex Elder 
739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
769e15b77dSAlex Elder 
771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
78589d30e0SAlex Elder 
79d889140cSAlex Elder /* Feature bits */
80d889140cSAlex Elder 
815cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
825cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
835cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
845cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
85d889140cSAlex Elder 
86d889140cSAlex Elder /* Features supported by this (client software) implementation. */
87d889140cSAlex Elder 
88770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
89d889140cSAlex Elder 
9081a89793SAlex Elder /*
9181a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9281a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9381a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9481a89793SAlex Elder  * enough to hold all possible device names.
9581a89793SAlex Elder  */
96602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9781a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
98602adf40SYehuda Sadeh 
99602adf40SYehuda Sadeh /*
100602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
101602adf40SYehuda Sadeh  */
102602adf40SYehuda Sadeh struct rbd_image_header {
103f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
104849b4260SAlex Elder 	char *object_prefix;
105602adf40SYehuda Sadeh 	__u8 obj_order;
106602adf40SYehuda Sadeh 	__u8 crypt_type;
107602adf40SYehuda Sadeh 	__u8 comp_type;
108f35a4deeSAlex Elder 	u64 stripe_unit;
109f35a4deeSAlex Elder 	u64 stripe_count;
110f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
111602adf40SYehuda Sadeh 
112f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
113f84344f3SAlex Elder 	u64 image_size;
114f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
115f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
116f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
11759c2be1eSYehuda Sadeh };
11859c2be1eSYehuda Sadeh 
1190d7dbfceSAlex Elder /*
1200d7dbfceSAlex Elder  * An rbd image specification.
1210d7dbfceSAlex Elder  *
1220d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
123c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
124c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
127c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
128c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
129c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
130c66c6e0cSAlex Elder  *
131c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
132c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
133c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
134c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
135c66c6e0cSAlex Elder  * is shared between the parent and child).
136c66c6e0cSAlex Elder  *
137c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
138c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
139c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
140c66c6e0cSAlex Elder  *
141c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
142c66c6e0cSAlex Elder  * could be a null pointer).
1430d7dbfceSAlex Elder  */
1440d7dbfceSAlex Elder struct rbd_spec {
1450d7dbfceSAlex Elder 	u64		pool_id;
146ecb4dc22SAlex Elder 	const char	*pool_name;
1470d7dbfceSAlex Elder 
148ecb4dc22SAlex Elder 	const char	*image_id;
149ecb4dc22SAlex Elder 	const char	*image_name;
1500d7dbfceSAlex Elder 
1510d7dbfceSAlex Elder 	u64		snap_id;
152ecb4dc22SAlex Elder 	const char	*snap_name;
1530d7dbfceSAlex Elder 
1540d7dbfceSAlex Elder 	struct kref	kref;
1550d7dbfceSAlex Elder };
1560d7dbfceSAlex Elder 
157602adf40SYehuda Sadeh /*
158f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
159602adf40SYehuda Sadeh  */
160602adf40SYehuda Sadeh struct rbd_client {
161602adf40SYehuda Sadeh 	struct ceph_client	*client;
162602adf40SYehuda Sadeh 	struct kref		kref;
163602adf40SYehuda Sadeh 	struct list_head	node;
164602adf40SYehuda Sadeh };
165602adf40SYehuda Sadeh 
166bf0d5f50SAlex Elder struct rbd_img_request;
167bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
168bf0d5f50SAlex Elder 
169bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
170bf0d5f50SAlex Elder 
171bf0d5f50SAlex Elder struct rbd_obj_request;
172bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
173bf0d5f50SAlex Elder 
1749969ebc5SAlex Elder enum obj_request_type {
1759969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1769969ebc5SAlex Elder };
177bf0d5f50SAlex Elder 
178926f9b3fSAlex Elder enum obj_req_flags {
179926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1806365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1815679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1825679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
183926f9b3fSAlex Elder };
184926f9b3fSAlex Elder 
185bf0d5f50SAlex Elder struct rbd_obj_request {
186bf0d5f50SAlex Elder 	const char		*object_name;
187bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
188bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
189926f9b3fSAlex Elder 	unsigned long		flags;
190bf0d5f50SAlex Elder 
191c5b5ef6cSAlex Elder 	/*
192c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
193c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
194c5b5ef6cSAlex Elder 	 *
195c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
196c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
197c5b5ef6cSAlex Elder 	 *
198c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
199c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
200c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
201c5b5ef6cSAlex Elder 	 *
202c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
203c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
204c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
205c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
206c5b5ef6cSAlex Elder 	 */
207c5b5ef6cSAlex Elder 	union {
208c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
209c5b5ef6cSAlex Elder 		struct {
210bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
211c5b5ef6cSAlex Elder 			u64			img_offset;
212c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
213c5b5ef6cSAlex Elder 			struct list_head	links;
214c5b5ef6cSAlex Elder 		};
215c5b5ef6cSAlex Elder 	};
216bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
217bf0d5f50SAlex Elder 
218bf0d5f50SAlex Elder 	enum obj_request_type	type;
219788e2df3SAlex Elder 	union {
220bf0d5f50SAlex Elder 		struct bio	*bio_list;
221788e2df3SAlex Elder 		struct {
222788e2df3SAlex Elder 			struct page	**pages;
223788e2df3SAlex Elder 			u32		page_count;
224788e2df3SAlex Elder 		};
225788e2df3SAlex Elder 	};
2260eefd470SAlex Elder 	struct page		**copyup_pages;
227bf0d5f50SAlex Elder 
228bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
229bf0d5f50SAlex Elder 
230bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2311b83bef2SSage Weil 	int			result;
232bf0d5f50SAlex Elder 
233bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
234788e2df3SAlex Elder 	struct completion	completion;
235bf0d5f50SAlex Elder 
236bf0d5f50SAlex Elder 	struct kref		kref;
237bf0d5f50SAlex Elder };
238bf0d5f50SAlex Elder 
2390c425248SAlex Elder enum img_req_flags {
2409849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2419849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
242d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2430c425248SAlex Elder };
2440c425248SAlex Elder 
245bf0d5f50SAlex Elder struct rbd_img_request {
246bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
247bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
248bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2490c425248SAlex Elder 	unsigned long		flags;
250bf0d5f50SAlex Elder 	union {
251bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2529849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2539849e986SAlex Elder 	};
2549849e986SAlex Elder 	union {
2559849e986SAlex Elder 		struct request		*rq;		/* block request */
2569849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
257bf0d5f50SAlex Elder 	};
2583d7efd18SAlex Elder 	struct page		**copyup_pages;
259bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
260bf0d5f50SAlex Elder 	u32			next_completion;
261bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
26255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
263a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	u32			obj_request_count;
266bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
267bf0d5f50SAlex Elder 
268bf0d5f50SAlex Elder 	struct kref		kref;
269bf0d5f50SAlex Elder };
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
272ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
273bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
274ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
275bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
276ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
277bf0d5f50SAlex Elder 
278f84344f3SAlex Elder struct rbd_mapping {
27999c1f08fSAlex Elder 	u64                     size;
28034b13184SAlex Elder 	u64                     features;
281f84344f3SAlex Elder 	bool			read_only;
282f84344f3SAlex Elder };
283f84344f3SAlex Elder 
284602adf40SYehuda Sadeh /*
285602adf40SYehuda Sadeh  * a single device
286602adf40SYehuda Sadeh  */
287602adf40SYehuda Sadeh struct rbd_device {
288de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
289602adf40SYehuda Sadeh 
290602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
291602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
292602adf40SYehuda Sadeh 
293a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
294602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
295602adf40SYehuda Sadeh 
296602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
297602adf40SYehuda Sadeh 
298b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
299602adf40SYehuda Sadeh 
300602adf40SYehuda Sadeh 	struct rbd_image_header	header;
301b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3020d7dbfceSAlex Elder 	struct rbd_spec		*spec;
303602adf40SYehuda Sadeh 
3040d7dbfceSAlex Elder 	char			*header_name;
305971f839aSAlex Elder 
3060903e875SAlex Elder 	struct ceph_file_layout	layout;
3070903e875SAlex Elder 
30859c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
309975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31059c2be1eSYehuda Sadeh 
31186b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31286b00e0dSAlex Elder 	u64			parent_overlap;
3132f82ee54SAlex Elder 	struct rbd_device	*parent;
31486b00e0dSAlex Elder 
315c666601aSJosh Durgin 	/* protects updating the header */
316c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
317f84344f3SAlex Elder 
318f84344f3SAlex Elder 	struct rbd_mapping	mapping;
319602adf40SYehuda Sadeh 
320602adf40SYehuda Sadeh 	struct list_head	node;
321dfc5606dSYehuda Sadeh 
322dfc5606dSYehuda Sadeh 	/* sysfs related */
323dfc5606dSYehuda Sadeh 	struct device		dev;
324b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
325dfc5606dSYehuda Sadeh };
326dfc5606dSYehuda Sadeh 
327b82d167bSAlex Elder /*
328b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
329b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
330b82d167bSAlex Elder  *
331b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
332b82d167bSAlex Elder  * "open_count" field) requires atomic access.
333b82d167bSAlex Elder  */
3346d292906SAlex Elder enum rbd_dev_flags {
3356d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
336b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3376d292906SAlex Elder };
3386d292906SAlex Elder 
339602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
340e124a82fSAlex Elder 
341602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
342e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
343e124a82fSAlex Elder 
344602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
345432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
346602adf40SYehuda Sadeh 
34778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
34878c2a44aSAlex Elder 
3491c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
350868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
35178c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3521c2a9dfeSAlex Elder 
3533d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3543d7efd18SAlex Elder 
355200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
356dfc5606dSYehuda Sadeh 
357f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
358f0f8cef5SAlex Elder 		       size_t count);
359f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
360f0f8cef5SAlex Elder 			  size_t count);
36151344a38SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only);
362f0f8cef5SAlex Elder 
363f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
364f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
365f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
366f0f8cef5SAlex Elder 	__ATTR_NULL
367f0f8cef5SAlex Elder };
368f0f8cef5SAlex Elder 
369f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
370f0f8cef5SAlex Elder 	.name		= "rbd",
371f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
372f0f8cef5SAlex Elder };
373f0f8cef5SAlex Elder 
374f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
375f0f8cef5SAlex Elder {
376f0f8cef5SAlex Elder }
377f0f8cef5SAlex Elder 
378f0f8cef5SAlex Elder static struct device rbd_root_dev = {
379f0f8cef5SAlex Elder 	.init_name =    "rbd",
380f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
381f0f8cef5SAlex Elder };
382f0f8cef5SAlex Elder 
38306ecc6cbSAlex Elder static __printf(2, 3)
38406ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
38506ecc6cbSAlex Elder {
38606ecc6cbSAlex Elder 	struct va_format vaf;
38706ecc6cbSAlex Elder 	va_list args;
38806ecc6cbSAlex Elder 
38906ecc6cbSAlex Elder 	va_start(args, fmt);
39006ecc6cbSAlex Elder 	vaf.fmt = fmt;
39106ecc6cbSAlex Elder 	vaf.va = &args;
39206ecc6cbSAlex Elder 
39306ecc6cbSAlex Elder 	if (!rbd_dev)
39406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
39506ecc6cbSAlex Elder 	else if (rbd_dev->disk)
39606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
39706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
39806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
39906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
40306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
40406ecc6cbSAlex Elder 	else	/* punt */
40506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
40606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
40706ecc6cbSAlex Elder 	va_end(args);
40806ecc6cbSAlex Elder }
40906ecc6cbSAlex Elder 
410aafb230eSAlex Elder #ifdef RBD_DEBUG
411aafb230eSAlex Elder #define rbd_assert(expr)						\
412aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
413aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
414aafb230eSAlex Elder 						"at line %d:\n\n"	\
415aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
416aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
417aafb230eSAlex Elder 			BUG();						\
418aafb230eSAlex Elder 		}
419aafb230eSAlex Elder #else /* !RBD_DEBUG */
420aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
421aafb230eSAlex Elder #endif /* !RBD_DEBUG */
422dfc5606dSYehuda Sadeh 
423b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
42405a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
42505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
4268b3e1a56SAlex Elder 
427cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
428cc4a38bdSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
42954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
43054cac61fSAlex Elder 					u64 snap_id);
4312ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4322ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
4332ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4342ad3d716SAlex Elder 		u64 *snap_features);
4352ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
43659c2be1eSYehuda Sadeh 
437602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
438602adf40SYehuda Sadeh {
439f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
440b82d167bSAlex Elder 	bool removing = false;
441602adf40SYehuda Sadeh 
442f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
443602adf40SYehuda Sadeh 		return -EROFS;
444602adf40SYehuda Sadeh 
445a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
446b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
447b82d167bSAlex Elder 		removing = true;
448b82d167bSAlex Elder 	else
449b82d167bSAlex Elder 		rbd_dev->open_count++;
450a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
451b82d167bSAlex Elder 	if (removing)
452b82d167bSAlex Elder 		return -ENOENT;
453b82d167bSAlex Elder 
45442382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
456f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45742382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
458340c7a2bSAlex Elder 
459602adf40SYehuda Sadeh 	return 0;
460602adf40SYehuda Sadeh }
461602adf40SYehuda Sadeh 
462dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
463dfc5606dSYehuda Sadeh {
464dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
465b82d167bSAlex Elder 	unsigned long open_count_before;
466b82d167bSAlex Elder 
467a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
468b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
469a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
470b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
471dfc5606dSYehuda Sadeh 
47242382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
473c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
47442382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
475dfc5606dSYehuda Sadeh 
476dfc5606dSYehuda Sadeh 	return 0;
477dfc5606dSYehuda Sadeh }
478dfc5606dSYehuda Sadeh 
479602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
480602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
481602adf40SYehuda Sadeh 	.open			= rbd_open,
482dfc5606dSYehuda Sadeh 	.release		= rbd_release,
483602adf40SYehuda Sadeh };
484602adf40SYehuda Sadeh 
485602adf40SYehuda Sadeh /*
486602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48743ae4701SAlex Elder  * We own *ceph_opts.
488602adf40SYehuda Sadeh  */
489f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
490602adf40SYehuda Sadeh {
491602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
492602adf40SYehuda Sadeh 	int ret = -ENOMEM;
493602adf40SYehuda Sadeh 
49437206ee5SAlex Elder 	dout("%s:\n", __func__);
495602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
496602adf40SYehuda Sadeh 	if (!rbdc)
497602adf40SYehuda Sadeh 		goto out_opt;
498602adf40SYehuda Sadeh 
499602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
500602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
501602adf40SYehuda Sadeh 
502bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
503bc534d86SAlex Elder 
50443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
505602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
506bc534d86SAlex Elder 		goto out_mutex;
50743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
508602adf40SYehuda Sadeh 
509602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
510602adf40SYehuda Sadeh 	if (ret < 0)
511602adf40SYehuda Sadeh 		goto out_err;
512602adf40SYehuda Sadeh 
513432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
514602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
515432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
516602adf40SYehuda Sadeh 
517bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
519bc534d86SAlex Elder 
520602adf40SYehuda Sadeh 	return rbdc;
521602adf40SYehuda Sadeh 
522602adf40SYehuda Sadeh out_err:
523602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
524bc534d86SAlex Elder out_mutex:
525bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
526602adf40SYehuda Sadeh 	kfree(rbdc);
527602adf40SYehuda Sadeh out_opt:
52843ae4701SAlex Elder 	if (ceph_opts)
52943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
53037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
53137206ee5SAlex Elder 
53228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
533602adf40SYehuda Sadeh }
534602adf40SYehuda Sadeh 
5352f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5362f82ee54SAlex Elder {
5372f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5382f82ee54SAlex Elder 
5392f82ee54SAlex Elder 	return rbdc;
5402f82ee54SAlex Elder }
5412f82ee54SAlex Elder 
542602adf40SYehuda Sadeh /*
5431f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5441f7ba331SAlex Elder  * found, bump its reference count.
545602adf40SYehuda Sadeh  */
5461f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
547602adf40SYehuda Sadeh {
548602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5491f7ba331SAlex Elder 	bool found = false;
550602adf40SYehuda Sadeh 
55143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
552602adf40SYehuda Sadeh 		return NULL;
553602adf40SYehuda Sadeh 
5541f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5551f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5561f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5572f82ee54SAlex Elder 			__rbd_get_client(client_node);
5582f82ee54SAlex Elder 
5591f7ba331SAlex Elder 			found = true;
5601f7ba331SAlex Elder 			break;
5611f7ba331SAlex Elder 		}
5621f7ba331SAlex Elder 	}
5631f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5641f7ba331SAlex Elder 
5651f7ba331SAlex Elder 	return found ? client_node : NULL;
566602adf40SYehuda Sadeh }
567602adf40SYehuda Sadeh 
568602adf40SYehuda Sadeh /*
56959c2be1eSYehuda Sadeh  * mount options
57059c2be1eSYehuda Sadeh  */
57159c2be1eSYehuda Sadeh enum {
57259c2be1eSYehuda Sadeh 	Opt_last_int,
57359c2be1eSYehuda Sadeh 	/* int args above */
57459c2be1eSYehuda Sadeh 	Opt_last_string,
57559c2be1eSYehuda Sadeh 	/* string args above */
576cc0538b6SAlex Elder 	Opt_read_only,
577cc0538b6SAlex Elder 	Opt_read_write,
578cc0538b6SAlex Elder 	/* Boolean args above */
579cc0538b6SAlex Elder 	Opt_last_bool,
58059c2be1eSYehuda Sadeh };
58159c2be1eSYehuda Sadeh 
58243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
58359c2be1eSYehuda Sadeh 	/* int args above */
58459c2be1eSYehuda Sadeh 	/* string args above */
585be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
586cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
587cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
588cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
589cc0538b6SAlex Elder 	/* Boolean args above */
59059c2be1eSYehuda Sadeh 	{-1, NULL}
59159c2be1eSYehuda Sadeh };
59259c2be1eSYehuda Sadeh 
59398571b5aSAlex Elder struct rbd_options {
59498571b5aSAlex Elder 	bool	read_only;
59598571b5aSAlex Elder };
59698571b5aSAlex Elder 
59798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59898571b5aSAlex Elder 
59959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
60059c2be1eSYehuda Sadeh {
60143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
60259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
60359c2be1eSYehuda Sadeh 	int token, intval, ret;
60459c2be1eSYehuda Sadeh 
60543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60659c2be1eSYehuda Sadeh 	if (token < 0)
60759c2be1eSYehuda Sadeh 		return -EINVAL;
60859c2be1eSYehuda Sadeh 
60959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
61059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
61159c2be1eSYehuda Sadeh 		if (ret < 0) {
61259c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
61359c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
61459c2be1eSYehuda Sadeh 			return ret;
61559c2be1eSYehuda Sadeh 		}
61659c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61759c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61859c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
61959c2be1eSYehuda Sadeh 		     argstr[0].from);
620cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
621cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
62259c2be1eSYehuda Sadeh 	} else {
62359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
62459c2be1eSYehuda Sadeh 	}
62559c2be1eSYehuda Sadeh 
62659c2be1eSYehuda Sadeh 	switch (token) {
627cc0538b6SAlex Elder 	case Opt_read_only:
628cc0538b6SAlex Elder 		rbd_opts->read_only = true;
629cc0538b6SAlex Elder 		break;
630cc0538b6SAlex Elder 	case Opt_read_write:
631cc0538b6SAlex Elder 		rbd_opts->read_only = false;
632cc0538b6SAlex Elder 		break;
63359c2be1eSYehuda Sadeh 	default:
634aafb230eSAlex Elder 		rbd_assert(false);
635aafb230eSAlex Elder 		break;
63659c2be1eSYehuda Sadeh 	}
63759c2be1eSYehuda Sadeh 	return 0;
63859c2be1eSYehuda Sadeh }
63959c2be1eSYehuda Sadeh 
64059c2be1eSYehuda Sadeh /*
641602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
642602adf40SYehuda Sadeh  * not exist create it.
643602adf40SYehuda Sadeh  */
6449d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
645602adf40SYehuda Sadeh {
646f8c38929SAlex Elder 	struct rbd_client *rbdc;
64759c2be1eSYehuda Sadeh 
6481f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6499d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
65043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6519d3997fdSAlex Elder 	else
652f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
653d720bcb0SAlex Elder 
6549d3997fdSAlex Elder 	return rbdc;
655602adf40SYehuda Sadeh }
656602adf40SYehuda Sadeh 
657602adf40SYehuda Sadeh /*
658602adf40SYehuda Sadeh  * Destroy ceph client
659d23a4b3fSAlex Elder  *
660432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
661602adf40SYehuda Sadeh  */
662602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
663602adf40SYehuda Sadeh {
664602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
665602adf40SYehuda Sadeh 
66637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
667cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
668602adf40SYehuda Sadeh 	list_del(&rbdc->node);
669cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
670602adf40SYehuda Sadeh 
671602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
672602adf40SYehuda Sadeh 	kfree(rbdc);
673602adf40SYehuda Sadeh }
674602adf40SYehuda Sadeh 
675602adf40SYehuda Sadeh /*
676602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
677602adf40SYehuda Sadeh  * it.
678602adf40SYehuda Sadeh  */
6799d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
680602adf40SYehuda Sadeh {
681c53d5893SAlex Elder 	if (rbdc)
6829d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
683602adf40SYehuda Sadeh }
684602adf40SYehuda Sadeh 
685a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
686a30b71b9SAlex Elder {
687a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
688a30b71b9SAlex Elder }
689a30b71b9SAlex Elder 
6908e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6918e94af8eSAlex Elder {
692103a150fSAlex Elder 	size_t size;
693103a150fSAlex Elder 	u32 snap_count;
694103a150fSAlex Elder 
695103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
696103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
697103a150fSAlex Elder 		return false;
698103a150fSAlex Elder 
699db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
700db2388b6SAlex Elder 
701db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
702db2388b6SAlex Elder 		return false;
703db2388b6SAlex Elder 
704db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
705db2388b6SAlex Elder 
706db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
707db2388b6SAlex Elder 		return false;
708db2388b6SAlex Elder 
709103a150fSAlex Elder 	/*
710103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
711103a150fSAlex Elder 	 * that limits the number of snapshots.
712103a150fSAlex Elder 	 */
713103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
714103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
715103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
716103a150fSAlex Elder 		return false;
717103a150fSAlex Elder 
718103a150fSAlex Elder 	/*
719103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
720103a150fSAlex Elder 	 * header must also be representable in a size_t.
721103a150fSAlex Elder 	 */
722103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
723103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
724103a150fSAlex Elder 		return false;
725103a150fSAlex Elder 
726103a150fSAlex Elder 	return true;
7278e94af8eSAlex Elder }
7288e94af8eSAlex Elder 
729602adf40SYehuda Sadeh /*
730602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
731602adf40SYehuda Sadeh  * header.
732602adf40SYehuda Sadeh  */
733602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7344156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
735602adf40SYehuda Sadeh {
736ccece235SAlex Elder 	u32 snap_count;
73758c17b0eSAlex Elder 	size_t len;
738d2bb24e5SAlex Elder 	size_t size;
739621901d6SAlex Elder 	u32 i;
740602adf40SYehuda Sadeh 
7416a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7426a52325fSAlex Elder 
743103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
744103a150fSAlex Elder 
74558c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
74658c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7476a52325fSAlex Elder 	if (!header->object_prefix)
748602adf40SYehuda Sadeh 		return -ENOMEM;
74958c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
75058c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
75100f1f36fSAlex Elder 
752602adf40SYehuda Sadeh 	if (snap_count) {
753f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
754f785cc1dSAlex Elder 
755621901d6SAlex Elder 		/* Save a copy of the snapshot names */
756621901d6SAlex Elder 
757f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
758f785cc1dSAlex Elder 			return -EIO;
759f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
760602adf40SYehuda Sadeh 		if (!header->snap_names)
7616a52325fSAlex Elder 			goto out_err;
762f785cc1dSAlex Elder 		/*
763f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
764f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
765f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
766f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
767f785cc1dSAlex Elder 		 */
768f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
769f785cc1dSAlex Elder 			snap_names_len);
7706a52325fSAlex Elder 
771621901d6SAlex Elder 		/* Record each snapshot's size */
772621901d6SAlex Elder 
773d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
774d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
775602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7766a52325fSAlex Elder 			goto out_err;
777621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
778621901d6SAlex Elder 			header->snap_sizes[i] =
779621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
780602adf40SYehuda Sadeh 	} else {
781602adf40SYehuda Sadeh 		header->snap_names = NULL;
782602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
783602adf40SYehuda Sadeh 	}
784849b4260SAlex Elder 
78534b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
786602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
787602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
788602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7896a52325fSAlex Elder 
790621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
791621901d6SAlex Elder 
792f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
793468521c1SAlex Elder 
794812164f8SAlex Elder 	header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
7956a52325fSAlex Elder 	if (!header->snapc)
7966a52325fSAlex Elder 		goto out_err;
797505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
798621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
799468521c1SAlex Elder 		header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
800602adf40SYehuda Sadeh 
801602adf40SYehuda Sadeh 	return 0;
802602adf40SYehuda Sadeh 
8036a52325fSAlex Elder out_err:
804849b4260SAlex Elder 	kfree(header->snap_sizes);
805ccece235SAlex Elder 	header->snap_sizes = NULL;
806602adf40SYehuda Sadeh 	kfree(header->snap_names);
807ccece235SAlex Elder 	header->snap_names = NULL;
8086a52325fSAlex Elder 	kfree(header->object_prefix);
8096a52325fSAlex Elder 	header->object_prefix = NULL;
810ccece235SAlex Elder 
81100f1f36fSAlex Elder 	return -ENOMEM;
812602adf40SYehuda Sadeh }
813602adf40SYehuda Sadeh 
8149682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
8159682fc6dSAlex Elder {
8169682fc6dSAlex Elder 	const char *snap_name;
8179682fc6dSAlex Elder 
8189682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
8199682fc6dSAlex Elder 
8209682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
8219682fc6dSAlex Elder 
8229682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
8239682fc6dSAlex Elder 	while (which--)
8249682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
8259682fc6dSAlex Elder 
8269682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
8279682fc6dSAlex Elder }
8289682fc6dSAlex Elder 
82930d1cff8SAlex Elder /*
83030d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
83130d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
83230d1cff8SAlex Elder  */
83330d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
83430d1cff8SAlex Elder {
83530d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
83630d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
83730d1cff8SAlex Elder 
83830d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
83930d1cff8SAlex Elder 		return 1;
84030d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
84130d1cff8SAlex Elder }
84230d1cff8SAlex Elder 
84330d1cff8SAlex Elder /*
84430d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
84530d1cff8SAlex Elder  * present.
84630d1cff8SAlex Elder  *
84730d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
84830d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
84930d1cff8SAlex Elder  *
85030d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
85130d1cff8SAlex Elder  * reverse order, highest snapshot id first.
85230d1cff8SAlex Elder  */
8539682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
8549682fc6dSAlex Elder {
8559682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
85630d1cff8SAlex Elder 	u64 *found;
8579682fc6dSAlex Elder 
85830d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
85930d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
8609682fc6dSAlex Elder 
86130d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
8629682fc6dSAlex Elder }
8639682fc6dSAlex Elder 
8642ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
8652ad3d716SAlex Elder 					u64 snap_id)
86654cac61fSAlex Elder {
86754cac61fSAlex Elder 	u32 which;
86854cac61fSAlex Elder 
86954cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
87054cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
87154cac61fSAlex Elder 		return NULL;
87254cac61fSAlex Elder 
87354cac61fSAlex Elder 	return _rbd_dev_v1_snap_name(rbd_dev, which);
87454cac61fSAlex Elder }
87554cac61fSAlex Elder 
8769e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
8779e15b77dSAlex Elder {
8789e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
8799e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
8809e15b77dSAlex Elder 
88154cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
88254cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
88354cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
8849e15b77dSAlex Elder 
88554cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
8869e15b77dSAlex Elder }
8879e15b77dSAlex Elder 
8882ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
8892ad3d716SAlex Elder 				u64 *snap_size)
890602adf40SYehuda Sadeh {
8912ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
8922ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
8932ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
8942ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
8952ad3d716SAlex Elder 		u32 which;
89600f1f36fSAlex Elder 
8972ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
8982ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
8992ad3d716SAlex Elder 			return -ENOENT;
90000f1f36fSAlex Elder 
9012ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
9022ad3d716SAlex Elder 	} else {
9032ad3d716SAlex Elder 		u64 size = 0;
9042ad3d716SAlex Elder 		int ret;
9052ad3d716SAlex Elder 
9062ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
9072ad3d716SAlex Elder 		if (ret)
9082ad3d716SAlex Elder 			return ret;
9092ad3d716SAlex Elder 
9102ad3d716SAlex Elder 		*snap_size = size;
9112ad3d716SAlex Elder 	}
9122ad3d716SAlex Elder 	return 0;
9132ad3d716SAlex Elder }
9142ad3d716SAlex Elder 
9152ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
9162ad3d716SAlex Elder 			u64 *snap_features)
9172ad3d716SAlex Elder {
9182ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9192ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9202ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
9212ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9222ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
9232ad3d716SAlex Elder 	} else {
9242ad3d716SAlex Elder 		u64 features = 0;
9252ad3d716SAlex Elder 		int ret;
9262ad3d716SAlex Elder 
9272ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
9282ad3d716SAlex Elder 		if (ret)
9292ad3d716SAlex Elder 			return ret;
9302ad3d716SAlex Elder 
9312ad3d716SAlex Elder 		*snap_features = features;
9322ad3d716SAlex Elder 	}
9332ad3d716SAlex Elder 	return 0;
93400f1f36fSAlex Elder }
935602adf40SYehuda Sadeh 
936d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
937602adf40SYehuda Sadeh {
9388f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
9392ad3d716SAlex Elder 	u64 size = 0;
9402ad3d716SAlex Elder 	u64 features = 0;
9412ad3d716SAlex Elder 	int ret;
9428b0241f8SAlex Elder 
9432ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
9442ad3d716SAlex Elder 	if (ret)
9452ad3d716SAlex Elder 		return ret;
9462ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
9472ad3d716SAlex Elder 	if (ret)
9482ad3d716SAlex Elder 		return ret;
9492ad3d716SAlex Elder 
9502ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
9512ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
9522ad3d716SAlex Elder 
9538b0241f8SAlex Elder 	return 0;
954602adf40SYehuda Sadeh }
955602adf40SYehuda Sadeh 
956d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
957d1cf5788SAlex Elder {
958d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
959d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
960d1cf5788SAlex Elder }
961d1cf5788SAlex Elder 
96298571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
963602adf40SYehuda Sadeh {
96465ccfe21SAlex Elder 	char *name;
96565ccfe21SAlex Elder 	u64 segment;
96665ccfe21SAlex Elder 	int ret;
967602adf40SYehuda Sadeh 
96878c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
96965ccfe21SAlex Elder 	if (!name)
97065ccfe21SAlex Elder 		return NULL;
97165ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
9722fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
97365ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
9742fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
97565ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
97665ccfe21SAlex Elder 			segment, ret);
97765ccfe21SAlex Elder 		kfree(name);
97865ccfe21SAlex Elder 		name = NULL;
97965ccfe21SAlex Elder 	}
980602adf40SYehuda Sadeh 
98165ccfe21SAlex Elder 	return name;
98265ccfe21SAlex Elder }
983602adf40SYehuda Sadeh 
98478c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
98578c2a44aSAlex Elder {
98678c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
98778c2a44aSAlex Elder 
98878c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
98978c2a44aSAlex Elder }
99078c2a44aSAlex Elder 
99165ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
99265ccfe21SAlex Elder {
99365ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
994602adf40SYehuda Sadeh 
99565ccfe21SAlex Elder 	return offset & (segment_size - 1);
99665ccfe21SAlex Elder }
99765ccfe21SAlex Elder 
99865ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
99965ccfe21SAlex Elder 				u64 offset, u64 length)
100065ccfe21SAlex Elder {
100165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
100265ccfe21SAlex Elder 
100365ccfe21SAlex Elder 	offset &= segment_size - 1;
100465ccfe21SAlex Elder 
1005aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
100665ccfe21SAlex Elder 	if (offset + length > segment_size)
100765ccfe21SAlex Elder 		length = segment_size - offset;
100865ccfe21SAlex Elder 
100965ccfe21SAlex Elder 	return length;
1010602adf40SYehuda Sadeh }
1011602adf40SYehuda Sadeh 
1012602adf40SYehuda Sadeh /*
1013029bcbd8SJosh Durgin  * returns the size of an object in the image
1014029bcbd8SJosh Durgin  */
1015029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1016029bcbd8SJosh Durgin {
1017029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1018029bcbd8SJosh Durgin }
1019029bcbd8SJosh Durgin 
1020029bcbd8SJosh Durgin /*
1021602adf40SYehuda Sadeh  * bio helpers
1022602adf40SYehuda Sadeh  */
1023602adf40SYehuda Sadeh 
1024602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1025602adf40SYehuda Sadeh {
1026602adf40SYehuda Sadeh 	struct bio *tmp;
1027602adf40SYehuda Sadeh 
1028602adf40SYehuda Sadeh 	while (chain) {
1029602adf40SYehuda Sadeh 		tmp = chain;
1030602adf40SYehuda Sadeh 		chain = chain->bi_next;
1031602adf40SYehuda Sadeh 		bio_put(tmp);
1032602adf40SYehuda Sadeh 	}
1033602adf40SYehuda Sadeh }
1034602adf40SYehuda Sadeh 
1035602adf40SYehuda Sadeh /*
1036602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1037602adf40SYehuda Sadeh  */
1038602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1039602adf40SYehuda Sadeh {
1040602adf40SYehuda Sadeh 	struct bio_vec *bv;
1041602adf40SYehuda Sadeh 	unsigned long flags;
1042602adf40SYehuda Sadeh 	void *buf;
1043602adf40SYehuda Sadeh 	int i;
1044602adf40SYehuda Sadeh 	int pos = 0;
1045602adf40SYehuda Sadeh 
1046602adf40SYehuda Sadeh 	while (chain) {
1047602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
1048602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
1049602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
1050602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
1051602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
1052602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
105385b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1054602adf40SYehuda Sadeh 			}
1055602adf40SYehuda Sadeh 			pos += bv->bv_len;
1056602adf40SYehuda Sadeh 		}
1057602adf40SYehuda Sadeh 
1058602adf40SYehuda Sadeh 		chain = chain->bi_next;
1059602adf40SYehuda Sadeh 	}
1060602adf40SYehuda Sadeh }
1061602adf40SYehuda Sadeh 
1062602adf40SYehuda Sadeh /*
1063b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1064b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1065b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1066b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1067b9434c5bSAlex Elder  */
1068b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1069b9434c5bSAlex Elder {
1070b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1071b9434c5bSAlex Elder 
1072b9434c5bSAlex Elder 	rbd_assert(end > offset);
1073b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1074b9434c5bSAlex Elder 	while (offset < end) {
1075b9434c5bSAlex Elder 		size_t page_offset;
1076b9434c5bSAlex Elder 		size_t length;
1077b9434c5bSAlex Elder 		unsigned long flags;
1078b9434c5bSAlex Elder 		void *kaddr;
1079b9434c5bSAlex Elder 
1080b9434c5bSAlex Elder 		page_offset = (size_t)(offset & ~PAGE_MASK);
1081b9434c5bSAlex Elder 		length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1082b9434c5bSAlex Elder 		local_irq_save(flags);
1083b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1084b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1085b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1086b9434c5bSAlex Elder 		local_irq_restore(flags);
1087b9434c5bSAlex Elder 
1088b9434c5bSAlex Elder 		offset += length;
1089b9434c5bSAlex Elder 		page++;
1090b9434c5bSAlex Elder 	}
1091b9434c5bSAlex Elder }
1092b9434c5bSAlex Elder 
1093b9434c5bSAlex Elder /*
1094f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1095f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1096602adf40SYehuda Sadeh  */
1097f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1098f7760dadSAlex Elder 					unsigned int offset,
1099f7760dadSAlex Elder 					unsigned int len,
1100f7760dadSAlex Elder 					gfp_t gfpmask)
1101602adf40SYehuda Sadeh {
1102f7760dadSAlex Elder 	struct bio_vec *bv;
1103f7760dadSAlex Elder 	unsigned int resid;
1104f7760dadSAlex Elder 	unsigned short idx;
1105f7760dadSAlex Elder 	unsigned int voff;
1106f7760dadSAlex Elder 	unsigned short end_idx;
1107f7760dadSAlex Elder 	unsigned short vcnt;
1108f7760dadSAlex Elder 	struct bio *bio;
1109602adf40SYehuda Sadeh 
1110f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1111f7760dadSAlex Elder 
1112f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1113f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1114f7760dadSAlex Elder 
1115f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1116f7760dadSAlex Elder 		return NULL;
1117f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1118f7760dadSAlex Elder 		return NULL;
1119f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1120f7760dadSAlex Elder 		return NULL;
1121f7760dadSAlex Elder 
1122f7760dadSAlex Elder 	/* Find first affected segment... */
1123f7760dadSAlex Elder 
1124f7760dadSAlex Elder 	resid = offset;
1125f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1126f7760dadSAlex Elder 		if (resid < bv->bv_len)
1127f7760dadSAlex Elder 			break;
1128f7760dadSAlex Elder 		resid -= bv->bv_len;
1129602adf40SYehuda Sadeh 	}
1130f7760dadSAlex Elder 	voff = resid;
1131602adf40SYehuda Sadeh 
1132f7760dadSAlex Elder 	/* ...and the last affected segment */
1133542582fcSAlex Elder 
1134f7760dadSAlex Elder 	resid += len;
1135f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1136f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1137f7760dadSAlex Elder 			break;
1138f7760dadSAlex Elder 		resid -= bv->bv_len;
1139f7760dadSAlex Elder 	}
1140f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1141602adf40SYehuda Sadeh 
1142f7760dadSAlex Elder 	/* Build the clone */
1143f7760dadSAlex Elder 
1144f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1145f7760dadSAlex Elder 	if (!bio)
1146f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1147f7760dadSAlex Elder 
1148f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1149f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1150f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1151f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1152602adf40SYehuda Sadeh 
1153602adf40SYehuda Sadeh 	/*
1154f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1155f7760dadSAlex Elder 	 * and last (or only) entries.
1156602adf40SYehuda Sadeh 	 */
1157f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1158f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1159f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1160f7760dadSAlex Elder 	if (vcnt > 1) {
1161f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1162f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1163602adf40SYehuda Sadeh 	} else {
1164f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1165602adf40SYehuda Sadeh 	}
1166602adf40SYehuda Sadeh 
1167f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1168f7760dadSAlex Elder 	bio->bi_size = len;
1169f7760dadSAlex Elder 	bio->bi_idx = 0;
1170602adf40SYehuda Sadeh 
1171f7760dadSAlex Elder 	return bio;
1172602adf40SYehuda Sadeh }
1173602adf40SYehuda Sadeh 
1174f7760dadSAlex Elder /*
1175f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1176f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1177f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1178f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1179f7760dadSAlex Elder  *
1180f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1181f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1182f7760dadSAlex Elder  * the start of data to be cloned is located.
1183f7760dadSAlex Elder  *
1184f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1185f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1186f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1187f7760dadSAlex Elder  */
1188f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1189f7760dadSAlex Elder 					unsigned int *offset,
1190f7760dadSAlex Elder 					unsigned int len,
1191f7760dadSAlex Elder 					gfp_t gfpmask)
1192f7760dadSAlex Elder {
1193f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1194f7760dadSAlex Elder 	unsigned int off = *offset;
1195f7760dadSAlex Elder 	struct bio *chain = NULL;
1196f7760dadSAlex Elder 	struct bio **end;
1197602adf40SYehuda Sadeh 
1198f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1199602adf40SYehuda Sadeh 
1200f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1201f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1202602adf40SYehuda Sadeh 
1203f7760dadSAlex Elder 	end = &chain;
1204f7760dadSAlex Elder 	while (len) {
1205f7760dadSAlex Elder 		unsigned int bi_size;
1206f7760dadSAlex Elder 		struct bio *bio;
1207f7760dadSAlex Elder 
1208f5400b7aSAlex Elder 		if (!bi) {
1209f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1210f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1211f5400b7aSAlex Elder 		}
1212f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1213f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1214f7760dadSAlex Elder 		if (!bio)
1215f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1216f7760dadSAlex Elder 
1217f7760dadSAlex Elder 		*end = bio;
1218f7760dadSAlex Elder 		end = &bio->bi_next;
1219f7760dadSAlex Elder 
1220f7760dadSAlex Elder 		off += bi_size;
1221f7760dadSAlex Elder 		if (off == bi->bi_size) {
1222f7760dadSAlex Elder 			bi = bi->bi_next;
1223f7760dadSAlex Elder 			off = 0;
1224f7760dadSAlex Elder 		}
1225f7760dadSAlex Elder 		len -= bi_size;
1226f7760dadSAlex Elder 	}
1227f7760dadSAlex Elder 	*bio_src = bi;
1228f7760dadSAlex Elder 	*offset = off;
1229f7760dadSAlex Elder 
1230f7760dadSAlex Elder 	return chain;
1231f7760dadSAlex Elder out_err:
1232f7760dadSAlex Elder 	bio_chain_put(chain);
1233f7760dadSAlex Elder 
1234602adf40SYehuda Sadeh 	return NULL;
1235602adf40SYehuda Sadeh }
1236602adf40SYehuda Sadeh 
1237926f9b3fSAlex Elder /*
1238926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1239926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1240926f9b3fSAlex Elder  * again.
1241926f9b3fSAlex Elder  */
12426365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
12436365d33aSAlex Elder {
12446365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
12456365d33aSAlex Elder 		struct rbd_device *rbd_dev;
12466365d33aSAlex Elder 
124757acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
12486365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
12496365d33aSAlex Elder 			obj_request);
12506365d33aSAlex Elder 	}
12516365d33aSAlex Elder }
12526365d33aSAlex Elder 
12536365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
12546365d33aSAlex Elder {
12556365d33aSAlex Elder 	smp_mb();
12566365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
12576365d33aSAlex Elder }
12586365d33aSAlex Elder 
125957acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
126057acbaa7SAlex Elder {
126157acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
126257acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
126357acbaa7SAlex Elder 
126457acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
126557acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
126657acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
126757acbaa7SAlex Elder 			obj_request);
126857acbaa7SAlex Elder 	}
126957acbaa7SAlex Elder }
127057acbaa7SAlex Elder 
127157acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
127257acbaa7SAlex Elder {
127357acbaa7SAlex Elder 	smp_mb();
127457acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
127557acbaa7SAlex Elder }
127657acbaa7SAlex Elder 
12775679c59fSAlex Elder /*
12785679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
12795679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
12805679c59fSAlex Elder  *
12815679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
12825679c59fSAlex Elder  * away again.  It's possible that the response from two existence
12835679c59fSAlex Elder  * checks are separated by the creation of the target object, and
12845679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
12855679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
12865679c59fSAlex Elder  */
12875679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
12885679c59fSAlex Elder 				bool exists)
12895679c59fSAlex Elder {
12905679c59fSAlex Elder 	if (exists)
12915679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
12925679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
12935679c59fSAlex Elder 	smp_mb();
12945679c59fSAlex Elder }
12955679c59fSAlex Elder 
12965679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
12975679c59fSAlex Elder {
12985679c59fSAlex Elder 	smp_mb();
12995679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
13005679c59fSAlex Elder }
13015679c59fSAlex Elder 
13025679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
13035679c59fSAlex Elder {
13045679c59fSAlex Elder 	smp_mb();
13055679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
13065679c59fSAlex Elder }
13075679c59fSAlex Elder 
1308bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1309bf0d5f50SAlex Elder {
131037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
131137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1312bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1313bf0d5f50SAlex Elder }
1314bf0d5f50SAlex Elder 
1315bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1316bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1317bf0d5f50SAlex Elder {
1318bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
131937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
132037206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1321bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1322bf0d5f50SAlex Elder }
1323bf0d5f50SAlex Elder 
1324bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1325bf0d5f50SAlex Elder {
132637206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
132737206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1328bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1329bf0d5f50SAlex Elder }
1330bf0d5f50SAlex Elder 
1331bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1332bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1333bf0d5f50SAlex Elder {
1334bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
133537206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
133637206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1337bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1338bf0d5f50SAlex Elder }
1339bf0d5f50SAlex Elder 
1340bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1341bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1342bf0d5f50SAlex Elder {
134325dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
134425dcf954SAlex Elder 
1345b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1346bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
134725dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
13486365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
13496365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1350bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
135125dcf954SAlex Elder 	img_request->obj_request_count++;
135225dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
135337206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
135437206ee5SAlex Elder 		obj_request->which);
1355bf0d5f50SAlex Elder }
1356bf0d5f50SAlex Elder 
1357bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1358bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1359bf0d5f50SAlex Elder {
1360bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
136125dcf954SAlex Elder 
136237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
136337206ee5SAlex Elder 		obj_request->which);
1364bf0d5f50SAlex Elder 	list_del(&obj_request->links);
136525dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
136625dcf954SAlex Elder 	img_request->obj_request_count--;
136725dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
136825dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
13696365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1370bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1371bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
137225dcf954SAlex Elder 	obj_request->callback = NULL;
1373bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1374bf0d5f50SAlex Elder }
1375bf0d5f50SAlex Elder 
1376bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1377bf0d5f50SAlex Elder {
1378bf0d5f50SAlex Elder 	switch (type) {
13799969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1380bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1381788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1382bf0d5f50SAlex Elder 		return true;
1383bf0d5f50SAlex Elder 	default:
1384bf0d5f50SAlex Elder 		return false;
1385bf0d5f50SAlex Elder 	}
1386bf0d5f50SAlex Elder }
1387bf0d5f50SAlex Elder 
1388bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1389bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1390bf0d5f50SAlex Elder {
139137206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
139237206ee5SAlex Elder 
1393bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1394bf0d5f50SAlex Elder }
1395bf0d5f50SAlex Elder 
1396bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1397bf0d5f50SAlex Elder {
139855f27e09SAlex Elder 
139937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
140055f27e09SAlex Elder 
140155f27e09SAlex Elder 	/*
140255f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
140355f27e09SAlex Elder 	 * count for the image request.  We could instead use
140455f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
140555f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
140655f27e09SAlex Elder 	 */
140755f27e09SAlex Elder 	if (!img_request->result) {
140855f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
140955f27e09SAlex Elder 		u64 xferred = 0;
141055f27e09SAlex Elder 
141155f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
141255f27e09SAlex Elder 			xferred += obj_request->xferred;
141355f27e09SAlex Elder 		img_request->xferred = xferred;
141455f27e09SAlex Elder 	}
141555f27e09SAlex Elder 
1416bf0d5f50SAlex Elder 	if (img_request->callback)
1417bf0d5f50SAlex Elder 		img_request->callback(img_request);
1418bf0d5f50SAlex Elder 	else
1419bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1420bf0d5f50SAlex Elder }
1421bf0d5f50SAlex Elder 
1422788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1423788e2df3SAlex Elder 
1424788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1425788e2df3SAlex Elder {
142637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
142737206ee5SAlex Elder 
1428788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1429788e2df3SAlex Elder }
1430788e2df3SAlex Elder 
14310c425248SAlex Elder /*
14320c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14330c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
14340c425248SAlex Elder  * and currently never change thereafter.
14350c425248SAlex Elder  */
14360c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
14370c425248SAlex Elder {
14380c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
14390c425248SAlex Elder 	smp_mb();
14400c425248SAlex Elder }
14410c425248SAlex Elder 
14420c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
14430c425248SAlex Elder {
14440c425248SAlex Elder 	smp_mb();
14450c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
14460c425248SAlex Elder }
14470c425248SAlex Elder 
14489849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
14499849e986SAlex Elder {
14509849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
14519849e986SAlex Elder 	smp_mb();
14529849e986SAlex Elder }
14539849e986SAlex Elder 
14549849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
14559849e986SAlex Elder {
14569849e986SAlex Elder 	smp_mb();
14579849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
14589849e986SAlex Elder }
14599849e986SAlex Elder 
1460d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1461d0b2e944SAlex Elder {
1462d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1463d0b2e944SAlex Elder 	smp_mb();
1464d0b2e944SAlex Elder }
1465d0b2e944SAlex Elder 
1466d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1467d0b2e944SAlex Elder {
1468d0b2e944SAlex Elder 	smp_mb();
1469d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1470d0b2e944SAlex Elder }
1471d0b2e944SAlex Elder 
14726e2a4505SAlex Elder static void
14736e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
14746e2a4505SAlex Elder {
1475b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1476b9434c5bSAlex Elder 	u64 length = obj_request->length;
1477b9434c5bSAlex Elder 
14786e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
14796e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1480b9434c5bSAlex Elder 		xferred, length);
14816e2a4505SAlex Elder 	/*
14826e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
14836e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
14846e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
14856e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
14866e2a4505SAlex Elder 	 * was satisfied.
14876e2a4505SAlex Elder 	 */
1488b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
14896e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1490b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
14916e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1492b9434c5bSAlex Elder 		else
1493b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
14946e2a4505SAlex Elder 		obj_request->result = 0;
1495b9434c5bSAlex Elder 		obj_request->xferred = length;
1496b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1497b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1498b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1499b9434c5bSAlex Elder 		else
1500b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1501b9434c5bSAlex Elder 		obj_request->xferred = length;
15026e2a4505SAlex Elder 	}
15036e2a4505SAlex Elder 	obj_request_done_set(obj_request);
15046e2a4505SAlex Elder }
15056e2a4505SAlex Elder 
1506bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1507bf0d5f50SAlex Elder {
150837206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
150937206ee5SAlex Elder 		obj_request->callback);
1510bf0d5f50SAlex Elder 	if (obj_request->callback)
1511bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1512788e2df3SAlex Elder 	else
1513788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1514bf0d5f50SAlex Elder }
1515bf0d5f50SAlex Elder 
1516c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
151739bf2c5dSAlex Elder {
151839bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
151939bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
152039bf2c5dSAlex Elder }
152139bf2c5dSAlex Elder 
1522c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1523bf0d5f50SAlex Elder {
152457acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1525a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
152657acbaa7SAlex Elder 	bool layered = false;
152757acbaa7SAlex Elder 
152857acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
152957acbaa7SAlex Elder 		img_request = obj_request->img_request;
153057acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1531a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
153257acbaa7SAlex Elder 	}
15338b3e1a56SAlex Elder 
15348b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15358b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
15368b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1537a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1538a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
15398b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
15408b3e1a56SAlex Elder 	else if (img_request)
15416e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
15426e2a4505SAlex Elder 	else
154307741308SAlex Elder 		obj_request_done_set(obj_request);
1544bf0d5f50SAlex Elder }
1545bf0d5f50SAlex Elder 
1546c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1547bf0d5f50SAlex Elder {
15481b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
15491b83bef2SSage Weil 		obj_request->result, obj_request->length);
15501b83bef2SSage Weil 	/*
15518b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
15528b3e1a56SAlex Elder 	 * it to our originally-requested length.
15531b83bef2SSage Weil 	 */
15541b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
155507741308SAlex Elder 	obj_request_done_set(obj_request);
1556bf0d5f50SAlex Elder }
1557bf0d5f50SAlex Elder 
1558fbfab539SAlex Elder /*
1559fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1560fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1561fbfab539SAlex Elder  */
1562c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1563fbfab539SAlex Elder {
156437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1565fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1566fbfab539SAlex Elder }
1567fbfab539SAlex Elder 
1568bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1569bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1570bf0d5f50SAlex Elder {
1571bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1572bf0d5f50SAlex Elder 	u16 opcode;
1573bf0d5f50SAlex Elder 
157437206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1575bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
157657acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
157757acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
157857acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
157957acbaa7SAlex Elder 	} else {
158057acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
158157acbaa7SAlex Elder 	}
1582bf0d5f50SAlex Elder 
15831b83bef2SSage Weil 	if (osd_req->r_result < 0)
15841b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1585bf0d5f50SAlex Elder 
15860eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1587bf0d5f50SAlex Elder 
1588c47f9371SAlex Elder 	/*
1589c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1590c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1591c47f9371SAlex Elder 	 */
15921b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1593c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
159479528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1595bf0d5f50SAlex Elder 	switch (opcode) {
1596bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1597c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1598bf0d5f50SAlex Elder 		break;
1599bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1600c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1601bf0d5f50SAlex Elder 		break;
1602fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1603c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1604fbfab539SAlex Elder 		break;
160536be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1606b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
16079969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1608c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
16099969ebc5SAlex Elder 		break;
1610bf0d5f50SAlex Elder 	default:
1611bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1612bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1613bf0d5f50SAlex Elder 		break;
1614bf0d5f50SAlex Elder 	}
1615bf0d5f50SAlex Elder 
161607741308SAlex Elder 	if (obj_request_done_test(obj_request))
1617bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1618bf0d5f50SAlex Elder }
1619bf0d5f50SAlex Elder 
16209d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1621430c28c3SAlex Elder {
1622430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16238c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16249d4df01fSAlex Elder 	u64 snap_id;
1625430c28c3SAlex Elder 
16268c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1627430c28c3SAlex Elder 
16289d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
16298c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
16309d4df01fSAlex Elder 			NULL, snap_id, NULL);
16319d4df01fSAlex Elder }
16329d4df01fSAlex Elder 
16339d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
16349d4df01fSAlex Elder {
16359d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16369d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16379d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
16389d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
16399d4df01fSAlex Elder 
16409d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
16419d4df01fSAlex Elder 
16429d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
16439d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
16449d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1645430c28c3SAlex Elder }
1646430c28c3SAlex Elder 
1647bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1648bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1649bf0d5f50SAlex Elder 					bool write_request,
1650430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1651bf0d5f50SAlex Elder {
1652bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1653bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1654bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1655bf0d5f50SAlex Elder 
16566365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
16576365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
16586365d33aSAlex Elder 
16590c425248SAlex Elder 		rbd_assert(write_request ==
16600c425248SAlex Elder 				img_request_write_test(img_request));
16610c425248SAlex Elder 		if (write_request)
1662bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1663bf0d5f50SAlex Elder 	}
1664bf0d5f50SAlex Elder 
1665bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1666bf0d5f50SAlex Elder 
1667bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1668bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1669bf0d5f50SAlex Elder 	if (!osd_req)
1670bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1671bf0d5f50SAlex Elder 
1672430c28c3SAlex Elder 	if (write_request)
1673bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1674430c28c3SAlex Elder 	else
1675bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1676bf0d5f50SAlex Elder 
1677bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1678bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1679bf0d5f50SAlex Elder 
1680bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1681bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1682bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1683bf0d5f50SAlex Elder 
1684bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1685bf0d5f50SAlex Elder 
1686bf0d5f50SAlex Elder 	return osd_req;
1687bf0d5f50SAlex Elder }
1688bf0d5f50SAlex Elder 
16890eefd470SAlex Elder /*
16900eefd470SAlex Elder  * Create a copyup osd request based on the information in the
16910eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
16920eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
16930eefd470SAlex Elder  */
16940eefd470SAlex Elder static struct ceph_osd_request *
16950eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
16960eefd470SAlex Elder {
16970eefd470SAlex Elder 	struct rbd_img_request *img_request;
16980eefd470SAlex Elder 	struct ceph_snap_context *snapc;
16990eefd470SAlex Elder 	struct rbd_device *rbd_dev;
17000eefd470SAlex Elder 	struct ceph_osd_client *osdc;
17010eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
17020eefd470SAlex Elder 
17030eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17040eefd470SAlex Elder 	img_request = obj_request->img_request;
17050eefd470SAlex Elder 	rbd_assert(img_request);
17060eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
17070eefd470SAlex Elder 
17080eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
17090eefd470SAlex Elder 
17100eefd470SAlex Elder 	snapc = img_request->snapc;
17110eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
17120eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
17130eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
17140eefd470SAlex Elder 	if (!osd_req)
17150eefd470SAlex Elder 		return NULL;	/* ENOMEM */
17160eefd470SAlex Elder 
17170eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
17180eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
17190eefd470SAlex Elder 	osd_req->r_priv = obj_request;
17200eefd470SAlex Elder 
17210eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
17220eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
17230eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
17240eefd470SAlex Elder 
17250eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
17260eefd470SAlex Elder 
17270eefd470SAlex Elder 	return osd_req;
17280eefd470SAlex Elder }
17290eefd470SAlex Elder 
17300eefd470SAlex Elder 
1731bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1732bf0d5f50SAlex Elder {
1733bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1734bf0d5f50SAlex Elder }
1735bf0d5f50SAlex Elder 
1736bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1737bf0d5f50SAlex Elder 
1738bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1739bf0d5f50SAlex Elder 						u64 offset, u64 length,
1740bf0d5f50SAlex Elder 						enum obj_request_type type)
1741bf0d5f50SAlex Elder {
1742bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1743bf0d5f50SAlex Elder 	size_t size;
1744bf0d5f50SAlex Elder 	char *name;
1745bf0d5f50SAlex Elder 
1746bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1747bf0d5f50SAlex Elder 
1748bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1749f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1750f907ad55SAlex Elder 	if (!name)
1751bf0d5f50SAlex Elder 		return NULL;
1752bf0d5f50SAlex Elder 
1753868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1754f907ad55SAlex Elder 	if (!obj_request) {
1755f907ad55SAlex Elder 		kfree(name);
1756f907ad55SAlex Elder 		return NULL;
1757f907ad55SAlex Elder 	}
1758f907ad55SAlex Elder 
1759bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1760bf0d5f50SAlex Elder 	obj_request->offset = offset;
1761bf0d5f50SAlex Elder 	obj_request->length = length;
1762926f9b3fSAlex Elder 	obj_request->flags = 0;
1763bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1764bf0d5f50SAlex Elder 	obj_request->type = type;
1765bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1766788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1767bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1768bf0d5f50SAlex Elder 
176937206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
177037206ee5SAlex Elder 		offset, length, (int)type, obj_request);
177137206ee5SAlex Elder 
1772bf0d5f50SAlex Elder 	return obj_request;
1773bf0d5f50SAlex Elder }
1774bf0d5f50SAlex Elder 
1775bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1776bf0d5f50SAlex Elder {
1777bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1778bf0d5f50SAlex Elder 
1779bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1780bf0d5f50SAlex Elder 
178137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
178237206ee5SAlex Elder 
1783bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1784bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1785bf0d5f50SAlex Elder 
1786bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1787bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1788bf0d5f50SAlex Elder 
1789bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1790bf0d5f50SAlex Elder 	switch (obj_request->type) {
17919969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
17929969ebc5SAlex Elder 		break;		/* Nothing to do */
1793bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1794bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1795bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1796bf0d5f50SAlex Elder 		break;
1797788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1798788e2df3SAlex Elder 		if (obj_request->pages)
1799788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1800788e2df3SAlex Elder 						obj_request->page_count);
1801788e2df3SAlex Elder 		break;
1802bf0d5f50SAlex Elder 	}
1803bf0d5f50SAlex Elder 
1804f907ad55SAlex Elder 	kfree(obj_request->object_name);
1805868311b1SAlex Elder 	obj_request->object_name = NULL;
1806868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1807bf0d5f50SAlex Elder }
1808bf0d5f50SAlex Elder 
1809bf0d5f50SAlex Elder /*
1810bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1811bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1812bf0d5f50SAlex Elder  * (if there is one).
1813bf0d5f50SAlex Elder  */
1814cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1815cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1816bf0d5f50SAlex Elder 					u64 offset, u64 length,
18179849e986SAlex Elder 					bool write_request,
18189849e986SAlex Elder 					bool child_request)
1819bf0d5f50SAlex Elder {
1820bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1821bf0d5f50SAlex Elder 
18221c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1823bf0d5f50SAlex Elder 	if (!img_request)
1824bf0d5f50SAlex Elder 		return NULL;
1825bf0d5f50SAlex Elder 
1826bf0d5f50SAlex Elder 	if (write_request) {
1827bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1828812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1829bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1830bf0d5f50SAlex Elder 	}
1831bf0d5f50SAlex Elder 
1832bf0d5f50SAlex Elder 	img_request->rq = NULL;
1833bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1834bf0d5f50SAlex Elder 	img_request->offset = offset;
1835bf0d5f50SAlex Elder 	img_request->length = length;
18360c425248SAlex Elder 	img_request->flags = 0;
18370c425248SAlex Elder 	if (write_request) {
18380c425248SAlex Elder 		img_request_write_set(img_request);
1839468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
18400c425248SAlex Elder 	} else {
1841bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
18420c425248SAlex Elder 	}
18439849e986SAlex Elder 	if (child_request)
18449849e986SAlex Elder 		img_request_child_set(img_request);
1845d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1846d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1847bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1848bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1849bf0d5f50SAlex Elder 	img_request->callback = NULL;
1850a5a337d4SAlex Elder 	img_request->result = 0;
1851bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1852bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1853bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1854bf0d5f50SAlex Elder 
1855bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1856bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1857bf0d5f50SAlex Elder 
185837206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
185937206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
186037206ee5SAlex Elder 		img_request);
186137206ee5SAlex Elder 
1862bf0d5f50SAlex Elder 	return img_request;
1863bf0d5f50SAlex Elder }
1864bf0d5f50SAlex Elder 
1865bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1866bf0d5f50SAlex Elder {
1867bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1868bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1869bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1870bf0d5f50SAlex Elder 
1871bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1872bf0d5f50SAlex Elder 
187337206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
187437206ee5SAlex Elder 
1875bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1876bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
187725dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1878bf0d5f50SAlex Elder 
18790c425248SAlex Elder 	if (img_request_write_test(img_request))
1880812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1881bf0d5f50SAlex Elder 
18828b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
18838b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
18848b3e1a56SAlex Elder 
18851c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1886bf0d5f50SAlex Elder }
1887bf0d5f50SAlex Elder 
18881217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
18891217857fSAlex Elder {
18906365d33aSAlex Elder 	struct rbd_img_request *img_request;
18911217857fSAlex Elder 	unsigned int xferred;
18921217857fSAlex Elder 	int result;
18938b3e1a56SAlex Elder 	bool more;
18941217857fSAlex Elder 
18956365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18966365d33aSAlex Elder 	img_request = obj_request->img_request;
18976365d33aSAlex Elder 
18981217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
18991217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
19001217857fSAlex Elder 	result = obj_request->result;
19011217857fSAlex Elder 	if (result) {
19021217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
19031217857fSAlex Elder 
19041217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
19051217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
19061217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
19071217857fSAlex Elder 			obj_request->offset);
19081217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
19091217857fSAlex Elder 			result, xferred);
19101217857fSAlex Elder 		if (!img_request->result)
19111217857fSAlex Elder 			img_request->result = result;
19121217857fSAlex Elder 	}
19131217857fSAlex Elder 
1914f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
1915f1a4739fSAlex Elder 
1916f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
1917f1a4739fSAlex Elder 		obj_request->pages = NULL;
1918f1a4739fSAlex Elder 		obj_request->page_count = 0;
1919f1a4739fSAlex Elder 	}
1920f1a4739fSAlex Elder 
19218b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
19228b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
19238b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
19248b3e1a56SAlex Elder 	} else {
19258b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
19268b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
19278b3e1a56SAlex Elder 	}
19288b3e1a56SAlex Elder 
19298b3e1a56SAlex Elder 	return more;
19301217857fSAlex Elder }
19311217857fSAlex Elder 
19322169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
19332169238dSAlex Elder {
19342169238dSAlex Elder 	struct rbd_img_request *img_request;
19352169238dSAlex Elder 	u32 which = obj_request->which;
19362169238dSAlex Elder 	bool more = true;
19372169238dSAlex Elder 
19386365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19392169238dSAlex Elder 	img_request = obj_request->img_request;
19402169238dSAlex Elder 
19412169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
19422169238dSAlex Elder 	rbd_assert(img_request != NULL);
19432169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
19442169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
19452169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
19462169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
19472169238dSAlex Elder 
19482169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
19492169238dSAlex Elder 	if (which != img_request->next_completion)
19502169238dSAlex Elder 		goto out;
19512169238dSAlex Elder 
19522169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
19532169238dSAlex Elder 		rbd_assert(more);
19542169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
19552169238dSAlex Elder 
19562169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
19572169238dSAlex Elder 			break;
19581217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
19592169238dSAlex Elder 		which++;
19602169238dSAlex Elder 	}
19612169238dSAlex Elder 
19622169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
19632169238dSAlex Elder 	img_request->next_completion = which;
19642169238dSAlex Elder out:
19652169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
19662169238dSAlex Elder 
19672169238dSAlex Elder 	if (!more)
19682169238dSAlex Elder 		rbd_img_request_complete(img_request);
19692169238dSAlex Elder }
19702169238dSAlex Elder 
1971f1a4739fSAlex Elder /*
1972f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
1973f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
1974f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
1975f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
1976f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
1977f1a4739fSAlex Elder  * all data described by the image request.
1978f1a4739fSAlex Elder  */
1979f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
1980f1a4739fSAlex Elder 					enum obj_request_type type,
1981f1a4739fSAlex Elder 					void *data_desc)
1982bf0d5f50SAlex Elder {
1983bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1984bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1985bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
19860c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1987f1a4739fSAlex Elder 	struct bio *bio_list;
1988f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
1989f1a4739fSAlex Elder 	struct page **pages;
19907da22d29SAlex Elder 	u64 img_offset;
1991bf0d5f50SAlex Elder 	u64 resid;
1992bf0d5f50SAlex Elder 	u16 opcode;
1993bf0d5f50SAlex Elder 
1994f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1995f1a4739fSAlex Elder 		(int)type, data_desc);
199637206ee5SAlex Elder 
1997430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
19987da22d29SAlex Elder 	img_offset = img_request->offset;
1999bf0d5f50SAlex Elder 	resid = img_request->length;
20004dda41d3SAlex Elder 	rbd_assert(resid > 0);
2001f1a4739fSAlex Elder 
2002f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2003f1a4739fSAlex Elder 		bio_list = data_desc;
2004f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2005f1a4739fSAlex Elder 	} else {
2006f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2007f1a4739fSAlex Elder 		pages = data_desc;
2008f1a4739fSAlex Elder 	}
2009f1a4739fSAlex Elder 
2010bf0d5f50SAlex Elder 	while (resid) {
20112fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2012bf0d5f50SAlex Elder 		const char *object_name;
2013bf0d5f50SAlex Elder 		u64 offset;
2014bf0d5f50SAlex Elder 		u64 length;
2015bf0d5f50SAlex Elder 
20167da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2017bf0d5f50SAlex Elder 		if (!object_name)
2018bf0d5f50SAlex Elder 			goto out_unwind;
20197da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
20207da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2021bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2022f1a4739fSAlex Elder 						offset, length, type);
202378c2a44aSAlex Elder 		/* object request has its own copy of the object name */
202478c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2025bf0d5f50SAlex Elder 		if (!obj_request)
2026bf0d5f50SAlex Elder 			goto out_unwind;
2027bf0d5f50SAlex Elder 
2028f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2029f1a4739fSAlex Elder 			unsigned int clone_size;
2030f1a4739fSAlex Elder 
2031bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2032bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2033f1a4739fSAlex Elder 			obj_request->bio_list =
2034f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2035f1a4739fSAlex Elder 								&bio_offset,
2036f1a4739fSAlex Elder 								clone_size,
2037bf0d5f50SAlex Elder 								GFP_ATOMIC);
2038bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
2039bf0d5f50SAlex Elder 				goto out_partial;
2040f1a4739fSAlex Elder 		} else {
2041f1a4739fSAlex Elder 			unsigned int page_count;
2042f1a4739fSAlex Elder 
2043f1a4739fSAlex Elder 			obj_request->pages = pages;
2044f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2045f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2046f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2047f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2048f1a4739fSAlex Elder 			pages += page_count;
2049f1a4739fSAlex Elder 		}
2050bf0d5f50SAlex Elder 
20512fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
20522fa12320SAlex Elder 						obj_request);
20532fa12320SAlex Elder 		if (!osd_req)
2054bf0d5f50SAlex Elder 			goto out_partial;
20552fa12320SAlex Elder 		obj_request->osd_req = osd_req;
20562169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2057430c28c3SAlex Elder 
20582fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
20592fa12320SAlex Elder 						0, 0);
2060f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
2061406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
2062f1a4739fSAlex Elder 					obj_request->bio_list, length);
2063f1a4739fSAlex Elder 		else
2064f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
2065f1a4739fSAlex Elder 					obj_request->pages, length,
2066f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
20679d4df01fSAlex Elder 
20689d4df01fSAlex Elder 		if (write_request)
20699d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
20709d4df01fSAlex Elder 		else
20719d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2072430c28c3SAlex Elder 
20737da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2074bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
2075bf0d5f50SAlex Elder 
20767da22d29SAlex Elder 		img_offset += length;
2077bf0d5f50SAlex Elder 		resid -= length;
2078bf0d5f50SAlex Elder 	}
2079bf0d5f50SAlex Elder 
2080bf0d5f50SAlex Elder 	return 0;
2081bf0d5f50SAlex Elder 
2082bf0d5f50SAlex Elder out_partial:
2083bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
2084bf0d5f50SAlex Elder out_unwind:
2085bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2086bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2087bf0d5f50SAlex Elder 
2088bf0d5f50SAlex Elder 	return -ENOMEM;
2089bf0d5f50SAlex Elder }
2090bf0d5f50SAlex Elder 
20913d7efd18SAlex Elder static void
20920eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
20930eefd470SAlex Elder {
20940eefd470SAlex Elder 	struct rbd_img_request *img_request;
20950eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20960eefd470SAlex Elder 	u64 length;
20970eefd470SAlex Elder 	u32 page_count;
20980eefd470SAlex Elder 
20990eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
21000eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21010eefd470SAlex Elder 	img_request = obj_request->img_request;
21020eefd470SAlex Elder 	rbd_assert(img_request);
21030eefd470SAlex Elder 
21040eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
21050eefd470SAlex Elder 	rbd_assert(rbd_dev);
21060eefd470SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
21070eefd470SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
21080eefd470SAlex Elder 
21090eefd470SAlex Elder 	rbd_assert(obj_request->copyup_pages);
21100eefd470SAlex Elder 	ceph_release_page_vector(obj_request->copyup_pages, page_count);
21110eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
21120eefd470SAlex Elder 
21130eefd470SAlex Elder 	/*
21140eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
21150eefd470SAlex Elder 	 * original write request.  There is no such thing as a
21160eefd470SAlex Elder 	 * successful short write, so if the request was successful
21170eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
21180eefd470SAlex Elder 	 */
21190eefd470SAlex Elder 	if (!obj_request->result)
21200eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
21210eefd470SAlex Elder 
21220eefd470SAlex Elder 	/* Finish up with the normal image object callback */
21230eefd470SAlex Elder 
21240eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
21250eefd470SAlex Elder }
21260eefd470SAlex Elder 
21270eefd470SAlex Elder static void
21283d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
21293d7efd18SAlex Elder {
21303d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
21310eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
21320eefd470SAlex Elder 	struct ceph_osd_client *osdc;
21330eefd470SAlex Elder 	struct rbd_device *rbd_dev;
21343d7efd18SAlex Elder 	struct page **pages;
21353d7efd18SAlex Elder 	int result;
21363d7efd18SAlex Elder 	u64 obj_size;
21373d7efd18SAlex Elder 	u64 xferred;
21383d7efd18SAlex Elder 
21393d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
21403d7efd18SAlex Elder 
21413d7efd18SAlex Elder 	/* First get what we need from the image request */
21423d7efd18SAlex Elder 
21433d7efd18SAlex Elder 	pages = img_request->copyup_pages;
21443d7efd18SAlex Elder 	rbd_assert(pages != NULL);
21453d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
21463d7efd18SAlex Elder 
21473d7efd18SAlex Elder 	orig_request = img_request->obj_request;
21483d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
21490eefd470SAlex Elder 	rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
21503d7efd18SAlex Elder 	result = img_request->result;
21513d7efd18SAlex Elder 	obj_size = img_request->length;
21523d7efd18SAlex Elder 	xferred = img_request->xferred;
21533d7efd18SAlex Elder 
21540eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
21550eefd470SAlex Elder 	rbd_assert(rbd_dev);
21560eefd470SAlex Elder 	rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
21570eefd470SAlex Elder 
21583d7efd18SAlex Elder 	rbd_img_request_put(img_request);
21593d7efd18SAlex Elder 
21600eefd470SAlex Elder 	if (result)
21610eefd470SAlex Elder 		goto out_err;
21623d7efd18SAlex Elder 
21630eefd470SAlex Elder 	/* Allocate the new copyup osd request for the original request */
21643d7efd18SAlex Elder 
21650eefd470SAlex Elder 	result = -ENOMEM;
21660eefd470SAlex Elder 	rbd_assert(!orig_request->osd_req);
21670eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
21680eefd470SAlex Elder 	if (!osd_req)
21690eefd470SAlex Elder 		goto out_err;
21700eefd470SAlex Elder 	orig_request->osd_req = osd_req;
21710eefd470SAlex Elder 	orig_request->copyup_pages = pages;
21723d7efd18SAlex Elder 
21730eefd470SAlex Elder 	/* Initialize the copyup op */
21740eefd470SAlex Elder 
21750eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
21760eefd470SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
21770eefd470SAlex Elder 						false, false);
21780eefd470SAlex Elder 
21790eefd470SAlex Elder 	/* Then the original write request op */
21800eefd470SAlex Elder 
21810eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
21820eefd470SAlex Elder 					orig_request->offset,
21830eefd470SAlex Elder 					orig_request->length, 0, 0);
21840eefd470SAlex Elder 	osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
21850eefd470SAlex Elder 					orig_request->length);
21860eefd470SAlex Elder 
21870eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
21880eefd470SAlex Elder 
21890eefd470SAlex Elder 	/* All set, send it off. */
21900eefd470SAlex Elder 
21910eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
21920eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
21930eefd470SAlex Elder 	result = rbd_obj_request_submit(osdc, orig_request);
21940eefd470SAlex Elder 	if (!result)
21950eefd470SAlex Elder 		return;
21960eefd470SAlex Elder out_err:
21970eefd470SAlex Elder 	/* Record the error code and complete the request */
21980eefd470SAlex Elder 
21990eefd470SAlex Elder 	orig_request->result = result;
22000eefd470SAlex Elder 	orig_request->xferred = 0;
22013d7efd18SAlex Elder 	obj_request_done_set(orig_request);
22023d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
22033d7efd18SAlex Elder }
22043d7efd18SAlex Elder 
22053d7efd18SAlex Elder /*
22063d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
22073d7efd18SAlex Elder  * entire target of the given object request.  This is used for
22083d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
22093d7efd18SAlex Elder  * object request from the image request does not exist.
22103d7efd18SAlex Elder  *
22113d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
22123d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
22133d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
22143d7efd18SAlex Elder  * the original object request for the copyup operation.
22153d7efd18SAlex Elder  *
22163d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
22173d7efd18SAlex Elder  * object request and mark it done so it gets completed.
22183d7efd18SAlex Elder  */
22193d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
22203d7efd18SAlex Elder {
22213d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
22223d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
22233d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
22243d7efd18SAlex Elder 	u64 img_offset;
22253d7efd18SAlex Elder 	u64 length;
22263d7efd18SAlex Elder 	struct page **pages = NULL;
22273d7efd18SAlex Elder 	u32 page_count;
22283d7efd18SAlex Elder 	int result;
22293d7efd18SAlex Elder 
22303d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22313d7efd18SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
22323d7efd18SAlex Elder 
22333d7efd18SAlex Elder 	img_request = obj_request->img_request;
22343d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
22353d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
22363d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
22373d7efd18SAlex Elder 
22383d7efd18SAlex Elder 	/*
22390eefd470SAlex Elder 	 * First things first.  The original osd request is of no
22400eefd470SAlex Elder 	 * use to use any more, we'll need a new one that can hold
22410eefd470SAlex Elder 	 * the two ops in a copyup request.  We'll get that later,
22420eefd470SAlex Elder 	 * but for now we can release the old one.
22430eefd470SAlex Elder 	 */
22440eefd470SAlex Elder 	rbd_osd_req_destroy(obj_request->osd_req);
22450eefd470SAlex Elder 	obj_request->osd_req = NULL;
22460eefd470SAlex Elder 
22470eefd470SAlex Elder 	/*
22483d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
22493d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
22503d7efd18SAlex Elder 	 */
22513d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
22523d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
22533d7efd18SAlex Elder 
22543d7efd18SAlex Elder 	/*
2255a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2256a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2257a9e8ba2cSAlex Elder 	 * necessary.
2258a9e8ba2cSAlex Elder 	 */
2259a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2260a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2261a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2262a9e8ba2cSAlex Elder 	}
2263a9e8ba2cSAlex Elder 
2264a9e8ba2cSAlex Elder 	/*
22653d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
22663d7efd18SAlex Elder 	 * from the parent.
22673d7efd18SAlex Elder 	 */
22683d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
22693d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
22703d7efd18SAlex Elder 	if (IS_ERR(pages)) {
22713d7efd18SAlex Elder 		result = PTR_ERR(pages);
22723d7efd18SAlex Elder 		pages = NULL;
22733d7efd18SAlex Elder 		goto out_err;
22743d7efd18SAlex Elder 	}
22753d7efd18SAlex Elder 
22763d7efd18SAlex Elder 	result = -ENOMEM;
22773d7efd18SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
22783d7efd18SAlex Elder 						img_offset, length,
22793d7efd18SAlex Elder 						false, true);
22803d7efd18SAlex Elder 	if (!parent_request)
22813d7efd18SAlex Elder 		goto out_err;
22823d7efd18SAlex Elder 	rbd_obj_request_get(obj_request);
22833d7efd18SAlex Elder 	parent_request->obj_request = obj_request;
22843d7efd18SAlex Elder 
22853d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
22863d7efd18SAlex Elder 	if (result)
22873d7efd18SAlex Elder 		goto out_err;
22883d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
22893d7efd18SAlex Elder 
22903d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
22913d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
22923d7efd18SAlex Elder 	if (!result)
22933d7efd18SAlex Elder 		return 0;
22943d7efd18SAlex Elder 
22953d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
22963d7efd18SAlex Elder 	parent_request->obj_request = NULL;
22973d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
22983d7efd18SAlex Elder out_err:
22993d7efd18SAlex Elder 	if (pages)
23003d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
23013d7efd18SAlex Elder 	if (parent_request)
23023d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
23033d7efd18SAlex Elder 	obj_request->result = result;
23043d7efd18SAlex Elder 	obj_request->xferred = 0;
23053d7efd18SAlex Elder 	obj_request_done_set(obj_request);
23063d7efd18SAlex Elder 
23073d7efd18SAlex Elder 	return result;
23083d7efd18SAlex Elder }
23093d7efd18SAlex Elder 
2310c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2311c5b5ef6cSAlex Elder {
2312c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2313c5b5ef6cSAlex Elder 	int result;
2314c5b5ef6cSAlex Elder 
2315c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2316c5b5ef6cSAlex Elder 
2317c5b5ef6cSAlex Elder 	/*
2318c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2319c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2320c5b5ef6cSAlex Elder 	 * we're done with the request.
2321c5b5ef6cSAlex Elder 	 */
2322c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2323c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2324c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2325c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2326c5b5ef6cSAlex Elder 
2327c5b5ef6cSAlex Elder 	result = obj_request->result;
2328c5b5ef6cSAlex Elder 	obj_request->result = 0;
2329c5b5ef6cSAlex Elder 
2330c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2331c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2332c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2333c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2334c5b5ef6cSAlex Elder 
2335c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2336c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2337c5b5ef6cSAlex Elder 
2338c5b5ef6cSAlex Elder 	/*
2339c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2340c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2341c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2342c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2343c5b5ef6cSAlex Elder 	 */
2344c5b5ef6cSAlex Elder 	if (!result) {
2345c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2346c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2347c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2348c5b5ef6cSAlex Elder 	} else if (result) {
2349c5b5ef6cSAlex Elder 		orig_request->result = result;
23503d7efd18SAlex Elder 		goto out;
2351c5b5ef6cSAlex Elder 	}
2352c5b5ef6cSAlex Elder 
2353c5b5ef6cSAlex Elder 	/*
2354c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2355c5b5ef6cSAlex Elder 	 * whether the target object exists.
2356c5b5ef6cSAlex Elder 	 */
2357b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
23583d7efd18SAlex Elder out:
2359c5b5ef6cSAlex Elder 	if (orig_request->result)
2360c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2361c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
2362c5b5ef6cSAlex Elder }
2363c5b5ef6cSAlex Elder 
2364c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2365c5b5ef6cSAlex Elder {
2366c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2367c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2368c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2369c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2370c5b5ef6cSAlex Elder 	u32 page_count;
2371c5b5ef6cSAlex Elder 	size_t size;
2372c5b5ef6cSAlex Elder 	int ret;
2373c5b5ef6cSAlex Elder 
2374c5b5ef6cSAlex Elder 	/*
2375c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2376c5b5ef6cSAlex Elder 	 *     le64 length;
2377c5b5ef6cSAlex Elder 	 *     struct {
2378c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2379c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2380c5b5ef6cSAlex Elder 	 *     } mtime;
2381c5b5ef6cSAlex Elder 	 */
2382c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2383c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2384c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2385c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2386c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2387c5b5ef6cSAlex Elder 
2388c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2389c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2390c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2391c5b5ef6cSAlex Elder 	if (!stat_request)
2392c5b5ef6cSAlex Elder 		goto out;
2393c5b5ef6cSAlex Elder 
2394c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2395c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2396c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2397c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2398c5b5ef6cSAlex Elder 
2399c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2400c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2401c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2402c5b5ef6cSAlex Elder 						stat_request);
2403c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2404c5b5ef6cSAlex Elder 		goto out;
2405c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2406c5b5ef6cSAlex Elder 
2407c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2408c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2409c5b5ef6cSAlex Elder 					false, false);
24109d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2411c5b5ef6cSAlex Elder 
2412c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2413c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2414c5b5ef6cSAlex Elder out:
2415c5b5ef6cSAlex Elder 	if (ret)
2416c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2417c5b5ef6cSAlex Elder 
2418c5b5ef6cSAlex Elder 	return ret;
2419c5b5ef6cSAlex Elder }
2420c5b5ef6cSAlex Elder 
2421b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2422b454e36dSAlex Elder {
2423b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2424a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
24253d7efd18SAlex Elder 	bool known;
2426b454e36dSAlex Elder 
2427b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2428b454e36dSAlex Elder 
2429b454e36dSAlex Elder 	img_request = obj_request->img_request;
2430b454e36dSAlex Elder 	rbd_assert(img_request);
2431a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2432b454e36dSAlex Elder 
2433b454e36dSAlex Elder 	/*
2434a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2435a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2436a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2437a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2438a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2439a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2440a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2441a9e8ba2cSAlex Elder 	 * simple object request.
2442b454e36dSAlex Elder 	 */
2443b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2444b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2445a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
24463d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
24473d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2448b454e36dSAlex Elder 
2449b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2450b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2451b454e36dSAlex Elder 
2452b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2453b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2454b454e36dSAlex Elder 
2455b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2456b454e36dSAlex Elder 	}
2457b454e36dSAlex Elder 
2458b454e36dSAlex Elder 	/*
24593d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
24603d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
24613d7efd18SAlex Elder 	 * start by reading the data for the full target object from
24623d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2463b454e36dSAlex Elder 	 */
24643d7efd18SAlex Elder 	if (known)
24653d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
24663d7efd18SAlex Elder 
24673d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2468b454e36dSAlex Elder 
2469b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2470b454e36dSAlex Elder }
2471b454e36dSAlex Elder 
2472bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2473bf0d5f50SAlex Elder {
2474bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
247546faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2476bf0d5f50SAlex Elder 
247737206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
247846faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2479bf0d5f50SAlex Elder 		int ret;
2480bf0d5f50SAlex Elder 
2481b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2482bf0d5f50SAlex Elder 		if (ret)
2483bf0d5f50SAlex Elder 			return ret;
2484bf0d5f50SAlex Elder 	}
2485bf0d5f50SAlex Elder 
2486bf0d5f50SAlex Elder 	return 0;
2487bf0d5f50SAlex Elder }
2488bf0d5f50SAlex Elder 
24898b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
24908b3e1a56SAlex Elder {
24918b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2492a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2493a9e8ba2cSAlex Elder 	u64 obj_end;
24948b3e1a56SAlex Elder 
24958b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
24968b3e1a56SAlex Elder 
24978b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
2498a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2499a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
25008b3e1a56SAlex Elder 
2501a9e8ba2cSAlex Elder 	obj_request->result = img_request->result;
2502a9e8ba2cSAlex Elder 	if (obj_request->result)
2503a9e8ba2cSAlex Elder 		goto out;
2504a9e8ba2cSAlex Elder 
2505a9e8ba2cSAlex Elder 	/*
2506a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2507a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2508a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2509a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2510a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2511a9e8ba2cSAlex Elder 	 */
2512a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2513a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2514a9e8ba2cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2515a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2516a9e8ba2cSAlex Elder 		u64 xferred = 0;
2517a9e8ba2cSAlex Elder 
2518a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2519a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2520a9e8ba2cSAlex Elder 					obj_request->img_offset;
2521a9e8ba2cSAlex Elder 
2522a9e8ba2cSAlex Elder 		obj_request->xferred = min(img_request->xferred, xferred);
2523a9e8ba2cSAlex Elder 	} else {
2524a9e8ba2cSAlex Elder 		obj_request->xferred = img_request->xferred;
2525a9e8ba2cSAlex Elder 	}
2526a9e8ba2cSAlex Elder out:
2527b5b09be3SAlex Elder 	rbd_img_request_put(img_request);
25288b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
25298b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
25308b3e1a56SAlex Elder }
25318b3e1a56SAlex Elder 
25328b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
25338b3e1a56SAlex Elder {
25348b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
25358b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
25368b3e1a56SAlex Elder 	int result;
25378b3e1a56SAlex Elder 
25388b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25398b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
25408b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
25418b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
25428b3e1a56SAlex Elder 
25438b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
25448b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
25458b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
25468b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
25478b3e1a56SAlex Elder 						obj_request->img_offset,
25488b3e1a56SAlex Elder 						obj_request->length,
25498b3e1a56SAlex Elder 						false, true);
25508b3e1a56SAlex Elder 	result = -ENOMEM;
25518b3e1a56SAlex Elder 	if (!img_request)
25528b3e1a56SAlex Elder 		goto out_err;
25538b3e1a56SAlex Elder 
25548b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
25558b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
25568b3e1a56SAlex Elder 
2557f1a4739fSAlex Elder 	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2558f1a4739fSAlex Elder 					obj_request->bio_list);
25598b3e1a56SAlex Elder 	if (result)
25608b3e1a56SAlex Elder 		goto out_err;
25618b3e1a56SAlex Elder 
25628b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
25638b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
25648b3e1a56SAlex Elder 	if (result)
25658b3e1a56SAlex Elder 		goto out_err;
25668b3e1a56SAlex Elder 
25678b3e1a56SAlex Elder 	return;
25688b3e1a56SAlex Elder out_err:
25698b3e1a56SAlex Elder 	if (img_request)
25708b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
25718b3e1a56SAlex Elder 	obj_request->result = result;
25728b3e1a56SAlex Elder 	obj_request->xferred = 0;
25738b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
25748b3e1a56SAlex Elder }
25758b3e1a56SAlex Elder 
2576cc4a38bdSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2577b8d70035SAlex Elder {
2578b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
25792169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2580b8d70035SAlex Elder 	int ret;
2581b8d70035SAlex Elder 
2582b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2583b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2584b8d70035SAlex Elder 	if (!obj_request)
2585b8d70035SAlex Elder 		return -ENOMEM;
2586b8d70035SAlex Elder 
2587b8d70035SAlex Elder 	ret = -ENOMEM;
2588430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2589b8d70035SAlex Elder 	if (!obj_request->osd_req)
2590b8d70035SAlex Elder 		goto out;
25912169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2592b8d70035SAlex Elder 
2593c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2594cc4a38bdSAlex Elder 					notify_id, 0, 0);
25959d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2596430c28c3SAlex Elder 
2597b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2598b8d70035SAlex Elder out:
2599cf81b60eSAlex Elder 	if (ret)
2600b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2601b8d70035SAlex Elder 
2602b8d70035SAlex Elder 	return ret;
2603b8d70035SAlex Elder }
2604b8d70035SAlex Elder 
2605b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2606b8d70035SAlex Elder {
2607b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2608e627db08SAlex Elder 	int ret;
2609b8d70035SAlex Elder 
2610b8d70035SAlex Elder 	if (!rbd_dev)
2611b8d70035SAlex Elder 		return;
2612b8d70035SAlex Elder 
261337206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2614b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2615b8d70035SAlex Elder 		(unsigned int)opcode);
2616e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2617e627db08SAlex Elder 	if (ret)
2618e627db08SAlex Elder 		rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret);
2619b8d70035SAlex Elder 
2620cc4a38bdSAlex Elder 	rbd_obj_notify_ack(rbd_dev, notify_id);
2621b8d70035SAlex Elder }
2622b8d70035SAlex Elder 
26239969ebc5SAlex Elder /*
26249969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
26259969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
26269969ebc5SAlex Elder  */
26279969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
26289969ebc5SAlex Elder {
26299969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
26309969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
26319969ebc5SAlex Elder 	int ret;
26329969ebc5SAlex Elder 
26339969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
26349969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
26359969ebc5SAlex Elder 
26369969ebc5SAlex Elder 	if (start) {
26373c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
26389969ebc5SAlex Elder 						&rbd_dev->watch_event);
26399969ebc5SAlex Elder 		if (ret < 0)
26409969ebc5SAlex Elder 			return ret;
26418eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
26429969ebc5SAlex Elder 	}
26439969ebc5SAlex Elder 
26449969ebc5SAlex Elder 	ret = -ENOMEM;
26459969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
26469969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
26479969ebc5SAlex Elder 	if (!obj_request)
26489969ebc5SAlex Elder 		goto out_cancel;
26499969ebc5SAlex Elder 
2650430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2651430c28c3SAlex Elder 	if (!obj_request->osd_req)
2652430c28c3SAlex Elder 		goto out_cancel;
2653430c28c3SAlex Elder 
26548eb87565SAlex Elder 	if (start)
2655975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
26568eb87565SAlex Elder 	else
26576977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2658975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
26592169238dSAlex Elder 
26602169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2661b21ebdddSAlex Elder 				rbd_dev->watch_event->cookie, 0, start);
26629d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
26632169238dSAlex Elder 
26649969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
26659969ebc5SAlex Elder 	if (ret)
26669969ebc5SAlex Elder 		goto out_cancel;
26679969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
26689969ebc5SAlex Elder 	if (ret)
26699969ebc5SAlex Elder 		goto out_cancel;
26709969ebc5SAlex Elder 	ret = obj_request->result;
26719969ebc5SAlex Elder 	if (ret)
26729969ebc5SAlex Elder 		goto out_cancel;
26739969ebc5SAlex Elder 
26748eb87565SAlex Elder 	/*
26758eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
26768eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
26778eb87565SAlex Elder 	 * a pointer to the object request during that time (in
26788eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
26798eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
26808eb87565SAlex Elder 	 * unregistered it.
26818eb87565SAlex Elder 	 */
26828eb87565SAlex Elder 	if (start) {
26838eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
26848eb87565SAlex Elder 
26858eb87565SAlex Elder 		return 0;
26868eb87565SAlex Elder 	}
26878eb87565SAlex Elder 
26888eb87565SAlex Elder 	/* We have successfully torn down the watch request */
26898eb87565SAlex Elder 
26908eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
26918eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
26929969ebc5SAlex Elder out_cancel:
26939969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
26949969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
26959969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
26969969ebc5SAlex Elder 	if (obj_request)
26979969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
26989969ebc5SAlex Elder 
26999969ebc5SAlex Elder 	return ret;
27009969ebc5SAlex Elder }
27019969ebc5SAlex Elder 
270236be9a76SAlex Elder /*
2703f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2704f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
270536be9a76SAlex Elder  */
270636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
270736be9a76SAlex Elder 			     const char *object_name,
270836be9a76SAlex Elder 			     const char *class_name,
270936be9a76SAlex Elder 			     const char *method_name,
27104157976bSAlex Elder 			     const void *outbound,
271136be9a76SAlex Elder 			     size_t outbound_size,
27124157976bSAlex Elder 			     void *inbound,
2713e2a58ee5SAlex Elder 			     size_t inbound_size)
271436be9a76SAlex Elder {
27152169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
271636be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
271736be9a76SAlex Elder 	struct page **pages;
271836be9a76SAlex Elder 	u32 page_count;
271936be9a76SAlex Elder 	int ret;
272036be9a76SAlex Elder 
272136be9a76SAlex Elder 	/*
27226010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
27236010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
27246010a451SAlex Elder 	 * also supply outbound data--parameters for the object
27256010a451SAlex Elder 	 * method.  Currently if this is present it will be a
27266010a451SAlex Elder 	 * snapshot id.
272736be9a76SAlex Elder 	 */
272836be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
272936be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
273036be9a76SAlex Elder 	if (IS_ERR(pages))
273136be9a76SAlex Elder 		return PTR_ERR(pages);
273236be9a76SAlex Elder 
273336be9a76SAlex Elder 	ret = -ENOMEM;
27346010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
273536be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
273636be9a76SAlex Elder 	if (!obj_request)
273736be9a76SAlex Elder 		goto out;
273836be9a76SAlex Elder 
273936be9a76SAlex Elder 	obj_request->pages = pages;
274036be9a76SAlex Elder 	obj_request->page_count = page_count;
274136be9a76SAlex Elder 
2742430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
274336be9a76SAlex Elder 	if (!obj_request->osd_req)
274436be9a76SAlex Elder 		goto out;
274536be9a76SAlex Elder 
2746c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
274704017e29SAlex Elder 					class_name, method_name);
274804017e29SAlex Elder 	if (outbound_size) {
274904017e29SAlex Elder 		struct ceph_pagelist *pagelist;
275004017e29SAlex Elder 
275104017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
275204017e29SAlex Elder 		if (!pagelist)
275304017e29SAlex Elder 			goto out;
275404017e29SAlex Elder 
275504017e29SAlex Elder 		ceph_pagelist_init(pagelist);
275604017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
275704017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
275804017e29SAlex Elder 						pagelist);
275904017e29SAlex Elder 	}
2760a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2761a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
276244cd188dSAlex Elder 					0, false, false);
27639d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2764430c28c3SAlex Elder 
276536be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
276636be9a76SAlex Elder 	if (ret)
276736be9a76SAlex Elder 		goto out;
276836be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
276936be9a76SAlex Elder 	if (ret)
277036be9a76SAlex Elder 		goto out;
277136be9a76SAlex Elder 
277236be9a76SAlex Elder 	ret = obj_request->result;
277336be9a76SAlex Elder 	if (ret < 0)
277436be9a76SAlex Elder 		goto out;
277557385b51SAlex Elder 
277657385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
277757385b51SAlex Elder 	ret = (int)obj_request->xferred;
2778903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
277936be9a76SAlex Elder out:
278036be9a76SAlex Elder 	if (obj_request)
278136be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
278236be9a76SAlex Elder 	else
278336be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
278436be9a76SAlex Elder 
278536be9a76SAlex Elder 	return ret;
278636be9a76SAlex Elder }
278736be9a76SAlex Elder 
2788bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2789cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2790bf0d5f50SAlex Elder {
2791bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2792bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2793bf0d5f50SAlex Elder 	struct request *rq;
2794bf0d5f50SAlex Elder 	int result;
2795bf0d5f50SAlex Elder 
2796bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2797bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2798bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2799bf0d5f50SAlex Elder 		u64 offset;
2800bf0d5f50SAlex Elder 		u64 length;
2801bf0d5f50SAlex Elder 
2802bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2803bf0d5f50SAlex Elder 
2804bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
28054dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
28064dda41d3SAlex Elder 				(int) rq->cmd_type);
28074dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
28084dda41d3SAlex Elder 			continue;
28094dda41d3SAlex Elder 		}
28104dda41d3SAlex Elder 
28114dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
28124dda41d3SAlex Elder 
28134dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
28144dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
28154dda41d3SAlex Elder 
28164dda41d3SAlex Elder 		if (!length) {
28174dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2818bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2819bf0d5f50SAlex Elder 			continue;
2820bf0d5f50SAlex Elder 		}
2821bf0d5f50SAlex Elder 
2822bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2823bf0d5f50SAlex Elder 
2824bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2825bf0d5f50SAlex Elder 
2826bf0d5f50SAlex Elder 		if (write_request) {
2827bf0d5f50SAlex Elder 			result = -EROFS;
2828bf0d5f50SAlex Elder 			if (read_only)
2829bf0d5f50SAlex Elder 				goto end_request;
2830bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2831bf0d5f50SAlex Elder 		}
2832bf0d5f50SAlex Elder 
28336d292906SAlex Elder 		/*
28346d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
28356d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
28366d292906SAlex Elder 		 * have disappeared by the time our request arrives
28376d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
28386d292906SAlex Elder 		 * we already know.
28396d292906SAlex Elder 		 */
28406d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2841bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2842bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2843bf0d5f50SAlex Elder 			result = -ENXIO;
2844bf0d5f50SAlex Elder 			goto end_request;
2845bf0d5f50SAlex Elder 		}
2846bf0d5f50SAlex Elder 
2847bf0d5f50SAlex Elder 		result = -EINVAL;
2848c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
2849c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2850c0cd10dbSAlex Elder 				offset, length);
2851bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2852c0cd10dbSAlex Elder 		}
2853bf0d5f50SAlex Elder 
285400a653e2SAlex Elder 		result = -EIO;
285500a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
285600a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
285700a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
285800a653e2SAlex Elder 			goto end_request;
285900a653e2SAlex Elder 		}
286000a653e2SAlex Elder 
2861bf0d5f50SAlex Elder 		result = -ENOMEM;
2862bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
28639849e986SAlex Elder 							write_request, false);
2864bf0d5f50SAlex Elder 		if (!img_request)
2865bf0d5f50SAlex Elder 			goto end_request;
2866bf0d5f50SAlex Elder 
2867bf0d5f50SAlex Elder 		img_request->rq = rq;
2868bf0d5f50SAlex Elder 
2869f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2870f1a4739fSAlex Elder 						rq->bio);
2871bf0d5f50SAlex Elder 		if (!result)
2872bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2873bf0d5f50SAlex Elder 		if (result)
2874bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2875bf0d5f50SAlex Elder end_request:
2876bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2877bf0d5f50SAlex Elder 		if (result < 0) {
28787da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
28797da22d29SAlex Elder 				write_request ? "write" : "read",
28807da22d29SAlex Elder 				length, offset, result);
28817da22d29SAlex Elder 
2882bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2883bf0d5f50SAlex Elder 		}
2884bf0d5f50SAlex Elder 	}
2885bf0d5f50SAlex Elder }
2886bf0d5f50SAlex Elder 
2887602adf40SYehuda Sadeh /*
2888602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2889602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2890f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2891602adf40SYehuda Sadeh  */
2892602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2893602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2894602adf40SYehuda Sadeh {
2895602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2896e5cfeed2SAlex Elder 	sector_t sector_offset;
2897e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2898e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2899e5cfeed2SAlex Elder 	int ret;
2900602adf40SYehuda Sadeh 
2901e5cfeed2SAlex Elder 	/*
2902e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2903e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2904e5cfeed2SAlex Elder 	 * device.
2905e5cfeed2SAlex Elder 	 */
2906e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2907e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2908e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2909593a9e7bSAlex Elder 
2910e5cfeed2SAlex Elder 	/*
2911e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2912e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2913e5cfeed2SAlex Elder 	 */
2914e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2915e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2916e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2917e5cfeed2SAlex Elder 	else
2918e5cfeed2SAlex Elder 		ret = 0;
2919e5cfeed2SAlex Elder 
2920e5cfeed2SAlex Elder 	/*
2921e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2922e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2923e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2924e5cfeed2SAlex Elder 	 * added to an empty bio."
2925e5cfeed2SAlex Elder 	 */
2926e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2927e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2928e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2929e5cfeed2SAlex Elder 
2930e5cfeed2SAlex Elder 	return ret;
2931602adf40SYehuda Sadeh }
2932602adf40SYehuda Sadeh 
2933602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2934602adf40SYehuda Sadeh {
2935602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2936602adf40SYehuda Sadeh 
2937602adf40SYehuda Sadeh 	if (!disk)
2938602adf40SYehuda Sadeh 		return;
2939602adf40SYehuda Sadeh 
2940a0cab924SAlex Elder 	rbd_dev->disk = NULL;
2941a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
2942602adf40SYehuda Sadeh 		del_gendisk(disk);
2943602adf40SYehuda Sadeh 		if (disk->queue)
2944602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
2945a0cab924SAlex Elder 	}
2946602adf40SYehuda Sadeh 	put_disk(disk);
2947602adf40SYehuda Sadeh }
2948602adf40SYehuda Sadeh 
2949788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2950788e2df3SAlex Elder 				const char *object_name,
29517097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
2952788e2df3SAlex Elder 
2953788e2df3SAlex Elder {
29542169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2955788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2956788e2df3SAlex Elder 	struct page **pages = NULL;
2957788e2df3SAlex Elder 	u32 page_count;
29581ceae7efSAlex Elder 	size_t size;
2959788e2df3SAlex Elder 	int ret;
2960788e2df3SAlex Elder 
2961788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2962788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2963788e2df3SAlex Elder 	if (IS_ERR(pages))
2964788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2965788e2df3SAlex Elder 
2966788e2df3SAlex Elder 	ret = -ENOMEM;
2967788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2968788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2969788e2df3SAlex Elder 	if (!obj_request)
2970788e2df3SAlex Elder 		goto out;
2971788e2df3SAlex Elder 
2972788e2df3SAlex Elder 	obj_request->pages = pages;
2973788e2df3SAlex Elder 	obj_request->page_count = page_count;
2974788e2df3SAlex Elder 
2975430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2976788e2df3SAlex Elder 	if (!obj_request->osd_req)
2977788e2df3SAlex Elder 		goto out;
2978788e2df3SAlex Elder 
2979c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2980c99d2d4aSAlex Elder 					offset, length, 0, 0);
2981406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2982a4ce40a9SAlex Elder 					obj_request->pages,
298344cd188dSAlex Elder 					obj_request->length,
298444cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
298544cd188dSAlex Elder 					false, false);
29869d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2987430c28c3SAlex Elder 
2988788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2989788e2df3SAlex Elder 	if (ret)
2990788e2df3SAlex Elder 		goto out;
2991788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2992788e2df3SAlex Elder 	if (ret)
2993788e2df3SAlex Elder 		goto out;
2994788e2df3SAlex Elder 
2995788e2df3SAlex Elder 	ret = obj_request->result;
2996788e2df3SAlex Elder 	if (ret < 0)
2997788e2df3SAlex Elder 		goto out;
29981ceae7efSAlex Elder 
29991ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
30001ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3001903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
300223ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
300323ed6e13SAlex Elder 	ret = (int)size;
3004788e2df3SAlex Elder out:
3005788e2df3SAlex Elder 	if (obj_request)
3006788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3007788e2df3SAlex Elder 	else
3008788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3009788e2df3SAlex Elder 
3010788e2df3SAlex Elder 	return ret;
3011788e2df3SAlex Elder }
3012788e2df3SAlex Elder 
3013602adf40SYehuda Sadeh /*
30144156d998SAlex Elder  * Read the complete header for the given rbd device.
30154156d998SAlex Elder  *
30164156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
30174156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
30184156d998SAlex Elder  * of a variable that will be filled in with the version of the
30194156d998SAlex Elder  * header object at the time it was read.
30204156d998SAlex Elder  *
30214156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
30224156d998SAlex Elder  */
30234156d998SAlex Elder static struct rbd_image_header_ondisk *
30247097f8dfSAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
30254156d998SAlex Elder {
30264156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
30274156d998SAlex Elder 	u32 snap_count = 0;
30284156d998SAlex Elder 	u64 names_size = 0;
30294156d998SAlex Elder 	u32 want_count;
30304156d998SAlex Elder 	int ret;
30314156d998SAlex Elder 
30324156d998SAlex Elder 	/*
30334156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
30344156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
30354156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
30364156d998SAlex Elder 	 * the number of snapshots could change by the time we read
30374156d998SAlex Elder 	 * it in, in which case we re-read it.
30384156d998SAlex Elder 	 */
30394156d998SAlex Elder 	do {
30404156d998SAlex Elder 		size_t size;
30414156d998SAlex Elder 
30424156d998SAlex Elder 		kfree(ondisk);
30434156d998SAlex Elder 
30444156d998SAlex Elder 		size = sizeof (*ondisk);
30454156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
30464156d998SAlex Elder 		size += names_size;
30474156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
30484156d998SAlex Elder 		if (!ondisk)
30494156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
30504156d998SAlex Elder 
3051788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
30527097f8dfSAlex Elder 				       0, size, ondisk);
30534156d998SAlex Elder 		if (ret < 0)
30544156d998SAlex Elder 			goto out_err;
3055c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
30564156d998SAlex Elder 			ret = -ENXIO;
305706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
305806ecc6cbSAlex Elder 				size, ret);
30594156d998SAlex Elder 			goto out_err;
30604156d998SAlex Elder 		}
30614156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
30624156d998SAlex Elder 			ret = -ENXIO;
306306ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
30644156d998SAlex Elder 			goto out_err;
30654156d998SAlex Elder 		}
30664156d998SAlex Elder 
30674156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
30684156d998SAlex Elder 		want_count = snap_count;
30694156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
30704156d998SAlex Elder 	} while (snap_count != want_count);
30714156d998SAlex Elder 
30724156d998SAlex Elder 	return ondisk;
30734156d998SAlex Elder 
30744156d998SAlex Elder out_err:
30754156d998SAlex Elder 	kfree(ondisk);
30764156d998SAlex Elder 
30774156d998SAlex Elder 	return ERR_PTR(ret);
30784156d998SAlex Elder }
30794156d998SAlex Elder 
30804156d998SAlex Elder /*
3081602adf40SYehuda Sadeh  * reload the ondisk the header
3082602adf40SYehuda Sadeh  */
3083602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
3084602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
3085602adf40SYehuda Sadeh {
30864156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
30874156d998SAlex Elder 	int ret;
3088602adf40SYehuda Sadeh 
30897097f8dfSAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev);
30904156d998SAlex Elder 	if (IS_ERR(ondisk))
30914156d998SAlex Elder 		return PTR_ERR(ondisk);
30924156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
30934156d998SAlex Elder 	kfree(ondisk);
3094602adf40SYehuda Sadeh 
30954156d998SAlex Elder 	return ret;
3096602adf40SYehuda Sadeh }
3097602adf40SYehuda Sadeh 
3098602adf40SYehuda Sadeh /*
3099602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
3100602adf40SYehuda Sadeh  */
3101cc4a38bdSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
3102602adf40SYehuda Sadeh {
3103602adf40SYehuda Sadeh 	int ret;
3104602adf40SYehuda Sadeh 	struct rbd_image_header h;
3105602adf40SYehuda Sadeh 
3106602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
3107602adf40SYehuda Sadeh 	if (ret < 0)
3108602adf40SYehuda Sadeh 		return ret;
3109602adf40SYehuda Sadeh 
3110a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
3111a51aa0c0SJosh Durgin 
31129478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
31139478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
311429334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
311529334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
311629334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
31179db4b3e3SSage Weil 
3118849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
3119602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
3120849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
3121d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
3122812164f8SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
3123602adf40SYehuda Sadeh 
312493a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
3125602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
3126602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
3127602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
3128849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
3129c0cd10dbSAlex Elder 	if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3130c0cd10dbSAlex Elder 		rbd_warn(rbd_dev, "object prefix changed (ignoring)");
3131849b4260SAlex Elder 	kfree(h.object_prefix);
3132849b4260SAlex Elder 
3133c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
3134602adf40SYehuda Sadeh 
3135dfc5606dSYehuda Sadeh 	return ret;
3136602adf40SYehuda Sadeh }
3137602adf40SYehuda Sadeh 
313815228edeSAlex Elder /*
313915228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
314015228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
314115228edeSAlex Elder  */
314215228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
314315228edeSAlex Elder {
314415228edeSAlex Elder 	u64 snap_id;
314515228edeSAlex Elder 
314615228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
314715228edeSAlex Elder 		return;
314815228edeSAlex Elder 
314915228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
315015228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
315115228edeSAlex Elder 		return;
315215228edeSAlex Elder 
315315228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
315415228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
315515228edeSAlex Elder }
315615228edeSAlex Elder 
3157cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
31581fe5e993SAlex Elder {
3159e627db08SAlex Elder 	u64 mapping_size;
31601fe5e993SAlex Elder 	int ret;
31611fe5e993SAlex Elder 
3162117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3163e627db08SAlex Elder 	mapping_size = rbd_dev->mapping.size;
31641fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3165117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
3166cc4a38bdSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev);
3167117973fbSAlex Elder 	else
3168cc4a38bdSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev);
316915228edeSAlex Elder 
317015228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
317115228edeSAlex Elder 
317215228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
31731fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
317400a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
317500a653e2SAlex Elder 		sector_t size;
317600a653e2SAlex Elder 
317700a653e2SAlex Elder 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
317800a653e2SAlex Elder 		dout("setting size to %llu sectors", (unsigned long long)size);
317900a653e2SAlex Elder 		set_capacity(rbd_dev->disk, size);
3180a3fbe5d4SAlex Elder 		revalidate_disk(rbd_dev->disk);
318100a653e2SAlex Elder 	}
31821fe5e993SAlex Elder 
31831fe5e993SAlex Elder 	return ret;
31841fe5e993SAlex Elder }
31851fe5e993SAlex Elder 
3186602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3187602adf40SYehuda Sadeh {
3188602adf40SYehuda Sadeh 	struct gendisk *disk;
3189602adf40SYehuda Sadeh 	struct request_queue *q;
3190593a9e7bSAlex Elder 	u64 segment_size;
3191602adf40SYehuda Sadeh 
3192602adf40SYehuda Sadeh 	/* create gendisk info */
3193602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3194602adf40SYehuda Sadeh 	if (!disk)
31951fcdb8aaSAlex Elder 		return -ENOMEM;
3196602adf40SYehuda Sadeh 
3197f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3198de71a297SAlex Elder 		 rbd_dev->dev_id);
3199602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3200602adf40SYehuda Sadeh 	disk->first_minor = 0;
3201602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3202602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3203602adf40SYehuda Sadeh 
3204bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3205602adf40SYehuda Sadeh 	if (!q)
3206602adf40SYehuda Sadeh 		goto out_disk;
3207029bcbd8SJosh Durgin 
3208593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3209593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3210593a9e7bSAlex Elder 
3211029bcbd8SJosh Durgin 	/* set io sizes to object size */
3212593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3213593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3214593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3215593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3216593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3217029bcbd8SJosh Durgin 
3218602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3219602adf40SYehuda Sadeh 	disk->queue = q;
3220602adf40SYehuda Sadeh 
3221602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3222602adf40SYehuda Sadeh 
3223602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3224602adf40SYehuda Sadeh 
3225602adf40SYehuda Sadeh 	return 0;
3226602adf40SYehuda Sadeh out_disk:
3227602adf40SYehuda Sadeh 	put_disk(disk);
32281fcdb8aaSAlex Elder 
32291fcdb8aaSAlex Elder 	return -ENOMEM;
3230602adf40SYehuda Sadeh }
3231602adf40SYehuda Sadeh 
3232dfc5606dSYehuda Sadeh /*
3233dfc5606dSYehuda Sadeh   sysfs
3234dfc5606dSYehuda Sadeh */
3235602adf40SYehuda Sadeh 
3236593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3237593a9e7bSAlex Elder {
3238593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3239593a9e7bSAlex Elder }
3240593a9e7bSAlex Elder 
3241dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3242dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3243602adf40SYehuda Sadeh {
3244593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3245dfc5606dSYehuda Sadeh 
3246fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3247fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3248602adf40SYehuda Sadeh }
3249602adf40SYehuda Sadeh 
325034b13184SAlex Elder /*
325134b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
325234b13184SAlex Elder  * necessarily the base image.
325334b13184SAlex Elder  */
325434b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
325534b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
325634b13184SAlex Elder {
325734b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
325834b13184SAlex Elder 
325934b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
326034b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
326134b13184SAlex Elder }
326234b13184SAlex Elder 
3263dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3264dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3265602adf40SYehuda Sadeh {
3266593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3267dfc5606dSYehuda Sadeh 
3268fc71d833SAlex Elder 	if (rbd_dev->major)
3269dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3270fc71d833SAlex Elder 
3271fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3272fc71d833SAlex Elder 
3273dfc5606dSYehuda Sadeh }
3274dfc5606dSYehuda Sadeh 
3275dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3276dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3277dfc5606dSYehuda Sadeh {
3278593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3279dfc5606dSYehuda Sadeh 
32801dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
32811dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3282dfc5606dSYehuda Sadeh }
3283dfc5606dSYehuda Sadeh 
3284dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3285dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3286dfc5606dSYehuda Sadeh {
3287593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3288dfc5606dSYehuda Sadeh 
32890d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3290dfc5606dSYehuda Sadeh }
3291dfc5606dSYehuda Sadeh 
32929bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
32939bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
32949bb2f334SAlex Elder {
32959bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
32969bb2f334SAlex Elder 
32970d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
32980d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
32999bb2f334SAlex Elder }
33009bb2f334SAlex Elder 
3301dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3302dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3303dfc5606dSYehuda Sadeh {
3304593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3305dfc5606dSYehuda Sadeh 
3306a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
33070d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3308a92ffdf8SAlex Elder 
3309a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3310dfc5606dSYehuda Sadeh }
3311dfc5606dSYehuda Sadeh 
3312589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3313589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3314589d30e0SAlex Elder {
3315589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3316589d30e0SAlex Elder 
33170d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3318589d30e0SAlex Elder }
3319589d30e0SAlex Elder 
332034b13184SAlex Elder /*
332134b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
332234b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
332334b13184SAlex Elder  */
3324dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3325dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3326dfc5606dSYehuda Sadeh 			     char *buf)
3327dfc5606dSYehuda Sadeh {
3328593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3329dfc5606dSYehuda Sadeh 
33300d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3331dfc5606dSYehuda Sadeh }
3332dfc5606dSYehuda Sadeh 
333386b00e0dSAlex Elder /*
333486b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
333586b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
333686b00e0dSAlex Elder  * "(no parent image)".
333786b00e0dSAlex Elder  */
333886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
333986b00e0dSAlex Elder 			     struct device_attribute *attr,
334086b00e0dSAlex Elder 			     char *buf)
334186b00e0dSAlex Elder {
334286b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
334386b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
334486b00e0dSAlex Elder 	int count;
334586b00e0dSAlex Elder 	char *bufp = buf;
334686b00e0dSAlex Elder 
334786b00e0dSAlex Elder 	if (!spec)
334886b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
334986b00e0dSAlex Elder 
335086b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
335186b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
335286b00e0dSAlex Elder 	if (count < 0)
335386b00e0dSAlex Elder 		return count;
335486b00e0dSAlex Elder 	bufp += count;
335586b00e0dSAlex Elder 
335686b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
335786b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
335886b00e0dSAlex Elder 	if (count < 0)
335986b00e0dSAlex Elder 		return count;
336086b00e0dSAlex Elder 	bufp += count;
336186b00e0dSAlex Elder 
336286b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
336386b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
336486b00e0dSAlex Elder 	if (count < 0)
336586b00e0dSAlex Elder 		return count;
336686b00e0dSAlex Elder 	bufp += count;
336786b00e0dSAlex Elder 
336886b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
336986b00e0dSAlex Elder 	if (count < 0)
337086b00e0dSAlex Elder 		return count;
337186b00e0dSAlex Elder 	bufp += count;
337286b00e0dSAlex Elder 
337386b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
337486b00e0dSAlex Elder }
337586b00e0dSAlex Elder 
3376dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3377dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3378dfc5606dSYehuda Sadeh 				 const char *buf,
3379dfc5606dSYehuda Sadeh 				 size_t size)
3380dfc5606dSYehuda Sadeh {
3381593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3382b813623aSAlex Elder 	int ret;
3383602adf40SYehuda Sadeh 
3384cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3385e627db08SAlex Elder 	if (ret)
3386e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3387b813623aSAlex Elder 
3388b813623aSAlex Elder 	return ret < 0 ? ret : size;
3389dfc5606dSYehuda Sadeh }
3390602adf40SYehuda Sadeh 
3391dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
339234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3393dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3394dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3395dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
33969bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3397dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3398589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3399dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3400dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
340186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3402dfc5606dSYehuda Sadeh 
3403dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3404dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
340534b13184SAlex Elder 	&dev_attr_features.attr,
3406dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3407dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3408dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
34099bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3410dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3411589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3412dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
341386b00e0dSAlex Elder 	&dev_attr_parent.attr,
3414dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3415dfc5606dSYehuda Sadeh 	NULL
3416dfc5606dSYehuda Sadeh };
3417dfc5606dSYehuda Sadeh 
3418dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3419dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3420dfc5606dSYehuda Sadeh };
3421dfc5606dSYehuda Sadeh 
3422dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3423dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3424dfc5606dSYehuda Sadeh 	NULL
3425dfc5606dSYehuda Sadeh };
3426dfc5606dSYehuda Sadeh 
3427dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3428dfc5606dSYehuda Sadeh {
3429dfc5606dSYehuda Sadeh }
3430dfc5606dSYehuda Sadeh 
3431dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3432dfc5606dSYehuda Sadeh 	.name		= "rbd",
3433dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3434dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3435dfc5606dSYehuda Sadeh };
3436dfc5606dSYehuda Sadeh 
34378b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
34388b8fb99cSAlex Elder {
34398b8fb99cSAlex Elder 	kref_get(&spec->kref);
34408b8fb99cSAlex Elder 
34418b8fb99cSAlex Elder 	return spec;
34428b8fb99cSAlex Elder }
34438b8fb99cSAlex Elder 
34448b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
34458b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
34468b8fb99cSAlex Elder {
34478b8fb99cSAlex Elder 	if (spec)
34488b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
34498b8fb99cSAlex Elder }
34508b8fb99cSAlex Elder 
34518b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
34528b8fb99cSAlex Elder {
34538b8fb99cSAlex Elder 	struct rbd_spec *spec;
34548b8fb99cSAlex Elder 
34558b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
34568b8fb99cSAlex Elder 	if (!spec)
34578b8fb99cSAlex Elder 		return NULL;
34588b8fb99cSAlex Elder 	kref_init(&spec->kref);
34598b8fb99cSAlex Elder 
34608b8fb99cSAlex Elder 	return spec;
34618b8fb99cSAlex Elder }
34628b8fb99cSAlex Elder 
34638b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
34648b8fb99cSAlex Elder {
34658b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
34668b8fb99cSAlex Elder 
34678b8fb99cSAlex Elder 	kfree(spec->pool_name);
34688b8fb99cSAlex Elder 	kfree(spec->image_id);
34698b8fb99cSAlex Elder 	kfree(spec->image_name);
34708b8fb99cSAlex Elder 	kfree(spec->snap_name);
34718b8fb99cSAlex Elder 	kfree(spec);
34728b8fb99cSAlex Elder }
34738b8fb99cSAlex Elder 
3474cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3475c53d5893SAlex Elder 				struct rbd_spec *spec)
3476c53d5893SAlex Elder {
3477c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3478c53d5893SAlex Elder 
3479c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3480c53d5893SAlex Elder 	if (!rbd_dev)
3481c53d5893SAlex Elder 		return NULL;
3482c53d5893SAlex Elder 
3483c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
34846d292906SAlex Elder 	rbd_dev->flags = 0;
3485c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3486c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3487c53d5893SAlex Elder 
3488c53d5893SAlex Elder 	rbd_dev->spec = spec;
3489c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3490c53d5893SAlex Elder 
34910903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
34920903e875SAlex Elder 
34930903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34940903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
34950903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34960903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
34970903e875SAlex Elder 
3498c53d5893SAlex Elder 	return rbd_dev;
3499c53d5893SAlex Elder }
3500c53d5893SAlex Elder 
3501c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3502c53d5893SAlex Elder {
3503c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3504c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3505c53d5893SAlex Elder 	kfree(rbd_dev);
3506c53d5893SAlex Elder }
3507c53d5893SAlex Elder 
3508dfc5606dSYehuda Sadeh /*
35099d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
35109d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
35119d475de5SAlex Elder  * image.
35129d475de5SAlex Elder  */
35139d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
35149d475de5SAlex Elder 				u8 *order, u64 *snap_size)
35159d475de5SAlex Elder {
35169d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
35179d475de5SAlex Elder 	int ret;
35189d475de5SAlex Elder 	struct {
35199d475de5SAlex Elder 		u8 order;
35209d475de5SAlex Elder 		__le64 size;
35219d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
35229d475de5SAlex Elder 
352336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35249d475de5SAlex Elder 				"rbd", "get_size",
35254157976bSAlex Elder 				&snapid, sizeof (snapid),
3526e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
352736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35289d475de5SAlex Elder 	if (ret < 0)
35299d475de5SAlex Elder 		return ret;
353057385b51SAlex Elder 	if (ret < sizeof (size_buf))
353157385b51SAlex Elder 		return -ERANGE;
35329d475de5SAlex Elder 
3533c86f86e9SAlex Elder 	if (order)
35349d475de5SAlex Elder 		*order = size_buf.order;
35359d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
35369d475de5SAlex Elder 
35379d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
35389d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
35399d475de5SAlex Elder 		(unsigned long long)*snap_size);
35409d475de5SAlex Elder 
35419d475de5SAlex Elder 	return 0;
35429d475de5SAlex Elder }
35439d475de5SAlex Elder 
35449d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
35459d475de5SAlex Elder {
35469d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
35479d475de5SAlex Elder 					&rbd_dev->header.obj_order,
35489d475de5SAlex Elder 					&rbd_dev->header.image_size);
35499d475de5SAlex Elder }
35509d475de5SAlex Elder 
35511e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
35521e130199SAlex Elder {
35531e130199SAlex Elder 	void *reply_buf;
35541e130199SAlex Elder 	int ret;
35551e130199SAlex Elder 	void *p;
35561e130199SAlex Elder 
35571e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
35581e130199SAlex Elder 	if (!reply_buf)
35591e130199SAlex Elder 		return -ENOMEM;
35601e130199SAlex Elder 
356136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35624157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3563e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
356436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35651e130199SAlex Elder 	if (ret < 0)
35661e130199SAlex Elder 		goto out;
35671e130199SAlex Elder 
35681e130199SAlex Elder 	p = reply_buf;
35691e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
357057385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
357157385b51SAlex Elder 	ret = 0;
35721e130199SAlex Elder 
35731e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
35741e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
35751e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
35761e130199SAlex Elder 	} else {
35771e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
35781e130199SAlex Elder 	}
35791e130199SAlex Elder out:
35801e130199SAlex Elder 	kfree(reply_buf);
35811e130199SAlex Elder 
35821e130199SAlex Elder 	return ret;
35831e130199SAlex Elder }
35841e130199SAlex Elder 
3585b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3586b1b5402aSAlex Elder 		u64 *snap_features)
3587b1b5402aSAlex Elder {
3588b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3589b1b5402aSAlex Elder 	struct {
3590b1b5402aSAlex Elder 		__le64 features;
3591b1b5402aSAlex Elder 		__le64 incompat;
35924157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3593d889140cSAlex Elder 	u64 incompat;
3594b1b5402aSAlex Elder 	int ret;
3595b1b5402aSAlex Elder 
359636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3597b1b5402aSAlex Elder 				"rbd", "get_features",
35984157976bSAlex Elder 				&snapid, sizeof (snapid),
3599e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
360036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3601b1b5402aSAlex Elder 	if (ret < 0)
3602b1b5402aSAlex Elder 		return ret;
360357385b51SAlex Elder 	if (ret < sizeof (features_buf))
360457385b51SAlex Elder 		return -ERANGE;
3605d889140cSAlex Elder 
3606d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
36075cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3608b8f5c6edSAlex Elder 		return -ENXIO;
3609d889140cSAlex Elder 
3610b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3611b1b5402aSAlex Elder 
3612b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3613b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3614b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3615b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3616b1b5402aSAlex Elder 
3617b1b5402aSAlex Elder 	return 0;
3618b1b5402aSAlex Elder }
3619b1b5402aSAlex Elder 
3620b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3621b1b5402aSAlex Elder {
3622b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3623b1b5402aSAlex Elder 						&rbd_dev->header.features);
3624b1b5402aSAlex Elder }
3625b1b5402aSAlex Elder 
362686b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
362786b00e0dSAlex Elder {
362886b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
362986b00e0dSAlex Elder 	size_t size;
363086b00e0dSAlex Elder 	void *reply_buf = NULL;
363186b00e0dSAlex Elder 	__le64 snapid;
363286b00e0dSAlex Elder 	void *p;
363386b00e0dSAlex Elder 	void *end;
363486b00e0dSAlex Elder 	char *image_id;
363586b00e0dSAlex Elder 	u64 overlap;
363686b00e0dSAlex Elder 	int ret;
363786b00e0dSAlex Elder 
363886b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
363986b00e0dSAlex Elder 	if (!parent_spec)
364086b00e0dSAlex Elder 		return -ENOMEM;
364186b00e0dSAlex Elder 
364286b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
364386b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
364486b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
364586b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
364686b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
364786b00e0dSAlex Elder 	if (!reply_buf) {
364886b00e0dSAlex Elder 		ret = -ENOMEM;
364986b00e0dSAlex Elder 		goto out_err;
365086b00e0dSAlex Elder 	}
365186b00e0dSAlex Elder 
365286b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
365336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
365486b00e0dSAlex Elder 				"rbd", "get_parent",
36554157976bSAlex Elder 				&snapid, sizeof (snapid),
3656e2a58ee5SAlex Elder 				reply_buf, size);
365736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
365886b00e0dSAlex Elder 	if (ret < 0)
365986b00e0dSAlex Elder 		goto out_err;
366086b00e0dSAlex Elder 
366186b00e0dSAlex Elder 	p = reply_buf;
366257385b51SAlex Elder 	end = reply_buf + ret;
366357385b51SAlex Elder 	ret = -ERANGE;
366486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
366586b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
366686b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
366786b00e0dSAlex Elder 
36680903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36690903e875SAlex Elder 
36700903e875SAlex Elder 	ret = -EIO;
3671c0cd10dbSAlex Elder 	if (parent_spec->pool_id > (u64)U32_MAX) {
3672c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3673c0cd10dbSAlex Elder 			(unsigned long long)parent_spec->pool_id, U32_MAX);
367457385b51SAlex Elder 		goto out_err;
3675c0cd10dbSAlex Elder 	}
36760903e875SAlex Elder 
3677979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
367886b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
367986b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
368086b00e0dSAlex Elder 		goto out_err;
368186b00e0dSAlex Elder 	}
368286b00e0dSAlex Elder 	parent_spec->image_id = image_id;
368386b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
368486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
368586b00e0dSAlex Elder 
368686b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
368786b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
368886b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
368986b00e0dSAlex Elder out:
369086b00e0dSAlex Elder 	ret = 0;
369186b00e0dSAlex Elder out_err:
369286b00e0dSAlex Elder 	kfree(reply_buf);
369386b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
369486b00e0dSAlex Elder 
369586b00e0dSAlex Elder 	return ret;
369686b00e0dSAlex Elder }
369786b00e0dSAlex Elder 
3698cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3699cc070d59SAlex Elder {
3700cc070d59SAlex Elder 	struct {
3701cc070d59SAlex Elder 		__le64 stripe_unit;
3702cc070d59SAlex Elder 		__le64 stripe_count;
3703cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3704cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3705cc070d59SAlex Elder 	void *p;
3706cc070d59SAlex Elder 	u64 obj_size;
3707cc070d59SAlex Elder 	u64 stripe_unit;
3708cc070d59SAlex Elder 	u64 stripe_count;
3709cc070d59SAlex Elder 	int ret;
3710cc070d59SAlex Elder 
3711cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3712cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3713e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
3714cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3715cc070d59SAlex Elder 	if (ret < 0)
3716cc070d59SAlex Elder 		return ret;
3717cc070d59SAlex Elder 	if (ret < size)
3718cc070d59SAlex Elder 		return -ERANGE;
3719cc070d59SAlex Elder 
3720cc070d59SAlex Elder 	/*
3721cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3722cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3723cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3724cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3725cc070d59SAlex Elder 	 */
3726cc070d59SAlex Elder 	ret = -EINVAL;
3727cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3728cc070d59SAlex Elder 	p = &striping_info_buf;
3729cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3730cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3731cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3732cc070d59SAlex Elder 				"(got %llu want %llu)",
3733cc070d59SAlex Elder 				stripe_unit, obj_size);
3734cc070d59SAlex Elder 		return -EINVAL;
3735cc070d59SAlex Elder 	}
3736cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3737cc070d59SAlex Elder 	if (stripe_count != 1) {
3738cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3739cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3740cc070d59SAlex Elder 		return -EINVAL;
3741cc070d59SAlex Elder 	}
3742500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
3743500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
3744cc070d59SAlex Elder 
3745cc070d59SAlex Elder 	return 0;
3746cc070d59SAlex Elder }
3747cc070d59SAlex Elder 
37489e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
37499e15b77dSAlex Elder {
37509e15b77dSAlex Elder 	size_t image_id_size;
37519e15b77dSAlex Elder 	char *image_id;
37529e15b77dSAlex Elder 	void *p;
37539e15b77dSAlex Elder 	void *end;
37549e15b77dSAlex Elder 	size_t size;
37559e15b77dSAlex Elder 	void *reply_buf = NULL;
37569e15b77dSAlex Elder 	size_t len = 0;
37579e15b77dSAlex Elder 	char *image_name = NULL;
37589e15b77dSAlex Elder 	int ret;
37599e15b77dSAlex Elder 
37609e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
37619e15b77dSAlex Elder 
376269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
376369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
37649e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
37659e15b77dSAlex Elder 	if (!image_id)
37669e15b77dSAlex Elder 		return NULL;
37679e15b77dSAlex Elder 
37689e15b77dSAlex Elder 	p = image_id;
37694157976bSAlex Elder 	end = image_id + image_id_size;
377069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
37719e15b77dSAlex Elder 
37729e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
37739e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
37749e15b77dSAlex Elder 	if (!reply_buf)
37759e15b77dSAlex Elder 		goto out;
37769e15b77dSAlex Elder 
377736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
37789e15b77dSAlex Elder 				"rbd", "dir_get_name",
37799e15b77dSAlex Elder 				image_id, image_id_size,
3780e2a58ee5SAlex Elder 				reply_buf, size);
37819e15b77dSAlex Elder 	if (ret < 0)
37829e15b77dSAlex Elder 		goto out;
37839e15b77dSAlex Elder 	p = reply_buf;
3784f40eb349SAlex Elder 	end = reply_buf + ret;
3785f40eb349SAlex Elder 
37869e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
37879e15b77dSAlex Elder 	if (IS_ERR(image_name))
37889e15b77dSAlex Elder 		image_name = NULL;
37899e15b77dSAlex Elder 	else
37909e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
37919e15b77dSAlex Elder out:
37929e15b77dSAlex Elder 	kfree(reply_buf);
37939e15b77dSAlex Elder 	kfree(image_id);
37949e15b77dSAlex Elder 
37959e15b77dSAlex Elder 	return image_name;
37969e15b77dSAlex Elder }
37979e15b77dSAlex Elder 
37982ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
37992ad3d716SAlex Elder {
38002ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
38012ad3d716SAlex Elder 	const char *snap_name;
38022ad3d716SAlex Elder 	u32 which = 0;
38032ad3d716SAlex Elder 
38042ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
38052ad3d716SAlex Elder 
38062ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
38072ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
38082ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
38092ad3d716SAlex Elder 			return snapc->snaps[which];
38102ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
38112ad3d716SAlex Elder 		which++;
38122ad3d716SAlex Elder 	}
38132ad3d716SAlex Elder 	return CEPH_NOSNAP;
38142ad3d716SAlex Elder }
38152ad3d716SAlex Elder 
38162ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
38172ad3d716SAlex Elder {
38182ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
38192ad3d716SAlex Elder 	u32 which;
38202ad3d716SAlex Elder 	bool found = false;
38212ad3d716SAlex Elder 	u64 snap_id;
38222ad3d716SAlex Elder 
38232ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
38242ad3d716SAlex Elder 		const char *snap_name;
38252ad3d716SAlex Elder 
38262ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
38272ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
38282ad3d716SAlex Elder 		if (IS_ERR(snap_name))
38292ad3d716SAlex Elder 			break;
38302ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
38312ad3d716SAlex Elder 		kfree(snap_name);
38322ad3d716SAlex Elder 	}
38332ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
38342ad3d716SAlex Elder }
38352ad3d716SAlex Elder 
38362ad3d716SAlex Elder /*
38372ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
38382ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
38392ad3d716SAlex Elder  */
38402ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
38412ad3d716SAlex Elder {
38422ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
38432ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
38442ad3d716SAlex Elder 
38452ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
38462ad3d716SAlex Elder }
38472ad3d716SAlex Elder 
38489e15b77dSAlex Elder /*
38492e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
38502e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
38512e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
38522e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
38532e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
38542e9f7f1cSAlex Elder  * allocated.
3855e1d4213fSAlex Elder  *
3856e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
3857e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
3858e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
38599e15b77dSAlex Elder  */
38602e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
38619e15b77dSAlex Elder {
38622e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
38632e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
38642e9f7f1cSAlex Elder 	const char *pool_name;
38652e9f7f1cSAlex Elder 	const char *image_name;
38662e9f7f1cSAlex Elder 	const char *snap_name;
38679e15b77dSAlex Elder 	int ret;
38689e15b77dSAlex Elder 
3869e1d4213fSAlex Elder 	/*
3870e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
3871e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
3872e1d4213fSAlex Elder 	 */
38732e9f7f1cSAlex Elder 	if (spec->pool_name) {
38742e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
38752ad3d716SAlex Elder 			u64 snap_id;
3876e1d4213fSAlex Elder 
38772ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
38782ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
3879e1d4213fSAlex Elder 				return -ENOENT;
38802ad3d716SAlex Elder 			spec->snap_id = snap_id;
3881e1d4213fSAlex Elder 		} else {
38822e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
3883e1d4213fSAlex Elder 		}
3884e1d4213fSAlex Elder 
3885e1d4213fSAlex Elder 		return 0;
3886e1d4213fSAlex Elder 	}
38879e15b77dSAlex Elder 
38882e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
38899e15b77dSAlex Elder 
38902e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
38912e9f7f1cSAlex Elder 	if (!pool_name) {
38922e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
3893935dc89fSAlex Elder 		return -EIO;
3894935dc89fSAlex Elder 	}
38952e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
38962e9f7f1cSAlex Elder 	if (!pool_name)
38979e15b77dSAlex Elder 		return -ENOMEM;
38989e15b77dSAlex Elder 
38999e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
39009e15b77dSAlex Elder 
39012e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
39022e9f7f1cSAlex Elder 	if (!image_name)
390306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
39049e15b77dSAlex Elder 
39052e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
39069e15b77dSAlex Elder 
39072e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
39082e9f7f1cSAlex Elder 	if (!snap_name) {
39092e9f7f1cSAlex Elder 		ret = -ENOMEM;
39109e15b77dSAlex Elder 		goto out_err;
39112e9f7f1cSAlex Elder 	}
39122e9f7f1cSAlex Elder 
39132e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
39142e9f7f1cSAlex Elder 	spec->image_name = image_name;
39152e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
39169e15b77dSAlex Elder 
39179e15b77dSAlex Elder 	return 0;
39189e15b77dSAlex Elder out_err:
39192e9f7f1cSAlex Elder 	kfree(image_name);
39202e9f7f1cSAlex Elder 	kfree(pool_name);
39219e15b77dSAlex Elder 
39229e15b77dSAlex Elder 	return ret;
39239e15b77dSAlex Elder }
39249e15b77dSAlex Elder 
3925cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
392635d489f9SAlex Elder {
392735d489f9SAlex Elder 	size_t size;
392835d489f9SAlex Elder 	int ret;
392935d489f9SAlex Elder 	void *reply_buf;
393035d489f9SAlex Elder 	void *p;
393135d489f9SAlex Elder 	void *end;
393235d489f9SAlex Elder 	u64 seq;
393335d489f9SAlex Elder 	u32 snap_count;
393435d489f9SAlex Elder 	struct ceph_snap_context *snapc;
393535d489f9SAlex Elder 	u32 i;
393635d489f9SAlex Elder 
393735d489f9SAlex Elder 	/*
393835d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
393935d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
394035d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
394135d489f9SAlex Elder 	 * prepared to receive.
394235d489f9SAlex Elder 	 */
394335d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
394435d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
394535d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
394635d489f9SAlex Elder 	if (!reply_buf)
394735d489f9SAlex Elder 		return -ENOMEM;
394835d489f9SAlex Elder 
394936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
39504157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
3951e2a58ee5SAlex Elder 				reply_buf, size);
395236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
395335d489f9SAlex Elder 	if (ret < 0)
395435d489f9SAlex Elder 		goto out;
395535d489f9SAlex Elder 
395635d489f9SAlex Elder 	p = reply_buf;
395757385b51SAlex Elder 	end = reply_buf + ret;
395857385b51SAlex Elder 	ret = -ERANGE;
395935d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
396035d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
396135d489f9SAlex Elder 
396235d489f9SAlex Elder 	/*
396335d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
396435d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
396535d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
396635d489f9SAlex Elder 	 * allocate is representable in a size_t.
396735d489f9SAlex Elder 	 */
396835d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
396935d489f9SAlex Elder 				 / sizeof (u64)) {
397035d489f9SAlex Elder 		ret = -EINVAL;
397135d489f9SAlex Elder 		goto out;
397235d489f9SAlex Elder 	}
397335d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
397435d489f9SAlex Elder 		goto out;
3975468521c1SAlex Elder 	ret = 0;
397635d489f9SAlex Elder 
3977812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
397835d489f9SAlex Elder 	if (!snapc) {
397935d489f9SAlex Elder 		ret = -ENOMEM;
398035d489f9SAlex Elder 		goto out;
398135d489f9SAlex Elder 	}
398235d489f9SAlex Elder 	snapc->seq = seq;
398335d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
398435d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
398535d489f9SAlex Elder 
398649ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
398735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
398835d489f9SAlex Elder 
398935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
399035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
399135d489f9SAlex Elder out:
399235d489f9SAlex Elder 	kfree(reply_buf);
399335d489f9SAlex Elder 
399457385b51SAlex Elder 	return ret;
399535d489f9SAlex Elder }
399635d489f9SAlex Elder 
399754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
399854cac61fSAlex Elder 					u64 snap_id)
3999b8b1e2dbSAlex Elder {
4000b8b1e2dbSAlex Elder 	size_t size;
4001b8b1e2dbSAlex Elder 	void *reply_buf;
400254cac61fSAlex Elder 	__le64 snapid;
4003b8b1e2dbSAlex Elder 	int ret;
4004b8b1e2dbSAlex Elder 	void *p;
4005b8b1e2dbSAlex Elder 	void *end;
4006b8b1e2dbSAlex Elder 	char *snap_name;
4007b8b1e2dbSAlex Elder 
4008b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4009b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4010b8b1e2dbSAlex Elder 	if (!reply_buf)
4011b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4012b8b1e2dbSAlex Elder 
401354cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
401436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4015b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
401654cac61fSAlex Elder 				&snapid, sizeof (snapid),
4017e2a58ee5SAlex Elder 				reply_buf, size);
401836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4019f40eb349SAlex Elder 	if (ret < 0) {
4020f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4021b8b1e2dbSAlex Elder 		goto out;
4022f40eb349SAlex Elder 	}
4023b8b1e2dbSAlex Elder 
4024b8b1e2dbSAlex Elder 	p = reply_buf;
4025f40eb349SAlex Elder 	end = reply_buf + ret;
4026e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4027f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4028b8b1e2dbSAlex Elder 		goto out;
4029f40eb349SAlex Elder 
4030b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
403154cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4032b8b1e2dbSAlex Elder out:
4033b8b1e2dbSAlex Elder 	kfree(reply_buf);
4034b8b1e2dbSAlex Elder 
4035f40eb349SAlex Elder 	return snap_name;
4036b8b1e2dbSAlex Elder }
4037b8b1e2dbSAlex Elder 
4038cc4a38bdSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
4039117973fbSAlex Elder {
4040117973fbSAlex Elder 	int ret;
4041117973fbSAlex Elder 
4042117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
4043117973fbSAlex Elder 
4044117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
4045117973fbSAlex Elder 	if (ret)
4046117973fbSAlex Elder 		goto out;
404729334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
404829334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
404929334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4050117973fbSAlex Elder 
4051cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4052117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4053117973fbSAlex Elder 	if (ret)
4054117973fbSAlex Elder 		goto out;
4055117973fbSAlex Elder out:
4056117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
4057117973fbSAlex Elder 
4058117973fbSAlex Elder 	return ret;
4059117973fbSAlex Elder }
4060117973fbSAlex Elder 
4061dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4062dfc5606dSYehuda Sadeh {
4063dfc5606dSYehuda Sadeh 	struct device *dev;
4064cd789ab9SAlex Elder 	int ret;
4065dfc5606dSYehuda Sadeh 
4066dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4067dfc5606dSYehuda Sadeh 
4068cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4069dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4070dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4071dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4072200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4073de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4074dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4075dfc5606dSYehuda Sadeh 
4076dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4077cd789ab9SAlex Elder 
4078dfc5606dSYehuda Sadeh 	return ret;
4079602adf40SYehuda Sadeh }
4080602adf40SYehuda Sadeh 
4081dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4082dfc5606dSYehuda Sadeh {
4083dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4084dfc5606dSYehuda Sadeh }
4085dfc5606dSYehuda Sadeh 
4086e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
40871ddbe94eSAlex Elder 
40881ddbe94eSAlex Elder /*
4089499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4090499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
40911ddbe94eSAlex Elder  */
4092e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4093b7f23c36SAlex Elder {
4094e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4095499afd5bSAlex Elder 
4096499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4097499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4098499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4099e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4100e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4101b7f23c36SAlex Elder }
4102b7f23c36SAlex Elder 
41031ddbe94eSAlex Elder /*
4104499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4105499afd5bSAlex Elder  * identifier is no longer in use.
41061ddbe94eSAlex Elder  */
4107e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
41081ddbe94eSAlex Elder {
4109d184f6bfSAlex Elder 	struct list_head *tmp;
4110de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4111d184f6bfSAlex Elder 	int max_id;
4112d184f6bfSAlex Elder 
4113aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4114499afd5bSAlex Elder 
4115e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4116e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4117499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4118499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4119d184f6bfSAlex Elder 
4120d184f6bfSAlex Elder 	/*
4121d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4122d184f6bfSAlex Elder 	 * is nothing special we need to do.
4123d184f6bfSAlex Elder 	 */
4124e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4125d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4126d184f6bfSAlex Elder 		return;
4127d184f6bfSAlex Elder 	}
4128d184f6bfSAlex Elder 
4129d184f6bfSAlex Elder 	/*
4130d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4131d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4132d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4133d184f6bfSAlex Elder 	 */
4134d184f6bfSAlex Elder 	max_id = 0;
4135d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4136d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4137d184f6bfSAlex Elder 
4138d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4139b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4140b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4141d184f6bfSAlex Elder 	}
4142499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
41431ddbe94eSAlex Elder 
41441ddbe94eSAlex Elder 	/*
4145e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4146d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4147d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4148d184f6bfSAlex Elder 	 * case.
41491ddbe94eSAlex Elder 	 */
4150e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4151e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4152b7f23c36SAlex Elder }
4153b7f23c36SAlex Elder 
4154a725f65eSAlex Elder /*
4155e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4156e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4157593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4158593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4159e28fff26SAlex Elder  */
4160e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4161e28fff26SAlex Elder {
4162e28fff26SAlex Elder         /*
4163e28fff26SAlex Elder         * These are the characters that produce nonzero for
4164e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4165e28fff26SAlex Elder         */
4166e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4167e28fff26SAlex Elder 
4168e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4169e28fff26SAlex Elder 
4170e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4171e28fff26SAlex Elder }
4172e28fff26SAlex Elder 
4173e28fff26SAlex Elder /*
4174e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4175e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4176593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4177593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4178e28fff26SAlex Elder  *
4179e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4180e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4181e28fff26SAlex Elder  * token_size if the token would not fit.
4182e28fff26SAlex Elder  *
4183593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4184e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4185e28fff26SAlex Elder  * too small to hold it.
4186e28fff26SAlex Elder  */
4187e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4188e28fff26SAlex Elder 				char *token,
4189e28fff26SAlex Elder 				size_t token_size)
4190e28fff26SAlex Elder {
4191e28fff26SAlex Elder         size_t len;
4192e28fff26SAlex Elder 
4193e28fff26SAlex Elder 	len = next_token(buf);
4194e28fff26SAlex Elder 	if (len < token_size) {
4195e28fff26SAlex Elder 		memcpy(token, *buf, len);
4196e28fff26SAlex Elder 		*(token + len) = '\0';
4197e28fff26SAlex Elder 	}
4198e28fff26SAlex Elder 	*buf += len;
4199e28fff26SAlex Elder 
4200e28fff26SAlex Elder         return len;
4201e28fff26SAlex Elder }
4202e28fff26SAlex Elder 
4203e28fff26SAlex Elder /*
4204ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4205ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4206ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4207ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4208ea3352f4SAlex Elder  *
4209ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4210ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4211ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4212ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4213ea3352f4SAlex Elder  *
4214ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4215ea3352f4SAlex Elder  * the end of the found token.
4216ea3352f4SAlex Elder  *
4217ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4218ea3352f4SAlex Elder  */
4219ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4220ea3352f4SAlex Elder {
4221ea3352f4SAlex Elder 	char *dup;
4222ea3352f4SAlex Elder 	size_t len;
4223ea3352f4SAlex Elder 
4224ea3352f4SAlex Elder 	len = next_token(buf);
42254caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4226ea3352f4SAlex Elder 	if (!dup)
4227ea3352f4SAlex Elder 		return NULL;
4228ea3352f4SAlex Elder 	*(dup + len) = '\0';
4229ea3352f4SAlex Elder 	*buf += len;
4230ea3352f4SAlex Elder 
4231ea3352f4SAlex Elder 	if (lenp)
4232ea3352f4SAlex Elder 		*lenp = len;
4233ea3352f4SAlex Elder 
4234ea3352f4SAlex Elder 	return dup;
4235ea3352f4SAlex Elder }
4236ea3352f4SAlex Elder 
4237ea3352f4SAlex Elder /*
4238859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4239859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4240859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4241859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4242d22f76e7SAlex Elder  *
4243859c31dfSAlex Elder  * The information extracted from these options is recorded in
4244859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4245859c31dfSAlex Elder  * structures:
4246859c31dfSAlex Elder  *  ceph_opts
4247859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4248859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4249859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4250859c31dfSAlex Elder  *  rbd_opts
4251859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4252859c31dfSAlex Elder  *	this function; caller must release with kfree().
4253859c31dfSAlex Elder  *  spec
4254859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4255859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4256859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4257859c31dfSAlex Elder  *
4258859c31dfSAlex Elder  * The options passed take this form:
4259859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4260859c31dfSAlex Elder  * where:
4261859c31dfSAlex Elder  *  <mon_addrs>
4262859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4263859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4264859c31dfSAlex Elder  *      by a port number (separated by a colon).
4265859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4266859c31dfSAlex Elder  *  <options>
4267859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4268859c31dfSAlex Elder  *  <pool_name>
4269859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4270859c31dfSAlex Elder  *  <image_name>
4271859c31dfSAlex Elder  *      The name of the image in that pool to map.
4272859c31dfSAlex Elder  *  <snap_id>
4273859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4274859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4275859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4276859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4277a725f65eSAlex Elder  */
4278859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4279dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4280859c31dfSAlex Elder 				struct rbd_options **opts,
4281859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4282a725f65eSAlex Elder {
4283e28fff26SAlex Elder 	size_t len;
4284859c31dfSAlex Elder 	char *options;
42850ddebc0cSAlex Elder 	const char *mon_addrs;
4286ecb4dc22SAlex Elder 	char *snap_name;
42870ddebc0cSAlex Elder 	size_t mon_addrs_size;
4288859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
42894e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4290859c31dfSAlex Elder 	struct ceph_options *copts;
4291dc79b113SAlex Elder 	int ret;
4292e28fff26SAlex Elder 
4293e28fff26SAlex Elder 	/* The first four tokens are required */
4294e28fff26SAlex Elder 
42957ef3214aSAlex Elder 	len = next_token(&buf);
42964fb5d671SAlex Elder 	if (!len) {
42974fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
42984fb5d671SAlex Elder 		return -EINVAL;
42994fb5d671SAlex Elder 	}
43000ddebc0cSAlex Elder 	mon_addrs = buf;
4301f28e565aSAlex Elder 	mon_addrs_size = len + 1;
43027ef3214aSAlex Elder 	buf += len;
4303a725f65eSAlex Elder 
4304dc79b113SAlex Elder 	ret = -EINVAL;
4305f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4306f28e565aSAlex Elder 	if (!options)
4307dc79b113SAlex Elder 		return -ENOMEM;
43084fb5d671SAlex Elder 	if (!*options) {
43094fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
43104fb5d671SAlex Elder 		goto out_err;
43114fb5d671SAlex Elder 	}
4312a725f65eSAlex Elder 
4313859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4314859c31dfSAlex Elder 	if (!spec)
4315f28e565aSAlex Elder 		goto out_mem;
4316859c31dfSAlex Elder 
4317859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4318859c31dfSAlex Elder 	if (!spec->pool_name)
4319859c31dfSAlex Elder 		goto out_mem;
43204fb5d671SAlex Elder 	if (!*spec->pool_name) {
43214fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
43224fb5d671SAlex Elder 		goto out_err;
43234fb5d671SAlex Elder 	}
4324e28fff26SAlex Elder 
432569e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4326859c31dfSAlex Elder 	if (!spec->image_name)
4327f28e565aSAlex Elder 		goto out_mem;
43284fb5d671SAlex Elder 	if (!*spec->image_name) {
43294fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
43304fb5d671SAlex Elder 		goto out_err;
43314fb5d671SAlex Elder 	}
4332e28fff26SAlex Elder 
4333f28e565aSAlex Elder 	/*
4334f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4335f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4336f28e565aSAlex Elder 	 */
43373feeb894SAlex Elder 	len = next_token(&buf);
4338820a5f3eSAlex Elder 	if (!len) {
43393feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
43403feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4341f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4342dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4343f28e565aSAlex Elder 		goto out_err;
4344849b4260SAlex Elder 	}
4345ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4346ecb4dc22SAlex Elder 	if (!snap_name)
4347f28e565aSAlex Elder 		goto out_mem;
4348ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4349ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4350e5c35534SAlex Elder 
43510ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4352e28fff26SAlex Elder 
43534e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
43544e9afebaSAlex Elder 	if (!rbd_opts)
43554e9afebaSAlex Elder 		goto out_mem;
43564e9afebaSAlex Elder 
43574e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4358d22f76e7SAlex Elder 
4359859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
43600ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
43614e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4362859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4363859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4364dc79b113SAlex Elder 		goto out_err;
4365dc79b113SAlex Elder 	}
4366859c31dfSAlex Elder 	kfree(options);
4367859c31dfSAlex Elder 
4368859c31dfSAlex Elder 	*ceph_opts = copts;
43694e9afebaSAlex Elder 	*opts = rbd_opts;
4370859c31dfSAlex Elder 	*rbd_spec = spec;
43710ddebc0cSAlex Elder 
4372dc79b113SAlex Elder 	return 0;
4373f28e565aSAlex Elder out_mem:
4374dc79b113SAlex Elder 	ret = -ENOMEM;
4375d22f76e7SAlex Elder out_err:
4376859c31dfSAlex Elder 	kfree(rbd_opts);
4377859c31dfSAlex Elder 	rbd_spec_put(spec);
4378f28e565aSAlex Elder 	kfree(options);
4379d22f76e7SAlex Elder 
4380dc79b113SAlex Elder 	return ret;
4381a725f65eSAlex Elder }
4382a725f65eSAlex Elder 
4383589d30e0SAlex Elder /*
4384589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4385589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4386589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4387589d30e0SAlex Elder  *
4388589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4389589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4390589d30e0SAlex Elder  * with the supplied name.
4391589d30e0SAlex Elder  *
4392589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4393589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4394589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4395589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4396589d30e0SAlex Elder  */
4397589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4398589d30e0SAlex Elder {
4399589d30e0SAlex Elder 	int ret;
4400589d30e0SAlex Elder 	size_t size;
4401589d30e0SAlex Elder 	char *object_name;
4402589d30e0SAlex Elder 	void *response;
4403c0fba368SAlex Elder 	char *image_id;
44042f82ee54SAlex Elder 
4405589d30e0SAlex Elder 	/*
44062c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
44072c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4408c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4409c0fba368SAlex Elder 	 * do still need to set the image format though.
44102c0d0a10SAlex Elder 	 */
4411c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4412c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4413c0fba368SAlex Elder 
44142c0d0a10SAlex Elder 		return 0;
4415c0fba368SAlex Elder 	}
44162c0d0a10SAlex Elder 
44172c0d0a10SAlex Elder 	/*
4418589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4419589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4420589d30e0SAlex Elder 	 */
442169e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4422589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4423589d30e0SAlex Elder 	if (!object_name)
4424589d30e0SAlex Elder 		return -ENOMEM;
44250d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4426589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4427589d30e0SAlex Elder 
4428589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4429589d30e0SAlex Elder 
4430589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4431589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4432589d30e0SAlex Elder 	if (!response) {
4433589d30e0SAlex Elder 		ret = -ENOMEM;
4434589d30e0SAlex Elder 		goto out;
4435589d30e0SAlex Elder 	}
4436589d30e0SAlex Elder 
4437c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4438c0fba368SAlex Elder 
443936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
44404157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4441e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
444236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4443c0fba368SAlex Elder 	if (ret == -ENOENT) {
4444c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4445c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4446c0fba368SAlex Elder 		if (!ret)
4447c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4448c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4449c0fba368SAlex Elder 		void *p = response;
4450589d30e0SAlex Elder 
4451c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4452979ed480SAlex Elder 						NULL, GFP_NOIO);
4453c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4454c0fba368SAlex Elder 		if (!ret)
4455c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4456589d30e0SAlex Elder 	} else {
4457c0fba368SAlex Elder 		ret = -EINVAL;
4458c0fba368SAlex Elder 	}
4459c0fba368SAlex Elder 
4460c0fba368SAlex Elder 	if (!ret) {
4461c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4462c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4463589d30e0SAlex Elder 	}
4464589d30e0SAlex Elder out:
4465589d30e0SAlex Elder 	kfree(response);
4466589d30e0SAlex Elder 	kfree(object_name);
4467589d30e0SAlex Elder 
4468589d30e0SAlex Elder 	return ret;
4469589d30e0SAlex Elder }
4470589d30e0SAlex Elder 
44716fd48b3bSAlex Elder /* Undo whatever state changes are made by v1 or v2 image probe */
44726fd48b3bSAlex Elder 
44736fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
44746fd48b3bSAlex Elder {
44756fd48b3bSAlex Elder 	struct rbd_image_header	*header;
44766fd48b3bSAlex Elder 
44776fd48b3bSAlex Elder 	rbd_dev_remove_parent(rbd_dev);
44786fd48b3bSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
44796fd48b3bSAlex Elder 	rbd_dev->parent_spec = NULL;
44806fd48b3bSAlex Elder 	rbd_dev->parent_overlap = 0;
44816fd48b3bSAlex Elder 
44826fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
44836fd48b3bSAlex Elder 
44846fd48b3bSAlex Elder 	header = &rbd_dev->header;
4485812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
44866fd48b3bSAlex Elder 	kfree(header->snap_sizes);
44876fd48b3bSAlex Elder 	kfree(header->snap_names);
44886fd48b3bSAlex Elder 	kfree(header->object_prefix);
44896fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
44906fd48b3bSAlex Elder }
44916fd48b3bSAlex Elder 
4492a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4493a30b71b9SAlex Elder {
4494a30b71b9SAlex Elder 	int ret;
4495a30b71b9SAlex Elder 
4496a30b71b9SAlex Elder 	/* Populate rbd image metadata */
4497a30b71b9SAlex Elder 
4498a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4499a30b71b9SAlex Elder 	if (ret < 0)
4500a30b71b9SAlex Elder 		goto out_err;
450186b00e0dSAlex Elder 
450286b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
450386b00e0dSAlex Elder 
450486b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
450586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
450686b00e0dSAlex Elder 
4507a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4508a30b71b9SAlex Elder 		rbd_dev->header_name);
4509a30b71b9SAlex Elder 
4510a30b71b9SAlex Elder 	return 0;
4511a30b71b9SAlex Elder 
4512a30b71b9SAlex Elder out_err:
4513a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4514a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
45150d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
45160d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4517a30b71b9SAlex Elder 
4518a30b71b9SAlex Elder 	return ret;
4519a30b71b9SAlex Elder }
4520a30b71b9SAlex Elder 
4521a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4522a30b71b9SAlex Elder {
45239d475de5SAlex Elder 	int ret;
4524a30b71b9SAlex Elder 
45259d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
452657385b51SAlex Elder 	if (ret)
45279d475de5SAlex Elder 		goto out_err;
45281e130199SAlex Elder 
45291e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
45301e130199SAlex Elder 
45311e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
453257385b51SAlex Elder 	if (ret)
45331e130199SAlex Elder 		goto out_err;
4534b1b5402aSAlex Elder 
4535d889140cSAlex Elder 	/* Get the and check features for the image */
4536b1b5402aSAlex Elder 
4537b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
453857385b51SAlex Elder 	if (ret)
4539b1b5402aSAlex Elder 		goto out_err;
454035d489f9SAlex Elder 
454186b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
454286b00e0dSAlex Elder 
454386b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
454486b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
454557385b51SAlex Elder 		if (ret)
454686b00e0dSAlex Elder 			goto out_err;
454796882f55SAlex Elder 		/*
4548c734b796SAlex Elder 		 * Print a warning if this image has a parent.
4549c734b796SAlex Elder 		 * Don't print it if the image now being probed
4550c734b796SAlex Elder 		 * is itself a parent.  We can tell at this point
4551c734b796SAlex Elder 		 * because we won't know its pool name yet (just its
4552c734b796SAlex Elder 		 * pool id).
455396882f55SAlex Elder 		 */
4554c734b796SAlex Elder 		if (rbd_dev->parent_spec && rbd_dev->spec->pool_name)
455596882f55SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
455696882f55SAlex Elder 					"is EXPERIMENTAL!");
455786b00e0dSAlex Elder 	}
455886b00e0dSAlex Elder 
4559cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4560cc070d59SAlex Elder 
4561cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4562cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4563cc070d59SAlex Elder 		if (ret < 0)
4564cc070d59SAlex Elder 			goto out_err;
4565cc070d59SAlex Elder 	}
4566cc070d59SAlex Elder 
45676e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
456835d489f9SAlex Elder 
45696e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
45706e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
45716e14b1a6SAlex Elder 
45726e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
45736e14b1a6SAlex Elder 
4574cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
457535d489f9SAlex Elder 	if (ret)
457635d489f9SAlex Elder 		goto out_err;
45776e14b1a6SAlex Elder 
4578a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4579a30b71b9SAlex Elder 		rbd_dev->header_name);
4580a30b71b9SAlex Elder 
458135152979SAlex Elder 	return 0;
45829d475de5SAlex Elder out_err:
458386b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
458486b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
458586b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
45869d475de5SAlex Elder 	kfree(rbd_dev->header_name);
45879d475de5SAlex Elder 	rbd_dev->header_name = NULL;
45881e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
45891e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
45909d475de5SAlex Elder 
45919d475de5SAlex Elder 	return ret;
4592a30b71b9SAlex Elder }
4593a30b71b9SAlex Elder 
4594124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
459583a06263SAlex Elder {
45962f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4597124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4598124afba2SAlex Elder 	struct rbd_client *rbdc;
4599124afba2SAlex Elder 	int ret;
4600124afba2SAlex Elder 
4601124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4602124afba2SAlex Elder 		return 0;
4603124afba2SAlex Elder 	/*
4604124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4605124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4606124afba2SAlex Elder 	 * parent/child relationships always share both.
4607124afba2SAlex Elder 	 */
4608124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4609124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4610124afba2SAlex Elder 
4611124afba2SAlex Elder 	ret = -ENOMEM;
4612124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4613124afba2SAlex Elder 	if (!parent)
4614124afba2SAlex Elder 		goto out_err;
4615124afba2SAlex Elder 
461651344a38SAlex Elder 	ret = rbd_dev_image_probe(parent, true);
4617124afba2SAlex Elder 	if (ret < 0)
4618124afba2SAlex Elder 		goto out_err;
4619124afba2SAlex Elder 	rbd_dev->parent = parent;
4620124afba2SAlex Elder 
4621124afba2SAlex Elder 	return 0;
4622124afba2SAlex Elder out_err:
4623124afba2SAlex Elder 	if (parent) {
4624124afba2SAlex Elder 		rbd_spec_put(rbd_dev->parent_spec);
4625124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4626124afba2SAlex Elder 		rbd_dev_destroy(parent);
4627124afba2SAlex Elder 	} else {
4628124afba2SAlex Elder 		rbd_put_client(rbdc);
4629124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4630124afba2SAlex Elder 	}
4631124afba2SAlex Elder 
4632124afba2SAlex Elder 	return ret;
4633124afba2SAlex Elder }
4634124afba2SAlex Elder 
4635200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4636124afba2SAlex Elder {
463783a06263SAlex Elder 	int ret;
463883a06263SAlex Elder 
463983a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
464083a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
464183a06263SAlex Elder 
464283a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
464383a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
464483a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
464583a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
464683a06263SAlex Elder 
464783a06263SAlex Elder 	/* Get our block major device number. */
464883a06263SAlex Elder 
464983a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
465083a06263SAlex Elder 	if (ret < 0)
465183a06263SAlex Elder 		goto err_out_id;
465283a06263SAlex Elder 	rbd_dev->major = ret;
465383a06263SAlex Elder 
465483a06263SAlex Elder 	/* Set up the blkdev mapping. */
465583a06263SAlex Elder 
465683a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
465783a06263SAlex Elder 	if (ret)
465883a06263SAlex Elder 		goto err_out_blkdev;
465983a06263SAlex Elder 
4660f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
466183a06263SAlex Elder 	if (ret)
466283a06263SAlex Elder 		goto err_out_disk;
4663f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4664f35a4deeSAlex Elder 
4665f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
4666f35a4deeSAlex Elder 	if (ret)
4667f35a4deeSAlex Elder 		goto err_out_mapping;
466883a06263SAlex Elder 
466983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
467083a06263SAlex Elder 
4671129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
467283a06263SAlex Elder 	add_disk(rbd_dev->disk);
467383a06263SAlex Elder 
467483a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
467583a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
467683a06263SAlex Elder 
467783a06263SAlex Elder 	return ret;
46782f82ee54SAlex Elder 
4679f35a4deeSAlex Elder err_out_mapping:
4680f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
468183a06263SAlex Elder err_out_disk:
468283a06263SAlex Elder 	rbd_free_disk(rbd_dev);
468383a06263SAlex Elder err_out_blkdev:
468483a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
468583a06263SAlex Elder err_out_id:
468683a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4687d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
468883a06263SAlex Elder 
468983a06263SAlex Elder 	return ret;
469083a06263SAlex Elder }
469183a06263SAlex Elder 
4692332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4693332bb12dSAlex Elder {
4694332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4695332bb12dSAlex Elder 	size_t size;
4696332bb12dSAlex Elder 
4697332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4698332bb12dSAlex Elder 
4699332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4700332bb12dSAlex Elder 
4701332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4702332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4703332bb12dSAlex Elder 	else
4704332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4705332bb12dSAlex Elder 
4706332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4707332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4708332bb12dSAlex Elder 		return -ENOMEM;
4709332bb12dSAlex Elder 
4710332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4711332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4712332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4713332bb12dSAlex Elder 	else
4714332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4715332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4716332bb12dSAlex Elder 	return 0;
4717332bb12dSAlex Elder }
4718332bb12dSAlex Elder 
4719200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4720200a6a8bSAlex Elder {
47216fd48b3bSAlex Elder 	int ret;
47226fd48b3bSAlex Elder 
47236fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
47246fd48b3bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 0);
47256fd48b3bSAlex Elder 	if (ret)
47266fd48b3bSAlex Elder 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
4727200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
47286fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
47296fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
47306fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
47316fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
47326fd48b3bSAlex Elder 
4733200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
4734200a6a8bSAlex Elder }
4735200a6a8bSAlex Elder 
4736a30b71b9SAlex Elder /*
4737a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4738a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4739a30b71b9SAlex Elder  * id.
4740a30b71b9SAlex Elder  */
474151344a38SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only)
4742a30b71b9SAlex Elder {
4743a30b71b9SAlex Elder 	int ret;
4744b644de2bSAlex Elder 	int tmp;
4745a30b71b9SAlex Elder 
4746a30b71b9SAlex Elder 	/*
4747a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4748a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4749a30b71b9SAlex Elder 	 * it's a format 1 image.
4750a30b71b9SAlex Elder 	 */
4751a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4752a30b71b9SAlex Elder 	if (ret)
4753c0fba368SAlex Elder 		return ret;
4754c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
4755c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4756c0fba368SAlex Elder 
4757332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
4758332bb12dSAlex Elder 	if (ret)
4759332bb12dSAlex Elder 		goto err_out_format;
4760332bb12dSAlex Elder 
4761b644de2bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4762b644de2bSAlex Elder 	if (ret)
4763b644de2bSAlex Elder 		goto out_header_name;
4764b644de2bSAlex Elder 
4765c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
4766a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4767a30b71b9SAlex Elder 	else
4768a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
47695655c4d9SAlex Elder 	if (ret)
4770b644de2bSAlex Elder 		goto err_out_watch;
4771a30b71b9SAlex Elder 
47729bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
47739bb81c9bSAlex Elder 	if (ret)
477433dca39fSAlex Elder 		goto err_out_probe;
47759bb81c9bSAlex Elder 
477651344a38SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
477751344a38SAlex Elder 
477851344a38SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
477951344a38SAlex Elder 		read_only = true;
478051344a38SAlex Elder 	rbd_dev->mapping.read_only = read_only;
478151344a38SAlex Elder 
47829bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
47836fd48b3bSAlex Elder 	if (!ret)
47846fd48b3bSAlex Elder 		return 0;
478583a06263SAlex Elder 
47866fd48b3bSAlex Elder err_out_probe:
47876fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4788b644de2bSAlex Elder err_out_watch:
4789b644de2bSAlex Elder 	tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4790b644de2bSAlex Elder 	if (tmp)
4791b644de2bSAlex Elder 		rbd_warn(rbd_dev, "unable to tear down watch request\n");
4792332bb12dSAlex Elder out_header_name:
4793332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
4794332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
4795332bb12dSAlex Elder err_out_format:
4796332bb12dSAlex Elder 	rbd_dev->image_format = 0;
47975655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
47985655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
47995655c4d9SAlex Elder 
48005655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
48015655c4d9SAlex Elder 
48025655c4d9SAlex Elder 	return ret;
480383a06263SAlex Elder }
480483a06263SAlex Elder 
480559c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
480659c2be1eSYehuda Sadeh 		       const char *buf,
480759c2be1eSYehuda Sadeh 		       size_t count)
4808602adf40SYehuda Sadeh {
4809cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4810dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
48114e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4812859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48139d3997fdSAlex Elder 	struct rbd_client *rbdc;
481427cc2594SAlex Elder 	struct ceph_osd_client *osdc;
481551344a38SAlex Elder 	bool read_only;
481627cc2594SAlex Elder 	int rc = -ENOMEM;
4817602adf40SYehuda Sadeh 
4818602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4819602adf40SYehuda Sadeh 		return -ENODEV;
4820602adf40SYehuda Sadeh 
4821a725f65eSAlex Elder 	/* parse add command */
4822859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4823dc79b113SAlex Elder 	if (rc < 0)
4824bd4ba655SAlex Elder 		goto err_out_module;
482551344a38SAlex Elder 	read_only = rbd_opts->read_only;
482651344a38SAlex Elder 	kfree(rbd_opts);
482751344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
4828a725f65eSAlex Elder 
48299d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
48309d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
48319d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
48320ddebc0cSAlex Elder 		goto err_out_args;
48339d3997fdSAlex Elder 	}
4834c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4835602adf40SYehuda Sadeh 
4836602adf40SYehuda Sadeh 	/* pick the pool */
48379d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4838859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4839602adf40SYehuda Sadeh 	if (rc < 0)
4840602adf40SYehuda Sadeh 		goto err_out_client;
4841859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
4842859c31dfSAlex Elder 
48430903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
48440903e875SAlex Elder 
4845c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
4846c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4847c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
48480903e875SAlex Elder 		rc = -EIO;
48490903e875SAlex Elder 		goto err_out_client;
48500903e875SAlex Elder 	}
48510903e875SAlex Elder 
4852c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4853bd4ba655SAlex Elder 	if (!rbd_dev)
4854bd4ba655SAlex Elder 		goto err_out_client;
4855c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4856c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4857602adf40SYehuda Sadeh 
485851344a38SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, read_only);
4859a30b71b9SAlex Elder 	if (rc < 0)
4860c53d5893SAlex Elder 		goto err_out_rbd_dev;
486105fd6f6fSAlex Elder 
4862b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
4863b536f69aSAlex Elder 	if (!rc)
4864602adf40SYehuda Sadeh 		return count;
4865b536f69aSAlex Elder 
4866b536f69aSAlex Elder 	rbd_dev_image_release(rbd_dev);
4867c53d5893SAlex Elder err_out_rbd_dev:
4868c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4869bd4ba655SAlex Elder err_out_client:
48709d3997fdSAlex Elder 	rbd_put_client(rbdc);
48710ddebc0cSAlex Elder err_out_args:
487278cea76eSAlex Elder 	if (ceph_opts)
487378cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
48744e9afebaSAlex Elder 	kfree(rbd_opts);
4875859c31dfSAlex Elder 	rbd_spec_put(spec);
4876bd4ba655SAlex Elder err_out_module:
4877bd4ba655SAlex Elder 	module_put(THIS_MODULE);
487827cc2594SAlex Elder 
4879602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
488027cc2594SAlex Elder 
488127cc2594SAlex Elder 	return (ssize_t)rc;
4882602adf40SYehuda Sadeh }
4883602adf40SYehuda Sadeh 
4884de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4885602adf40SYehuda Sadeh {
4886602adf40SYehuda Sadeh 	struct list_head *tmp;
4887602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4888602adf40SYehuda Sadeh 
4889e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4890602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4891602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4892de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4893e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4894602adf40SYehuda Sadeh 			return rbd_dev;
4895602adf40SYehuda Sadeh 		}
4896e124a82fSAlex Elder 	}
4897e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4898602adf40SYehuda Sadeh 	return NULL;
4899602adf40SYehuda Sadeh }
4900602adf40SYehuda Sadeh 
4901200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
4902602adf40SYehuda Sadeh {
4903593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4904602adf40SYehuda Sadeh 
4905602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4906200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
49076d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
4908602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
4909200a6a8bSAlex Elder 	rbd_dev->major = 0;
4910e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4911d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
4912602adf40SYehuda Sadeh }
4913602adf40SYehuda Sadeh 
491405a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
491505a46afdSAlex Elder {
4916ad945fc1SAlex Elder 	while (rbd_dev->parent) {
491705a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
491805a46afdSAlex Elder 		struct rbd_device *second = first->parent;
491905a46afdSAlex Elder 		struct rbd_device *third;
492005a46afdSAlex Elder 
492105a46afdSAlex Elder 		/*
492205a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
492305a46afdSAlex Elder 		 * remove it.
492405a46afdSAlex Elder 		 */
492505a46afdSAlex Elder 		while (second && (third = second->parent)) {
492605a46afdSAlex Elder 			first = second;
492705a46afdSAlex Elder 			second = third;
492805a46afdSAlex Elder 		}
4929ad945fc1SAlex Elder 		rbd_assert(second);
49308ad42cd0SAlex Elder 		rbd_dev_image_release(second);
4931ad945fc1SAlex Elder 		first->parent = NULL;
4932ad945fc1SAlex Elder 		first->parent_overlap = 0;
4933ad945fc1SAlex Elder 
4934ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
493505a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
493605a46afdSAlex Elder 		first->parent_spec = NULL;
493705a46afdSAlex Elder 	}
493805a46afdSAlex Elder }
493905a46afdSAlex Elder 
4940dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4941602adf40SYehuda Sadeh 			  const char *buf,
4942602adf40SYehuda Sadeh 			  size_t count)
4943602adf40SYehuda Sadeh {
4944602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
49450d8189e1SAlex Elder 	int target_id;
4946602adf40SYehuda Sadeh 	unsigned long ul;
49470d8189e1SAlex Elder 	int ret;
4948602adf40SYehuda Sadeh 
49490d8189e1SAlex Elder 	ret = strict_strtoul(buf, 10, &ul);
49500d8189e1SAlex Elder 	if (ret)
49510d8189e1SAlex Elder 		return ret;
4952602adf40SYehuda Sadeh 
4953602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4954602adf40SYehuda Sadeh 	target_id = (int) ul;
4955602adf40SYehuda Sadeh 	if (target_id != ul)
4956602adf40SYehuda Sadeh 		return -EINVAL;
4957602adf40SYehuda Sadeh 
4958602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4959602adf40SYehuda Sadeh 
4960602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4961602adf40SYehuda Sadeh 	if (!rbd_dev) {
4962602adf40SYehuda Sadeh 		ret = -ENOENT;
4963602adf40SYehuda Sadeh 		goto done;
4964602adf40SYehuda Sadeh 	}
4965602adf40SYehuda Sadeh 
4966a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4967b82d167bSAlex Elder 	if (rbd_dev->open_count)
496842382b70SAlex Elder 		ret = -EBUSY;
4969b82d167bSAlex Elder 	else
4970b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4971a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4972b82d167bSAlex Elder 	if (ret < 0)
497342382b70SAlex Elder 		goto done;
49740d8189e1SAlex Elder 	ret = count;
4975b480815aSAlex Elder 	rbd_bus_del_dev(rbd_dev);
49768ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
497779ab7558SAlex Elder 	module_put(THIS_MODULE);
4978602adf40SYehuda Sadeh done:
4979602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4980aafb230eSAlex Elder 
4981602adf40SYehuda Sadeh 	return ret;
4982602adf40SYehuda Sadeh }
4983602adf40SYehuda Sadeh 
4984602adf40SYehuda Sadeh /*
4985602adf40SYehuda Sadeh  * create control files in sysfs
4986dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4987602adf40SYehuda Sadeh  */
4988602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4989602adf40SYehuda Sadeh {
4990dfc5606dSYehuda Sadeh 	int ret;
4991602adf40SYehuda Sadeh 
4992fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4993dfc5606dSYehuda Sadeh 	if (ret < 0)
4994dfc5606dSYehuda Sadeh 		return ret;
4995602adf40SYehuda Sadeh 
4996fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4997fed4c143SAlex Elder 	if (ret < 0)
4998fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4999602adf40SYehuda Sadeh 
5000602adf40SYehuda Sadeh 	return ret;
5001602adf40SYehuda Sadeh }
5002602adf40SYehuda Sadeh 
5003602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5004602adf40SYehuda Sadeh {
5005dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5006fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5007602adf40SYehuda Sadeh }
5008602adf40SYehuda Sadeh 
50091c2a9dfeSAlex Elder static int rbd_slab_init(void)
50101c2a9dfeSAlex Elder {
50111c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
50121c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
50131c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
50141c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
50151c2a9dfeSAlex Elder 					0, NULL);
5016868311b1SAlex Elder 	if (!rbd_img_request_cache)
5017868311b1SAlex Elder 		return -ENOMEM;
5018868311b1SAlex Elder 
5019868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5020868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5021868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5022868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5023868311b1SAlex Elder 					0, NULL);
502478c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
502578c2a44aSAlex Elder 		goto out_err;
502678c2a44aSAlex Elder 
502778c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
502878c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
502978c2a44aSAlex Elder 					MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
503078c2a44aSAlex Elder 	if (rbd_segment_name_cache)
50311c2a9dfeSAlex Elder 		return 0;
503278c2a44aSAlex Elder out_err:
503378c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
503478c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
503578c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
503678c2a44aSAlex Elder 	}
50371c2a9dfeSAlex Elder 
5038868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5039868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5040868311b1SAlex Elder 
50411c2a9dfeSAlex Elder 	return -ENOMEM;
50421c2a9dfeSAlex Elder }
50431c2a9dfeSAlex Elder 
50441c2a9dfeSAlex Elder static void rbd_slab_exit(void)
50451c2a9dfeSAlex Elder {
504678c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
504778c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
504878c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
504978c2a44aSAlex Elder 
5050868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5051868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5052868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5053868311b1SAlex Elder 
50541c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
50551c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
50561c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
50571c2a9dfeSAlex Elder }
50581c2a9dfeSAlex Elder 
5059cc344fa1SAlex Elder static int __init rbd_init(void)
5060602adf40SYehuda Sadeh {
5061602adf40SYehuda Sadeh 	int rc;
5062602adf40SYehuda Sadeh 
50631e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
50641e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
50651e32d34cSAlex Elder 
50661e32d34cSAlex Elder 		return -EINVAL;
50671e32d34cSAlex Elder 	}
50681c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5069602adf40SYehuda Sadeh 	if (rc)
5070602adf40SYehuda Sadeh 		return rc;
50711c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
50721c2a9dfeSAlex Elder 	if (rc)
50731c2a9dfeSAlex Elder 		rbd_slab_exit();
50741c2a9dfeSAlex Elder 	else
5075f0f8cef5SAlex Elder 		pr_info("loaded " RBD_DRV_NAME_LONG "\n");
50761c2a9dfeSAlex Elder 
50771c2a9dfeSAlex Elder 	return rc;
5078602adf40SYehuda Sadeh }
5079602adf40SYehuda Sadeh 
5080cc344fa1SAlex Elder static void __exit rbd_exit(void)
5081602adf40SYehuda Sadeh {
5082602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
50831c2a9dfeSAlex Elder 	rbd_slab_exit();
5084602adf40SYehuda Sadeh }
5085602adf40SYehuda Sadeh 
5086602adf40SYehuda Sadeh module_init(rbd_init);
5087602adf40SYehuda Sadeh module_exit(rbd_exit);
5088602adf40SYehuda Sadeh 
5089602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5090602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5091602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5092602adf40SYehuda Sadeh 
5093602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5094602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5095602adf40SYehuda Sadeh 
5096602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5097