xref: /openbmc/linux/drivers/block/rbd.c (revision b5b09be3)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44602adf40SYehuda Sadeh 
45602adf40SYehuda Sadeh #include "rbd_types.h"
46602adf40SYehuda Sadeh 
47aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
48aafb230eSAlex Elder 
49593a9e7bSAlex Elder /*
50593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
51593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
52593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
53593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
54593a9e7bSAlex Elder  */
55593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
56593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
57593a9e7bSAlex Elder 
58f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
59f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
60602adf40SYehuda Sadeh 
61602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
62602adf40SYehuda Sadeh 
63d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
64d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
65d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
66d4b125e9SAlex Elder 
6735d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
68602adf40SYehuda Sadeh 
69602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
70602adf40SYehuda Sadeh 
719682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
729682fc6dSAlex Elder 
739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
769e15b77dSAlex Elder 
771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
78589d30e0SAlex Elder 
79d889140cSAlex Elder /* Feature bits */
80d889140cSAlex Elder 
815cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
825cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
835cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
845cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
85d889140cSAlex Elder 
86d889140cSAlex Elder /* Features supported by this (client software) implementation. */
87d889140cSAlex Elder 
88770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
89d889140cSAlex Elder 
9081a89793SAlex Elder /*
9181a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9281a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9381a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9481a89793SAlex Elder  * enough to hold all possible device names.
9581a89793SAlex Elder  */
96602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9781a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
98602adf40SYehuda Sadeh 
99602adf40SYehuda Sadeh /*
100602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
101602adf40SYehuda Sadeh  */
102602adf40SYehuda Sadeh struct rbd_image_header {
103f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
104849b4260SAlex Elder 	char *object_prefix;
10534b13184SAlex Elder 	u64 features;
106602adf40SYehuda Sadeh 	__u8 obj_order;
107602adf40SYehuda Sadeh 	__u8 crypt_type;
108602adf40SYehuda Sadeh 	__u8 comp_type;
109602adf40SYehuda Sadeh 
110f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
111f84344f3SAlex Elder 	u64 image_size;
112f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
113602adf40SYehuda Sadeh 	char *snap_names;
114602adf40SYehuda Sadeh 	u64 *snap_sizes;
11559c2be1eSYehuda Sadeh 
116500d0c0fSAlex Elder 	u64 stripe_unit;
117500d0c0fSAlex Elder 	u64 stripe_count;
11859c2be1eSYehuda Sadeh };
11959c2be1eSYehuda Sadeh 
1200d7dbfceSAlex Elder /*
1210d7dbfceSAlex Elder  * An rbd image specification.
1220d7dbfceSAlex Elder  *
1230d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
124c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
125c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
126c66c6e0cSAlex Elder  *
127c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
128c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
129c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
130c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
133c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
134c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
135c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
136c66c6e0cSAlex Elder  * is shared between the parent and child).
137c66c6e0cSAlex Elder  *
138c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
139c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
140c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
141c66c6e0cSAlex Elder  *
142c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
143c66c6e0cSAlex Elder  * could be a null pointer).
1440d7dbfceSAlex Elder  */
1450d7dbfceSAlex Elder struct rbd_spec {
1460d7dbfceSAlex Elder 	u64		pool_id;
147ecb4dc22SAlex Elder 	const char	*pool_name;
1480d7dbfceSAlex Elder 
149ecb4dc22SAlex Elder 	const char	*image_id;
150ecb4dc22SAlex Elder 	const char	*image_name;
1510d7dbfceSAlex Elder 
1520d7dbfceSAlex Elder 	u64		snap_id;
153ecb4dc22SAlex Elder 	const char	*snap_name;
1540d7dbfceSAlex Elder 
1550d7dbfceSAlex Elder 	struct kref	kref;
1560d7dbfceSAlex Elder };
1570d7dbfceSAlex Elder 
158602adf40SYehuda Sadeh /*
159f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
160602adf40SYehuda Sadeh  */
161602adf40SYehuda Sadeh struct rbd_client {
162602adf40SYehuda Sadeh 	struct ceph_client	*client;
163602adf40SYehuda Sadeh 	struct kref		kref;
164602adf40SYehuda Sadeh 	struct list_head	node;
165602adf40SYehuda Sadeh };
166602adf40SYehuda Sadeh 
167bf0d5f50SAlex Elder struct rbd_img_request;
168bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
169bf0d5f50SAlex Elder 
170bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
171bf0d5f50SAlex Elder 
172bf0d5f50SAlex Elder struct rbd_obj_request;
173bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
174bf0d5f50SAlex Elder 
1759969ebc5SAlex Elder enum obj_request_type {
1769969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1779969ebc5SAlex Elder };
178bf0d5f50SAlex Elder 
179926f9b3fSAlex Elder enum obj_req_flags {
180926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1816365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1825679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1835679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
184926f9b3fSAlex Elder };
185926f9b3fSAlex Elder 
186bf0d5f50SAlex Elder struct rbd_obj_request {
187bf0d5f50SAlex Elder 	const char		*object_name;
188bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
189bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
190926f9b3fSAlex Elder 	unsigned long		flags;
191bf0d5f50SAlex Elder 
192c5b5ef6cSAlex Elder 	/*
193c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
194c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
195c5b5ef6cSAlex Elder 	 *
196c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
197c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
198c5b5ef6cSAlex Elder 	 *
199c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
200c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
201c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
202c5b5ef6cSAlex Elder 	 *
203c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
204c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
205c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
206c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
207c5b5ef6cSAlex Elder 	 */
208c5b5ef6cSAlex Elder 	union {
209c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
210c5b5ef6cSAlex Elder 		struct {
211bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
212c5b5ef6cSAlex Elder 			u64			img_offset;
213c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
214c5b5ef6cSAlex Elder 			struct list_head	links;
215c5b5ef6cSAlex Elder 		};
216c5b5ef6cSAlex Elder 	};
217bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
218bf0d5f50SAlex Elder 
219bf0d5f50SAlex Elder 	enum obj_request_type	type;
220788e2df3SAlex Elder 	union {
221bf0d5f50SAlex Elder 		struct bio	*bio_list;
222788e2df3SAlex Elder 		struct {
223788e2df3SAlex Elder 			struct page	**pages;
224788e2df3SAlex Elder 			u32		page_count;
225788e2df3SAlex Elder 		};
226788e2df3SAlex Elder 	};
2270eefd470SAlex Elder 	struct page		**copyup_pages;
228bf0d5f50SAlex Elder 
229bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
230bf0d5f50SAlex Elder 
231bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2321b83bef2SSage Weil 	int			result;
233bf0d5f50SAlex Elder 
234bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
235788e2df3SAlex Elder 	struct completion	completion;
236bf0d5f50SAlex Elder 
237bf0d5f50SAlex Elder 	struct kref		kref;
238bf0d5f50SAlex Elder };
239bf0d5f50SAlex Elder 
2400c425248SAlex Elder enum img_req_flags {
2419849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2429849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
243d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2440c425248SAlex Elder };
2450c425248SAlex Elder 
246bf0d5f50SAlex Elder struct rbd_img_request {
247bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
248bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
249bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2500c425248SAlex Elder 	unsigned long		flags;
251bf0d5f50SAlex Elder 	union {
252bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2539849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2549849e986SAlex Elder 	};
2559849e986SAlex Elder 	union {
2569849e986SAlex Elder 		struct request		*rq;		/* block request */
2579849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
258bf0d5f50SAlex Elder 	};
2593d7efd18SAlex Elder 	struct page		**copyup_pages;
260bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
261bf0d5f50SAlex Elder 	u32			next_completion;
262bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
26355f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
264a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
265bf0d5f50SAlex Elder 
266bf0d5f50SAlex Elder 	u32			obj_request_count;
267bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
268bf0d5f50SAlex Elder 
269bf0d5f50SAlex Elder 	struct kref		kref;
270bf0d5f50SAlex Elder };
271bf0d5f50SAlex Elder 
272bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
273ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
274bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
275ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
276bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
277ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
278bf0d5f50SAlex Elder 
279f84344f3SAlex Elder struct rbd_mapping {
28099c1f08fSAlex Elder 	u64                     size;
28134b13184SAlex Elder 	u64                     features;
282f84344f3SAlex Elder 	bool			read_only;
283f84344f3SAlex Elder };
284f84344f3SAlex Elder 
285602adf40SYehuda Sadeh /*
286602adf40SYehuda Sadeh  * a single device
287602adf40SYehuda Sadeh  */
288602adf40SYehuda Sadeh struct rbd_device {
289de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
290602adf40SYehuda Sadeh 
291602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
292602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
293602adf40SYehuda Sadeh 
294a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
295602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
296602adf40SYehuda Sadeh 
297602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
298602adf40SYehuda Sadeh 
299b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
300602adf40SYehuda Sadeh 
301602adf40SYehuda Sadeh 	struct rbd_image_header	header;
302b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3030d7dbfceSAlex Elder 	struct rbd_spec		*spec;
304602adf40SYehuda Sadeh 
3050d7dbfceSAlex Elder 	char			*header_name;
306971f839aSAlex Elder 
3070903e875SAlex Elder 	struct ceph_file_layout	layout;
3080903e875SAlex Elder 
30959c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
310975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31159c2be1eSYehuda Sadeh 
31286b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31386b00e0dSAlex Elder 	u64			parent_overlap;
3142f82ee54SAlex Elder 	struct rbd_device	*parent;
31586b00e0dSAlex Elder 
316c666601aSJosh Durgin 	/* protects updating the header */
317c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
318f84344f3SAlex Elder 
319f84344f3SAlex Elder 	struct rbd_mapping	mapping;
320602adf40SYehuda Sadeh 
321602adf40SYehuda Sadeh 	struct list_head	node;
322dfc5606dSYehuda Sadeh 
323dfc5606dSYehuda Sadeh 	/* sysfs related */
324dfc5606dSYehuda Sadeh 	struct device		dev;
325b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
326dfc5606dSYehuda Sadeh };
327dfc5606dSYehuda Sadeh 
328b82d167bSAlex Elder /*
329b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
330b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
331b82d167bSAlex Elder  *
332b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
333b82d167bSAlex Elder  * "open_count" field) requires atomic access.
334b82d167bSAlex Elder  */
3356d292906SAlex Elder enum rbd_dev_flags {
3366d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
337b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3386d292906SAlex Elder };
3396d292906SAlex Elder 
340602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
341e124a82fSAlex Elder 
342602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
343e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
344e124a82fSAlex Elder 
345602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
346432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
347602adf40SYehuda Sadeh 
34878c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
34978c2a44aSAlex Elder 
3501c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
351868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
35278c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3531c2a9dfeSAlex Elder 
3543d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3553d7efd18SAlex Elder 
356200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
357dfc5606dSYehuda Sadeh 
358f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
359f0f8cef5SAlex Elder 		       size_t count);
360f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
361f0f8cef5SAlex Elder 			  size_t count);
36271f293e2SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
363f0f8cef5SAlex Elder 
364f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
365f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
366f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
367f0f8cef5SAlex Elder 	__ATTR_NULL
368f0f8cef5SAlex Elder };
369f0f8cef5SAlex Elder 
370f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
371f0f8cef5SAlex Elder 	.name		= "rbd",
372f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
373f0f8cef5SAlex Elder };
374f0f8cef5SAlex Elder 
375f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
376f0f8cef5SAlex Elder {
377f0f8cef5SAlex Elder }
378f0f8cef5SAlex Elder 
379f0f8cef5SAlex Elder static struct device rbd_root_dev = {
380f0f8cef5SAlex Elder 	.init_name =    "rbd",
381f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
382f0f8cef5SAlex Elder };
383f0f8cef5SAlex Elder 
38406ecc6cbSAlex Elder static __printf(2, 3)
38506ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
38606ecc6cbSAlex Elder {
38706ecc6cbSAlex Elder 	struct va_format vaf;
38806ecc6cbSAlex Elder 	va_list args;
38906ecc6cbSAlex Elder 
39006ecc6cbSAlex Elder 	va_start(args, fmt);
39106ecc6cbSAlex Elder 	vaf.fmt = fmt;
39206ecc6cbSAlex Elder 	vaf.va = &args;
39306ecc6cbSAlex Elder 
39406ecc6cbSAlex Elder 	if (!rbd_dev)
39506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
39606ecc6cbSAlex Elder 	else if (rbd_dev->disk)
39706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
39806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
39906ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
40006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40206ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
40406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
40506ecc6cbSAlex Elder 	else	/* punt */
40606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
40706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
40806ecc6cbSAlex Elder 	va_end(args);
40906ecc6cbSAlex Elder }
41006ecc6cbSAlex Elder 
411aafb230eSAlex Elder #ifdef RBD_DEBUG
412aafb230eSAlex Elder #define rbd_assert(expr)						\
413aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
414aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
415aafb230eSAlex Elder 						"at line %d:\n\n"	\
416aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
417aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
418aafb230eSAlex Elder 			BUG();						\
419aafb230eSAlex Elder 		}
420aafb230eSAlex Elder #else /* !RBD_DEBUG */
421aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
422aafb230eSAlex Elder #endif /* !RBD_DEBUG */
423dfc5606dSYehuda Sadeh 
424b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
42505a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
42605a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
4278b3e1a56SAlex Elder 
428cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
429cc4a38bdSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
43054cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
43154cac61fSAlex Elder 					u64 snap_id);
4322ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4332ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
4342ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4352ad3d716SAlex Elder 		u64 *snap_features);
4362ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
43759c2be1eSYehuda Sadeh 
438602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
439602adf40SYehuda Sadeh {
440f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
441b82d167bSAlex Elder 	bool removing = false;
442602adf40SYehuda Sadeh 
443f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
444602adf40SYehuda Sadeh 		return -EROFS;
445602adf40SYehuda Sadeh 
446a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
447b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
448b82d167bSAlex Elder 		removing = true;
449b82d167bSAlex Elder 	else
450b82d167bSAlex Elder 		rbd_dev->open_count++;
451a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
452b82d167bSAlex Elder 	if (removing)
453b82d167bSAlex Elder 		return -ENOENT;
454b82d167bSAlex Elder 
45542382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
456c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
457f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45842382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
459340c7a2bSAlex Elder 
460602adf40SYehuda Sadeh 	return 0;
461602adf40SYehuda Sadeh }
462602adf40SYehuda Sadeh 
463dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
464dfc5606dSYehuda Sadeh {
465dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
466b82d167bSAlex Elder 	unsigned long open_count_before;
467b82d167bSAlex Elder 
468a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
469b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
470a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
471b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
472dfc5606dSYehuda Sadeh 
47342382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
474c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
47542382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
476dfc5606dSYehuda Sadeh 
477dfc5606dSYehuda Sadeh 	return 0;
478dfc5606dSYehuda Sadeh }
479dfc5606dSYehuda Sadeh 
480602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
481602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
482602adf40SYehuda Sadeh 	.open			= rbd_open,
483dfc5606dSYehuda Sadeh 	.release		= rbd_release,
484602adf40SYehuda Sadeh };
485602adf40SYehuda Sadeh 
486602adf40SYehuda Sadeh /*
487602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48843ae4701SAlex Elder  * We own *ceph_opts.
489602adf40SYehuda Sadeh  */
490f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
491602adf40SYehuda Sadeh {
492602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
493602adf40SYehuda Sadeh 	int ret = -ENOMEM;
494602adf40SYehuda Sadeh 
49537206ee5SAlex Elder 	dout("%s:\n", __func__);
496602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
497602adf40SYehuda Sadeh 	if (!rbdc)
498602adf40SYehuda Sadeh 		goto out_opt;
499602adf40SYehuda Sadeh 
500602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
501602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
502602adf40SYehuda Sadeh 
503bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
504bc534d86SAlex Elder 
50543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
506602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
507bc534d86SAlex Elder 		goto out_mutex;
50843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
509602adf40SYehuda Sadeh 
510602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
511602adf40SYehuda Sadeh 	if (ret < 0)
512602adf40SYehuda Sadeh 		goto out_err;
513602adf40SYehuda Sadeh 
514432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
515602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
516432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
517602adf40SYehuda Sadeh 
518bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51937206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
520bc534d86SAlex Elder 
521602adf40SYehuda Sadeh 	return rbdc;
522602adf40SYehuda Sadeh 
523602adf40SYehuda Sadeh out_err:
524602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
525bc534d86SAlex Elder out_mutex:
526bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
527602adf40SYehuda Sadeh 	kfree(rbdc);
528602adf40SYehuda Sadeh out_opt:
52943ae4701SAlex Elder 	if (ceph_opts)
53043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
53137206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
53237206ee5SAlex Elder 
53328f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
534602adf40SYehuda Sadeh }
535602adf40SYehuda Sadeh 
5362f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5372f82ee54SAlex Elder {
5382f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5392f82ee54SAlex Elder 
5402f82ee54SAlex Elder 	return rbdc;
5412f82ee54SAlex Elder }
5422f82ee54SAlex Elder 
543602adf40SYehuda Sadeh /*
5441f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5451f7ba331SAlex Elder  * found, bump its reference count.
546602adf40SYehuda Sadeh  */
5471f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
548602adf40SYehuda Sadeh {
549602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5501f7ba331SAlex Elder 	bool found = false;
551602adf40SYehuda Sadeh 
55243ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
553602adf40SYehuda Sadeh 		return NULL;
554602adf40SYehuda Sadeh 
5551f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5561f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5571f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5582f82ee54SAlex Elder 			__rbd_get_client(client_node);
5592f82ee54SAlex Elder 
5601f7ba331SAlex Elder 			found = true;
5611f7ba331SAlex Elder 			break;
5621f7ba331SAlex Elder 		}
5631f7ba331SAlex Elder 	}
5641f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5651f7ba331SAlex Elder 
5661f7ba331SAlex Elder 	return found ? client_node : NULL;
567602adf40SYehuda Sadeh }
568602adf40SYehuda Sadeh 
569602adf40SYehuda Sadeh /*
57059c2be1eSYehuda Sadeh  * mount options
57159c2be1eSYehuda Sadeh  */
57259c2be1eSYehuda Sadeh enum {
57359c2be1eSYehuda Sadeh 	Opt_last_int,
57459c2be1eSYehuda Sadeh 	/* int args above */
57559c2be1eSYehuda Sadeh 	Opt_last_string,
57659c2be1eSYehuda Sadeh 	/* string args above */
577cc0538b6SAlex Elder 	Opt_read_only,
578cc0538b6SAlex Elder 	Opt_read_write,
579cc0538b6SAlex Elder 	/* Boolean args above */
580cc0538b6SAlex Elder 	Opt_last_bool,
58159c2be1eSYehuda Sadeh };
58259c2be1eSYehuda Sadeh 
58343ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
58459c2be1eSYehuda Sadeh 	/* int args above */
58559c2be1eSYehuda Sadeh 	/* string args above */
586be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
587cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
588cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
589cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
590cc0538b6SAlex Elder 	/* Boolean args above */
59159c2be1eSYehuda Sadeh 	{-1, NULL}
59259c2be1eSYehuda Sadeh };
59359c2be1eSYehuda Sadeh 
59498571b5aSAlex Elder struct rbd_options {
59598571b5aSAlex Elder 	bool	read_only;
59698571b5aSAlex Elder };
59798571b5aSAlex Elder 
59898571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59998571b5aSAlex Elder 
60059c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
60159c2be1eSYehuda Sadeh {
60243ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
60359c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
60459c2be1eSYehuda Sadeh 	int token, intval, ret;
60559c2be1eSYehuda Sadeh 
60643ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60759c2be1eSYehuda Sadeh 	if (token < 0)
60859c2be1eSYehuda Sadeh 		return -EINVAL;
60959c2be1eSYehuda Sadeh 
61059c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
61159c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
61259c2be1eSYehuda Sadeh 		if (ret < 0) {
61359c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
61459c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
61559c2be1eSYehuda Sadeh 			return ret;
61659c2be1eSYehuda Sadeh 		}
61759c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61859c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61959c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
62059c2be1eSYehuda Sadeh 		     argstr[0].from);
621cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
622cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
62359c2be1eSYehuda Sadeh 	} else {
62459c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
62559c2be1eSYehuda Sadeh 	}
62659c2be1eSYehuda Sadeh 
62759c2be1eSYehuda Sadeh 	switch (token) {
628cc0538b6SAlex Elder 	case Opt_read_only:
629cc0538b6SAlex Elder 		rbd_opts->read_only = true;
630cc0538b6SAlex Elder 		break;
631cc0538b6SAlex Elder 	case Opt_read_write:
632cc0538b6SAlex Elder 		rbd_opts->read_only = false;
633cc0538b6SAlex Elder 		break;
63459c2be1eSYehuda Sadeh 	default:
635aafb230eSAlex Elder 		rbd_assert(false);
636aafb230eSAlex Elder 		break;
63759c2be1eSYehuda Sadeh 	}
63859c2be1eSYehuda Sadeh 	return 0;
63959c2be1eSYehuda Sadeh }
64059c2be1eSYehuda Sadeh 
64159c2be1eSYehuda Sadeh /*
642602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
643602adf40SYehuda Sadeh  * not exist create it.
644602adf40SYehuda Sadeh  */
6459d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
646602adf40SYehuda Sadeh {
647f8c38929SAlex Elder 	struct rbd_client *rbdc;
64859c2be1eSYehuda Sadeh 
6491f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6509d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
65143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6529d3997fdSAlex Elder 	else
653f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
654d720bcb0SAlex Elder 
6559d3997fdSAlex Elder 	return rbdc;
656602adf40SYehuda Sadeh }
657602adf40SYehuda Sadeh 
658602adf40SYehuda Sadeh /*
659602adf40SYehuda Sadeh  * Destroy ceph client
660d23a4b3fSAlex Elder  *
661432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
662602adf40SYehuda Sadeh  */
663602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
664602adf40SYehuda Sadeh {
665602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
666602adf40SYehuda Sadeh 
66737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
668cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
669602adf40SYehuda Sadeh 	list_del(&rbdc->node);
670cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
671602adf40SYehuda Sadeh 
672602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
673602adf40SYehuda Sadeh 	kfree(rbdc);
674602adf40SYehuda Sadeh }
675602adf40SYehuda Sadeh 
676602adf40SYehuda Sadeh /*
677602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
678602adf40SYehuda Sadeh  * it.
679602adf40SYehuda Sadeh  */
6809d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
681602adf40SYehuda Sadeh {
682c53d5893SAlex Elder 	if (rbdc)
6839d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
684602adf40SYehuda Sadeh }
685602adf40SYehuda Sadeh 
686a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
687a30b71b9SAlex Elder {
688a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
689a30b71b9SAlex Elder }
690a30b71b9SAlex Elder 
6918e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6928e94af8eSAlex Elder {
693103a150fSAlex Elder 	size_t size;
694103a150fSAlex Elder 	u32 snap_count;
695103a150fSAlex Elder 
696103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
697103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
698103a150fSAlex Elder 		return false;
699103a150fSAlex Elder 
700db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
701db2388b6SAlex Elder 
702db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
703db2388b6SAlex Elder 		return false;
704db2388b6SAlex Elder 
705db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
706db2388b6SAlex Elder 
707db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
708db2388b6SAlex Elder 		return false;
709db2388b6SAlex Elder 
710103a150fSAlex Elder 	/*
711103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
712103a150fSAlex Elder 	 * that limits the number of snapshots.
713103a150fSAlex Elder 	 */
714103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
715103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
716103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
717103a150fSAlex Elder 		return false;
718103a150fSAlex Elder 
719103a150fSAlex Elder 	/*
720103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
721103a150fSAlex Elder 	 * header must also be representable in a size_t.
722103a150fSAlex Elder 	 */
723103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
724103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
725103a150fSAlex Elder 		return false;
726103a150fSAlex Elder 
727103a150fSAlex Elder 	return true;
7288e94af8eSAlex Elder }
7298e94af8eSAlex Elder 
730602adf40SYehuda Sadeh /*
731602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
732602adf40SYehuda Sadeh  * header.
733602adf40SYehuda Sadeh  */
734602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7354156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
736602adf40SYehuda Sadeh {
737ccece235SAlex Elder 	u32 snap_count;
73858c17b0eSAlex Elder 	size_t len;
739d2bb24e5SAlex Elder 	size_t size;
740621901d6SAlex Elder 	u32 i;
741602adf40SYehuda Sadeh 
7426a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7436a52325fSAlex Elder 
744103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
745103a150fSAlex Elder 
74658c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
74758c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7486a52325fSAlex Elder 	if (!header->object_prefix)
749602adf40SYehuda Sadeh 		return -ENOMEM;
75058c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
75158c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
75200f1f36fSAlex Elder 
753602adf40SYehuda Sadeh 	if (snap_count) {
754f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
755f785cc1dSAlex Elder 
756621901d6SAlex Elder 		/* Save a copy of the snapshot names */
757621901d6SAlex Elder 
758f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
759f785cc1dSAlex Elder 			return -EIO;
760f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
761602adf40SYehuda Sadeh 		if (!header->snap_names)
7626a52325fSAlex Elder 			goto out_err;
763f785cc1dSAlex Elder 		/*
764f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
765f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
766f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
767f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
768f785cc1dSAlex Elder 		 */
769f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
770f785cc1dSAlex Elder 			snap_names_len);
7716a52325fSAlex Elder 
772621901d6SAlex Elder 		/* Record each snapshot's size */
773621901d6SAlex Elder 
774d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
775d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
776602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7776a52325fSAlex Elder 			goto out_err;
778621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
779621901d6SAlex Elder 			header->snap_sizes[i] =
780621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
781602adf40SYehuda Sadeh 	} else {
782602adf40SYehuda Sadeh 		header->snap_names = NULL;
783602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
784602adf40SYehuda Sadeh 	}
785849b4260SAlex Elder 
78634b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
787602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
788602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
789602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7906a52325fSAlex Elder 
791621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
792621901d6SAlex Elder 
793f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
794468521c1SAlex Elder 
795812164f8SAlex Elder 	header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
7966a52325fSAlex Elder 	if (!header->snapc)
7976a52325fSAlex Elder 		goto out_err;
798505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
799621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
800468521c1SAlex Elder 		header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
801602adf40SYehuda Sadeh 
802602adf40SYehuda Sadeh 	return 0;
803602adf40SYehuda Sadeh 
8046a52325fSAlex Elder out_err:
805849b4260SAlex Elder 	kfree(header->snap_sizes);
806ccece235SAlex Elder 	header->snap_sizes = NULL;
807602adf40SYehuda Sadeh 	kfree(header->snap_names);
808ccece235SAlex Elder 	header->snap_names = NULL;
8096a52325fSAlex Elder 	kfree(header->object_prefix);
8106a52325fSAlex Elder 	header->object_prefix = NULL;
811ccece235SAlex Elder 
81200f1f36fSAlex Elder 	return -ENOMEM;
813602adf40SYehuda Sadeh }
814602adf40SYehuda Sadeh 
8159682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
8169682fc6dSAlex Elder {
8179682fc6dSAlex Elder 	const char *snap_name;
8189682fc6dSAlex Elder 
8199682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
8209682fc6dSAlex Elder 
8219682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
8229682fc6dSAlex Elder 
8239682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
8249682fc6dSAlex Elder 	while (which--)
8259682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
8269682fc6dSAlex Elder 
8279682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
8289682fc6dSAlex Elder }
8299682fc6dSAlex Elder 
83030d1cff8SAlex Elder /*
83130d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
83230d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
83330d1cff8SAlex Elder  */
83430d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
83530d1cff8SAlex Elder {
83630d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
83730d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
83830d1cff8SAlex Elder 
83930d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
84030d1cff8SAlex Elder 		return 1;
84130d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
84230d1cff8SAlex Elder }
84330d1cff8SAlex Elder 
84430d1cff8SAlex Elder /*
84530d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
84630d1cff8SAlex Elder  * present.
84730d1cff8SAlex Elder  *
84830d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
84930d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
85030d1cff8SAlex Elder  *
85130d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
85230d1cff8SAlex Elder  * reverse order, highest snapshot id first.
85330d1cff8SAlex Elder  */
8549682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
8559682fc6dSAlex Elder {
8569682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
85730d1cff8SAlex Elder 	u64 *found;
8589682fc6dSAlex Elder 
85930d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
86030d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
8619682fc6dSAlex Elder 
86230d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
8639682fc6dSAlex Elder }
8649682fc6dSAlex Elder 
8652ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
8662ad3d716SAlex Elder 					u64 snap_id)
86754cac61fSAlex Elder {
86854cac61fSAlex Elder 	u32 which;
86954cac61fSAlex Elder 
87054cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
87154cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
87254cac61fSAlex Elder 		return NULL;
87354cac61fSAlex Elder 
87454cac61fSAlex Elder 	return _rbd_dev_v1_snap_name(rbd_dev, which);
87554cac61fSAlex Elder }
87654cac61fSAlex Elder 
8779e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
8789e15b77dSAlex Elder {
8799e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
8809e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
8819e15b77dSAlex Elder 
88254cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
88354cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
88454cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
8859e15b77dSAlex Elder 
88654cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
8879e15b77dSAlex Elder }
8889e15b77dSAlex Elder 
8892ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
8902ad3d716SAlex Elder 				u64 *snap_size)
891602adf40SYehuda Sadeh {
8922ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
8932ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
8942ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
8952ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
8962ad3d716SAlex Elder 		u32 which;
89700f1f36fSAlex Elder 
8982ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
8992ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
9002ad3d716SAlex Elder 			return -ENOENT;
90100f1f36fSAlex Elder 
9022ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
9032ad3d716SAlex Elder 	} else {
9042ad3d716SAlex Elder 		u64 size = 0;
9052ad3d716SAlex Elder 		int ret;
9062ad3d716SAlex Elder 
9072ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
9082ad3d716SAlex Elder 		if (ret)
9092ad3d716SAlex Elder 			return ret;
9102ad3d716SAlex Elder 
9112ad3d716SAlex Elder 		*snap_size = size;
9122ad3d716SAlex Elder 	}
9132ad3d716SAlex Elder 	return 0;
9142ad3d716SAlex Elder }
9152ad3d716SAlex Elder 
9162ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
9172ad3d716SAlex Elder 			u64 *snap_features)
9182ad3d716SAlex Elder {
9192ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9202ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9212ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
9222ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9232ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
9242ad3d716SAlex Elder 	} else {
9252ad3d716SAlex Elder 		u64 features = 0;
9262ad3d716SAlex Elder 		int ret;
9272ad3d716SAlex Elder 
9282ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
9292ad3d716SAlex Elder 		if (ret)
9302ad3d716SAlex Elder 			return ret;
9312ad3d716SAlex Elder 
9322ad3d716SAlex Elder 		*snap_features = features;
9332ad3d716SAlex Elder 	}
9342ad3d716SAlex Elder 	return 0;
93500f1f36fSAlex Elder }
936602adf40SYehuda Sadeh 
937d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
938602adf40SYehuda Sadeh {
9392ad3d716SAlex Elder 	const char *snap_name = rbd_dev->spec->snap_name;
9402ad3d716SAlex Elder 	u64 snap_id;
9412ad3d716SAlex Elder 	u64 size = 0;
9422ad3d716SAlex Elder 	u64 features = 0;
9432ad3d716SAlex Elder 	int ret;
9448b0241f8SAlex Elder 
9452ad3d716SAlex Elder 	if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
9462ad3d716SAlex Elder 		snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
9472ad3d716SAlex Elder 		if (snap_id == CEPH_NOSNAP)
9488b0241f8SAlex Elder 			return -ENOENT;
9492ad3d716SAlex Elder 	} else {
9502ad3d716SAlex Elder 		snap_id = CEPH_NOSNAP;
951602adf40SYehuda Sadeh 	}
9526d292906SAlex Elder 
9532ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
9542ad3d716SAlex Elder 	if (ret)
9552ad3d716SAlex Elder 		return ret;
9562ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
9572ad3d716SAlex Elder 	if (ret)
9582ad3d716SAlex Elder 		return ret;
9592ad3d716SAlex Elder 
9602ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
9612ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
9622ad3d716SAlex Elder 
9632ad3d716SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
9642ad3d716SAlex Elder 
9652ad3d716SAlex Elder 	if (snap_id != CEPH_NOSNAP)
9662ad3d716SAlex Elder 		rbd_dev->mapping.read_only = true;
9672ad3d716SAlex Elder 
9688b0241f8SAlex Elder 	return 0;
969602adf40SYehuda Sadeh }
970602adf40SYehuda Sadeh 
971d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
972d1cf5788SAlex Elder {
973d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
974d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
975d1cf5788SAlex Elder 	rbd_dev->mapping.read_only = true;
976d1cf5788SAlex Elder }
977d1cf5788SAlex Elder 
978200a6a8bSAlex Elder static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
979200a6a8bSAlex Elder {
980200a6a8bSAlex Elder 	rbd_dev->mapping.size = 0;
981200a6a8bSAlex Elder 	rbd_dev->mapping.features = 0;
982200a6a8bSAlex Elder 	rbd_dev->mapping.read_only = true;
983200a6a8bSAlex Elder }
984200a6a8bSAlex Elder 
98598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
986602adf40SYehuda Sadeh {
98765ccfe21SAlex Elder 	char *name;
98865ccfe21SAlex Elder 	u64 segment;
98965ccfe21SAlex Elder 	int ret;
990602adf40SYehuda Sadeh 
99178c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
99265ccfe21SAlex Elder 	if (!name)
99365ccfe21SAlex Elder 		return NULL;
99465ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
9952fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
99665ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
9972fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
99865ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
99965ccfe21SAlex Elder 			segment, ret);
100065ccfe21SAlex Elder 		kfree(name);
100165ccfe21SAlex Elder 		name = NULL;
100265ccfe21SAlex Elder 	}
1003602adf40SYehuda Sadeh 
100465ccfe21SAlex Elder 	return name;
100565ccfe21SAlex Elder }
1006602adf40SYehuda Sadeh 
100778c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
100878c2a44aSAlex Elder {
100978c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
101078c2a44aSAlex Elder 
101178c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
101278c2a44aSAlex Elder }
101378c2a44aSAlex Elder 
101465ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
101565ccfe21SAlex Elder {
101665ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1017602adf40SYehuda Sadeh 
101865ccfe21SAlex Elder 	return offset & (segment_size - 1);
101965ccfe21SAlex Elder }
102065ccfe21SAlex Elder 
102165ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
102265ccfe21SAlex Elder 				u64 offset, u64 length)
102365ccfe21SAlex Elder {
102465ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
102565ccfe21SAlex Elder 
102665ccfe21SAlex Elder 	offset &= segment_size - 1;
102765ccfe21SAlex Elder 
1028aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
102965ccfe21SAlex Elder 	if (offset + length > segment_size)
103065ccfe21SAlex Elder 		length = segment_size - offset;
103165ccfe21SAlex Elder 
103265ccfe21SAlex Elder 	return length;
1033602adf40SYehuda Sadeh }
1034602adf40SYehuda Sadeh 
1035602adf40SYehuda Sadeh /*
1036029bcbd8SJosh Durgin  * returns the size of an object in the image
1037029bcbd8SJosh Durgin  */
1038029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1039029bcbd8SJosh Durgin {
1040029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1041029bcbd8SJosh Durgin }
1042029bcbd8SJosh Durgin 
1043029bcbd8SJosh Durgin /*
1044602adf40SYehuda Sadeh  * bio helpers
1045602adf40SYehuda Sadeh  */
1046602adf40SYehuda Sadeh 
1047602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1048602adf40SYehuda Sadeh {
1049602adf40SYehuda Sadeh 	struct bio *tmp;
1050602adf40SYehuda Sadeh 
1051602adf40SYehuda Sadeh 	while (chain) {
1052602adf40SYehuda Sadeh 		tmp = chain;
1053602adf40SYehuda Sadeh 		chain = chain->bi_next;
1054602adf40SYehuda Sadeh 		bio_put(tmp);
1055602adf40SYehuda Sadeh 	}
1056602adf40SYehuda Sadeh }
1057602adf40SYehuda Sadeh 
1058602adf40SYehuda Sadeh /*
1059602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1060602adf40SYehuda Sadeh  */
1061602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1062602adf40SYehuda Sadeh {
1063602adf40SYehuda Sadeh 	struct bio_vec *bv;
1064602adf40SYehuda Sadeh 	unsigned long flags;
1065602adf40SYehuda Sadeh 	void *buf;
1066602adf40SYehuda Sadeh 	int i;
1067602adf40SYehuda Sadeh 	int pos = 0;
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh 	while (chain) {
1070602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
1071602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
1072602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
1073602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
1074602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
1075602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
107685b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1077602adf40SYehuda Sadeh 			}
1078602adf40SYehuda Sadeh 			pos += bv->bv_len;
1079602adf40SYehuda Sadeh 		}
1080602adf40SYehuda Sadeh 
1081602adf40SYehuda Sadeh 		chain = chain->bi_next;
1082602adf40SYehuda Sadeh 	}
1083602adf40SYehuda Sadeh }
1084602adf40SYehuda Sadeh 
1085602adf40SYehuda Sadeh /*
1086b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1087b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1088b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1089b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1090b9434c5bSAlex Elder  */
1091b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1092b9434c5bSAlex Elder {
1093b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1094b9434c5bSAlex Elder 
1095b9434c5bSAlex Elder 	rbd_assert(end > offset);
1096b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1097b9434c5bSAlex Elder 	while (offset < end) {
1098b9434c5bSAlex Elder 		size_t page_offset;
1099b9434c5bSAlex Elder 		size_t length;
1100b9434c5bSAlex Elder 		unsigned long flags;
1101b9434c5bSAlex Elder 		void *kaddr;
1102b9434c5bSAlex Elder 
1103b9434c5bSAlex Elder 		page_offset = (size_t)(offset & ~PAGE_MASK);
1104b9434c5bSAlex Elder 		length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1105b9434c5bSAlex Elder 		local_irq_save(flags);
1106b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1107b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1108b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1109b9434c5bSAlex Elder 		local_irq_restore(flags);
1110b9434c5bSAlex Elder 
1111b9434c5bSAlex Elder 		offset += length;
1112b9434c5bSAlex Elder 		page++;
1113b9434c5bSAlex Elder 	}
1114b9434c5bSAlex Elder }
1115b9434c5bSAlex Elder 
1116b9434c5bSAlex Elder /*
1117f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1118f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1119602adf40SYehuda Sadeh  */
1120f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1121f7760dadSAlex Elder 					unsigned int offset,
1122f7760dadSAlex Elder 					unsigned int len,
1123f7760dadSAlex Elder 					gfp_t gfpmask)
1124602adf40SYehuda Sadeh {
1125f7760dadSAlex Elder 	struct bio_vec *bv;
1126f7760dadSAlex Elder 	unsigned int resid;
1127f7760dadSAlex Elder 	unsigned short idx;
1128f7760dadSAlex Elder 	unsigned int voff;
1129f7760dadSAlex Elder 	unsigned short end_idx;
1130f7760dadSAlex Elder 	unsigned short vcnt;
1131f7760dadSAlex Elder 	struct bio *bio;
1132602adf40SYehuda Sadeh 
1133f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1134f7760dadSAlex Elder 
1135f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1136f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1137f7760dadSAlex Elder 
1138f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1139f7760dadSAlex Elder 		return NULL;
1140f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1141f7760dadSAlex Elder 		return NULL;
1142f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1143f7760dadSAlex Elder 		return NULL;
1144f7760dadSAlex Elder 
1145f7760dadSAlex Elder 	/* Find first affected segment... */
1146f7760dadSAlex Elder 
1147f7760dadSAlex Elder 	resid = offset;
1148f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1149f7760dadSAlex Elder 		if (resid < bv->bv_len)
1150f7760dadSAlex Elder 			break;
1151f7760dadSAlex Elder 		resid -= bv->bv_len;
1152602adf40SYehuda Sadeh 	}
1153f7760dadSAlex Elder 	voff = resid;
1154602adf40SYehuda Sadeh 
1155f7760dadSAlex Elder 	/* ...and the last affected segment */
1156542582fcSAlex Elder 
1157f7760dadSAlex Elder 	resid += len;
1158f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1159f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1160f7760dadSAlex Elder 			break;
1161f7760dadSAlex Elder 		resid -= bv->bv_len;
1162f7760dadSAlex Elder 	}
1163f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1164602adf40SYehuda Sadeh 
1165f7760dadSAlex Elder 	/* Build the clone */
1166f7760dadSAlex Elder 
1167f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1168f7760dadSAlex Elder 	if (!bio)
1169f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1170f7760dadSAlex Elder 
1171f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1172f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1173f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1174f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1175602adf40SYehuda Sadeh 
1176602adf40SYehuda Sadeh 	/*
1177f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1178f7760dadSAlex Elder 	 * and last (or only) entries.
1179602adf40SYehuda Sadeh 	 */
1180f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1181f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1182f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1183f7760dadSAlex Elder 	if (vcnt > 1) {
1184f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1185f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1186602adf40SYehuda Sadeh 	} else {
1187f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1188602adf40SYehuda Sadeh 	}
1189602adf40SYehuda Sadeh 
1190f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1191f7760dadSAlex Elder 	bio->bi_size = len;
1192f7760dadSAlex Elder 	bio->bi_idx = 0;
1193602adf40SYehuda Sadeh 
1194f7760dadSAlex Elder 	return bio;
1195602adf40SYehuda Sadeh }
1196602adf40SYehuda Sadeh 
1197f7760dadSAlex Elder /*
1198f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1199f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1200f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1201f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1202f7760dadSAlex Elder  *
1203f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1204f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1205f7760dadSAlex Elder  * the start of data to be cloned is located.
1206f7760dadSAlex Elder  *
1207f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1208f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1209f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1210f7760dadSAlex Elder  */
1211f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1212f7760dadSAlex Elder 					unsigned int *offset,
1213f7760dadSAlex Elder 					unsigned int len,
1214f7760dadSAlex Elder 					gfp_t gfpmask)
1215f7760dadSAlex Elder {
1216f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1217f7760dadSAlex Elder 	unsigned int off = *offset;
1218f7760dadSAlex Elder 	struct bio *chain = NULL;
1219f7760dadSAlex Elder 	struct bio **end;
1220602adf40SYehuda Sadeh 
1221f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1222602adf40SYehuda Sadeh 
1223f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1224f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1225602adf40SYehuda Sadeh 
1226f7760dadSAlex Elder 	end = &chain;
1227f7760dadSAlex Elder 	while (len) {
1228f7760dadSAlex Elder 		unsigned int bi_size;
1229f7760dadSAlex Elder 		struct bio *bio;
1230f7760dadSAlex Elder 
1231f5400b7aSAlex Elder 		if (!bi) {
1232f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1233f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1234f5400b7aSAlex Elder 		}
1235f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1236f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1237f7760dadSAlex Elder 		if (!bio)
1238f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1239f7760dadSAlex Elder 
1240f7760dadSAlex Elder 		*end = bio;
1241f7760dadSAlex Elder 		end = &bio->bi_next;
1242f7760dadSAlex Elder 
1243f7760dadSAlex Elder 		off += bi_size;
1244f7760dadSAlex Elder 		if (off == bi->bi_size) {
1245f7760dadSAlex Elder 			bi = bi->bi_next;
1246f7760dadSAlex Elder 			off = 0;
1247f7760dadSAlex Elder 		}
1248f7760dadSAlex Elder 		len -= bi_size;
1249f7760dadSAlex Elder 	}
1250f7760dadSAlex Elder 	*bio_src = bi;
1251f7760dadSAlex Elder 	*offset = off;
1252f7760dadSAlex Elder 
1253f7760dadSAlex Elder 	return chain;
1254f7760dadSAlex Elder out_err:
1255f7760dadSAlex Elder 	bio_chain_put(chain);
1256f7760dadSAlex Elder 
1257602adf40SYehuda Sadeh 	return NULL;
1258602adf40SYehuda Sadeh }
1259602adf40SYehuda Sadeh 
1260926f9b3fSAlex Elder /*
1261926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1262926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1263926f9b3fSAlex Elder  * again.
1264926f9b3fSAlex Elder  */
12656365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
12666365d33aSAlex Elder {
12676365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
12686365d33aSAlex Elder 		struct rbd_device *rbd_dev;
12696365d33aSAlex Elder 
127057acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
12716365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
12726365d33aSAlex Elder 			obj_request);
12736365d33aSAlex Elder 	}
12746365d33aSAlex Elder }
12756365d33aSAlex Elder 
12766365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
12776365d33aSAlex Elder {
12786365d33aSAlex Elder 	smp_mb();
12796365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
12806365d33aSAlex Elder }
12816365d33aSAlex Elder 
128257acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
128357acbaa7SAlex Elder {
128457acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
128557acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
128657acbaa7SAlex Elder 
128757acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
128857acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
128957acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
129057acbaa7SAlex Elder 			obj_request);
129157acbaa7SAlex Elder 	}
129257acbaa7SAlex Elder }
129357acbaa7SAlex Elder 
129457acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
129557acbaa7SAlex Elder {
129657acbaa7SAlex Elder 	smp_mb();
129757acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
129857acbaa7SAlex Elder }
129957acbaa7SAlex Elder 
13005679c59fSAlex Elder /*
13015679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
13025679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
13035679c59fSAlex Elder  *
13045679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
13055679c59fSAlex Elder  * away again.  It's possible that the response from two existence
13065679c59fSAlex Elder  * checks are separated by the creation of the target object, and
13075679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
13085679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
13095679c59fSAlex Elder  */
13105679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
13115679c59fSAlex Elder 				bool exists)
13125679c59fSAlex Elder {
13135679c59fSAlex Elder 	if (exists)
13145679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
13155679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
13165679c59fSAlex Elder 	smp_mb();
13175679c59fSAlex Elder }
13185679c59fSAlex Elder 
13195679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
13205679c59fSAlex Elder {
13215679c59fSAlex Elder 	smp_mb();
13225679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
13235679c59fSAlex Elder }
13245679c59fSAlex Elder 
13255679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
13265679c59fSAlex Elder {
13275679c59fSAlex Elder 	smp_mb();
13285679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
13295679c59fSAlex Elder }
13305679c59fSAlex Elder 
1331bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1332bf0d5f50SAlex Elder {
133337206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
133437206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1335bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1336bf0d5f50SAlex Elder }
1337bf0d5f50SAlex Elder 
1338bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1339bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1340bf0d5f50SAlex Elder {
1341bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
134237206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
134337206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1344bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1345bf0d5f50SAlex Elder }
1346bf0d5f50SAlex Elder 
1347bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1348bf0d5f50SAlex Elder {
134937206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
135037206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1351bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1352bf0d5f50SAlex Elder }
1353bf0d5f50SAlex Elder 
1354bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1355bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1356bf0d5f50SAlex Elder {
1357bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
135837206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
135937206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1360bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1361bf0d5f50SAlex Elder }
1362bf0d5f50SAlex Elder 
1363bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1364bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1365bf0d5f50SAlex Elder {
136625dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
136725dcf954SAlex Elder 
1368b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1369bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
137025dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
13716365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
13726365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1373bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
137425dcf954SAlex Elder 	img_request->obj_request_count++;
137525dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
137637206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
137737206ee5SAlex Elder 		obj_request->which);
1378bf0d5f50SAlex Elder }
1379bf0d5f50SAlex Elder 
1380bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1381bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1382bf0d5f50SAlex Elder {
1383bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
138425dcf954SAlex Elder 
138537206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
138637206ee5SAlex Elder 		obj_request->which);
1387bf0d5f50SAlex Elder 	list_del(&obj_request->links);
138825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
138925dcf954SAlex Elder 	img_request->obj_request_count--;
139025dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
139125dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
13926365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1393bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1394bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
139525dcf954SAlex Elder 	obj_request->callback = NULL;
1396bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1397bf0d5f50SAlex Elder }
1398bf0d5f50SAlex Elder 
1399bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1400bf0d5f50SAlex Elder {
1401bf0d5f50SAlex Elder 	switch (type) {
14029969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1403bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1404788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1405bf0d5f50SAlex Elder 		return true;
1406bf0d5f50SAlex Elder 	default:
1407bf0d5f50SAlex Elder 		return false;
1408bf0d5f50SAlex Elder 	}
1409bf0d5f50SAlex Elder }
1410bf0d5f50SAlex Elder 
1411bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1412bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1413bf0d5f50SAlex Elder {
141437206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
141537206ee5SAlex Elder 
1416bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1417bf0d5f50SAlex Elder }
1418bf0d5f50SAlex Elder 
1419bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1420bf0d5f50SAlex Elder {
142155f27e09SAlex Elder 
142237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
142355f27e09SAlex Elder 
142455f27e09SAlex Elder 	/*
142555f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
142655f27e09SAlex Elder 	 * count for the image request.  We could instead use
142755f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
142855f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
142955f27e09SAlex Elder 	 */
143055f27e09SAlex Elder 	if (!img_request->result) {
143155f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
143255f27e09SAlex Elder 		u64 xferred = 0;
143355f27e09SAlex Elder 
143455f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
143555f27e09SAlex Elder 			xferred += obj_request->xferred;
143655f27e09SAlex Elder 		img_request->xferred = xferred;
143755f27e09SAlex Elder 	}
143855f27e09SAlex Elder 
1439bf0d5f50SAlex Elder 	if (img_request->callback)
1440bf0d5f50SAlex Elder 		img_request->callback(img_request);
1441bf0d5f50SAlex Elder 	else
1442bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1443bf0d5f50SAlex Elder }
1444bf0d5f50SAlex Elder 
1445788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1446788e2df3SAlex Elder 
1447788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1448788e2df3SAlex Elder {
144937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
145037206ee5SAlex Elder 
1451788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1452788e2df3SAlex Elder }
1453788e2df3SAlex Elder 
14540c425248SAlex Elder /*
14550c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14560c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
14570c425248SAlex Elder  * and currently never change thereafter.
14580c425248SAlex Elder  */
14590c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
14600c425248SAlex Elder {
14610c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
14620c425248SAlex Elder 	smp_mb();
14630c425248SAlex Elder }
14640c425248SAlex Elder 
14650c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
14660c425248SAlex Elder {
14670c425248SAlex Elder 	smp_mb();
14680c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
14690c425248SAlex Elder }
14700c425248SAlex Elder 
14719849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
14729849e986SAlex Elder {
14739849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
14749849e986SAlex Elder 	smp_mb();
14759849e986SAlex Elder }
14769849e986SAlex Elder 
14779849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
14789849e986SAlex Elder {
14799849e986SAlex Elder 	smp_mb();
14809849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
14819849e986SAlex Elder }
14829849e986SAlex Elder 
1483d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1484d0b2e944SAlex Elder {
1485d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1486d0b2e944SAlex Elder 	smp_mb();
1487d0b2e944SAlex Elder }
1488d0b2e944SAlex Elder 
1489d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1490d0b2e944SAlex Elder {
1491d0b2e944SAlex Elder 	smp_mb();
1492d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1493d0b2e944SAlex Elder }
1494d0b2e944SAlex Elder 
14956e2a4505SAlex Elder static void
14966e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
14976e2a4505SAlex Elder {
1498b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1499b9434c5bSAlex Elder 	u64 length = obj_request->length;
1500b9434c5bSAlex Elder 
15016e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15026e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1503b9434c5bSAlex Elder 		xferred, length);
15046e2a4505SAlex Elder 	/*
15056e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
15066e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
15076e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
15086e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
15096e2a4505SAlex Elder 	 * was satisfied.
15106e2a4505SAlex Elder 	 */
1511b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
15126e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1513b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
15146e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1515b9434c5bSAlex Elder 		else
1516b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
15176e2a4505SAlex Elder 		obj_request->result = 0;
1518b9434c5bSAlex Elder 		obj_request->xferred = length;
1519b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1520b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1521b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1522b9434c5bSAlex Elder 		else
1523b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1524b9434c5bSAlex Elder 		obj_request->xferred = length;
15256e2a4505SAlex Elder 	}
15266e2a4505SAlex Elder 	obj_request_done_set(obj_request);
15276e2a4505SAlex Elder }
15286e2a4505SAlex Elder 
1529bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1530bf0d5f50SAlex Elder {
153137206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
153237206ee5SAlex Elder 		obj_request->callback);
1533bf0d5f50SAlex Elder 	if (obj_request->callback)
1534bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1535788e2df3SAlex Elder 	else
1536788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1537bf0d5f50SAlex Elder }
1538bf0d5f50SAlex Elder 
1539c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
154039bf2c5dSAlex Elder {
154139bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
154239bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
154339bf2c5dSAlex Elder }
154439bf2c5dSAlex Elder 
1545c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1546bf0d5f50SAlex Elder {
154757acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1548a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
154957acbaa7SAlex Elder 	bool layered = false;
155057acbaa7SAlex Elder 
155157acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
155257acbaa7SAlex Elder 		img_request = obj_request->img_request;
155357acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1554a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
155557acbaa7SAlex Elder 	}
15568b3e1a56SAlex Elder 
15578b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15588b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
15598b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1560a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1561a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
15628b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
15638b3e1a56SAlex Elder 	else if (img_request)
15646e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
15656e2a4505SAlex Elder 	else
156607741308SAlex Elder 		obj_request_done_set(obj_request);
1567bf0d5f50SAlex Elder }
1568bf0d5f50SAlex Elder 
1569c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1570bf0d5f50SAlex Elder {
15711b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
15721b83bef2SSage Weil 		obj_request->result, obj_request->length);
15731b83bef2SSage Weil 	/*
15748b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
15758b3e1a56SAlex Elder 	 * it to our originally-requested length.
15761b83bef2SSage Weil 	 */
15771b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
157807741308SAlex Elder 	obj_request_done_set(obj_request);
1579bf0d5f50SAlex Elder }
1580bf0d5f50SAlex Elder 
1581fbfab539SAlex Elder /*
1582fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1583fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1584fbfab539SAlex Elder  */
1585c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1586fbfab539SAlex Elder {
158737206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1588fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1589fbfab539SAlex Elder }
1590fbfab539SAlex Elder 
1591bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1592bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1593bf0d5f50SAlex Elder {
1594bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1595bf0d5f50SAlex Elder 	u16 opcode;
1596bf0d5f50SAlex Elder 
159737206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1598bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
159957acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
160057acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
160157acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
160257acbaa7SAlex Elder 	} else {
160357acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
160457acbaa7SAlex Elder 	}
1605bf0d5f50SAlex Elder 
16061b83bef2SSage Weil 	if (osd_req->r_result < 0)
16071b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1608bf0d5f50SAlex Elder 
16090eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1610bf0d5f50SAlex Elder 
1611c47f9371SAlex Elder 	/*
1612c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1613c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1614c47f9371SAlex Elder 	 */
16151b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1616c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
161779528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1618bf0d5f50SAlex Elder 	switch (opcode) {
1619bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1620c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1621bf0d5f50SAlex Elder 		break;
1622bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1623c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1624bf0d5f50SAlex Elder 		break;
1625fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1626c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1627fbfab539SAlex Elder 		break;
162836be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1629b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
16309969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1631c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
16329969ebc5SAlex Elder 		break;
1633bf0d5f50SAlex Elder 	default:
1634bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1635bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1636bf0d5f50SAlex Elder 		break;
1637bf0d5f50SAlex Elder 	}
1638bf0d5f50SAlex Elder 
163907741308SAlex Elder 	if (obj_request_done_test(obj_request))
1640bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1641bf0d5f50SAlex Elder }
1642bf0d5f50SAlex Elder 
16439d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1644430c28c3SAlex Elder {
1645430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16468c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16479d4df01fSAlex Elder 	u64 snap_id;
1648430c28c3SAlex Elder 
16498c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1650430c28c3SAlex Elder 
16519d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
16528c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
16539d4df01fSAlex Elder 			NULL, snap_id, NULL);
16549d4df01fSAlex Elder }
16559d4df01fSAlex Elder 
16569d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
16579d4df01fSAlex Elder {
16589d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16599d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16609d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
16619d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
16629d4df01fSAlex Elder 
16639d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
16649d4df01fSAlex Elder 
16659d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
16669d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
16679d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1668430c28c3SAlex Elder }
1669430c28c3SAlex Elder 
1670bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1671bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1672bf0d5f50SAlex Elder 					bool write_request,
1673430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1674bf0d5f50SAlex Elder {
1675bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1676bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1677bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1678bf0d5f50SAlex Elder 
16796365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
16806365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
16816365d33aSAlex Elder 
16820c425248SAlex Elder 		rbd_assert(write_request ==
16830c425248SAlex Elder 				img_request_write_test(img_request));
16840c425248SAlex Elder 		if (write_request)
1685bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1686bf0d5f50SAlex Elder 	}
1687bf0d5f50SAlex Elder 
1688bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1689bf0d5f50SAlex Elder 
1690bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1691bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1692bf0d5f50SAlex Elder 	if (!osd_req)
1693bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1694bf0d5f50SAlex Elder 
1695430c28c3SAlex Elder 	if (write_request)
1696bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1697430c28c3SAlex Elder 	else
1698bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1699bf0d5f50SAlex Elder 
1700bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1701bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1702bf0d5f50SAlex Elder 
1703bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1704bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1705bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1706bf0d5f50SAlex Elder 
1707bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1708bf0d5f50SAlex Elder 
1709bf0d5f50SAlex Elder 	return osd_req;
1710bf0d5f50SAlex Elder }
1711bf0d5f50SAlex Elder 
17120eefd470SAlex Elder /*
17130eefd470SAlex Elder  * Create a copyup osd request based on the information in the
17140eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
17150eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
17160eefd470SAlex Elder  */
17170eefd470SAlex Elder static struct ceph_osd_request *
17180eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
17190eefd470SAlex Elder {
17200eefd470SAlex Elder 	struct rbd_img_request *img_request;
17210eefd470SAlex Elder 	struct ceph_snap_context *snapc;
17220eefd470SAlex Elder 	struct rbd_device *rbd_dev;
17230eefd470SAlex Elder 	struct ceph_osd_client *osdc;
17240eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
17250eefd470SAlex Elder 
17260eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17270eefd470SAlex Elder 	img_request = obj_request->img_request;
17280eefd470SAlex Elder 	rbd_assert(img_request);
17290eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
17300eefd470SAlex Elder 
17310eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
17320eefd470SAlex Elder 
17330eefd470SAlex Elder 	snapc = img_request->snapc;
17340eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
17350eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
17360eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
17370eefd470SAlex Elder 	if (!osd_req)
17380eefd470SAlex Elder 		return NULL;	/* ENOMEM */
17390eefd470SAlex Elder 
17400eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
17410eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
17420eefd470SAlex Elder 	osd_req->r_priv = obj_request;
17430eefd470SAlex Elder 
17440eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
17450eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
17460eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
17470eefd470SAlex Elder 
17480eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
17490eefd470SAlex Elder 
17500eefd470SAlex Elder 	return osd_req;
17510eefd470SAlex Elder }
17520eefd470SAlex Elder 
17530eefd470SAlex Elder 
1754bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1755bf0d5f50SAlex Elder {
1756bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1757bf0d5f50SAlex Elder }
1758bf0d5f50SAlex Elder 
1759bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1760bf0d5f50SAlex Elder 
1761bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1762bf0d5f50SAlex Elder 						u64 offset, u64 length,
1763bf0d5f50SAlex Elder 						enum obj_request_type type)
1764bf0d5f50SAlex Elder {
1765bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1766bf0d5f50SAlex Elder 	size_t size;
1767bf0d5f50SAlex Elder 	char *name;
1768bf0d5f50SAlex Elder 
1769bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1770bf0d5f50SAlex Elder 
1771bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1772f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1773f907ad55SAlex Elder 	if (!name)
1774bf0d5f50SAlex Elder 		return NULL;
1775bf0d5f50SAlex Elder 
1776868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1777f907ad55SAlex Elder 	if (!obj_request) {
1778f907ad55SAlex Elder 		kfree(name);
1779f907ad55SAlex Elder 		return NULL;
1780f907ad55SAlex Elder 	}
1781f907ad55SAlex Elder 
1782bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1783bf0d5f50SAlex Elder 	obj_request->offset = offset;
1784bf0d5f50SAlex Elder 	obj_request->length = length;
1785926f9b3fSAlex Elder 	obj_request->flags = 0;
1786bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1787bf0d5f50SAlex Elder 	obj_request->type = type;
1788bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1789788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1790bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1791bf0d5f50SAlex Elder 
179237206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
179337206ee5SAlex Elder 		offset, length, (int)type, obj_request);
179437206ee5SAlex Elder 
1795bf0d5f50SAlex Elder 	return obj_request;
1796bf0d5f50SAlex Elder }
1797bf0d5f50SAlex Elder 
1798bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1799bf0d5f50SAlex Elder {
1800bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1801bf0d5f50SAlex Elder 
1802bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1803bf0d5f50SAlex Elder 
180437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
180537206ee5SAlex Elder 
1806bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1807bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1808bf0d5f50SAlex Elder 
1809bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1810bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1811bf0d5f50SAlex Elder 
1812bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1813bf0d5f50SAlex Elder 	switch (obj_request->type) {
18149969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
18159969ebc5SAlex Elder 		break;		/* Nothing to do */
1816bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1817bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1818bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1819bf0d5f50SAlex Elder 		break;
1820788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1821788e2df3SAlex Elder 		if (obj_request->pages)
1822788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1823788e2df3SAlex Elder 						obj_request->page_count);
1824788e2df3SAlex Elder 		break;
1825bf0d5f50SAlex Elder 	}
1826bf0d5f50SAlex Elder 
1827f907ad55SAlex Elder 	kfree(obj_request->object_name);
1828868311b1SAlex Elder 	obj_request->object_name = NULL;
1829868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1830bf0d5f50SAlex Elder }
1831bf0d5f50SAlex Elder 
1832bf0d5f50SAlex Elder /*
1833bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1834bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1835bf0d5f50SAlex Elder  * (if there is one).
1836bf0d5f50SAlex Elder  */
1837cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1838cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1839bf0d5f50SAlex Elder 					u64 offset, u64 length,
18409849e986SAlex Elder 					bool write_request,
18419849e986SAlex Elder 					bool child_request)
1842bf0d5f50SAlex Elder {
1843bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1844bf0d5f50SAlex Elder 
18451c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1846bf0d5f50SAlex Elder 	if (!img_request)
1847bf0d5f50SAlex Elder 		return NULL;
1848bf0d5f50SAlex Elder 
1849bf0d5f50SAlex Elder 	if (write_request) {
1850bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1851812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1852bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1853bf0d5f50SAlex Elder 	}
1854bf0d5f50SAlex Elder 
1855bf0d5f50SAlex Elder 	img_request->rq = NULL;
1856bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1857bf0d5f50SAlex Elder 	img_request->offset = offset;
1858bf0d5f50SAlex Elder 	img_request->length = length;
18590c425248SAlex Elder 	img_request->flags = 0;
18600c425248SAlex Elder 	if (write_request) {
18610c425248SAlex Elder 		img_request_write_set(img_request);
1862468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
18630c425248SAlex Elder 	} else {
1864bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
18650c425248SAlex Elder 	}
18669849e986SAlex Elder 	if (child_request)
18679849e986SAlex Elder 		img_request_child_set(img_request);
1868d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1869d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1870bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1871bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1872bf0d5f50SAlex Elder 	img_request->callback = NULL;
1873a5a337d4SAlex Elder 	img_request->result = 0;
1874bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1875bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1876bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1877bf0d5f50SAlex Elder 
1878bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1879bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1880bf0d5f50SAlex Elder 
188137206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
188237206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
188337206ee5SAlex Elder 		img_request);
188437206ee5SAlex Elder 
1885bf0d5f50SAlex Elder 	return img_request;
1886bf0d5f50SAlex Elder }
1887bf0d5f50SAlex Elder 
1888bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1889bf0d5f50SAlex Elder {
1890bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1891bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1892bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1893bf0d5f50SAlex Elder 
1894bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1895bf0d5f50SAlex Elder 
189637206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
189737206ee5SAlex Elder 
1898bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1899bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
190025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1901bf0d5f50SAlex Elder 
19020c425248SAlex Elder 	if (img_request_write_test(img_request))
1903812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1904bf0d5f50SAlex Elder 
19058b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
19068b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
19078b3e1a56SAlex Elder 
19081c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1909bf0d5f50SAlex Elder }
1910bf0d5f50SAlex Elder 
19111217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
19121217857fSAlex Elder {
19136365d33aSAlex Elder 	struct rbd_img_request *img_request;
19141217857fSAlex Elder 	unsigned int xferred;
19151217857fSAlex Elder 	int result;
19168b3e1a56SAlex Elder 	bool more;
19171217857fSAlex Elder 
19186365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19196365d33aSAlex Elder 	img_request = obj_request->img_request;
19206365d33aSAlex Elder 
19211217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
19221217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
19231217857fSAlex Elder 	result = obj_request->result;
19241217857fSAlex Elder 	if (result) {
19251217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
19261217857fSAlex Elder 
19271217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
19281217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
19291217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
19301217857fSAlex Elder 			obj_request->offset);
19311217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
19321217857fSAlex Elder 			result, xferred);
19331217857fSAlex Elder 		if (!img_request->result)
19341217857fSAlex Elder 			img_request->result = result;
19351217857fSAlex Elder 	}
19361217857fSAlex Elder 
1937f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
1938f1a4739fSAlex Elder 
1939f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
1940f1a4739fSAlex Elder 		obj_request->pages = NULL;
1941f1a4739fSAlex Elder 		obj_request->page_count = 0;
1942f1a4739fSAlex Elder 	}
1943f1a4739fSAlex Elder 
19448b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
19458b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
19468b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
19478b3e1a56SAlex Elder 	} else {
19488b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
19498b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
19508b3e1a56SAlex Elder 	}
19518b3e1a56SAlex Elder 
19528b3e1a56SAlex Elder 	return more;
19531217857fSAlex Elder }
19541217857fSAlex Elder 
19552169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
19562169238dSAlex Elder {
19572169238dSAlex Elder 	struct rbd_img_request *img_request;
19582169238dSAlex Elder 	u32 which = obj_request->which;
19592169238dSAlex Elder 	bool more = true;
19602169238dSAlex Elder 
19616365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19622169238dSAlex Elder 	img_request = obj_request->img_request;
19632169238dSAlex Elder 
19642169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
19652169238dSAlex Elder 	rbd_assert(img_request != NULL);
19662169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
19672169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
19682169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
19692169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
19702169238dSAlex Elder 
19712169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
19722169238dSAlex Elder 	if (which != img_request->next_completion)
19732169238dSAlex Elder 		goto out;
19742169238dSAlex Elder 
19752169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
19762169238dSAlex Elder 		rbd_assert(more);
19772169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
19782169238dSAlex Elder 
19792169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
19802169238dSAlex Elder 			break;
19811217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
19822169238dSAlex Elder 		which++;
19832169238dSAlex Elder 	}
19842169238dSAlex Elder 
19852169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
19862169238dSAlex Elder 	img_request->next_completion = which;
19872169238dSAlex Elder out:
19882169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
19892169238dSAlex Elder 
19902169238dSAlex Elder 	if (!more)
19912169238dSAlex Elder 		rbd_img_request_complete(img_request);
19922169238dSAlex Elder }
19932169238dSAlex Elder 
1994f1a4739fSAlex Elder /*
1995f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
1996f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
1997f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
1998f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
1999f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2000f1a4739fSAlex Elder  * all data described by the image request.
2001f1a4739fSAlex Elder  */
2002f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2003f1a4739fSAlex Elder 					enum obj_request_type type,
2004f1a4739fSAlex Elder 					void *data_desc)
2005bf0d5f50SAlex Elder {
2006bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2007bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2008bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
20090c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2010f1a4739fSAlex Elder 	struct bio *bio_list;
2011f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2012f1a4739fSAlex Elder 	struct page **pages;
20137da22d29SAlex Elder 	u64 img_offset;
2014bf0d5f50SAlex Elder 	u64 resid;
2015bf0d5f50SAlex Elder 	u16 opcode;
2016bf0d5f50SAlex Elder 
2017f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2018f1a4739fSAlex Elder 		(int)type, data_desc);
201937206ee5SAlex Elder 
2020430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
20217da22d29SAlex Elder 	img_offset = img_request->offset;
2022bf0d5f50SAlex Elder 	resid = img_request->length;
20234dda41d3SAlex Elder 	rbd_assert(resid > 0);
2024f1a4739fSAlex Elder 
2025f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2026f1a4739fSAlex Elder 		bio_list = data_desc;
2027f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2028f1a4739fSAlex Elder 	} else {
2029f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2030f1a4739fSAlex Elder 		pages = data_desc;
2031f1a4739fSAlex Elder 	}
2032f1a4739fSAlex Elder 
2033bf0d5f50SAlex Elder 	while (resid) {
20342fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2035bf0d5f50SAlex Elder 		const char *object_name;
2036bf0d5f50SAlex Elder 		u64 offset;
2037bf0d5f50SAlex Elder 		u64 length;
2038bf0d5f50SAlex Elder 
20397da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2040bf0d5f50SAlex Elder 		if (!object_name)
2041bf0d5f50SAlex Elder 			goto out_unwind;
20427da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
20437da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2044bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2045f1a4739fSAlex Elder 						offset, length, type);
204678c2a44aSAlex Elder 		/* object request has its own copy of the object name */
204778c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2048bf0d5f50SAlex Elder 		if (!obj_request)
2049bf0d5f50SAlex Elder 			goto out_unwind;
2050bf0d5f50SAlex Elder 
2051f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2052f1a4739fSAlex Elder 			unsigned int clone_size;
2053f1a4739fSAlex Elder 
2054bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2055bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2056f1a4739fSAlex Elder 			obj_request->bio_list =
2057f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2058f1a4739fSAlex Elder 								&bio_offset,
2059f1a4739fSAlex Elder 								clone_size,
2060bf0d5f50SAlex Elder 								GFP_ATOMIC);
2061bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
2062bf0d5f50SAlex Elder 				goto out_partial;
2063f1a4739fSAlex Elder 		} else {
2064f1a4739fSAlex Elder 			unsigned int page_count;
2065f1a4739fSAlex Elder 
2066f1a4739fSAlex Elder 			obj_request->pages = pages;
2067f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2068f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2069f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2070f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2071f1a4739fSAlex Elder 			pages += page_count;
2072f1a4739fSAlex Elder 		}
2073bf0d5f50SAlex Elder 
20742fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
20752fa12320SAlex Elder 						obj_request);
20762fa12320SAlex Elder 		if (!osd_req)
2077bf0d5f50SAlex Elder 			goto out_partial;
20782fa12320SAlex Elder 		obj_request->osd_req = osd_req;
20792169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2080430c28c3SAlex Elder 
20812fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
20822fa12320SAlex Elder 						0, 0);
2083f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
2084406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
2085f1a4739fSAlex Elder 					obj_request->bio_list, length);
2086f1a4739fSAlex Elder 		else
2087f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
2088f1a4739fSAlex Elder 					obj_request->pages, length,
2089f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
20909d4df01fSAlex Elder 
20919d4df01fSAlex Elder 		if (write_request)
20929d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
20939d4df01fSAlex Elder 		else
20949d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2095430c28c3SAlex Elder 
20967da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2097bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
2098bf0d5f50SAlex Elder 
20997da22d29SAlex Elder 		img_offset += length;
2100bf0d5f50SAlex Elder 		resid -= length;
2101bf0d5f50SAlex Elder 	}
2102bf0d5f50SAlex Elder 
2103bf0d5f50SAlex Elder 	return 0;
2104bf0d5f50SAlex Elder 
2105bf0d5f50SAlex Elder out_partial:
2106bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
2107bf0d5f50SAlex Elder out_unwind:
2108bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2109bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2110bf0d5f50SAlex Elder 
2111bf0d5f50SAlex Elder 	return -ENOMEM;
2112bf0d5f50SAlex Elder }
2113bf0d5f50SAlex Elder 
21143d7efd18SAlex Elder static void
21150eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
21160eefd470SAlex Elder {
21170eefd470SAlex Elder 	struct rbd_img_request *img_request;
21180eefd470SAlex Elder 	struct rbd_device *rbd_dev;
21190eefd470SAlex Elder 	u64 length;
21200eefd470SAlex Elder 	u32 page_count;
21210eefd470SAlex Elder 
21220eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
21230eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21240eefd470SAlex Elder 	img_request = obj_request->img_request;
21250eefd470SAlex Elder 	rbd_assert(img_request);
21260eefd470SAlex Elder 
21270eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
21280eefd470SAlex Elder 	rbd_assert(rbd_dev);
21290eefd470SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
21300eefd470SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
21310eefd470SAlex Elder 
21320eefd470SAlex Elder 	rbd_assert(obj_request->copyup_pages);
21330eefd470SAlex Elder 	ceph_release_page_vector(obj_request->copyup_pages, page_count);
21340eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
21350eefd470SAlex Elder 
21360eefd470SAlex Elder 	/*
21370eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
21380eefd470SAlex Elder 	 * original write request.  There is no such thing as a
21390eefd470SAlex Elder 	 * successful short write, so if the request was successful
21400eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
21410eefd470SAlex Elder 	 */
21420eefd470SAlex Elder 	if (!obj_request->result)
21430eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
21440eefd470SAlex Elder 
21450eefd470SAlex Elder 	/* Finish up with the normal image object callback */
21460eefd470SAlex Elder 
21470eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
21480eefd470SAlex Elder }
21490eefd470SAlex Elder 
21500eefd470SAlex Elder static void
21513d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
21523d7efd18SAlex Elder {
21533d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
21540eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
21550eefd470SAlex Elder 	struct ceph_osd_client *osdc;
21560eefd470SAlex Elder 	struct rbd_device *rbd_dev;
21573d7efd18SAlex Elder 	struct page **pages;
21583d7efd18SAlex Elder 	int result;
21593d7efd18SAlex Elder 	u64 obj_size;
21603d7efd18SAlex Elder 	u64 xferred;
21613d7efd18SAlex Elder 
21623d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
21633d7efd18SAlex Elder 
21643d7efd18SAlex Elder 	/* First get what we need from the image request */
21653d7efd18SAlex Elder 
21663d7efd18SAlex Elder 	pages = img_request->copyup_pages;
21673d7efd18SAlex Elder 	rbd_assert(pages != NULL);
21683d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
21693d7efd18SAlex Elder 
21703d7efd18SAlex Elder 	orig_request = img_request->obj_request;
21713d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
21720eefd470SAlex Elder 	rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
21733d7efd18SAlex Elder 	result = img_request->result;
21743d7efd18SAlex Elder 	obj_size = img_request->length;
21753d7efd18SAlex Elder 	xferred = img_request->xferred;
21763d7efd18SAlex Elder 
21770eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
21780eefd470SAlex Elder 	rbd_assert(rbd_dev);
21790eefd470SAlex Elder 	rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
21800eefd470SAlex Elder 
21813d7efd18SAlex Elder 	rbd_img_request_put(img_request);
21823d7efd18SAlex Elder 
21830eefd470SAlex Elder 	if (result)
21840eefd470SAlex Elder 		goto out_err;
21853d7efd18SAlex Elder 
21860eefd470SAlex Elder 	/* Allocate the new copyup osd request for the original request */
21873d7efd18SAlex Elder 
21880eefd470SAlex Elder 	result = -ENOMEM;
21890eefd470SAlex Elder 	rbd_assert(!orig_request->osd_req);
21900eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
21910eefd470SAlex Elder 	if (!osd_req)
21920eefd470SAlex Elder 		goto out_err;
21930eefd470SAlex Elder 	orig_request->osd_req = osd_req;
21940eefd470SAlex Elder 	orig_request->copyup_pages = pages;
21953d7efd18SAlex Elder 
21960eefd470SAlex Elder 	/* Initialize the copyup op */
21970eefd470SAlex Elder 
21980eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
21990eefd470SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
22000eefd470SAlex Elder 						false, false);
22010eefd470SAlex Elder 
22020eefd470SAlex Elder 	/* Then the original write request op */
22030eefd470SAlex Elder 
22040eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
22050eefd470SAlex Elder 					orig_request->offset,
22060eefd470SAlex Elder 					orig_request->length, 0, 0);
22070eefd470SAlex Elder 	osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
22080eefd470SAlex Elder 					orig_request->length);
22090eefd470SAlex Elder 
22100eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
22110eefd470SAlex Elder 
22120eefd470SAlex Elder 	/* All set, send it off. */
22130eefd470SAlex Elder 
22140eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
22150eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
22160eefd470SAlex Elder 	result = rbd_obj_request_submit(osdc, orig_request);
22170eefd470SAlex Elder 	if (!result)
22180eefd470SAlex Elder 		return;
22190eefd470SAlex Elder out_err:
22200eefd470SAlex Elder 	/* Record the error code and complete the request */
22210eefd470SAlex Elder 
22220eefd470SAlex Elder 	orig_request->result = result;
22230eefd470SAlex Elder 	orig_request->xferred = 0;
22243d7efd18SAlex Elder 	obj_request_done_set(orig_request);
22253d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
22263d7efd18SAlex Elder }
22273d7efd18SAlex Elder 
22283d7efd18SAlex Elder /*
22293d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
22303d7efd18SAlex Elder  * entire target of the given object request.  This is used for
22313d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
22323d7efd18SAlex Elder  * object request from the image request does not exist.
22333d7efd18SAlex Elder  *
22343d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
22353d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
22363d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
22373d7efd18SAlex Elder  * the original object request for the copyup operation.
22383d7efd18SAlex Elder  *
22393d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
22403d7efd18SAlex Elder  * object request and mark it done so it gets completed.
22413d7efd18SAlex Elder  */
22423d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
22433d7efd18SAlex Elder {
22443d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
22453d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
22463d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
22473d7efd18SAlex Elder 	u64 img_offset;
22483d7efd18SAlex Elder 	u64 length;
22493d7efd18SAlex Elder 	struct page **pages = NULL;
22503d7efd18SAlex Elder 	u32 page_count;
22513d7efd18SAlex Elder 	int result;
22523d7efd18SAlex Elder 
22533d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22543d7efd18SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
22553d7efd18SAlex Elder 
22563d7efd18SAlex Elder 	img_request = obj_request->img_request;
22573d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
22583d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
22593d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
22603d7efd18SAlex Elder 
22613d7efd18SAlex Elder 	/*
22620eefd470SAlex Elder 	 * First things first.  The original osd request is of no
22630eefd470SAlex Elder 	 * use to use any more, we'll need a new one that can hold
22640eefd470SAlex Elder 	 * the two ops in a copyup request.  We'll get that later,
22650eefd470SAlex Elder 	 * but for now we can release the old one.
22660eefd470SAlex Elder 	 */
22670eefd470SAlex Elder 	rbd_osd_req_destroy(obj_request->osd_req);
22680eefd470SAlex Elder 	obj_request->osd_req = NULL;
22690eefd470SAlex Elder 
22700eefd470SAlex Elder 	/*
22713d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
22723d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
22733d7efd18SAlex Elder 	 */
22743d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
22753d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
22763d7efd18SAlex Elder 
22773d7efd18SAlex Elder 	/*
2278a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2279a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2280a9e8ba2cSAlex Elder 	 * necessary.
2281a9e8ba2cSAlex Elder 	 */
2282a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2283a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2284a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2285a9e8ba2cSAlex Elder 	}
2286a9e8ba2cSAlex Elder 
2287a9e8ba2cSAlex Elder 	/*
22883d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
22893d7efd18SAlex Elder 	 * from the parent.
22903d7efd18SAlex Elder 	 */
22913d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
22923d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
22933d7efd18SAlex Elder 	if (IS_ERR(pages)) {
22943d7efd18SAlex Elder 		result = PTR_ERR(pages);
22953d7efd18SAlex Elder 		pages = NULL;
22963d7efd18SAlex Elder 		goto out_err;
22973d7efd18SAlex Elder 	}
22983d7efd18SAlex Elder 
22993d7efd18SAlex Elder 	result = -ENOMEM;
23003d7efd18SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
23013d7efd18SAlex Elder 						img_offset, length,
23023d7efd18SAlex Elder 						false, true);
23033d7efd18SAlex Elder 	if (!parent_request)
23043d7efd18SAlex Elder 		goto out_err;
23053d7efd18SAlex Elder 	rbd_obj_request_get(obj_request);
23063d7efd18SAlex Elder 	parent_request->obj_request = obj_request;
23073d7efd18SAlex Elder 
23083d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
23093d7efd18SAlex Elder 	if (result)
23103d7efd18SAlex Elder 		goto out_err;
23113d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
23123d7efd18SAlex Elder 
23133d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
23143d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
23153d7efd18SAlex Elder 	if (!result)
23163d7efd18SAlex Elder 		return 0;
23173d7efd18SAlex Elder 
23183d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
23193d7efd18SAlex Elder 	parent_request->obj_request = NULL;
23203d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
23213d7efd18SAlex Elder out_err:
23223d7efd18SAlex Elder 	if (pages)
23233d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
23243d7efd18SAlex Elder 	if (parent_request)
23253d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
23263d7efd18SAlex Elder 	obj_request->result = result;
23273d7efd18SAlex Elder 	obj_request->xferred = 0;
23283d7efd18SAlex Elder 	obj_request_done_set(obj_request);
23293d7efd18SAlex Elder 
23303d7efd18SAlex Elder 	return result;
23313d7efd18SAlex Elder }
23323d7efd18SAlex Elder 
2333c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2334c5b5ef6cSAlex Elder {
2335c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2336c5b5ef6cSAlex Elder 	int result;
2337c5b5ef6cSAlex Elder 
2338c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2339c5b5ef6cSAlex Elder 
2340c5b5ef6cSAlex Elder 	/*
2341c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2342c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2343c5b5ef6cSAlex Elder 	 * we're done with the request.
2344c5b5ef6cSAlex Elder 	 */
2345c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2346c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2347c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2348c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2349c5b5ef6cSAlex Elder 
2350c5b5ef6cSAlex Elder 	result = obj_request->result;
2351c5b5ef6cSAlex Elder 	obj_request->result = 0;
2352c5b5ef6cSAlex Elder 
2353c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2354c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2355c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2356c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2357c5b5ef6cSAlex Elder 
2358c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2359c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2360c5b5ef6cSAlex Elder 
2361c5b5ef6cSAlex Elder 	/*
2362c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2363c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2364c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2365c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2366c5b5ef6cSAlex Elder 	 */
2367c5b5ef6cSAlex Elder 	if (!result) {
2368c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2369c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2370c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2371c5b5ef6cSAlex Elder 	} else if (result) {
2372c5b5ef6cSAlex Elder 		orig_request->result = result;
23733d7efd18SAlex Elder 		goto out;
2374c5b5ef6cSAlex Elder 	}
2375c5b5ef6cSAlex Elder 
2376c5b5ef6cSAlex Elder 	/*
2377c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2378c5b5ef6cSAlex Elder 	 * whether the target object exists.
2379c5b5ef6cSAlex Elder 	 */
2380b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
23813d7efd18SAlex Elder out:
2382c5b5ef6cSAlex Elder 	if (orig_request->result)
2383c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2384c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
2385c5b5ef6cSAlex Elder }
2386c5b5ef6cSAlex Elder 
2387c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2388c5b5ef6cSAlex Elder {
2389c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2390c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2391c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2392c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2393c5b5ef6cSAlex Elder 	u32 page_count;
2394c5b5ef6cSAlex Elder 	size_t size;
2395c5b5ef6cSAlex Elder 	int ret;
2396c5b5ef6cSAlex Elder 
2397c5b5ef6cSAlex Elder 	/*
2398c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2399c5b5ef6cSAlex Elder 	 *     le64 length;
2400c5b5ef6cSAlex Elder 	 *     struct {
2401c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2402c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2403c5b5ef6cSAlex Elder 	 *     } mtime;
2404c5b5ef6cSAlex Elder 	 */
2405c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2406c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2407c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2408c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2409c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2410c5b5ef6cSAlex Elder 
2411c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2412c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2413c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2414c5b5ef6cSAlex Elder 	if (!stat_request)
2415c5b5ef6cSAlex Elder 		goto out;
2416c5b5ef6cSAlex Elder 
2417c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2418c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2419c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2420c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2421c5b5ef6cSAlex Elder 
2422c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2423c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2424c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2425c5b5ef6cSAlex Elder 						stat_request);
2426c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2427c5b5ef6cSAlex Elder 		goto out;
2428c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2429c5b5ef6cSAlex Elder 
2430c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2431c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2432c5b5ef6cSAlex Elder 					false, false);
24339d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2434c5b5ef6cSAlex Elder 
2435c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2436c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2437c5b5ef6cSAlex Elder out:
2438c5b5ef6cSAlex Elder 	if (ret)
2439c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2440c5b5ef6cSAlex Elder 
2441c5b5ef6cSAlex Elder 	return ret;
2442c5b5ef6cSAlex Elder }
2443c5b5ef6cSAlex Elder 
2444b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2445b454e36dSAlex Elder {
2446b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2447a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
24483d7efd18SAlex Elder 	bool known;
2449b454e36dSAlex Elder 
2450b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2451b454e36dSAlex Elder 
2452b454e36dSAlex Elder 	img_request = obj_request->img_request;
2453b454e36dSAlex Elder 	rbd_assert(img_request);
2454a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2455b454e36dSAlex Elder 
2456b454e36dSAlex Elder 	/*
2457a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2458a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2459a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2460a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2461a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2462a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2463a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2464a9e8ba2cSAlex Elder 	 * simple object request.
2465b454e36dSAlex Elder 	 */
2466b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2467b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2468a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
24693d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
24703d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2471b454e36dSAlex Elder 
2472b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2473b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2474b454e36dSAlex Elder 
2475b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2476b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2477b454e36dSAlex Elder 
2478b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2479b454e36dSAlex Elder 	}
2480b454e36dSAlex Elder 
2481b454e36dSAlex Elder 	/*
24823d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
24833d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
24843d7efd18SAlex Elder 	 * start by reading the data for the full target object from
24853d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2486b454e36dSAlex Elder 	 */
24873d7efd18SAlex Elder 	if (known)
24883d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
24893d7efd18SAlex Elder 
24903d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2491b454e36dSAlex Elder 
2492b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2493b454e36dSAlex Elder }
2494b454e36dSAlex Elder 
2495bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2496bf0d5f50SAlex Elder {
2497bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
249846faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2499bf0d5f50SAlex Elder 
250037206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
250146faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2502bf0d5f50SAlex Elder 		int ret;
2503bf0d5f50SAlex Elder 
2504b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2505bf0d5f50SAlex Elder 		if (ret)
2506bf0d5f50SAlex Elder 			return ret;
2507bf0d5f50SAlex Elder 	}
2508bf0d5f50SAlex Elder 
2509bf0d5f50SAlex Elder 	return 0;
2510bf0d5f50SAlex Elder }
2511bf0d5f50SAlex Elder 
25128b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
25138b3e1a56SAlex Elder {
25148b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2515a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2516a9e8ba2cSAlex Elder 	u64 obj_end;
25178b3e1a56SAlex Elder 
25188b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
25198b3e1a56SAlex Elder 
25208b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
2521a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2522a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
25238b3e1a56SAlex Elder 
2524a9e8ba2cSAlex Elder 	obj_request->result = img_request->result;
2525a9e8ba2cSAlex Elder 	if (obj_request->result)
2526a9e8ba2cSAlex Elder 		goto out;
2527a9e8ba2cSAlex Elder 
2528a9e8ba2cSAlex Elder 	/*
2529a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2530a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2531a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2532a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2533a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2534a9e8ba2cSAlex Elder 	 */
2535a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2536a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2537a9e8ba2cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2538a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2539a9e8ba2cSAlex Elder 		u64 xferred = 0;
2540a9e8ba2cSAlex Elder 
2541a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2542a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2543a9e8ba2cSAlex Elder 					obj_request->img_offset;
2544a9e8ba2cSAlex Elder 
2545a9e8ba2cSAlex Elder 		obj_request->xferred = min(img_request->xferred, xferred);
2546a9e8ba2cSAlex Elder 	} else {
2547a9e8ba2cSAlex Elder 		obj_request->xferred = img_request->xferred;
2548a9e8ba2cSAlex Elder 	}
2549a9e8ba2cSAlex Elder out:
2550b5b09be3SAlex Elder 	rbd_img_request_put(img_request);
25518b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
25528b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
25538b3e1a56SAlex Elder }
25548b3e1a56SAlex Elder 
25558b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
25568b3e1a56SAlex Elder {
25578b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
25588b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
25598b3e1a56SAlex Elder 	int result;
25608b3e1a56SAlex Elder 
25618b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25628b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
25638b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
25648b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
25658b3e1a56SAlex Elder 
25668b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
25678b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
25688b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
25698b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
25708b3e1a56SAlex Elder 						obj_request->img_offset,
25718b3e1a56SAlex Elder 						obj_request->length,
25728b3e1a56SAlex Elder 						false, true);
25738b3e1a56SAlex Elder 	result = -ENOMEM;
25748b3e1a56SAlex Elder 	if (!img_request)
25758b3e1a56SAlex Elder 		goto out_err;
25768b3e1a56SAlex Elder 
25778b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
25788b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
25798b3e1a56SAlex Elder 
2580f1a4739fSAlex Elder 	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2581f1a4739fSAlex Elder 					obj_request->bio_list);
25828b3e1a56SAlex Elder 	if (result)
25838b3e1a56SAlex Elder 		goto out_err;
25848b3e1a56SAlex Elder 
25858b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
25868b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
25878b3e1a56SAlex Elder 	if (result)
25888b3e1a56SAlex Elder 		goto out_err;
25898b3e1a56SAlex Elder 
25908b3e1a56SAlex Elder 	return;
25918b3e1a56SAlex Elder out_err:
25928b3e1a56SAlex Elder 	if (img_request)
25938b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
25948b3e1a56SAlex Elder 	obj_request->result = result;
25958b3e1a56SAlex Elder 	obj_request->xferred = 0;
25968b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
25978b3e1a56SAlex Elder }
25988b3e1a56SAlex Elder 
2599cc4a38bdSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2600b8d70035SAlex Elder {
2601b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
26022169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2603b8d70035SAlex Elder 	int ret;
2604b8d70035SAlex Elder 
2605b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2606b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2607b8d70035SAlex Elder 	if (!obj_request)
2608b8d70035SAlex Elder 		return -ENOMEM;
2609b8d70035SAlex Elder 
2610b8d70035SAlex Elder 	ret = -ENOMEM;
2611430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2612b8d70035SAlex Elder 	if (!obj_request->osd_req)
2613b8d70035SAlex Elder 		goto out;
26142169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2615b8d70035SAlex Elder 
2616c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2617cc4a38bdSAlex Elder 					notify_id, 0, 0);
26189d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2619430c28c3SAlex Elder 
2620b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2621b8d70035SAlex Elder out:
2622cf81b60eSAlex Elder 	if (ret)
2623b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2624b8d70035SAlex Elder 
2625b8d70035SAlex Elder 	return ret;
2626b8d70035SAlex Elder }
2627b8d70035SAlex Elder 
2628b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2629b8d70035SAlex Elder {
2630b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2631b8d70035SAlex Elder 
2632b8d70035SAlex Elder 	if (!rbd_dev)
2633b8d70035SAlex Elder 		return;
2634b8d70035SAlex Elder 
263537206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2636b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2637b8d70035SAlex Elder 		(unsigned int)opcode);
2638cc4a38bdSAlex Elder 	(void)rbd_dev_refresh(rbd_dev);
2639b8d70035SAlex Elder 
2640cc4a38bdSAlex Elder 	rbd_obj_notify_ack(rbd_dev, notify_id);
2641b8d70035SAlex Elder }
2642b8d70035SAlex Elder 
26439969ebc5SAlex Elder /*
26449969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
26459969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
26469969ebc5SAlex Elder  */
26479969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
26489969ebc5SAlex Elder {
26499969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
26509969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
26519969ebc5SAlex Elder 	int ret;
26529969ebc5SAlex Elder 
26539969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
26549969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
26559969ebc5SAlex Elder 
26569969ebc5SAlex Elder 	if (start) {
26573c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
26589969ebc5SAlex Elder 						&rbd_dev->watch_event);
26599969ebc5SAlex Elder 		if (ret < 0)
26609969ebc5SAlex Elder 			return ret;
26618eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
26629969ebc5SAlex Elder 	}
26639969ebc5SAlex Elder 
26649969ebc5SAlex Elder 	ret = -ENOMEM;
26659969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
26669969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
26679969ebc5SAlex Elder 	if (!obj_request)
26689969ebc5SAlex Elder 		goto out_cancel;
26699969ebc5SAlex Elder 
2670430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2671430c28c3SAlex Elder 	if (!obj_request->osd_req)
2672430c28c3SAlex Elder 		goto out_cancel;
2673430c28c3SAlex Elder 
26748eb87565SAlex Elder 	if (start)
2675975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
26768eb87565SAlex Elder 	else
26776977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2678975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
26792169238dSAlex Elder 
26802169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2681b21ebdddSAlex Elder 				rbd_dev->watch_event->cookie, 0, start);
26829d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
26832169238dSAlex Elder 
26849969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
26859969ebc5SAlex Elder 	if (ret)
26869969ebc5SAlex Elder 		goto out_cancel;
26879969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
26889969ebc5SAlex Elder 	if (ret)
26899969ebc5SAlex Elder 		goto out_cancel;
26909969ebc5SAlex Elder 	ret = obj_request->result;
26919969ebc5SAlex Elder 	if (ret)
26929969ebc5SAlex Elder 		goto out_cancel;
26939969ebc5SAlex Elder 
26948eb87565SAlex Elder 	/*
26958eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
26968eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
26978eb87565SAlex Elder 	 * a pointer to the object request during that time (in
26988eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
26998eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
27008eb87565SAlex Elder 	 * unregistered it.
27018eb87565SAlex Elder 	 */
27028eb87565SAlex Elder 	if (start) {
27038eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
27048eb87565SAlex Elder 
27058eb87565SAlex Elder 		return 0;
27068eb87565SAlex Elder 	}
27078eb87565SAlex Elder 
27088eb87565SAlex Elder 	/* We have successfully torn down the watch request */
27098eb87565SAlex Elder 
27108eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
27118eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
27129969ebc5SAlex Elder out_cancel:
27139969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
27149969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
27159969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
27169969ebc5SAlex Elder 	if (obj_request)
27179969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
27189969ebc5SAlex Elder 
27199969ebc5SAlex Elder 	return ret;
27209969ebc5SAlex Elder }
27219969ebc5SAlex Elder 
272236be9a76SAlex Elder /*
2723f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2724f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
272536be9a76SAlex Elder  */
272636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
272736be9a76SAlex Elder 			     const char *object_name,
272836be9a76SAlex Elder 			     const char *class_name,
272936be9a76SAlex Elder 			     const char *method_name,
27304157976bSAlex Elder 			     const void *outbound,
273136be9a76SAlex Elder 			     size_t outbound_size,
27324157976bSAlex Elder 			     void *inbound,
2733e2a58ee5SAlex Elder 			     size_t inbound_size)
273436be9a76SAlex Elder {
27352169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
273636be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
273736be9a76SAlex Elder 	struct page **pages;
273836be9a76SAlex Elder 	u32 page_count;
273936be9a76SAlex Elder 	int ret;
274036be9a76SAlex Elder 
274136be9a76SAlex Elder 	/*
27426010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
27436010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
27446010a451SAlex Elder 	 * also supply outbound data--parameters for the object
27456010a451SAlex Elder 	 * method.  Currently if this is present it will be a
27466010a451SAlex Elder 	 * snapshot id.
274736be9a76SAlex Elder 	 */
274836be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
274936be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
275036be9a76SAlex Elder 	if (IS_ERR(pages))
275136be9a76SAlex Elder 		return PTR_ERR(pages);
275236be9a76SAlex Elder 
275336be9a76SAlex Elder 	ret = -ENOMEM;
27546010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
275536be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
275636be9a76SAlex Elder 	if (!obj_request)
275736be9a76SAlex Elder 		goto out;
275836be9a76SAlex Elder 
275936be9a76SAlex Elder 	obj_request->pages = pages;
276036be9a76SAlex Elder 	obj_request->page_count = page_count;
276136be9a76SAlex Elder 
2762430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
276336be9a76SAlex Elder 	if (!obj_request->osd_req)
276436be9a76SAlex Elder 		goto out;
276536be9a76SAlex Elder 
2766c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
276704017e29SAlex Elder 					class_name, method_name);
276804017e29SAlex Elder 	if (outbound_size) {
276904017e29SAlex Elder 		struct ceph_pagelist *pagelist;
277004017e29SAlex Elder 
277104017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
277204017e29SAlex Elder 		if (!pagelist)
277304017e29SAlex Elder 			goto out;
277404017e29SAlex Elder 
277504017e29SAlex Elder 		ceph_pagelist_init(pagelist);
277604017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
277704017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
277804017e29SAlex Elder 						pagelist);
277904017e29SAlex Elder 	}
2780a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2781a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
278244cd188dSAlex Elder 					0, false, false);
27839d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2784430c28c3SAlex Elder 
278536be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
278636be9a76SAlex Elder 	if (ret)
278736be9a76SAlex Elder 		goto out;
278836be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
278936be9a76SAlex Elder 	if (ret)
279036be9a76SAlex Elder 		goto out;
279136be9a76SAlex Elder 
279236be9a76SAlex Elder 	ret = obj_request->result;
279336be9a76SAlex Elder 	if (ret < 0)
279436be9a76SAlex Elder 		goto out;
279557385b51SAlex Elder 
279657385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
279757385b51SAlex Elder 	ret = (int)obj_request->xferred;
2798903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
279936be9a76SAlex Elder out:
280036be9a76SAlex Elder 	if (obj_request)
280136be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
280236be9a76SAlex Elder 	else
280336be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
280436be9a76SAlex Elder 
280536be9a76SAlex Elder 	return ret;
280636be9a76SAlex Elder }
280736be9a76SAlex Elder 
2808bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2809cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2810bf0d5f50SAlex Elder {
2811bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2812bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2813bf0d5f50SAlex Elder 	struct request *rq;
2814bf0d5f50SAlex Elder 	int result;
2815bf0d5f50SAlex Elder 
2816bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2817bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2818bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2819bf0d5f50SAlex Elder 		u64 offset;
2820bf0d5f50SAlex Elder 		u64 length;
2821bf0d5f50SAlex Elder 
2822bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2823bf0d5f50SAlex Elder 
2824bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
28254dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
28264dda41d3SAlex Elder 				(int) rq->cmd_type);
28274dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
28284dda41d3SAlex Elder 			continue;
28294dda41d3SAlex Elder 		}
28304dda41d3SAlex Elder 
28314dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
28324dda41d3SAlex Elder 
28334dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
28344dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
28354dda41d3SAlex Elder 
28364dda41d3SAlex Elder 		if (!length) {
28374dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2838bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2839bf0d5f50SAlex Elder 			continue;
2840bf0d5f50SAlex Elder 		}
2841bf0d5f50SAlex Elder 
2842bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2843bf0d5f50SAlex Elder 
2844bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2845bf0d5f50SAlex Elder 
2846bf0d5f50SAlex Elder 		if (write_request) {
2847bf0d5f50SAlex Elder 			result = -EROFS;
2848bf0d5f50SAlex Elder 			if (read_only)
2849bf0d5f50SAlex Elder 				goto end_request;
2850bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2851bf0d5f50SAlex Elder 		}
2852bf0d5f50SAlex Elder 
28536d292906SAlex Elder 		/*
28546d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
28556d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
28566d292906SAlex Elder 		 * have disappeared by the time our request arrives
28576d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
28586d292906SAlex Elder 		 * we already know.
28596d292906SAlex Elder 		 */
28606d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2861bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2862bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2863bf0d5f50SAlex Elder 			result = -ENXIO;
2864bf0d5f50SAlex Elder 			goto end_request;
2865bf0d5f50SAlex Elder 		}
2866bf0d5f50SAlex Elder 
2867bf0d5f50SAlex Elder 		result = -EINVAL;
2868c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
2869c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2870c0cd10dbSAlex Elder 				offset, length);
2871bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2872c0cd10dbSAlex Elder 		}
2873bf0d5f50SAlex Elder 
2874bf0d5f50SAlex Elder 		result = -ENOMEM;
2875bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
28769849e986SAlex Elder 							write_request, false);
2877bf0d5f50SAlex Elder 		if (!img_request)
2878bf0d5f50SAlex Elder 			goto end_request;
2879bf0d5f50SAlex Elder 
2880bf0d5f50SAlex Elder 		img_request->rq = rq;
2881bf0d5f50SAlex Elder 
2882f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2883f1a4739fSAlex Elder 						rq->bio);
2884bf0d5f50SAlex Elder 		if (!result)
2885bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2886bf0d5f50SAlex Elder 		if (result)
2887bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2888bf0d5f50SAlex Elder end_request:
2889bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2890bf0d5f50SAlex Elder 		if (result < 0) {
28917da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
28927da22d29SAlex Elder 				write_request ? "write" : "read",
28937da22d29SAlex Elder 				length, offset, result);
28947da22d29SAlex Elder 
2895bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2896bf0d5f50SAlex Elder 		}
2897bf0d5f50SAlex Elder 	}
2898bf0d5f50SAlex Elder }
2899bf0d5f50SAlex Elder 
2900602adf40SYehuda Sadeh /*
2901602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2902602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2903f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2904602adf40SYehuda Sadeh  */
2905602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2906602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2907602adf40SYehuda Sadeh {
2908602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2909e5cfeed2SAlex Elder 	sector_t sector_offset;
2910e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2911e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2912e5cfeed2SAlex Elder 	int ret;
2913602adf40SYehuda Sadeh 
2914e5cfeed2SAlex Elder 	/*
2915e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2916e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2917e5cfeed2SAlex Elder 	 * device.
2918e5cfeed2SAlex Elder 	 */
2919e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2920e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2921e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2922593a9e7bSAlex Elder 
2923e5cfeed2SAlex Elder 	/*
2924e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2925e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2926e5cfeed2SAlex Elder 	 */
2927e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2928e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2929e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2930e5cfeed2SAlex Elder 	else
2931e5cfeed2SAlex Elder 		ret = 0;
2932e5cfeed2SAlex Elder 
2933e5cfeed2SAlex Elder 	/*
2934e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2935e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2936e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2937e5cfeed2SAlex Elder 	 * added to an empty bio."
2938e5cfeed2SAlex Elder 	 */
2939e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2940e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2941e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2942e5cfeed2SAlex Elder 
2943e5cfeed2SAlex Elder 	return ret;
2944602adf40SYehuda Sadeh }
2945602adf40SYehuda Sadeh 
2946602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2947602adf40SYehuda Sadeh {
2948602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2949602adf40SYehuda Sadeh 
2950602adf40SYehuda Sadeh 	if (!disk)
2951602adf40SYehuda Sadeh 		return;
2952602adf40SYehuda Sadeh 
2953a0cab924SAlex Elder 	rbd_dev->disk = NULL;
2954a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
2955602adf40SYehuda Sadeh 		del_gendisk(disk);
2956602adf40SYehuda Sadeh 		if (disk->queue)
2957602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
2958a0cab924SAlex Elder 	}
2959602adf40SYehuda Sadeh 	put_disk(disk);
2960602adf40SYehuda Sadeh }
2961602adf40SYehuda Sadeh 
2962788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2963788e2df3SAlex Elder 				const char *object_name,
29647097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
2965788e2df3SAlex Elder 
2966788e2df3SAlex Elder {
29672169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2968788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2969788e2df3SAlex Elder 	struct page **pages = NULL;
2970788e2df3SAlex Elder 	u32 page_count;
29711ceae7efSAlex Elder 	size_t size;
2972788e2df3SAlex Elder 	int ret;
2973788e2df3SAlex Elder 
2974788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2975788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2976788e2df3SAlex Elder 	if (IS_ERR(pages))
2977788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2978788e2df3SAlex Elder 
2979788e2df3SAlex Elder 	ret = -ENOMEM;
2980788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2981788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2982788e2df3SAlex Elder 	if (!obj_request)
2983788e2df3SAlex Elder 		goto out;
2984788e2df3SAlex Elder 
2985788e2df3SAlex Elder 	obj_request->pages = pages;
2986788e2df3SAlex Elder 	obj_request->page_count = page_count;
2987788e2df3SAlex Elder 
2988430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2989788e2df3SAlex Elder 	if (!obj_request->osd_req)
2990788e2df3SAlex Elder 		goto out;
2991788e2df3SAlex Elder 
2992c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2993c99d2d4aSAlex Elder 					offset, length, 0, 0);
2994406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2995a4ce40a9SAlex Elder 					obj_request->pages,
299644cd188dSAlex Elder 					obj_request->length,
299744cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
299844cd188dSAlex Elder 					false, false);
29999d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3000430c28c3SAlex Elder 
3001788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3002788e2df3SAlex Elder 	if (ret)
3003788e2df3SAlex Elder 		goto out;
3004788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3005788e2df3SAlex Elder 	if (ret)
3006788e2df3SAlex Elder 		goto out;
3007788e2df3SAlex Elder 
3008788e2df3SAlex Elder 	ret = obj_request->result;
3009788e2df3SAlex Elder 	if (ret < 0)
3010788e2df3SAlex Elder 		goto out;
30111ceae7efSAlex Elder 
30121ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
30131ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3014903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
301523ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
301623ed6e13SAlex Elder 	ret = (int)size;
3017788e2df3SAlex Elder out:
3018788e2df3SAlex Elder 	if (obj_request)
3019788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3020788e2df3SAlex Elder 	else
3021788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3022788e2df3SAlex Elder 
3023788e2df3SAlex Elder 	return ret;
3024788e2df3SAlex Elder }
3025788e2df3SAlex Elder 
3026602adf40SYehuda Sadeh /*
30274156d998SAlex Elder  * Read the complete header for the given rbd device.
30284156d998SAlex Elder  *
30294156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
30304156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
30314156d998SAlex Elder  * of a variable that will be filled in with the version of the
30324156d998SAlex Elder  * header object at the time it was read.
30334156d998SAlex Elder  *
30344156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
30354156d998SAlex Elder  */
30364156d998SAlex Elder static struct rbd_image_header_ondisk *
30377097f8dfSAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
30384156d998SAlex Elder {
30394156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
30404156d998SAlex Elder 	u32 snap_count = 0;
30414156d998SAlex Elder 	u64 names_size = 0;
30424156d998SAlex Elder 	u32 want_count;
30434156d998SAlex Elder 	int ret;
30444156d998SAlex Elder 
30454156d998SAlex Elder 	/*
30464156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
30474156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
30484156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
30494156d998SAlex Elder 	 * the number of snapshots could change by the time we read
30504156d998SAlex Elder 	 * it in, in which case we re-read it.
30514156d998SAlex Elder 	 */
30524156d998SAlex Elder 	do {
30534156d998SAlex Elder 		size_t size;
30544156d998SAlex Elder 
30554156d998SAlex Elder 		kfree(ondisk);
30564156d998SAlex Elder 
30574156d998SAlex Elder 		size = sizeof (*ondisk);
30584156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
30594156d998SAlex Elder 		size += names_size;
30604156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
30614156d998SAlex Elder 		if (!ondisk)
30624156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
30634156d998SAlex Elder 
3064788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
30657097f8dfSAlex Elder 				       0, size, ondisk);
30664156d998SAlex Elder 		if (ret < 0)
30674156d998SAlex Elder 			goto out_err;
3068c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
30694156d998SAlex Elder 			ret = -ENXIO;
307006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
307106ecc6cbSAlex Elder 				size, ret);
30724156d998SAlex Elder 			goto out_err;
30734156d998SAlex Elder 		}
30744156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
30754156d998SAlex Elder 			ret = -ENXIO;
307606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
30774156d998SAlex Elder 			goto out_err;
30784156d998SAlex Elder 		}
30794156d998SAlex Elder 
30804156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
30814156d998SAlex Elder 		want_count = snap_count;
30824156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
30834156d998SAlex Elder 	} while (snap_count != want_count);
30844156d998SAlex Elder 
30854156d998SAlex Elder 	return ondisk;
30864156d998SAlex Elder 
30874156d998SAlex Elder out_err:
30884156d998SAlex Elder 	kfree(ondisk);
30894156d998SAlex Elder 
30904156d998SAlex Elder 	return ERR_PTR(ret);
30914156d998SAlex Elder }
30924156d998SAlex Elder 
30934156d998SAlex Elder /*
3094602adf40SYehuda Sadeh  * reload the ondisk the header
3095602adf40SYehuda Sadeh  */
3096602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
3097602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
3098602adf40SYehuda Sadeh {
30994156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
31004156d998SAlex Elder 	int ret;
3101602adf40SYehuda Sadeh 
31027097f8dfSAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev);
31034156d998SAlex Elder 	if (IS_ERR(ondisk))
31044156d998SAlex Elder 		return PTR_ERR(ondisk);
31054156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
31064156d998SAlex Elder 	kfree(ondisk);
3107602adf40SYehuda Sadeh 
31084156d998SAlex Elder 	return ret;
3109602adf40SYehuda Sadeh }
3110602adf40SYehuda Sadeh 
31119478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
31129478554aSAlex Elder {
31130d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
31149478554aSAlex Elder 		return;
31159478554aSAlex Elder 
3116e28626a0SAlex Elder 	if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3117e28626a0SAlex Elder 		sector_t size;
3118e28626a0SAlex Elder 
3119e28626a0SAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
3120e28626a0SAlex Elder 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
31219478554aSAlex Elder 		dout("setting size to %llu sectors", (unsigned long long)size);
31229478554aSAlex Elder 		set_capacity(rbd_dev->disk, size);
31239478554aSAlex Elder 	}
3124e28626a0SAlex Elder }
31259478554aSAlex Elder 
3126602adf40SYehuda Sadeh /*
3127602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
3128602adf40SYehuda Sadeh  */
3129cc4a38bdSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
3130602adf40SYehuda Sadeh {
3131602adf40SYehuda Sadeh 	int ret;
3132602adf40SYehuda Sadeh 	struct rbd_image_header h;
3133602adf40SYehuda Sadeh 
3134602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
3135602adf40SYehuda Sadeh 	if (ret < 0)
3136602adf40SYehuda Sadeh 		return ret;
3137602adf40SYehuda Sadeh 
3138a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
3139a51aa0c0SJosh Durgin 
31409478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
31419478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
31429478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
31439db4b3e3SSage Weil 
3144849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
3145602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
3146849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
3147d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
3148812164f8SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
3149602adf40SYehuda Sadeh 
315093a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
3151602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
3152602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
3153602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
3154849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
3155c0cd10dbSAlex Elder 	if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3156c0cd10dbSAlex Elder 		rbd_warn(rbd_dev, "object prefix changed (ignoring)");
3157849b4260SAlex Elder 	kfree(h.object_prefix);
3158849b4260SAlex Elder 
3159c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
3160602adf40SYehuda Sadeh 
3161dfc5606dSYehuda Sadeh 	return ret;
3162602adf40SYehuda Sadeh }
3163602adf40SYehuda Sadeh 
316415228edeSAlex Elder /*
316515228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
316615228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
316715228edeSAlex Elder  */
316815228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
316915228edeSAlex Elder {
317015228edeSAlex Elder 	u64 snap_id;
317115228edeSAlex Elder 
317215228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
317315228edeSAlex Elder 		return;
317415228edeSAlex Elder 
317515228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
317615228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
317715228edeSAlex Elder 		return;
317815228edeSAlex Elder 
317915228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
318015228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
318115228edeSAlex Elder }
318215228edeSAlex Elder 
3183cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
31841fe5e993SAlex Elder {
3185a3fbe5d4SAlex Elder 	u64 image_size;
31861fe5e993SAlex Elder 	int ret;
31871fe5e993SAlex Elder 
3188117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3189a3fbe5d4SAlex Elder 	image_size = rbd_dev->header.image_size;
31901fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3191117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
3192cc4a38bdSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev);
3193117973fbSAlex Elder 	else
3194cc4a38bdSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev);
319515228edeSAlex Elder 
319615228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
319715228edeSAlex Elder 
319815228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
31991fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
3200522a0cc0SAlex Elder 	if (ret)
3201522a0cc0SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
3202522a0cc0SAlex Elder 			   " update snaps: %d\n", ret);
3203a3fbe5d4SAlex Elder 	if (image_size != rbd_dev->header.image_size)
3204a3fbe5d4SAlex Elder 		revalidate_disk(rbd_dev->disk);
32051fe5e993SAlex Elder 
32061fe5e993SAlex Elder 	return ret;
32071fe5e993SAlex Elder }
32081fe5e993SAlex Elder 
3209602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3210602adf40SYehuda Sadeh {
3211602adf40SYehuda Sadeh 	struct gendisk *disk;
3212602adf40SYehuda Sadeh 	struct request_queue *q;
3213593a9e7bSAlex Elder 	u64 segment_size;
3214602adf40SYehuda Sadeh 
3215602adf40SYehuda Sadeh 	/* create gendisk info */
3216602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3217602adf40SYehuda Sadeh 	if (!disk)
32181fcdb8aaSAlex Elder 		return -ENOMEM;
3219602adf40SYehuda Sadeh 
3220f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3221de71a297SAlex Elder 		 rbd_dev->dev_id);
3222602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3223602adf40SYehuda Sadeh 	disk->first_minor = 0;
3224602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3225602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3226602adf40SYehuda Sadeh 
3227bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3228602adf40SYehuda Sadeh 	if (!q)
3229602adf40SYehuda Sadeh 		goto out_disk;
3230029bcbd8SJosh Durgin 
3231593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3232593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3233593a9e7bSAlex Elder 
3234029bcbd8SJosh Durgin 	/* set io sizes to object size */
3235593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3236593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3237593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3238593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3239593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3240029bcbd8SJosh Durgin 
3241602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3242602adf40SYehuda Sadeh 	disk->queue = q;
3243602adf40SYehuda Sadeh 
3244602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3245602adf40SYehuda Sadeh 
3246602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3247602adf40SYehuda Sadeh 
3248602adf40SYehuda Sadeh 	return 0;
3249602adf40SYehuda Sadeh out_disk:
3250602adf40SYehuda Sadeh 	put_disk(disk);
32511fcdb8aaSAlex Elder 
32521fcdb8aaSAlex Elder 	return -ENOMEM;
3253602adf40SYehuda Sadeh }
3254602adf40SYehuda Sadeh 
3255dfc5606dSYehuda Sadeh /*
3256dfc5606dSYehuda Sadeh   sysfs
3257dfc5606dSYehuda Sadeh */
3258602adf40SYehuda Sadeh 
3259593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3260593a9e7bSAlex Elder {
3261593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3262593a9e7bSAlex Elder }
3263593a9e7bSAlex Elder 
3264dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3265dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3266602adf40SYehuda Sadeh {
3267593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3268dfc5606dSYehuda Sadeh 
3269fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3270fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3271602adf40SYehuda Sadeh }
3272602adf40SYehuda Sadeh 
327334b13184SAlex Elder /*
327434b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
327534b13184SAlex Elder  * necessarily the base image.
327634b13184SAlex Elder  */
327734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
327834b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
327934b13184SAlex Elder {
328034b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
328134b13184SAlex Elder 
328234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
328334b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
328434b13184SAlex Elder }
328534b13184SAlex Elder 
3286dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3287dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3288602adf40SYehuda Sadeh {
3289593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3290dfc5606dSYehuda Sadeh 
3291fc71d833SAlex Elder 	if (rbd_dev->major)
3292dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3293fc71d833SAlex Elder 
3294fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3295fc71d833SAlex Elder 
3296dfc5606dSYehuda Sadeh }
3297dfc5606dSYehuda Sadeh 
3298dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3299dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3300dfc5606dSYehuda Sadeh {
3301593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3302dfc5606dSYehuda Sadeh 
33031dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
33041dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3305dfc5606dSYehuda Sadeh }
3306dfc5606dSYehuda Sadeh 
3307dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3308dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3309dfc5606dSYehuda Sadeh {
3310593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3311dfc5606dSYehuda Sadeh 
33120d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3313dfc5606dSYehuda Sadeh }
3314dfc5606dSYehuda Sadeh 
33159bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
33169bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
33179bb2f334SAlex Elder {
33189bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
33199bb2f334SAlex Elder 
33200d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
33210d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
33229bb2f334SAlex Elder }
33239bb2f334SAlex Elder 
3324dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3325dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3326dfc5606dSYehuda Sadeh {
3327593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3328dfc5606dSYehuda Sadeh 
3329a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
33300d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3331a92ffdf8SAlex Elder 
3332a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3333dfc5606dSYehuda Sadeh }
3334dfc5606dSYehuda Sadeh 
3335589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3336589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3337589d30e0SAlex Elder {
3338589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3339589d30e0SAlex Elder 
33400d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3341589d30e0SAlex Elder }
3342589d30e0SAlex Elder 
334334b13184SAlex Elder /*
334434b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
334534b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
334634b13184SAlex Elder  */
3347dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3348dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3349dfc5606dSYehuda Sadeh 			     char *buf)
3350dfc5606dSYehuda Sadeh {
3351593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3352dfc5606dSYehuda Sadeh 
33530d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3354dfc5606dSYehuda Sadeh }
3355dfc5606dSYehuda Sadeh 
335686b00e0dSAlex Elder /*
335786b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
335886b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
335986b00e0dSAlex Elder  * "(no parent image)".
336086b00e0dSAlex Elder  */
336186b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
336286b00e0dSAlex Elder 			     struct device_attribute *attr,
336386b00e0dSAlex Elder 			     char *buf)
336486b00e0dSAlex Elder {
336586b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
336686b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
336786b00e0dSAlex Elder 	int count;
336886b00e0dSAlex Elder 	char *bufp = buf;
336986b00e0dSAlex Elder 
337086b00e0dSAlex Elder 	if (!spec)
337186b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
337286b00e0dSAlex Elder 
337386b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
337486b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
337586b00e0dSAlex Elder 	if (count < 0)
337686b00e0dSAlex Elder 		return count;
337786b00e0dSAlex Elder 	bufp += count;
337886b00e0dSAlex Elder 
337986b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
338086b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
338186b00e0dSAlex Elder 	if (count < 0)
338286b00e0dSAlex Elder 		return count;
338386b00e0dSAlex Elder 	bufp += count;
338486b00e0dSAlex Elder 
338586b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
338686b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
338786b00e0dSAlex Elder 	if (count < 0)
338886b00e0dSAlex Elder 		return count;
338986b00e0dSAlex Elder 	bufp += count;
339086b00e0dSAlex Elder 
339186b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
339286b00e0dSAlex Elder 	if (count < 0)
339386b00e0dSAlex Elder 		return count;
339486b00e0dSAlex Elder 	bufp += count;
339586b00e0dSAlex Elder 
339686b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
339786b00e0dSAlex Elder }
339886b00e0dSAlex Elder 
3399dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3400dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3401dfc5606dSYehuda Sadeh 				 const char *buf,
3402dfc5606dSYehuda Sadeh 				 size_t size)
3403dfc5606dSYehuda Sadeh {
3404593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3405b813623aSAlex Elder 	int ret;
3406602adf40SYehuda Sadeh 
3407cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3408b813623aSAlex Elder 
3409b813623aSAlex Elder 	return ret < 0 ? ret : size;
3410dfc5606dSYehuda Sadeh }
3411602adf40SYehuda Sadeh 
3412dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
341334b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3414dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3415dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3416dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
34179bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3418dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3419589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3420dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3421dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
342286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3423dfc5606dSYehuda Sadeh 
3424dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3425dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
342634b13184SAlex Elder 	&dev_attr_features.attr,
3427dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3428dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3429dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
34309bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3431dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3432589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3433dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
343486b00e0dSAlex Elder 	&dev_attr_parent.attr,
3435dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3436dfc5606dSYehuda Sadeh 	NULL
3437dfc5606dSYehuda Sadeh };
3438dfc5606dSYehuda Sadeh 
3439dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3440dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3441dfc5606dSYehuda Sadeh };
3442dfc5606dSYehuda Sadeh 
3443dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3444dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3445dfc5606dSYehuda Sadeh 	NULL
3446dfc5606dSYehuda Sadeh };
3447dfc5606dSYehuda Sadeh 
3448dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3449dfc5606dSYehuda Sadeh {
3450dfc5606dSYehuda Sadeh }
3451dfc5606dSYehuda Sadeh 
3452dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3453dfc5606dSYehuda Sadeh 	.name		= "rbd",
3454dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3455dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3456dfc5606dSYehuda Sadeh };
3457dfc5606dSYehuda Sadeh 
34588b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
34598b8fb99cSAlex Elder {
34608b8fb99cSAlex Elder 	kref_get(&spec->kref);
34618b8fb99cSAlex Elder 
34628b8fb99cSAlex Elder 	return spec;
34638b8fb99cSAlex Elder }
34648b8fb99cSAlex Elder 
34658b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
34668b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
34678b8fb99cSAlex Elder {
34688b8fb99cSAlex Elder 	if (spec)
34698b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
34708b8fb99cSAlex Elder }
34718b8fb99cSAlex Elder 
34728b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
34738b8fb99cSAlex Elder {
34748b8fb99cSAlex Elder 	struct rbd_spec *spec;
34758b8fb99cSAlex Elder 
34768b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
34778b8fb99cSAlex Elder 	if (!spec)
34788b8fb99cSAlex Elder 		return NULL;
34798b8fb99cSAlex Elder 	kref_init(&spec->kref);
34808b8fb99cSAlex Elder 
34818b8fb99cSAlex Elder 	return spec;
34828b8fb99cSAlex Elder }
34838b8fb99cSAlex Elder 
34848b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
34858b8fb99cSAlex Elder {
34868b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
34878b8fb99cSAlex Elder 
34888b8fb99cSAlex Elder 	kfree(spec->pool_name);
34898b8fb99cSAlex Elder 	kfree(spec->image_id);
34908b8fb99cSAlex Elder 	kfree(spec->image_name);
34918b8fb99cSAlex Elder 	kfree(spec->snap_name);
34928b8fb99cSAlex Elder 	kfree(spec);
34938b8fb99cSAlex Elder }
34948b8fb99cSAlex Elder 
3495cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3496c53d5893SAlex Elder 				struct rbd_spec *spec)
3497c53d5893SAlex Elder {
3498c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3499c53d5893SAlex Elder 
3500c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3501c53d5893SAlex Elder 	if (!rbd_dev)
3502c53d5893SAlex Elder 		return NULL;
3503c53d5893SAlex Elder 
3504c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
35056d292906SAlex Elder 	rbd_dev->flags = 0;
3506c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3507c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3508c53d5893SAlex Elder 
3509c53d5893SAlex Elder 	rbd_dev->spec = spec;
3510c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3511c53d5893SAlex Elder 
35120903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
35130903e875SAlex Elder 
35140903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
35150903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
35160903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
35170903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
35180903e875SAlex Elder 
3519c53d5893SAlex Elder 	return rbd_dev;
3520c53d5893SAlex Elder }
3521c53d5893SAlex Elder 
3522c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3523c53d5893SAlex Elder {
3524c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3525c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3526c53d5893SAlex Elder 	kfree(rbd_dev);
3527c53d5893SAlex Elder }
3528c53d5893SAlex Elder 
3529dfc5606dSYehuda Sadeh /*
35309d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
35319d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
35329d475de5SAlex Elder  * image.
35339d475de5SAlex Elder  */
35349d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
35359d475de5SAlex Elder 				u8 *order, u64 *snap_size)
35369d475de5SAlex Elder {
35379d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
35389d475de5SAlex Elder 	int ret;
35399d475de5SAlex Elder 	struct {
35409d475de5SAlex Elder 		u8 order;
35419d475de5SAlex Elder 		__le64 size;
35429d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
35439d475de5SAlex Elder 
354436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35459d475de5SAlex Elder 				"rbd", "get_size",
35464157976bSAlex Elder 				&snapid, sizeof (snapid),
3547e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
354836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35499d475de5SAlex Elder 	if (ret < 0)
35509d475de5SAlex Elder 		return ret;
355157385b51SAlex Elder 	if (ret < sizeof (size_buf))
355257385b51SAlex Elder 		return -ERANGE;
35539d475de5SAlex Elder 
3554c86f86e9SAlex Elder 	if (order)
35559d475de5SAlex Elder 		*order = size_buf.order;
35569d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
35579d475de5SAlex Elder 
35589d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
35599d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
35609d475de5SAlex Elder 		(unsigned long long)*snap_size);
35619d475de5SAlex Elder 
35629d475de5SAlex Elder 	return 0;
35639d475de5SAlex Elder }
35649d475de5SAlex Elder 
35659d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
35669d475de5SAlex Elder {
35679d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
35689d475de5SAlex Elder 					&rbd_dev->header.obj_order,
35699d475de5SAlex Elder 					&rbd_dev->header.image_size);
35709d475de5SAlex Elder }
35719d475de5SAlex Elder 
35721e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
35731e130199SAlex Elder {
35741e130199SAlex Elder 	void *reply_buf;
35751e130199SAlex Elder 	int ret;
35761e130199SAlex Elder 	void *p;
35771e130199SAlex Elder 
35781e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
35791e130199SAlex Elder 	if (!reply_buf)
35801e130199SAlex Elder 		return -ENOMEM;
35811e130199SAlex Elder 
358236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35834157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3584e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
358536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35861e130199SAlex Elder 	if (ret < 0)
35871e130199SAlex Elder 		goto out;
35881e130199SAlex Elder 
35891e130199SAlex Elder 	p = reply_buf;
35901e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
359157385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
359257385b51SAlex Elder 	ret = 0;
35931e130199SAlex Elder 
35941e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
35951e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
35961e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
35971e130199SAlex Elder 	} else {
35981e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
35991e130199SAlex Elder 	}
36001e130199SAlex Elder out:
36011e130199SAlex Elder 	kfree(reply_buf);
36021e130199SAlex Elder 
36031e130199SAlex Elder 	return ret;
36041e130199SAlex Elder }
36051e130199SAlex Elder 
3606b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3607b1b5402aSAlex Elder 		u64 *snap_features)
3608b1b5402aSAlex Elder {
3609b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3610b1b5402aSAlex Elder 	struct {
3611b1b5402aSAlex Elder 		__le64 features;
3612b1b5402aSAlex Elder 		__le64 incompat;
36134157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3614d889140cSAlex Elder 	u64 incompat;
3615b1b5402aSAlex Elder 	int ret;
3616b1b5402aSAlex Elder 
361736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3618b1b5402aSAlex Elder 				"rbd", "get_features",
36194157976bSAlex Elder 				&snapid, sizeof (snapid),
3620e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
362136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3622b1b5402aSAlex Elder 	if (ret < 0)
3623b1b5402aSAlex Elder 		return ret;
362457385b51SAlex Elder 	if (ret < sizeof (features_buf))
362557385b51SAlex Elder 		return -ERANGE;
3626d889140cSAlex Elder 
3627d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
36285cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3629b8f5c6edSAlex Elder 		return -ENXIO;
3630d889140cSAlex Elder 
3631b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3632b1b5402aSAlex Elder 
3633b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3634b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3635b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3636b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3637b1b5402aSAlex Elder 
3638b1b5402aSAlex Elder 	return 0;
3639b1b5402aSAlex Elder }
3640b1b5402aSAlex Elder 
3641b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3642b1b5402aSAlex Elder {
3643b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3644b1b5402aSAlex Elder 						&rbd_dev->header.features);
3645b1b5402aSAlex Elder }
3646b1b5402aSAlex Elder 
364786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
364886b00e0dSAlex Elder {
364986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
365086b00e0dSAlex Elder 	size_t size;
365186b00e0dSAlex Elder 	void *reply_buf = NULL;
365286b00e0dSAlex Elder 	__le64 snapid;
365386b00e0dSAlex Elder 	void *p;
365486b00e0dSAlex Elder 	void *end;
365586b00e0dSAlex Elder 	char *image_id;
365686b00e0dSAlex Elder 	u64 overlap;
365786b00e0dSAlex Elder 	int ret;
365886b00e0dSAlex Elder 
365986b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
366086b00e0dSAlex Elder 	if (!parent_spec)
366186b00e0dSAlex Elder 		return -ENOMEM;
366286b00e0dSAlex Elder 
366386b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
366486b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
366586b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
366686b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
366786b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
366886b00e0dSAlex Elder 	if (!reply_buf) {
366986b00e0dSAlex Elder 		ret = -ENOMEM;
367086b00e0dSAlex Elder 		goto out_err;
367186b00e0dSAlex Elder 	}
367286b00e0dSAlex Elder 
367386b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
367436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
367586b00e0dSAlex Elder 				"rbd", "get_parent",
36764157976bSAlex Elder 				&snapid, sizeof (snapid),
3677e2a58ee5SAlex Elder 				reply_buf, size);
367836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
367986b00e0dSAlex Elder 	if (ret < 0)
368086b00e0dSAlex Elder 		goto out_err;
368186b00e0dSAlex Elder 
368286b00e0dSAlex Elder 	p = reply_buf;
368357385b51SAlex Elder 	end = reply_buf + ret;
368457385b51SAlex Elder 	ret = -ERANGE;
368586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
368686b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
368786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
368886b00e0dSAlex Elder 
36890903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36900903e875SAlex Elder 
36910903e875SAlex Elder 	ret = -EIO;
3692c0cd10dbSAlex Elder 	if (parent_spec->pool_id > (u64)U32_MAX) {
3693c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3694c0cd10dbSAlex Elder 			(unsigned long long)parent_spec->pool_id, U32_MAX);
369557385b51SAlex Elder 		goto out_err;
3696c0cd10dbSAlex Elder 	}
36970903e875SAlex Elder 
3698979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
369986b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
370086b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
370186b00e0dSAlex Elder 		goto out_err;
370286b00e0dSAlex Elder 	}
370386b00e0dSAlex Elder 	parent_spec->image_id = image_id;
370486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
370586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
370686b00e0dSAlex Elder 
370786b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
370886b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
370986b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
371086b00e0dSAlex Elder out:
371186b00e0dSAlex Elder 	ret = 0;
371286b00e0dSAlex Elder out_err:
371386b00e0dSAlex Elder 	kfree(reply_buf);
371486b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
371586b00e0dSAlex Elder 
371686b00e0dSAlex Elder 	return ret;
371786b00e0dSAlex Elder }
371886b00e0dSAlex Elder 
3719cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3720cc070d59SAlex Elder {
3721cc070d59SAlex Elder 	struct {
3722cc070d59SAlex Elder 		__le64 stripe_unit;
3723cc070d59SAlex Elder 		__le64 stripe_count;
3724cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3725cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3726cc070d59SAlex Elder 	void *p;
3727cc070d59SAlex Elder 	u64 obj_size;
3728cc070d59SAlex Elder 	u64 stripe_unit;
3729cc070d59SAlex Elder 	u64 stripe_count;
3730cc070d59SAlex Elder 	int ret;
3731cc070d59SAlex Elder 
3732cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3733cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3734e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
3735cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3736cc070d59SAlex Elder 	if (ret < 0)
3737cc070d59SAlex Elder 		return ret;
3738cc070d59SAlex Elder 	if (ret < size)
3739cc070d59SAlex Elder 		return -ERANGE;
3740cc070d59SAlex Elder 
3741cc070d59SAlex Elder 	/*
3742cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3743cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3744cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3745cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3746cc070d59SAlex Elder 	 */
3747cc070d59SAlex Elder 	ret = -EINVAL;
3748cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3749cc070d59SAlex Elder 	p = &striping_info_buf;
3750cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3751cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3752cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3753cc070d59SAlex Elder 				"(got %llu want %llu)",
3754cc070d59SAlex Elder 				stripe_unit, obj_size);
3755cc070d59SAlex Elder 		return -EINVAL;
3756cc070d59SAlex Elder 	}
3757cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3758cc070d59SAlex Elder 	if (stripe_count != 1) {
3759cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3760cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3761cc070d59SAlex Elder 		return -EINVAL;
3762cc070d59SAlex Elder 	}
3763500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
3764500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
3765cc070d59SAlex Elder 
3766cc070d59SAlex Elder 	return 0;
3767cc070d59SAlex Elder }
3768cc070d59SAlex Elder 
37699e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
37709e15b77dSAlex Elder {
37719e15b77dSAlex Elder 	size_t image_id_size;
37729e15b77dSAlex Elder 	char *image_id;
37739e15b77dSAlex Elder 	void *p;
37749e15b77dSAlex Elder 	void *end;
37759e15b77dSAlex Elder 	size_t size;
37769e15b77dSAlex Elder 	void *reply_buf = NULL;
37779e15b77dSAlex Elder 	size_t len = 0;
37789e15b77dSAlex Elder 	char *image_name = NULL;
37799e15b77dSAlex Elder 	int ret;
37809e15b77dSAlex Elder 
37819e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
37829e15b77dSAlex Elder 
378369e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
378469e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
37859e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
37869e15b77dSAlex Elder 	if (!image_id)
37879e15b77dSAlex Elder 		return NULL;
37889e15b77dSAlex Elder 
37899e15b77dSAlex Elder 	p = image_id;
37904157976bSAlex Elder 	end = image_id + image_id_size;
379169e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
37929e15b77dSAlex Elder 
37939e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
37949e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
37959e15b77dSAlex Elder 	if (!reply_buf)
37969e15b77dSAlex Elder 		goto out;
37979e15b77dSAlex Elder 
379836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
37999e15b77dSAlex Elder 				"rbd", "dir_get_name",
38009e15b77dSAlex Elder 				image_id, image_id_size,
3801e2a58ee5SAlex Elder 				reply_buf, size);
38029e15b77dSAlex Elder 	if (ret < 0)
38039e15b77dSAlex Elder 		goto out;
38049e15b77dSAlex Elder 	p = reply_buf;
3805f40eb349SAlex Elder 	end = reply_buf + ret;
3806f40eb349SAlex Elder 
38079e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
38089e15b77dSAlex Elder 	if (IS_ERR(image_name))
38099e15b77dSAlex Elder 		image_name = NULL;
38109e15b77dSAlex Elder 	else
38119e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
38129e15b77dSAlex Elder out:
38139e15b77dSAlex Elder 	kfree(reply_buf);
38149e15b77dSAlex Elder 	kfree(image_id);
38159e15b77dSAlex Elder 
38169e15b77dSAlex Elder 	return image_name;
38179e15b77dSAlex Elder }
38189e15b77dSAlex Elder 
38192ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
38202ad3d716SAlex Elder {
38212ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
38222ad3d716SAlex Elder 	const char *snap_name;
38232ad3d716SAlex Elder 	u32 which = 0;
38242ad3d716SAlex Elder 
38252ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
38262ad3d716SAlex Elder 
38272ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
38282ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
38292ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
38302ad3d716SAlex Elder 			return snapc->snaps[which];
38312ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
38322ad3d716SAlex Elder 		which++;
38332ad3d716SAlex Elder 	}
38342ad3d716SAlex Elder 	return CEPH_NOSNAP;
38352ad3d716SAlex Elder }
38362ad3d716SAlex Elder 
38372ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
38382ad3d716SAlex Elder {
38392ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
38402ad3d716SAlex Elder 	u32 which;
38412ad3d716SAlex Elder 	bool found = false;
38422ad3d716SAlex Elder 	u64 snap_id;
38432ad3d716SAlex Elder 
38442ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
38452ad3d716SAlex Elder 		const char *snap_name;
38462ad3d716SAlex Elder 
38472ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
38482ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
38492ad3d716SAlex Elder 		if (IS_ERR(snap_name))
38502ad3d716SAlex Elder 			break;
38512ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
38522ad3d716SAlex Elder 		kfree(snap_name);
38532ad3d716SAlex Elder 	}
38542ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
38552ad3d716SAlex Elder }
38562ad3d716SAlex Elder 
38572ad3d716SAlex Elder /*
38582ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
38592ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
38602ad3d716SAlex Elder  */
38612ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
38622ad3d716SAlex Elder {
38632ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
38642ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
38652ad3d716SAlex Elder 
38662ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
38672ad3d716SAlex Elder }
38682ad3d716SAlex Elder 
38699e15b77dSAlex Elder /*
38702e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
38712e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
38722e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
38732e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
38742e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
38752e9f7f1cSAlex Elder  * allocated.
3876e1d4213fSAlex Elder  *
3877e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
3878e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
3879e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
38809e15b77dSAlex Elder  */
38812e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
38829e15b77dSAlex Elder {
38832e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
38842e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
38852e9f7f1cSAlex Elder 	const char *pool_name;
38862e9f7f1cSAlex Elder 	const char *image_name;
38872e9f7f1cSAlex Elder 	const char *snap_name;
38889e15b77dSAlex Elder 	int ret;
38899e15b77dSAlex Elder 
3890e1d4213fSAlex Elder 	/*
3891e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
3892e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
3893e1d4213fSAlex Elder 	 */
38942e9f7f1cSAlex Elder 	if (spec->pool_name) {
38952e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
38962ad3d716SAlex Elder 			u64 snap_id;
3897e1d4213fSAlex Elder 
38982ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
38992ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
3900e1d4213fSAlex Elder 				return -ENOENT;
39012ad3d716SAlex Elder 			spec->snap_id = snap_id;
3902e1d4213fSAlex Elder 		} else {
39032e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
3904e1d4213fSAlex Elder 		}
3905e1d4213fSAlex Elder 
3906e1d4213fSAlex Elder 		return 0;
3907e1d4213fSAlex Elder 	}
39089e15b77dSAlex Elder 
39092e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
39109e15b77dSAlex Elder 
39112e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
39122e9f7f1cSAlex Elder 	if (!pool_name) {
39132e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
3914935dc89fSAlex Elder 		return -EIO;
3915935dc89fSAlex Elder 	}
39162e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
39172e9f7f1cSAlex Elder 	if (!pool_name)
39189e15b77dSAlex Elder 		return -ENOMEM;
39199e15b77dSAlex Elder 
39209e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
39219e15b77dSAlex Elder 
39222e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
39232e9f7f1cSAlex Elder 	if (!image_name)
392406ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
39259e15b77dSAlex Elder 
39262e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
39279e15b77dSAlex Elder 
39282e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
39292e9f7f1cSAlex Elder 	if (!snap_name) {
39302e9f7f1cSAlex Elder 		ret = -ENOMEM;
39319e15b77dSAlex Elder 		goto out_err;
39322e9f7f1cSAlex Elder 	}
39332e9f7f1cSAlex Elder 
39342e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
39352e9f7f1cSAlex Elder 	spec->image_name = image_name;
39362e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
39379e15b77dSAlex Elder 
39389e15b77dSAlex Elder 	return 0;
39399e15b77dSAlex Elder out_err:
39402e9f7f1cSAlex Elder 	kfree(image_name);
39412e9f7f1cSAlex Elder 	kfree(pool_name);
39429e15b77dSAlex Elder 
39439e15b77dSAlex Elder 	return ret;
39449e15b77dSAlex Elder }
39459e15b77dSAlex Elder 
3946cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
394735d489f9SAlex Elder {
394835d489f9SAlex Elder 	size_t size;
394935d489f9SAlex Elder 	int ret;
395035d489f9SAlex Elder 	void *reply_buf;
395135d489f9SAlex Elder 	void *p;
395235d489f9SAlex Elder 	void *end;
395335d489f9SAlex Elder 	u64 seq;
395435d489f9SAlex Elder 	u32 snap_count;
395535d489f9SAlex Elder 	struct ceph_snap_context *snapc;
395635d489f9SAlex Elder 	u32 i;
395735d489f9SAlex Elder 
395835d489f9SAlex Elder 	/*
395935d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
396035d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
396135d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
396235d489f9SAlex Elder 	 * prepared to receive.
396335d489f9SAlex Elder 	 */
396435d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
396535d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
396635d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
396735d489f9SAlex Elder 	if (!reply_buf)
396835d489f9SAlex Elder 		return -ENOMEM;
396935d489f9SAlex Elder 
397036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
39714157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
3972e2a58ee5SAlex Elder 				reply_buf, size);
397336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
397435d489f9SAlex Elder 	if (ret < 0)
397535d489f9SAlex Elder 		goto out;
397635d489f9SAlex Elder 
397735d489f9SAlex Elder 	p = reply_buf;
397857385b51SAlex Elder 	end = reply_buf + ret;
397957385b51SAlex Elder 	ret = -ERANGE;
398035d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
398135d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
398235d489f9SAlex Elder 
398335d489f9SAlex Elder 	/*
398435d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
398535d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
398635d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
398735d489f9SAlex Elder 	 * allocate is representable in a size_t.
398835d489f9SAlex Elder 	 */
398935d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
399035d489f9SAlex Elder 				 / sizeof (u64)) {
399135d489f9SAlex Elder 		ret = -EINVAL;
399235d489f9SAlex Elder 		goto out;
399335d489f9SAlex Elder 	}
399435d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
399535d489f9SAlex Elder 		goto out;
3996468521c1SAlex Elder 	ret = 0;
399735d489f9SAlex Elder 
3998812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
399935d489f9SAlex Elder 	if (!snapc) {
400035d489f9SAlex Elder 		ret = -ENOMEM;
400135d489f9SAlex Elder 		goto out;
400235d489f9SAlex Elder 	}
400335d489f9SAlex Elder 	snapc->seq = seq;
400435d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
400535d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
400635d489f9SAlex Elder 
400735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
400835d489f9SAlex Elder 
400935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
401035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
401135d489f9SAlex Elder out:
401235d489f9SAlex Elder 	kfree(reply_buf);
401335d489f9SAlex Elder 
401457385b51SAlex Elder 	return ret;
401535d489f9SAlex Elder }
401635d489f9SAlex Elder 
401754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
401854cac61fSAlex Elder 					u64 snap_id)
4019b8b1e2dbSAlex Elder {
4020b8b1e2dbSAlex Elder 	size_t size;
4021b8b1e2dbSAlex Elder 	void *reply_buf;
402254cac61fSAlex Elder 	__le64 snapid;
4023b8b1e2dbSAlex Elder 	int ret;
4024b8b1e2dbSAlex Elder 	void *p;
4025b8b1e2dbSAlex Elder 	void *end;
4026b8b1e2dbSAlex Elder 	char *snap_name;
4027b8b1e2dbSAlex Elder 
4028b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4029b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
4030b8b1e2dbSAlex Elder 	if (!reply_buf)
4031b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
4032b8b1e2dbSAlex Elder 
403354cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
403436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
4035b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
403654cac61fSAlex Elder 				&snapid, sizeof (snapid),
4037e2a58ee5SAlex Elder 				reply_buf, size);
403836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4039f40eb349SAlex Elder 	if (ret < 0) {
4040f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
4041b8b1e2dbSAlex Elder 		goto out;
4042f40eb349SAlex Elder 	}
4043b8b1e2dbSAlex Elder 
4044b8b1e2dbSAlex Elder 	p = reply_buf;
4045f40eb349SAlex Elder 	end = reply_buf + ret;
4046e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4047f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4048b8b1e2dbSAlex Elder 		goto out;
4049f40eb349SAlex Elder 
4050b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
405154cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4052b8b1e2dbSAlex Elder out:
4053b8b1e2dbSAlex Elder 	kfree(reply_buf);
4054b8b1e2dbSAlex Elder 
4055f40eb349SAlex Elder 	return snap_name;
4056b8b1e2dbSAlex Elder }
4057b8b1e2dbSAlex Elder 
4058cc4a38bdSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
4059117973fbSAlex Elder {
4060117973fbSAlex Elder 	int ret;
4061117973fbSAlex Elder 
4062117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
4063117973fbSAlex Elder 
4064117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
4065117973fbSAlex Elder 	if (ret)
4066117973fbSAlex Elder 		goto out;
4067117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
4068117973fbSAlex Elder 
4069cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4070117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4071117973fbSAlex Elder 	if (ret)
4072117973fbSAlex Elder 		goto out;
4073117973fbSAlex Elder out:
4074117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
4075117973fbSAlex Elder 
4076117973fbSAlex Elder 	return ret;
4077117973fbSAlex Elder }
4078117973fbSAlex Elder 
4079dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4080dfc5606dSYehuda Sadeh {
4081dfc5606dSYehuda Sadeh 	struct device *dev;
4082cd789ab9SAlex Elder 	int ret;
4083dfc5606dSYehuda Sadeh 
4084dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4085dfc5606dSYehuda Sadeh 
4086cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4087dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4088dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4089dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4090200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4091de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4092dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4093dfc5606dSYehuda Sadeh 
4094dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4095cd789ab9SAlex Elder 
4096dfc5606dSYehuda Sadeh 	return ret;
4097602adf40SYehuda Sadeh }
4098602adf40SYehuda Sadeh 
4099dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4100dfc5606dSYehuda Sadeh {
4101dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4102dfc5606dSYehuda Sadeh }
4103dfc5606dSYehuda Sadeh 
4104e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
41051ddbe94eSAlex Elder 
41061ddbe94eSAlex Elder /*
4107499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4108499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
41091ddbe94eSAlex Elder  */
4110e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4111b7f23c36SAlex Elder {
4112e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4113499afd5bSAlex Elder 
4114499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4115499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4116499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4117e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4118e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4119b7f23c36SAlex Elder }
4120b7f23c36SAlex Elder 
41211ddbe94eSAlex Elder /*
4122499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4123499afd5bSAlex Elder  * identifier is no longer in use.
41241ddbe94eSAlex Elder  */
4125e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
41261ddbe94eSAlex Elder {
4127d184f6bfSAlex Elder 	struct list_head *tmp;
4128de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4129d184f6bfSAlex Elder 	int max_id;
4130d184f6bfSAlex Elder 
4131aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4132499afd5bSAlex Elder 
4133e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4134e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4135499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4136499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4137d184f6bfSAlex Elder 
4138d184f6bfSAlex Elder 	/*
4139d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4140d184f6bfSAlex Elder 	 * is nothing special we need to do.
4141d184f6bfSAlex Elder 	 */
4142e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4143d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4144d184f6bfSAlex Elder 		return;
4145d184f6bfSAlex Elder 	}
4146d184f6bfSAlex Elder 
4147d184f6bfSAlex Elder 	/*
4148d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4149d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4150d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4151d184f6bfSAlex Elder 	 */
4152d184f6bfSAlex Elder 	max_id = 0;
4153d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4154d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4155d184f6bfSAlex Elder 
4156d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4157b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4158b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4159d184f6bfSAlex Elder 	}
4160499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
41611ddbe94eSAlex Elder 
41621ddbe94eSAlex Elder 	/*
4163e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4164d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4165d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4166d184f6bfSAlex Elder 	 * case.
41671ddbe94eSAlex Elder 	 */
4168e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4169e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4170b7f23c36SAlex Elder }
4171b7f23c36SAlex Elder 
4172a725f65eSAlex Elder /*
4173e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4174e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4175593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4176593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4177e28fff26SAlex Elder  */
4178e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4179e28fff26SAlex Elder {
4180e28fff26SAlex Elder         /*
4181e28fff26SAlex Elder         * These are the characters that produce nonzero for
4182e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4183e28fff26SAlex Elder         */
4184e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4185e28fff26SAlex Elder 
4186e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4187e28fff26SAlex Elder 
4188e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4189e28fff26SAlex Elder }
4190e28fff26SAlex Elder 
4191e28fff26SAlex Elder /*
4192e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4193e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4194593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4195593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4196e28fff26SAlex Elder  *
4197e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4198e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4199e28fff26SAlex Elder  * token_size if the token would not fit.
4200e28fff26SAlex Elder  *
4201593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4202e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4203e28fff26SAlex Elder  * too small to hold it.
4204e28fff26SAlex Elder  */
4205e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4206e28fff26SAlex Elder 				char *token,
4207e28fff26SAlex Elder 				size_t token_size)
4208e28fff26SAlex Elder {
4209e28fff26SAlex Elder         size_t len;
4210e28fff26SAlex Elder 
4211e28fff26SAlex Elder 	len = next_token(buf);
4212e28fff26SAlex Elder 	if (len < token_size) {
4213e28fff26SAlex Elder 		memcpy(token, *buf, len);
4214e28fff26SAlex Elder 		*(token + len) = '\0';
4215e28fff26SAlex Elder 	}
4216e28fff26SAlex Elder 	*buf += len;
4217e28fff26SAlex Elder 
4218e28fff26SAlex Elder         return len;
4219e28fff26SAlex Elder }
4220e28fff26SAlex Elder 
4221e28fff26SAlex Elder /*
4222ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4223ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4224ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4225ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4226ea3352f4SAlex Elder  *
4227ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4228ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4229ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4230ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4231ea3352f4SAlex Elder  *
4232ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4233ea3352f4SAlex Elder  * the end of the found token.
4234ea3352f4SAlex Elder  *
4235ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4236ea3352f4SAlex Elder  */
4237ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4238ea3352f4SAlex Elder {
4239ea3352f4SAlex Elder 	char *dup;
4240ea3352f4SAlex Elder 	size_t len;
4241ea3352f4SAlex Elder 
4242ea3352f4SAlex Elder 	len = next_token(buf);
42434caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4244ea3352f4SAlex Elder 	if (!dup)
4245ea3352f4SAlex Elder 		return NULL;
4246ea3352f4SAlex Elder 	*(dup + len) = '\0';
4247ea3352f4SAlex Elder 	*buf += len;
4248ea3352f4SAlex Elder 
4249ea3352f4SAlex Elder 	if (lenp)
4250ea3352f4SAlex Elder 		*lenp = len;
4251ea3352f4SAlex Elder 
4252ea3352f4SAlex Elder 	return dup;
4253ea3352f4SAlex Elder }
4254ea3352f4SAlex Elder 
4255ea3352f4SAlex Elder /*
4256859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4257859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4258859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4259859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4260d22f76e7SAlex Elder  *
4261859c31dfSAlex Elder  * The information extracted from these options is recorded in
4262859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4263859c31dfSAlex Elder  * structures:
4264859c31dfSAlex Elder  *  ceph_opts
4265859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4266859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4267859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4268859c31dfSAlex Elder  *  rbd_opts
4269859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4270859c31dfSAlex Elder  *	this function; caller must release with kfree().
4271859c31dfSAlex Elder  *  spec
4272859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4273859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4274859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4275859c31dfSAlex Elder  *
4276859c31dfSAlex Elder  * The options passed take this form:
4277859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4278859c31dfSAlex Elder  * where:
4279859c31dfSAlex Elder  *  <mon_addrs>
4280859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4281859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4282859c31dfSAlex Elder  *      by a port number (separated by a colon).
4283859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4284859c31dfSAlex Elder  *  <options>
4285859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4286859c31dfSAlex Elder  *  <pool_name>
4287859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4288859c31dfSAlex Elder  *  <image_name>
4289859c31dfSAlex Elder  *      The name of the image in that pool to map.
4290859c31dfSAlex Elder  *  <snap_id>
4291859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4292859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4293859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4294859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4295a725f65eSAlex Elder  */
4296859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4297dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4298859c31dfSAlex Elder 				struct rbd_options **opts,
4299859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4300a725f65eSAlex Elder {
4301e28fff26SAlex Elder 	size_t len;
4302859c31dfSAlex Elder 	char *options;
43030ddebc0cSAlex Elder 	const char *mon_addrs;
4304ecb4dc22SAlex Elder 	char *snap_name;
43050ddebc0cSAlex Elder 	size_t mon_addrs_size;
4306859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
43074e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4308859c31dfSAlex Elder 	struct ceph_options *copts;
4309dc79b113SAlex Elder 	int ret;
4310e28fff26SAlex Elder 
4311e28fff26SAlex Elder 	/* The first four tokens are required */
4312e28fff26SAlex Elder 
43137ef3214aSAlex Elder 	len = next_token(&buf);
43144fb5d671SAlex Elder 	if (!len) {
43154fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
43164fb5d671SAlex Elder 		return -EINVAL;
43174fb5d671SAlex Elder 	}
43180ddebc0cSAlex Elder 	mon_addrs = buf;
4319f28e565aSAlex Elder 	mon_addrs_size = len + 1;
43207ef3214aSAlex Elder 	buf += len;
4321a725f65eSAlex Elder 
4322dc79b113SAlex Elder 	ret = -EINVAL;
4323f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4324f28e565aSAlex Elder 	if (!options)
4325dc79b113SAlex Elder 		return -ENOMEM;
43264fb5d671SAlex Elder 	if (!*options) {
43274fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
43284fb5d671SAlex Elder 		goto out_err;
43294fb5d671SAlex Elder 	}
4330a725f65eSAlex Elder 
4331859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4332859c31dfSAlex Elder 	if (!spec)
4333f28e565aSAlex Elder 		goto out_mem;
4334859c31dfSAlex Elder 
4335859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4336859c31dfSAlex Elder 	if (!spec->pool_name)
4337859c31dfSAlex Elder 		goto out_mem;
43384fb5d671SAlex Elder 	if (!*spec->pool_name) {
43394fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
43404fb5d671SAlex Elder 		goto out_err;
43414fb5d671SAlex Elder 	}
4342e28fff26SAlex Elder 
434369e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4344859c31dfSAlex Elder 	if (!spec->image_name)
4345f28e565aSAlex Elder 		goto out_mem;
43464fb5d671SAlex Elder 	if (!*spec->image_name) {
43474fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
43484fb5d671SAlex Elder 		goto out_err;
43494fb5d671SAlex Elder 	}
4350e28fff26SAlex Elder 
4351f28e565aSAlex Elder 	/*
4352f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4353f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4354f28e565aSAlex Elder 	 */
43553feeb894SAlex Elder 	len = next_token(&buf);
4356820a5f3eSAlex Elder 	if (!len) {
43573feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
43583feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4359f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4360dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4361f28e565aSAlex Elder 		goto out_err;
4362849b4260SAlex Elder 	}
4363ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4364ecb4dc22SAlex Elder 	if (!snap_name)
4365f28e565aSAlex Elder 		goto out_mem;
4366ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4367ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4368e5c35534SAlex Elder 
43690ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4370e28fff26SAlex Elder 
43714e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
43724e9afebaSAlex Elder 	if (!rbd_opts)
43734e9afebaSAlex Elder 		goto out_mem;
43744e9afebaSAlex Elder 
43754e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4376d22f76e7SAlex Elder 
4377859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
43780ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
43794e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4380859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4381859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4382dc79b113SAlex Elder 		goto out_err;
4383dc79b113SAlex Elder 	}
4384859c31dfSAlex Elder 	kfree(options);
4385859c31dfSAlex Elder 
4386859c31dfSAlex Elder 	*ceph_opts = copts;
43874e9afebaSAlex Elder 	*opts = rbd_opts;
4388859c31dfSAlex Elder 	*rbd_spec = spec;
43890ddebc0cSAlex Elder 
4390dc79b113SAlex Elder 	return 0;
4391f28e565aSAlex Elder out_mem:
4392dc79b113SAlex Elder 	ret = -ENOMEM;
4393d22f76e7SAlex Elder out_err:
4394859c31dfSAlex Elder 	kfree(rbd_opts);
4395859c31dfSAlex Elder 	rbd_spec_put(spec);
4396f28e565aSAlex Elder 	kfree(options);
4397d22f76e7SAlex Elder 
4398dc79b113SAlex Elder 	return ret;
4399a725f65eSAlex Elder }
4400a725f65eSAlex Elder 
4401589d30e0SAlex Elder /*
4402589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4403589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4404589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4405589d30e0SAlex Elder  *
4406589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4407589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4408589d30e0SAlex Elder  * with the supplied name.
4409589d30e0SAlex Elder  *
4410589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4411589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4412589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4413589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4414589d30e0SAlex Elder  */
4415589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4416589d30e0SAlex Elder {
4417589d30e0SAlex Elder 	int ret;
4418589d30e0SAlex Elder 	size_t size;
4419589d30e0SAlex Elder 	char *object_name;
4420589d30e0SAlex Elder 	void *response;
4421c0fba368SAlex Elder 	char *image_id;
44222f82ee54SAlex Elder 
4423589d30e0SAlex Elder 	/*
44242c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
44252c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4426c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4427c0fba368SAlex Elder 	 * do still need to set the image format though.
44282c0d0a10SAlex Elder 	 */
4429c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4430c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4431c0fba368SAlex Elder 
44322c0d0a10SAlex Elder 		return 0;
4433c0fba368SAlex Elder 	}
44342c0d0a10SAlex Elder 
44352c0d0a10SAlex Elder 	/*
4436589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4437589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4438589d30e0SAlex Elder 	 */
443969e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4440589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4441589d30e0SAlex Elder 	if (!object_name)
4442589d30e0SAlex Elder 		return -ENOMEM;
44430d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4444589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4445589d30e0SAlex Elder 
4446589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4447589d30e0SAlex Elder 
4448589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4449589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4450589d30e0SAlex Elder 	if (!response) {
4451589d30e0SAlex Elder 		ret = -ENOMEM;
4452589d30e0SAlex Elder 		goto out;
4453589d30e0SAlex Elder 	}
4454589d30e0SAlex Elder 
4455c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4456c0fba368SAlex Elder 
445736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
44584157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4459e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
446036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4461c0fba368SAlex Elder 	if (ret == -ENOENT) {
4462c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4463c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4464c0fba368SAlex Elder 		if (!ret)
4465c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4466c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4467c0fba368SAlex Elder 		void *p = response;
4468589d30e0SAlex Elder 
4469c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4470979ed480SAlex Elder 						NULL, GFP_NOIO);
4471c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4472c0fba368SAlex Elder 		if (!ret)
4473c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4474589d30e0SAlex Elder 	} else {
4475c0fba368SAlex Elder 		ret = -EINVAL;
4476c0fba368SAlex Elder 	}
4477c0fba368SAlex Elder 
4478c0fba368SAlex Elder 	if (!ret) {
4479c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4480c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4481589d30e0SAlex Elder 	}
4482589d30e0SAlex Elder out:
4483589d30e0SAlex Elder 	kfree(response);
4484589d30e0SAlex Elder 	kfree(object_name);
4485589d30e0SAlex Elder 
4486589d30e0SAlex Elder 	return ret;
4487589d30e0SAlex Elder }
4488589d30e0SAlex Elder 
44896fd48b3bSAlex Elder /* Undo whatever state changes are made by v1 or v2 image probe */
44906fd48b3bSAlex Elder 
44916fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
44926fd48b3bSAlex Elder {
44936fd48b3bSAlex Elder 	struct rbd_image_header	*header;
44946fd48b3bSAlex Elder 
44956fd48b3bSAlex Elder 	rbd_dev_remove_parent(rbd_dev);
44966fd48b3bSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
44976fd48b3bSAlex Elder 	rbd_dev->parent_spec = NULL;
44986fd48b3bSAlex Elder 	rbd_dev->parent_overlap = 0;
44996fd48b3bSAlex Elder 
45006fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
45016fd48b3bSAlex Elder 
45026fd48b3bSAlex Elder 	header = &rbd_dev->header;
4503812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
45046fd48b3bSAlex Elder 	kfree(header->snap_sizes);
45056fd48b3bSAlex Elder 	kfree(header->snap_names);
45066fd48b3bSAlex Elder 	kfree(header->object_prefix);
45076fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
45086fd48b3bSAlex Elder }
45096fd48b3bSAlex Elder 
4510a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4511a30b71b9SAlex Elder {
4512a30b71b9SAlex Elder 	int ret;
4513a30b71b9SAlex Elder 
4514a30b71b9SAlex Elder 	/* Populate rbd image metadata */
4515a30b71b9SAlex Elder 
4516a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4517a30b71b9SAlex Elder 	if (ret < 0)
4518a30b71b9SAlex Elder 		goto out_err;
451986b00e0dSAlex Elder 
452086b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
452186b00e0dSAlex Elder 
452286b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
452386b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
452486b00e0dSAlex Elder 
4525a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4526a30b71b9SAlex Elder 		rbd_dev->header_name);
4527a30b71b9SAlex Elder 
4528a30b71b9SAlex Elder 	return 0;
4529a30b71b9SAlex Elder 
4530a30b71b9SAlex Elder out_err:
4531a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4532a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
45330d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
45340d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4535a30b71b9SAlex Elder 
4536a30b71b9SAlex Elder 	return ret;
4537a30b71b9SAlex Elder }
4538a30b71b9SAlex Elder 
4539a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4540a30b71b9SAlex Elder {
45419d475de5SAlex Elder 	int ret;
4542a30b71b9SAlex Elder 
45439d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
454457385b51SAlex Elder 	if (ret)
45459d475de5SAlex Elder 		goto out_err;
45461e130199SAlex Elder 
45471e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
45481e130199SAlex Elder 
45491e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
455057385b51SAlex Elder 	if (ret)
45511e130199SAlex Elder 		goto out_err;
4552b1b5402aSAlex Elder 
4553d889140cSAlex Elder 	/* Get the and check features for the image */
4554b1b5402aSAlex Elder 
4555b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
455657385b51SAlex Elder 	if (ret)
4557b1b5402aSAlex Elder 		goto out_err;
455835d489f9SAlex Elder 
455986b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
456086b00e0dSAlex Elder 
456186b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
456286b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
456357385b51SAlex Elder 		if (ret)
456486b00e0dSAlex Elder 			goto out_err;
456596882f55SAlex Elder 
456696882f55SAlex Elder 		/*
456796882f55SAlex Elder 		 * Don't print a warning for parent images.  We can
456896882f55SAlex Elder 		 * tell this point because we won't know its pool
456996882f55SAlex Elder 		 * name yet (just its pool id).
457096882f55SAlex Elder 		 */
457196882f55SAlex Elder 		if (rbd_dev->spec->pool_name)
457296882f55SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
457396882f55SAlex Elder 					"is EXPERIMENTAL!");
457486b00e0dSAlex Elder 	}
457586b00e0dSAlex Elder 
4576cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4577cc070d59SAlex Elder 
4578cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4579cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4580cc070d59SAlex Elder 		if (ret < 0)
4581cc070d59SAlex Elder 			goto out_err;
4582cc070d59SAlex Elder 	}
4583cc070d59SAlex Elder 
45846e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
458535d489f9SAlex Elder 
45866e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
45876e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
45886e14b1a6SAlex Elder 
45896e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
45906e14b1a6SAlex Elder 
4591cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
459235d489f9SAlex Elder 	if (ret)
459335d489f9SAlex Elder 		goto out_err;
45946e14b1a6SAlex Elder 
4595a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4596a30b71b9SAlex Elder 		rbd_dev->header_name);
4597a30b71b9SAlex Elder 
459835152979SAlex Elder 	return 0;
45999d475de5SAlex Elder out_err:
460086b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
460186b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
460286b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
46039d475de5SAlex Elder 	kfree(rbd_dev->header_name);
46049d475de5SAlex Elder 	rbd_dev->header_name = NULL;
46051e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
46061e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
46079d475de5SAlex Elder 
46089d475de5SAlex Elder 	return ret;
4609a30b71b9SAlex Elder }
4610a30b71b9SAlex Elder 
4611124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
461283a06263SAlex Elder {
46132f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4614124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4615124afba2SAlex Elder 	struct rbd_client *rbdc;
4616124afba2SAlex Elder 	int ret;
4617124afba2SAlex Elder 
4618124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4619124afba2SAlex Elder 		return 0;
4620124afba2SAlex Elder 	/*
4621124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4622124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4623124afba2SAlex Elder 	 * parent/child relationships always share both.
4624124afba2SAlex Elder 	 */
4625124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4626124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4627124afba2SAlex Elder 
4628124afba2SAlex Elder 	ret = -ENOMEM;
4629124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4630124afba2SAlex Elder 	if (!parent)
4631124afba2SAlex Elder 		goto out_err;
4632124afba2SAlex Elder 
4633124afba2SAlex Elder 	ret = rbd_dev_image_probe(parent);
4634124afba2SAlex Elder 	if (ret < 0)
4635124afba2SAlex Elder 		goto out_err;
4636124afba2SAlex Elder 	rbd_dev->parent = parent;
4637124afba2SAlex Elder 
4638124afba2SAlex Elder 	return 0;
4639124afba2SAlex Elder out_err:
4640124afba2SAlex Elder 	if (parent) {
4641124afba2SAlex Elder 		rbd_spec_put(rbd_dev->parent_spec);
4642124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4643124afba2SAlex Elder 		rbd_dev_destroy(parent);
4644124afba2SAlex Elder 	} else {
4645124afba2SAlex Elder 		rbd_put_client(rbdc);
4646124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4647124afba2SAlex Elder 	}
4648124afba2SAlex Elder 
4649124afba2SAlex Elder 	return ret;
4650124afba2SAlex Elder }
4651124afba2SAlex Elder 
4652200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4653124afba2SAlex Elder {
465483a06263SAlex Elder 	int ret;
465583a06263SAlex Elder 
4656d1cf5788SAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
465783a06263SAlex Elder 	if (ret)
46589bb81c9bSAlex Elder 		return ret;
46595de10f3bSAlex Elder 
466083a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
466183a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
466283a06263SAlex Elder 
466383a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
466483a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
466583a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
466683a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
466783a06263SAlex Elder 
466883a06263SAlex Elder 	/* Get our block major device number. */
466983a06263SAlex Elder 
467083a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
467183a06263SAlex Elder 	if (ret < 0)
467283a06263SAlex Elder 		goto err_out_id;
467383a06263SAlex Elder 	rbd_dev->major = ret;
467483a06263SAlex Elder 
467583a06263SAlex Elder 	/* Set up the blkdev mapping. */
467683a06263SAlex Elder 
467783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
467883a06263SAlex Elder 	if (ret)
467983a06263SAlex Elder 		goto err_out_blkdev;
468083a06263SAlex Elder 
468183a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
468283a06263SAlex Elder 	if (ret)
468383a06263SAlex Elder 		goto err_out_disk;
468483a06263SAlex Elder 
468583a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
468683a06263SAlex Elder 
4687b5156e76SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4688129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
468983a06263SAlex Elder 	add_disk(rbd_dev->disk);
469083a06263SAlex Elder 
469183a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
469283a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
469383a06263SAlex Elder 
469483a06263SAlex Elder 	return ret;
46952f82ee54SAlex Elder 
469683a06263SAlex Elder err_out_disk:
469783a06263SAlex Elder 	rbd_free_disk(rbd_dev);
469883a06263SAlex Elder err_out_blkdev:
469983a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
470083a06263SAlex Elder err_out_id:
470183a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4702d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
470383a06263SAlex Elder 
470483a06263SAlex Elder 	return ret;
470583a06263SAlex Elder }
470683a06263SAlex Elder 
4707332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4708332bb12dSAlex Elder {
4709332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4710332bb12dSAlex Elder 	size_t size;
4711332bb12dSAlex Elder 
4712332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4713332bb12dSAlex Elder 
4714332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4715332bb12dSAlex Elder 
4716332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4717332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4718332bb12dSAlex Elder 	else
4719332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4720332bb12dSAlex Elder 
4721332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4722332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4723332bb12dSAlex Elder 		return -ENOMEM;
4724332bb12dSAlex Elder 
4725332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4726332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4727332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4728332bb12dSAlex Elder 	else
4729332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4730332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4731332bb12dSAlex Elder 	return 0;
4732332bb12dSAlex Elder }
4733332bb12dSAlex Elder 
4734200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4735200a6a8bSAlex Elder {
47366fd48b3bSAlex Elder 	int ret;
47376fd48b3bSAlex Elder 
47386fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
47396fd48b3bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 0);
47406fd48b3bSAlex Elder 	if (ret)
47416fd48b3bSAlex Elder 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
4742200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
47436fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
47446fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
47456fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
47466fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
47476fd48b3bSAlex Elder 
4748200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
4749200a6a8bSAlex Elder }
4750200a6a8bSAlex Elder 
4751a30b71b9SAlex Elder /*
4752a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4753a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4754a30b71b9SAlex Elder  * id.
4755a30b71b9SAlex Elder  */
475671f293e2SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
4757a30b71b9SAlex Elder {
4758a30b71b9SAlex Elder 	int ret;
4759b644de2bSAlex Elder 	int tmp;
4760a30b71b9SAlex Elder 
4761a30b71b9SAlex Elder 	/*
4762a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4763a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4764a30b71b9SAlex Elder 	 * it's a format 1 image.
4765a30b71b9SAlex Elder 	 */
4766a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4767a30b71b9SAlex Elder 	if (ret)
4768c0fba368SAlex Elder 		return ret;
4769c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
4770c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4771c0fba368SAlex Elder 
4772332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
4773332bb12dSAlex Elder 	if (ret)
4774332bb12dSAlex Elder 		goto err_out_format;
4775332bb12dSAlex Elder 
4776b644de2bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4777b644de2bSAlex Elder 	if (ret)
4778b644de2bSAlex Elder 		goto out_header_name;
4779b644de2bSAlex Elder 
4780c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
4781a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4782a30b71b9SAlex Elder 	else
4783a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
47845655c4d9SAlex Elder 	if (ret)
4785b644de2bSAlex Elder 		goto err_out_watch;
4786a30b71b9SAlex Elder 
47879bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
47889bb81c9bSAlex Elder 	if (ret)
478933dca39fSAlex Elder 		goto err_out_probe;
47909bb81c9bSAlex Elder 
47919bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
47926fd48b3bSAlex Elder 	if (!ret)
47936fd48b3bSAlex Elder 		return 0;
479483a06263SAlex Elder 
47956fd48b3bSAlex Elder err_out_probe:
47966fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4797b644de2bSAlex Elder err_out_watch:
4798b644de2bSAlex Elder 	tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4799b644de2bSAlex Elder 	if (tmp)
4800b644de2bSAlex Elder 		rbd_warn(rbd_dev, "unable to tear down watch request\n");
4801332bb12dSAlex Elder out_header_name:
4802332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
4803332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
4804332bb12dSAlex Elder err_out_format:
4805332bb12dSAlex Elder 	rbd_dev->image_format = 0;
48065655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
48075655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
48085655c4d9SAlex Elder 
48095655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
48105655c4d9SAlex Elder 
48115655c4d9SAlex Elder 	return ret;
481283a06263SAlex Elder }
481383a06263SAlex Elder 
481459c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
481559c2be1eSYehuda Sadeh 		       const char *buf,
481659c2be1eSYehuda Sadeh 		       size_t count)
4817602adf40SYehuda Sadeh {
4818cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4819dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
48204e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4821859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48229d3997fdSAlex Elder 	struct rbd_client *rbdc;
482327cc2594SAlex Elder 	struct ceph_osd_client *osdc;
482427cc2594SAlex Elder 	int rc = -ENOMEM;
4825602adf40SYehuda Sadeh 
4826602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4827602adf40SYehuda Sadeh 		return -ENODEV;
4828602adf40SYehuda Sadeh 
4829a725f65eSAlex Elder 	/* parse add command */
4830859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4831dc79b113SAlex Elder 	if (rc < 0)
4832bd4ba655SAlex Elder 		goto err_out_module;
4833a725f65eSAlex Elder 
48349d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
48359d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
48369d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
48370ddebc0cSAlex Elder 		goto err_out_args;
48389d3997fdSAlex Elder 	}
4839c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4840602adf40SYehuda Sadeh 
4841602adf40SYehuda Sadeh 	/* pick the pool */
48429d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4843859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4844602adf40SYehuda Sadeh 	if (rc < 0)
4845602adf40SYehuda Sadeh 		goto err_out_client;
4846859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
4847859c31dfSAlex Elder 
48480903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
48490903e875SAlex Elder 
4850c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
4851c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4852c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
48530903e875SAlex Elder 		rc = -EIO;
48540903e875SAlex Elder 		goto err_out_client;
48550903e875SAlex Elder 	}
48560903e875SAlex Elder 
4857c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4858bd4ba655SAlex Elder 	if (!rbd_dev)
4859bd4ba655SAlex Elder 		goto err_out_client;
4860c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4861c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4862602adf40SYehuda Sadeh 
4863bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4864c53d5893SAlex Elder 	kfree(rbd_opts);
4865c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4866bd4ba655SAlex Elder 
486771f293e2SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev);
4868a30b71b9SAlex Elder 	if (rc < 0)
4869c53d5893SAlex Elder 		goto err_out_rbd_dev;
487005fd6f6fSAlex Elder 
4871b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
4872b536f69aSAlex Elder 	if (!rc)
4873602adf40SYehuda Sadeh 		return count;
4874b536f69aSAlex Elder 
4875b536f69aSAlex Elder 	rbd_dev_image_release(rbd_dev);
4876c53d5893SAlex Elder err_out_rbd_dev:
4877c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4878bd4ba655SAlex Elder err_out_client:
48799d3997fdSAlex Elder 	rbd_put_client(rbdc);
48800ddebc0cSAlex Elder err_out_args:
488178cea76eSAlex Elder 	if (ceph_opts)
488278cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
48834e9afebaSAlex Elder 	kfree(rbd_opts);
4884859c31dfSAlex Elder 	rbd_spec_put(spec);
4885bd4ba655SAlex Elder err_out_module:
4886bd4ba655SAlex Elder 	module_put(THIS_MODULE);
488727cc2594SAlex Elder 
4888602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
488927cc2594SAlex Elder 
489027cc2594SAlex Elder 	return (ssize_t)rc;
4891602adf40SYehuda Sadeh }
4892602adf40SYehuda Sadeh 
4893de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4894602adf40SYehuda Sadeh {
4895602adf40SYehuda Sadeh 	struct list_head *tmp;
4896602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4897602adf40SYehuda Sadeh 
4898e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4899602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4900602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4901de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4902e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4903602adf40SYehuda Sadeh 			return rbd_dev;
4904602adf40SYehuda Sadeh 		}
4905e124a82fSAlex Elder 	}
4906e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4907602adf40SYehuda Sadeh 	return NULL;
4908602adf40SYehuda Sadeh }
4909602adf40SYehuda Sadeh 
4910200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
4911602adf40SYehuda Sadeh {
4912593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4913602adf40SYehuda Sadeh 
4914602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4915200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4916200a6a8bSAlex Elder 	rbd_dev_clear_mapping(rbd_dev);
4917602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
4918200a6a8bSAlex Elder 	rbd_dev->major = 0;
4919e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4920d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
4921602adf40SYehuda Sadeh }
4922602adf40SYehuda Sadeh 
492305a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
492405a46afdSAlex Elder {
4925ad945fc1SAlex Elder 	while (rbd_dev->parent) {
492605a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
492705a46afdSAlex Elder 		struct rbd_device *second = first->parent;
492805a46afdSAlex Elder 		struct rbd_device *third;
492905a46afdSAlex Elder 
493005a46afdSAlex Elder 		/*
493105a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
493205a46afdSAlex Elder 		 * remove it.
493305a46afdSAlex Elder 		 */
493405a46afdSAlex Elder 		while (second && (third = second->parent)) {
493505a46afdSAlex Elder 			first = second;
493605a46afdSAlex Elder 			second = third;
493705a46afdSAlex Elder 		}
4938ad945fc1SAlex Elder 		rbd_assert(second);
49398ad42cd0SAlex Elder 		rbd_dev_image_release(second);
4940ad945fc1SAlex Elder 		first->parent = NULL;
4941ad945fc1SAlex Elder 		first->parent_overlap = 0;
4942ad945fc1SAlex Elder 
4943ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
494405a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
494505a46afdSAlex Elder 		first->parent_spec = NULL;
494605a46afdSAlex Elder 	}
494705a46afdSAlex Elder }
494805a46afdSAlex Elder 
4949dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4950602adf40SYehuda Sadeh 			  const char *buf,
4951602adf40SYehuda Sadeh 			  size_t count)
4952602adf40SYehuda Sadeh {
4953602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
49540d8189e1SAlex Elder 	int target_id;
4955602adf40SYehuda Sadeh 	unsigned long ul;
49560d8189e1SAlex Elder 	int ret;
4957602adf40SYehuda Sadeh 
49580d8189e1SAlex Elder 	ret = strict_strtoul(buf, 10, &ul);
49590d8189e1SAlex Elder 	if (ret)
49600d8189e1SAlex Elder 		return ret;
4961602adf40SYehuda Sadeh 
4962602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4963602adf40SYehuda Sadeh 	target_id = (int) ul;
4964602adf40SYehuda Sadeh 	if (target_id != ul)
4965602adf40SYehuda Sadeh 		return -EINVAL;
4966602adf40SYehuda Sadeh 
4967602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4968602adf40SYehuda Sadeh 
4969602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4970602adf40SYehuda Sadeh 	if (!rbd_dev) {
4971602adf40SYehuda Sadeh 		ret = -ENOENT;
4972602adf40SYehuda Sadeh 		goto done;
4973602adf40SYehuda Sadeh 	}
4974602adf40SYehuda Sadeh 
4975a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4976b82d167bSAlex Elder 	if (rbd_dev->open_count)
497742382b70SAlex Elder 		ret = -EBUSY;
4978b82d167bSAlex Elder 	else
4979b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4980a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4981b82d167bSAlex Elder 	if (ret < 0)
498242382b70SAlex Elder 		goto done;
49830d8189e1SAlex Elder 	ret = count;
4984b480815aSAlex Elder 	rbd_bus_del_dev(rbd_dev);
49858ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
498679ab7558SAlex Elder 	module_put(THIS_MODULE);
4987602adf40SYehuda Sadeh done:
4988602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4989aafb230eSAlex Elder 
4990602adf40SYehuda Sadeh 	return ret;
4991602adf40SYehuda Sadeh }
4992602adf40SYehuda Sadeh 
4993602adf40SYehuda Sadeh /*
4994602adf40SYehuda Sadeh  * create control files in sysfs
4995dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4996602adf40SYehuda Sadeh  */
4997602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4998602adf40SYehuda Sadeh {
4999dfc5606dSYehuda Sadeh 	int ret;
5000602adf40SYehuda Sadeh 
5001fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5002dfc5606dSYehuda Sadeh 	if (ret < 0)
5003dfc5606dSYehuda Sadeh 		return ret;
5004602adf40SYehuda Sadeh 
5005fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5006fed4c143SAlex Elder 	if (ret < 0)
5007fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5008602adf40SYehuda Sadeh 
5009602adf40SYehuda Sadeh 	return ret;
5010602adf40SYehuda Sadeh }
5011602adf40SYehuda Sadeh 
5012602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5013602adf40SYehuda Sadeh {
5014dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5015fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5016602adf40SYehuda Sadeh }
5017602adf40SYehuda Sadeh 
50181c2a9dfeSAlex Elder static int rbd_slab_init(void)
50191c2a9dfeSAlex Elder {
50201c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
50211c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
50221c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
50231c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
50241c2a9dfeSAlex Elder 					0, NULL);
5025868311b1SAlex Elder 	if (!rbd_img_request_cache)
5026868311b1SAlex Elder 		return -ENOMEM;
5027868311b1SAlex Elder 
5028868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
5029868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5030868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
5031868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
5032868311b1SAlex Elder 					0, NULL);
503378c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
503478c2a44aSAlex Elder 		goto out_err;
503578c2a44aSAlex Elder 
503678c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
503778c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
503878c2a44aSAlex Elder 					MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
503978c2a44aSAlex Elder 	if (rbd_segment_name_cache)
50401c2a9dfeSAlex Elder 		return 0;
504178c2a44aSAlex Elder out_err:
504278c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
504378c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
504478c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
504578c2a44aSAlex Elder 	}
50461c2a9dfeSAlex Elder 
5047868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
5048868311b1SAlex Elder 	rbd_img_request_cache = NULL;
5049868311b1SAlex Elder 
50501c2a9dfeSAlex Elder 	return -ENOMEM;
50511c2a9dfeSAlex Elder }
50521c2a9dfeSAlex Elder 
50531c2a9dfeSAlex Elder static void rbd_slab_exit(void)
50541c2a9dfeSAlex Elder {
505578c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
505678c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
505778c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
505878c2a44aSAlex Elder 
5059868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5060868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5061868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5062868311b1SAlex Elder 
50631c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
50641c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
50651c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
50661c2a9dfeSAlex Elder }
50671c2a9dfeSAlex Elder 
5068cc344fa1SAlex Elder static int __init rbd_init(void)
5069602adf40SYehuda Sadeh {
5070602adf40SYehuda Sadeh 	int rc;
5071602adf40SYehuda Sadeh 
50721e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
50731e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
50741e32d34cSAlex Elder 
50751e32d34cSAlex Elder 		return -EINVAL;
50761e32d34cSAlex Elder 	}
50771c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5078602adf40SYehuda Sadeh 	if (rc)
5079602adf40SYehuda Sadeh 		return rc;
50801c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
50811c2a9dfeSAlex Elder 	if (rc)
50821c2a9dfeSAlex Elder 		rbd_slab_exit();
50831c2a9dfeSAlex Elder 	else
5084f0f8cef5SAlex Elder 		pr_info("loaded " RBD_DRV_NAME_LONG "\n");
50851c2a9dfeSAlex Elder 
50861c2a9dfeSAlex Elder 	return rc;
5087602adf40SYehuda Sadeh }
5088602adf40SYehuda Sadeh 
5089cc344fa1SAlex Elder static void __exit rbd_exit(void)
5090602adf40SYehuda Sadeh {
5091602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
50921c2a9dfeSAlex Elder 	rbd_slab_exit();
5093602adf40SYehuda Sadeh }
5094602adf40SYehuda Sadeh 
5095602adf40SYehuda Sadeh module_init(rbd_init);
5096602adf40SYehuda Sadeh module_exit(rbd_exit);
5097602adf40SYehuda Sadeh 
5098602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5099602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5100602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5101602adf40SYehuda Sadeh 
5102602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5103602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5104602adf40SYehuda Sadeh 
5105602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5106