xref: /openbmc/linux/drivers/block/rbd.c (revision 30d60ba2)
1e2a58ee5SAlex Elder 
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh 
6602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh 
8602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh 
10602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh    the Free Software Foundation.
13602adf40SYehuda Sadeh 
14602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17602adf40SYehuda Sadeh    GNU General Public License for more details.
18602adf40SYehuda Sadeh 
19602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
21602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24602adf40SYehuda Sadeh 
25dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
26602adf40SYehuda Sadeh 
27dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh 
29602adf40SYehuda Sadeh  */
30602adf40SYehuda Sadeh 
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3559c2be1eSYehuda Sadeh #include <linux/parser.h>
3630d1cff8SAlex Elder #include <linux/bsearch.h>
37602adf40SYehuda Sadeh 
38602adf40SYehuda Sadeh #include <linux/kernel.h>
39602adf40SYehuda Sadeh #include <linux/device.h>
40602adf40SYehuda Sadeh #include <linux/module.h>
41602adf40SYehuda Sadeh #include <linux/fs.h>
42602adf40SYehuda Sadeh #include <linux/blkdev.h>
431c2a9dfeSAlex Elder #include <linux/slab.h>
44602adf40SYehuda Sadeh 
45602adf40SYehuda Sadeh #include "rbd_types.h"
46602adf40SYehuda Sadeh 
47aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
48aafb230eSAlex Elder 
49593a9e7bSAlex Elder /*
50593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
51593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
52593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
53593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
54593a9e7bSAlex Elder  */
55593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
56593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
57593a9e7bSAlex Elder 
58f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
59f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
60602adf40SYehuda Sadeh 
61602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
62602adf40SYehuda Sadeh 
63d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
64d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
65d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
66d4b125e9SAlex Elder 
6735d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
68602adf40SYehuda Sadeh 
69602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
70602adf40SYehuda Sadeh 
719682fc6dSAlex Elder #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
729682fc6dSAlex Elder 
739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
769e15b77dSAlex Elder 
771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
78589d30e0SAlex Elder 
79d889140cSAlex Elder /* Feature bits */
80d889140cSAlex Elder 
815cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
825cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
835cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
845cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
85d889140cSAlex Elder 
86d889140cSAlex Elder /* Features supported by this (client software) implementation. */
87d889140cSAlex Elder 
88770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
89d889140cSAlex Elder 
9081a89793SAlex Elder /*
9181a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9281a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9381a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9481a89793SAlex Elder  * enough to hold all possible device names.
9581a89793SAlex Elder  */
96602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9781a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
98602adf40SYehuda Sadeh 
99602adf40SYehuda Sadeh /*
100602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
101602adf40SYehuda Sadeh  */
102602adf40SYehuda Sadeh struct rbd_image_header {
103f35a4deeSAlex Elder 	/* These six fields never change for a given rbd image */
104849b4260SAlex Elder 	char *object_prefix;
105602adf40SYehuda Sadeh 	__u8 obj_order;
106602adf40SYehuda Sadeh 	__u8 crypt_type;
107602adf40SYehuda Sadeh 	__u8 comp_type;
108f35a4deeSAlex Elder 	u64 stripe_unit;
109f35a4deeSAlex Elder 	u64 stripe_count;
110f35a4deeSAlex Elder 	u64 features;		/* Might be changeable someday? */
111602adf40SYehuda Sadeh 
112f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
113f84344f3SAlex Elder 	u64 image_size;
114f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
115f35a4deeSAlex Elder 	char *snap_names;	/* format 1 only */
116f35a4deeSAlex Elder 	u64 *snap_sizes;	/* format 1 only */
11759c2be1eSYehuda Sadeh };
11859c2be1eSYehuda Sadeh 
1190d7dbfceSAlex Elder /*
1200d7dbfceSAlex Elder  * An rbd image specification.
1210d7dbfceSAlex Elder  *
1220d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
123c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
124c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
127c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
128c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
129c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
130c66c6e0cSAlex Elder  *
131c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
132c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
133c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
134c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
135c66c6e0cSAlex Elder  * is shared between the parent and child).
136c66c6e0cSAlex Elder  *
137c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
138c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
139c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
140c66c6e0cSAlex Elder  *
141c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
142c66c6e0cSAlex Elder  * could be a null pointer).
1430d7dbfceSAlex Elder  */
1440d7dbfceSAlex Elder struct rbd_spec {
1450d7dbfceSAlex Elder 	u64		pool_id;
146ecb4dc22SAlex Elder 	const char	*pool_name;
1470d7dbfceSAlex Elder 
148ecb4dc22SAlex Elder 	const char	*image_id;
149ecb4dc22SAlex Elder 	const char	*image_name;
1500d7dbfceSAlex Elder 
1510d7dbfceSAlex Elder 	u64		snap_id;
152ecb4dc22SAlex Elder 	const char	*snap_name;
1530d7dbfceSAlex Elder 
1540d7dbfceSAlex Elder 	struct kref	kref;
1550d7dbfceSAlex Elder };
1560d7dbfceSAlex Elder 
157602adf40SYehuda Sadeh /*
158f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
159602adf40SYehuda Sadeh  */
160602adf40SYehuda Sadeh struct rbd_client {
161602adf40SYehuda Sadeh 	struct ceph_client	*client;
162602adf40SYehuda Sadeh 	struct kref		kref;
163602adf40SYehuda Sadeh 	struct list_head	node;
164602adf40SYehuda Sadeh };
165602adf40SYehuda Sadeh 
166bf0d5f50SAlex Elder struct rbd_img_request;
167bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
168bf0d5f50SAlex Elder 
169bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
170bf0d5f50SAlex Elder 
171bf0d5f50SAlex Elder struct rbd_obj_request;
172bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
173bf0d5f50SAlex Elder 
1749969ebc5SAlex Elder enum obj_request_type {
1759969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1769969ebc5SAlex Elder };
177bf0d5f50SAlex Elder 
178926f9b3fSAlex Elder enum obj_req_flags {
179926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1806365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1815679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1825679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
183926f9b3fSAlex Elder };
184926f9b3fSAlex Elder 
185bf0d5f50SAlex Elder struct rbd_obj_request {
186bf0d5f50SAlex Elder 	const char		*object_name;
187bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
188bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
189926f9b3fSAlex Elder 	unsigned long		flags;
190bf0d5f50SAlex Elder 
191c5b5ef6cSAlex Elder 	/*
192c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
193c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
194c5b5ef6cSAlex Elder 	 *
195c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
196c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
197c5b5ef6cSAlex Elder 	 *
198c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
199c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
200c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
201c5b5ef6cSAlex Elder 	 *
202c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
203c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
204c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
205c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
206c5b5ef6cSAlex Elder 	 */
207c5b5ef6cSAlex Elder 	union {
208c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
209c5b5ef6cSAlex Elder 		struct {
210bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
211c5b5ef6cSAlex Elder 			u64			img_offset;
212c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
213c5b5ef6cSAlex Elder 			struct list_head	links;
214c5b5ef6cSAlex Elder 		};
215c5b5ef6cSAlex Elder 	};
216bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
217bf0d5f50SAlex Elder 
218bf0d5f50SAlex Elder 	enum obj_request_type	type;
219788e2df3SAlex Elder 	union {
220bf0d5f50SAlex Elder 		struct bio	*bio_list;
221788e2df3SAlex Elder 		struct {
222788e2df3SAlex Elder 			struct page	**pages;
223788e2df3SAlex Elder 			u32		page_count;
224788e2df3SAlex Elder 		};
225788e2df3SAlex Elder 	};
2260eefd470SAlex Elder 	struct page		**copyup_pages;
227bf0d5f50SAlex Elder 
228bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
229bf0d5f50SAlex Elder 
230bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
2311b83bef2SSage Weil 	int			result;
232bf0d5f50SAlex Elder 
233bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
234788e2df3SAlex Elder 	struct completion	completion;
235bf0d5f50SAlex Elder 
236bf0d5f50SAlex Elder 	struct kref		kref;
237bf0d5f50SAlex Elder };
238bf0d5f50SAlex Elder 
2390c425248SAlex Elder enum img_req_flags {
2409849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2419849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
242d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2430c425248SAlex Elder };
2440c425248SAlex Elder 
245bf0d5f50SAlex Elder struct rbd_img_request {
246bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
247bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
248bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2490c425248SAlex Elder 	unsigned long		flags;
250bf0d5f50SAlex Elder 	union {
251bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2529849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2539849e986SAlex Elder 	};
2549849e986SAlex Elder 	union {
2559849e986SAlex Elder 		struct request		*rq;		/* block request */
2569849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
257bf0d5f50SAlex Elder 	};
2583d7efd18SAlex Elder 	struct page		**copyup_pages;
259bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
260bf0d5f50SAlex Elder 	u32			next_completion;
261bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
26255f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
263a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder 	u32			obj_request_count;
266bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
267bf0d5f50SAlex Elder 
268bf0d5f50SAlex Elder 	struct kref		kref;
269bf0d5f50SAlex Elder };
270bf0d5f50SAlex Elder 
271bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
272ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
273bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
274ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
275bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
276ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
277bf0d5f50SAlex Elder 
278f84344f3SAlex Elder struct rbd_mapping {
27999c1f08fSAlex Elder 	u64                     size;
28034b13184SAlex Elder 	u64                     features;
281f84344f3SAlex Elder 	bool			read_only;
282f84344f3SAlex Elder };
283f84344f3SAlex Elder 
284602adf40SYehuda Sadeh /*
285602adf40SYehuda Sadeh  * a single device
286602adf40SYehuda Sadeh  */
287602adf40SYehuda Sadeh struct rbd_device {
288de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
289602adf40SYehuda Sadeh 
290602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
291602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
292602adf40SYehuda Sadeh 
293a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
294602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
295602adf40SYehuda Sadeh 
296602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
297602adf40SYehuda Sadeh 
298b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
299602adf40SYehuda Sadeh 
300602adf40SYehuda Sadeh 	struct rbd_image_header	header;
301b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3020d7dbfceSAlex Elder 	struct rbd_spec		*spec;
303602adf40SYehuda Sadeh 
3040d7dbfceSAlex Elder 	char			*header_name;
305971f839aSAlex Elder 
3060903e875SAlex Elder 	struct ceph_file_layout	layout;
3070903e875SAlex Elder 
30859c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
309975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31059c2be1eSYehuda Sadeh 
31186b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31286b00e0dSAlex Elder 	u64			parent_overlap;
3132f82ee54SAlex Elder 	struct rbd_device	*parent;
31486b00e0dSAlex Elder 
315c666601aSJosh Durgin 	/* protects updating the header */
316c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
317f84344f3SAlex Elder 
318f84344f3SAlex Elder 	struct rbd_mapping	mapping;
319602adf40SYehuda Sadeh 
320602adf40SYehuda Sadeh 	struct list_head	node;
321dfc5606dSYehuda Sadeh 
322dfc5606dSYehuda Sadeh 	/* sysfs related */
323dfc5606dSYehuda Sadeh 	struct device		dev;
324b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
325dfc5606dSYehuda Sadeh };
326dfc5606dSYehuda Sadeh 
327b82d167bSAlex Elder /*
328b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
329b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
330b82d167bSAlex Elder  *
331b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
332b82d167bSAlex Elder  * "open_count" field) requires atomic access.
333b82d167bSAlex Elder  */
3346d292906SAlex Elder enum rbd_dev_flags {
3356d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
336b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3376d292906SAlex Elder };
3386d292906SAlex Elder 
339602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
340e124a82fSAlex Elder 
341602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
342e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
343e124a82fSAlex Elder 
344602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
345432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
346602adf40SYehuda Sadeh 
34778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
34878c2a44aSAlex Elder 
3491c2a9dfeSAlex Elder static struct kmem_cache	*rbd_img_request_cache;
350868311b1SAlex Elder static struct kmem_cache	*rbd_obj_request_cache;
35178c2a44aSAlex Elder static struct kmem_cache	*rbd_segment_name_cache;
3521c2a9dfeSAlex Elder 
3533d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3543d7efd18SAlex Elder 
355200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
356dfc5606dSYehuda Sadeh 
357f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
358f0f8cef5SAlex Elder 		       size_t count);
359f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
360f0f8cef5SAlex Elder 			  size_t count);
36151344a38SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only);
362f0f8cef5SAlex Elder 
363f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
364f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
365f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
366f0f8cef5SAlex Elder 	__ATTR_NULL
367f0f8cef5SAlex Elder };
368f0f8cef5SAlex Elder 
369f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
370f0f8cef5SAlex Elder 	.name		= "rbd",
371f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
372f0f8cef5SAlex Elder };
373f0f8cef5SAlex Elder 
374f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
375f0f8cef5SAlex Elder {
376f0f8cef5SAlex Elder }
377f0f8cef5SAlex Elder 
378f0f8cef5SAlex Elder static struct device rbd_root_dev = {
379f0f8cef5SAlex Elder 	.init_name =    "rbd",
380f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
381f0f8cef5SAlex Elder };
382f0f8cef5SAlex Elder 
38306ecc6cbSAlex Elder static __printf(2, 3)
38406ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
38506ecc6cbSAlex Elder {
38606ecc6cbSAlex Elder 	struct va_format vaf;
38706ecc6cbSAlex Elder 	va_list args;
38806ecc6cbSAlex Elder 
38906ecc6cbSAlex Elder 	va_start(args, fmt);
39006ecc6cbSAlex Elder 	vaf.fmt = fmt;
39106ecc6cbSAlex Elder 	vaf.va = &args;
39206ecc6cbSAlex Elder 
39306ecc6cbSAlex Elder 	if (!rbd_dev)
39406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
39506ecc6cbSAlex Elder 	else if (rbd_dev->disk)
39606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
39706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
39806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
39906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
40306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
40406ecc6cbSAlex Elder 	else	/* punt */
40506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
40606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
40706ecc6cbSAlex Elder 	va_end(args);
40806ecc6cbSAlex Elder }
40906ecc6cbSAlex Elder 
410aafb230eSAlex Elder #ifdef RBD_DEBUG
411aafb230eSAlex Elder #define rbd_assert(expr)						\
412aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
413aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
414aafb230eSAlex Elder 						"at line %d:\n\n"	\
415aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
416aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
417aafb230eSAlex Elder 			BUG();						\
418aafb230eSAlex Elder 		}
419aafb230eSAlex Elder #else /* !RBD_DEBUG */
420aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
421aafb230eSAlex Elder #endif /* !RBD_DEBUG */
422dfc5606dSYehuda Sadeh 
423b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
42405a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
42505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
4268b3e1a56SAlex Elder 
427cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
428cc4a38bdSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
42954cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
43054cac61fSAlex Elder 					u64 snap_id);
4312ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4322ad3d716SAlex Elder 				u8 *order, u64 *snap_size);
4332ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4342ad3d716SAlex Elder 		u64 *snap_features);
4352ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
43659c2be1eSYehuda Sadeh 
437602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
438602adf40SYehuda Sadeh {
439f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
440b82d167bSAlex Elder 	bool removing = false;
441602adf40SYehuda Sadeh 
442f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
443602adf40SYehuda Sadeh 		return -EROFS;
444602adf40SYehuda Sadeh 
445a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
446b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
447b82d167bSAlex Elder 		removing = true;
448b82d167bSAlex Elder 	else
449b82d167bSAlex Elder 		rbd_dev->open_count++;
450a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
451b82d167bSAlex Elder 	if (removing)
452b82d167bSAlex Elder 		return -ENOENT;
453b82d167bSAlex Elder 
45442382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
456f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45742382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
458340c7a2bSAlex Elder 
459602adf40SYehuda Sadeh 	return 0;
460602adf40SYehuda Sadeh }
461602adf40SYehuda Sadeh 
462dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
463dfc5606dSYehuda Sadeh {
464dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
465b82d167bSAlex Elder 	unsigned long open_count_before;
466b82d167bSAlex Elder 
467a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
468b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
469a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
470b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
471dfc5606dSYehuda Sadeh 
47242382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
473c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
47442382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
475dfc5606dSYehuda Sadeh 
476dfc5606dSYehuda Sadeh 	return 0;
477dfc5606dSYehuda Sadeh }
478dfc5606dSYehuda Sadeh 
479602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
480602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
481602adf40SYehuda Sadeh 	.open			= rbd_open,
482dfc5606dSYehuda Sadeh 	.release		= rbd_release,
483602adf40SYehuda Sadeh };
484602adf40SYehuda Sadeh 
485602adf40SYehuda Sadeh /*
486602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48743ae4701SAlex Elder  * We own *ceph_opts.
488602adf40SYehuda Sadeh  */
489f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
490602adf40SYehuda Sadeh {
491602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
492602adf40SYehuda Sadeh 	int ret = -ENOMEM;
493602adf40SYehuda Sadeh 
49437206ee5SAlex Elder 	dout("%s:\n", __func__);
495602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
496602adf40SYehuda Sadeh 	if (!rbdc)
497602adf40SYehuda Sadeh 		goto out_opt;
498602adf40SYehuda Sadeh 
499602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
500602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
501602adf40SYehuda Sadeh 
502bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
503bc534d86SAlex Elder 
50443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
505602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
506bc534d86SAlex Elder 		goto out_mutex;
50743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
508602adf40SYehuda Sadeh 
509602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
510602adf40SYehuda Sadeh 	if (ret < 0)
511602adf40SYehuda Sadeh 		goto out_err;
512602adf40SYehuda Sadeh 
513432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
514602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
515432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
516602adf40SYehuda Sadeh 
517bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
519bc534d86SAlex Elder 
520602adf40SYehuda Sadeh 	return rbdc;
521602adf40SYehuda Sadeh 
522602adf40SYehuda Sadeh out_err:
523602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
524bc534d86SAlex Elder out_mutex:
525bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
526602adf40SYehuda Sadeh 	kfree(rbdc);
527602adf40SYehuda Sadeh out_opt:
52843ae4701SAlex Elder 	if (ceph_opts)
52943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
53037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
53137206ee5SAlex Elder 
53228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
533602adf40SYehuda Sadeh }
534602adf40SYehuda Sadeh 
5352f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5362f82ee54SAlex Elder {
5372f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5382f82ee54SAlex Elder 
5392f82ee54SAlex Elder 	return rbdc;
5402f82ee54SAlex Elder }
5412f82ee54SAlex Elder 
542602adf40SYehuda Sadeh /*
5431f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5441f7ba331SAlex Elder  * found, bump its reference count.
545602adf40SYehuda Sadeh  */
5461f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
547602adf40SYehuda Sadeh {
548602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5491f7ba331SAlex Elder 	bool found = false;
550602adf40SYehuda Sadeh 
55143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
552602adf40SYehuda Sadeh 		return NULL;
553602adf40SYehuda Sadeh 
5541f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5551f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5561f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5572f82ee54SAlex Elder 			__rbd_get_client(client_node);
5582f82ee54SAlex Elder 
5591f7ba331SAlex Elder 			found = true;
5601f7ba331SAlex Elder 			break;
5611f7ba331SAlex Elder 		}
5621f7ba331SAlex Elder 	}
5631f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5641f7ba331SAlex Elder 
5651f7ba331SAlex Elder 	return found ? client_node : NULL;
566602adf40SYehuda Sadeh }
567602adf40SYehuda Sadeh 
568602adf40SYehuda Sadeh /*
56959c2be1eSYehuda Sadeh  * mount options
57059c2be1eSYehuda Sadeh  */
57159c2be1eSYehuda Sadeh enum {
57259c2be1eSYehuda Sadeh 	Opt_last_int,
57359c2be1eSYehuda Sadeh 	/* int args above */
57459c2be1eSYehuda Sadeh 	Opt_last_string,
57559c2be1eSYehuda Sadeh 	/* string args above */
576cc0538b6SAlex Elder 	Opt_read_only,
577cc0538b6SAlex Elder 	Opt_read_write,
578cc0538b6SAlex Elder 	/* Boolean args above */
579cc0538b6SAlex Elder 	Opt_last_bool,
58059c2be1eSYehuda Sadeh };
58159c2be1eSYehuda Sadeh 
58243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
58359c2be1eSYehuda Sadeh 	/* int args above */
58459c2be1eSYehuda Sadeh 	/* string args above */
585be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
586cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
587cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
588cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
589cc0538b6SAlex Elder 	/* Boolean args above */
59059c2be1eSYehuda Sadeh 	{-1, NULL}
59159c2be1eSYehuda Sadeh };
59259c2be1eSYehuda Sadeh 
59398571b5aSAlex Elder struct rbd_options {
59498571b5aSAlex Elder 	bool	read_only;
59598571b5aSAlex Elder };
59698571b5aSAlex Elder 
59798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59898571b5aSAlex Elder 
59959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
60059c2be1eSYehuda Sadeh {
60143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
60259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
60359c2be1eSYehuda Sadeh 	int token, intval, ret;
60459c2be1eSYehuda Sadeh 
60543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60659c2be1eSYehuda Sadeh 	if (token < 0)
60759c2be1eSYehuda Sadeh 		return -EINVAL;
60859c2be1eSYehuda Sadeh 
60959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
61059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
61159c2be1eSYehuda Sadeh 		if (ret < 0) {
61259c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
61359c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
61459c2be1eSYehuda Sadeh 			return ret;
61559c2be1eSYehuda Sadeh 		}
61659c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61759c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61859c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
61959c2be1eSYehuda Sadeh 		     argstr[0].from);
620cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
621cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
62259c2be1eSYehuda Sadeh 	} else {
62359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
62459c2be1eSYehuda Sadeh 	}
62559c2be1eSYehuda Sadeh 
62659c2be1eSYehuda Sadeh 	switch (token) {
627cc0538b6SAlex Elder 	case Opt_read_only:
628cc0538b6SAlex Elder 		rbd_opts->read_only = true;
629cc0538b6SAlex Elder 		break;
630cc0538b6SAlex Elder 	case Opt_read_write:
631cc0538b6SAlex Elder 		rbd_opts->read_only = false;
632cc0538b6SAlex Elder 		break;
63359c2be1eSYehuda Sadeh 	default:
634aafb230eSAlex Elder 		rbd_assert(false);
635aafb230eSAlex Elder 		break;
63659c2be1eSYehuda Sadeh 	}
63759c2be1eSYehuda Sadeh 	return 0;
63859c2be1eSYehuda Sadeh }
63959c2be1eSYehuda Sadeh 
64059c2be1eSYehuda Sadeh /*
641602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
642602adf40SYehuda Sadeh  * not exist create it.
643602adf40SYehuda Sadeh  */
6449d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
645602adf40SYehuda Sadeh {
646f8c38929SAlex Elder 	struct rbd_client *rbdc;
64759c2be1eSYehuda Sadeh 
6481f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6499d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
65043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6519d3997fdSAlex Elder 	else
652f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
653d720bcb0SAlex Elder 
6549d3997fdSAlex Elder 	return rbdc;
655602adf40SYehuda Sadeh }
656602adf40SYehuda Sadeh 
657602adf40SYehuda Sadeh /*
658602adf40SYehuda Sadeh  * Destroy ceph client
659d23a4b3fSAlex Elder  *
660432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
661602adf40SYehuda Sadeh  */
662602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
663602adf40SYehuda Sadeh {
664602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
665602adf40SYehuda Sadeh 
66637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
667cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
668602adf40SYehuda Sadeh 	list_del(&rbdc->node);
669cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
670602adf40SYehuda Sadeh 
671602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
672602adf40SYehuda Sadeh 	kfree(rbdc);
673602adf40SYehuda Sadeh }
674602adf40SYehuda Sadeh 
675602adf40SYehuda Sadeh /*
676602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
677602adf40SYehuda Sadeh  * it.
678602adf40SYehuda Sadeh  */
6799d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
680602adf40SYehuda Sadeh {
681c53d5893SAlex Elder 	if (rbdc)
6829d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
683602adf40SYehuda Sadeh }
684602adf40SYehuda Sadeh 
685a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
686a30b71b9SAlex Elder {
687a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
688a30b71b9SAlex Elder }
689a30b71b9SAlex Elder 
6908e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6918e94af8eSAlex Elder {
692103a150fSAlex Elder 	size_t size;
693103a150fSAlex Elder 	u32 snap_count;
694103a150fSAlex Elder 
695103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
696103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
697103a150fSAlex Elder 		return false;
698103a150fSAlex Elder 
699db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
700db2388b6SAlex Elder 
701db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
702db2388b6SAlex Elder 		return false;
703db2388b6SAlex Elder 
704db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
705db2388b6SAlex Elder 
706db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
707db2388b6SAlex Elder 		return false;
708db2388b6SAlex Elder 
709103a150fSAlex Elder 	/*
710103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
711103a150fSAlex Elder 	 * that limits the number of snapshots.
712103a150fSAlex Elder 	 */
713103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
714103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
715103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
716103a150fSAlex Elder 		return false;
717103a150fSAlex Elder 
718103a150fSAlex Elder 	/*
719103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
720103a150fSAlex Elder 	 * header must also be representable in a size_t.
721103a150fSAlex Elder 	 */
722103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
723103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
724103a150fSAlex Elder 		return false;
725103a150fSAlex Elder 
726103a150fSAlex Elder 	return true;
7278e94af8eSAlex Elder }
7288e94af8eSAlex Elder 
729602adf40SYehuda Sadeh /*
730bb23e37aSAlex Elder  * Fill an rbd image header with information from the given format 1
731bb23e37aSAlex Elder  * on-disk header.
732602adf40SYehuda Sadeh  */
733662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev,
7344156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
735602adf40SYehuda Sadeh {
736662518b1SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
737bb23e37aSAlex Elder 	bool first_time = header->object_prefix == NULL;
738bb23e37aSAlex Elder 	struct ceph_snap_context *snapc;
739bb23e37aSAlex Elder 	char *object_prefix = NULL;
740bb23e37aSAlex Elder 	char *snap_names = NULL;
741bb23e37aSAlex Elder 	u64 *snap_sizes = NULL;
742ccece235SAlex Elder 	u32 snap_count;
743d2bb24e5SAlex Elder 	size_t size;
744bb23e37aSAlex Elder 	int ret = -ENOMEM;
745621901d6SAlex Elder 	u32 i;
746602adf40SYehuda Sadeh 
747bb23e37aSAlex Elder 	/* Allocate this now to avoid having to handle failure below */
748103a150fSAlex Elder 
749bb23e37aSAlex Elder 	if (first_time) {
750bb23e37aSAlex Elder 		size_t len;
751bb23e37aSAlex Elder 
752bb23e37aSAlex Elder 		len = strnlen(ondisk->object_prefix,
753bb23e37aSAlex Elder 				sizeof (ondisk->object_prefix));
754bb23e37aSAlex Elder 		object_prefix = kmalloc(len + 1, GFP_KERNEL);
755bb23e37aSAlex Elder 		if (!object_prefix)
756602adf40SYehuda Sadeh 			return -ENOMEM;
757bb23e37aSAlex Elder 		memcpy(object_prefix, ondisk->object_prefix, len);
758bb23e37aSAlex Elder 		object_prefix[len] = '\0';
759bb23e37aSAlex Elder 	}
76000f1f36fSAlex Elder 
761bb23e37aSAlex Elder 	/* Allocate the snapshot context and fill it in */
762bb23e37aSAlex Elder 
763bb23e37aSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
764bb23e37aSAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
765bb23e37aSAlex Elder 	if (!snapc)
766bb23e37aSAlex Elder 		goto out_err;
767bb23e37aSAlex Elder 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
768602adf40SYehuda Sadeh 	if (snap_count) {
769bb23e37aSAlex Elder 		struct rbd_image_snap_ondisk *snaps;
770f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
771f785cc1dSAlex Elder 
772bb23e37aSAlex Elder 		/* We'll keep a copy of the snapshot names... */
773621901d6SAlex Elder 
774f785cc1dSAlex Elder 		if (snap_names_len > (u64)SIZE_MAX)
775bb23e37aSAlex Elder 			goto out_2big;
776bb23e37aSAlex Elder 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
777bb23e37aSAlex Elder 		if (!snap_names)
7786a52325fSAlex Elder 			goto out_err;
779bb23e37aSAlex Elder 
780bb23e37aSAlex Elder 		/* ...as well as the array of their sizes. */
781bb23e37aSAlex Elder 
782bb23e37aSAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
783bb23e37aSAlex Elder 		snap_sizes = kmalloc(size, GFP_KERNEL);
784bb23e37aSAlex Elder 		if (!snap_sizes)
785bb23e37aSAlex Elder 			goto out_err;
786bb23e37aSAlex Elder 
787f785cc1dSAlex Elder 		/*
788bb23e37aSAlex Elder 		 * Copy the names, and fill in each snapshot's id
789bb23e37aSAlex Elder 		 * and size.
790bb23e37aSAlex Elder 		 *
791bb23e37aSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees the
792bb23e37aSAlex Elder 		 * ondisk buffer we're working with has
793f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
794f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
795f785cc1dSAlex Elder 		 */
796bb23e37aSAlex Elder 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
797bb23e37aSAlex Elder 		snaps = ondisk->snaps;
798bb23e37aSAlex Elder 		for (i = 0; i < snap_count; i++) {
799bb23e37aSAlex Elder 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
800bb23e37aSAlex Elder 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
801bb23e37aSAlex Elder 		}
802602adf40SYehuda Sadeh 	}
803849b4260SAlex Elder 
804bb23e37aSAlex Elder 	/* We won't fail any more, fill in the header */
805bb23e37aSAlex Elder 
806662518b1SAlex Elder 	down_write(&rbd_dev->header_rwsem);
807bb23e37aSAlex Elder 	if (first_time) {
808bb23e37aSAlex Elder 		header->object_prefix = object_prefix;
809602adf40SYehuda Sadeh 		header->obj_order = ondisk->options.order;
810602adf40SYehuda Sadeh 		header->crypt_type = ondisk->options.crypt_type;
811602adf40SYehuda Sadeh 		header->comp_type = ondisk->options.comp_type;
812bb23e37aSAlex Elder 		/* The rest aren't used for format 1 images */
813bb23e37aSAlex Elder 		header->stripe_unit = 0;
814bb23e37aSAlex Elder 		header->stripe_count = 0;
815bb23e37aSAlex Elder 		header->features = 0;
816662518b1SAlex Elder 	} else {
817662518b1SAlex Elder 		ceph_put_snap_context(header->snapc);
818662518b1SAlex Elder 		kfree(header->snap_names);
819662518b1SAlex Elder 		kfree(header->snap_sizes);
820bb23e37aSAlex Elder 	}
8216a52325fSAlex Elder 
822bb23e37aSAlex Elder 	/* The remaining fields always get updated (when we refresh) */
823621901d6SAlex Elder 
824f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
825bb23e37aSAlex Elder 	header->snapc = snapc;
826bb23e37aSAlex Elder 	header->snap_names = snap_names;
827bb23e37aSAlex Elder 	header->snap_sizes = snap_sizes;
828602adf40SYehuda Sadeh 
829662518b1SAlex Elder 	/* Make sure mapping size is consistent with header info */
830662518b1SAlex Elder 
831662518b1SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP || first_time)
832662518b1SAlex Elder 		if (rbd_dev->mapping.size != header->image_size)
833662518b1SAlex Elder 			rbd_dev->mapping.size = header->image_size;
834662518b1SAlex Elder 
835662518b1SAlex Elder 	up_write(&rbd_dev->header_rwsem);
836662518b1SAlex Elder 
837602adf40SYehuda Sadeh 	return 0;
838bb23e37aSAlex Elder out_2big:
839bb23e37aSAlex Elder 	ret = -EIO;
8406a52325fSAlex Elder out_err:
841bb23e37aSAlex Elder 	kfree(snap_sizes);
842bb23e37aSAlex Elder 	kfree(snap_names);
843bb23e37aSAlex Elder 	ceph_put_snap_context(snapc);
844bb23e37aSAlex Elder 	kfree(object_prefix);
845ccece235SAlex Elder 
846bb23e37aSAlex Elder 	return ret;
847602adf40SYehuda Sadeh }
848602adf40SYehuda Sadeh 
8499682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
8509682fc6dSAlex Elder {
8519682fc6dSAlex Elder 	const char *snap_name;
8529682fc6dSAlex Elder 
8539682fc6dSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
8549682fc6dSAlex Elder 
8559682fc6dSAlex Elder 	/* Skip over names until we find the one we are looking for */
8569682fc6dSAlex Elder 
8579682fc6dSAlex Elder 	snap_name = rbd_dev->header.snap_names;
8589682fc6dSAlex Elder 	while (which--)
8599682fc6dSAlex Elder 		snap_name += strlen(snap_name) + 1;
8609682fc6dSAlex Elder 
8619682fc6dSAlex Elder 	return kstrdup(snap_name, GFP_KERNEL);
8629682fc6dSAlex Elder }
8639682fc6dSAlex Elder 
86430d1cff8SAlex Elder /*
86530d1cff8SAlex Elder  * Snapshot id comparison function for use with qsort()/bsearch().
86630d1cff8SAlex Elder  * Note that result is for snapshots in *descending* order.
86730d1cff8SAlex Elder  */
86830d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
86930d1cff8SAlex Elder {
87030d1cff8SAlex Elder 	u64 snap_id1 = *(u64 *)s1;
87130d1cff8SAlex Elder 	u64 snap_id2 = *(u64 *)s2;
87230d1cff8SAlex Elder 
87330d1cff8SAlex Elder 	if (snap_id1 < snap_id2)
87430d1cff8SAlex Elder 		return 1;
87530d1cff8SAlex Elder 	return snap_id1 == snap_id2 ? 0 : -1;
87630d1cff8SAlex Elder }
87730d1cff8SAlex Elder 
87830d1cff8SAlex Elder /*
87930d1cff8SAlex Elder  * Search a snapshot context to see if the given snapshot id is
88030d1cff8SAlex Elder  * present.
88130d1cff8SAlex Elder  *
88230d1cff8SAlex Elder  * Returns the position of the snapshot id in the array if it's found,
88330d1cff8SAlex Elder  * or BAD_SNAP_INDEX otherwise.
88430d1cff8SAlex Elder  *
88530d1cff8SAlex Elder  * Note: The snapshot array is in kept sorted (by the osd) in
88630d1cff8SAlex Elder  * reverse order, highest snapshot id first.
88730d1cff8SAlex Elder  */
8889682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
8899682fc6dSAlex Elder {
8909682fc6dSAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
89130d1cff8SAlex Elder 	u64 *found;
8929682fc6dSAlex Elder 
89330d1cff8SAlex Elder 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
89430d1cff8SAlex Elder 				sizeof (snap_id), snapid_compare_reverse);
8959682fc6dSAlex Elder 
89630d1cff8SAlex Elder 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
8979682fc6dSAlex Elder }
8989682fc6dSAlex Elder 
8992ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
9002ad3d716SAlex Elder 					u64 snap_id)
90154cac61fSAlex Elder {
90254cac61fSAlex Elder 	u32 which;
90354cac61fSAlex Elder 
90454cac61fSAlex Elder 	which = rbd_dev_snap_index(rbd_dev, snap_id);
90554cac61fSAlex Elder 	if (which == BAD_SNAP_INDEX)
90654cac61fSAlex Elder 		return NULL;
90754cac61fSAlex Elder 
90854cac61fSAlex Elder 	return _rbd_dev_v1_snap_name(rbd_dev, which);
90954cac61fSAlex Elder }
91054cac61fSAlex Elder 
9119e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
9129e15b77dSAlex Elder {
9139e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
9149e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
9159e15b77dSAlex Elder 
91654cac61fSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
91754cac61fSAlex Elder 	if (rbd_dev->image_format == 1)
91854cac61fSAlex Elder 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
9199e15b77dSAlex Elder 
92054cac61fSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
9219e15b77dSAlex Elder }
9229e15b77dSAlex Elder 
9232ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
9242ad3d716SAlex Elder 				u64 *snap_size)
925602adf40SYehuda Sadeh {
9262ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9272ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9282ad3d716SAlex Elder 		*snap_size = rbd_dev->header.image_size;
9292ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9302ad3d716SAlex Elder 		u32 which;
93100f1f36fSAlex Elder 
9322ad3d716SAlex Elder 		which = rbd_dev_snap_index(rbd_dev, snap_id);
9332ad3d716SAlex Elder 		if (which == BAD_SNAP_INDEX)
9342ad3d716SAlex Elder 			return -ENOENT;
93500f1f36fSAlex Elder 
9362ad3d716SAlex Elder 		*snap_size = rbd_dev->header.snap_sizes[which];
9372ad3d716SAlex Elder 	} else {
9382ad3d716SAlex Elder 		u64 size = 0;
9392ad3d716SAlex Elder 		int ret;
9402ad3d716SAlex Elder 
9412ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
9422ad3d716SAlex Elder 		if (ret)
9432ad3d716SAlex Elder 			return ret;
9442ad3d716SAlex Elder 
9452ad3d716SAlex Elder 		*snap_size = size;
9462ad3d716SAlex Elder 	}
9472ad3d716SAlex Elder 	return 0;
9482ad3d716SAlex Elder }
9492ad3d716SAlex Elder 
9502ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
9512ad3d716SAlex Elder 			u64 *snap_features)
9522ad3d716SAlex Elder {
9532ad3d716SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
9542ad3d716SAlex Elder 	if (snap_id == CEPH_NOSNAP) {
9552ad3d716SAlex Elder 		*snap_features = rbd_dev->header.features;
9562ad3d716SAlex Elder 	} else if (rbd_dev->image_format == 1) {
9572ad3d716SAlex Elder 		*snap_features = 0;	/* No features for format 1 */
9582ad3d716SAlex Elder 	} else {
9592ad3d716SAlex Elder 		u64 features = 0;
9602ad3d716SAlex Elder 		int ret;
9612ad3d716SAlex Elder 
9622ad3d716SAlex Elder 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
9632ad3d716SAlex Elder 		if (ret)
9642ad3d716SAlex Elder 			return ret;
9652ad3d716SAlex Elder 
9662ad3d716SAlex Elder 		*snap_features = features;
9672ad3d716SAlex Elder 	}
9682ad3d716SAlex Elder 	return 0;
96900f1f36fSAlex Elder }
970602adf40SYehuda Sadeh 
971d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
972602adf40SYehuda Sadeh {
9738f4b7d98SAlex Elder 	u64 snap_id = rbd_dev->spec->snap_id;
9742ad3d716SAlex Elder 	u64 size = 0;
9752ad3d716SAlex Elder 	u64 features = 0;
9762ad3d716SAlex Elder 	int ret;
9778b0241f8SAlex Elder 
9782ad3d716SAlex Elder 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
9792ad3d716SAlex Elder 	if (ret)
9802ad3d716SAlex Elder 		return ret;
9812ad3d716SAlex Elder 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
9822ad3d716SAlex Elder 	if (ret)
9832ad3d716SAlex Elder 		return ret;
9842ad3d716SAlex Elder 
9852ad3d716SAlex Elder 	rbd_dev->mapping.size = size;
9862ad3d716SAlex Elder 	rbd_dev->mapping.features = features;
9872ad3d716SAlex Elder 
9888b0241f8SAlex Elder 	return 0;
989602adf40SYehuda Sadeh }
990602adf40SYehuda Sadeh 
991d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
992d1cf5788SAlex Elder {
993d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
994d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
995d1cf5788SAlex Elder }
996d1cf5788SAlex Elder 
99798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
998602adf40SYehuda Sadeh {
99965ccfe21SAlex Elder 	char *name;
100065ccfe21SAlex Elder 	u64 segment;
100165ccfe21SAlex Elder 	int ret;
1002602adf40SYehuda Sadeh 
100378c2a44aSAlex Elder 	name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
100465ccfe21SAlex Elder 	if (!name)
100565ccfe21SAlex Elder 		return NULL;
100665ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
10072fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
100865ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
10092fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
101065ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
101165ccfe21SAlex Elder 			segment, ret);
101265ccfe21SAlex Elder 		kfree(name);
101365ccfe21SAlex Elder 		name = NULL;
101465ccfe21SAlex Elder 	}
1015602adf40SYehuda Sadeh 
101665ccfe21SAlex Elder 	return name;
101765ccfe21SAlex Elder }
1018602adf40SYehuda Sadeh 
101978c2a44aSAlex Elder static void rbd_segment_name_free(const char *name)
102078c2a44aSAlex Elder {
102178c2a44aSAlex Elder 	/* The explicit cast here is needed to drop the const qualifier */
102278c2a44aSAlex Elder 
102378c2a44aSAlex Elder 	kmem_cache_free(rbd_segment_name_cache, (void *)name);
102478c2a44aSAlex Elder }
102578c2a44aSAlex Elder 
102665ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
102765ccfe21SAlex Elder {
102865ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1029602adf40SYehuda Sadeh 
103065ccfe21SAlex Elder 	return offset & (segment_size - 1);
103165ccfe21SAlex Elder }
103265ccfe21SAlex Elder 
103365ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
103465ccfe21SAlex Elder 				u64 offset, u64 length)
103565ccfe21SAlex Elder {
103665ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
103765ccfe21SAlex Elder 
103865ccfe21SAlex Elder 	offset &= segment_size - 1;
103965ccfe21SAlex Elder 
1040aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
104165ccfe21SAlex Elder 	if (offset + length > segment_size)
104265ccfe21SAlex Elder 		length = segment_size - offset;
104365ccfe21SAlex Elder 
104465ccfe21SAlex Elder 	return length;
1045602adf40SYehuda Sadeh }
1046602adf40SYehuda Sadeh 
1047602adf40SYehuda Sadeh /*
1048029bcbd8SJosh Durgin  * returns the size of an object in the image
1049029bcbd8SJosh Durgin  */
1050029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
1051029bcbd8SJosh Durgin {
1052029bcbd8SJosh Durgin 	return 1 << header->obj_order;
1053029bcbd8SJosh Durgin }
1054029bcbd8SJosh Durgin 
1055029bcbd8SJosh Durgin /*
1056602adf40SYehuda Sadeh  * bio helpers
1057602adf40SYehuda Sadeh  */
1058602adf40SYehuda Sadeh 
1059602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
1060602adf40SYehuda Sadeh {
1061602adf40SYehuda Sadeh 	struct bio *tmp;
1062602adf40SYehuda Sadeh 
1063602adf40SYehuda Sadeh 	while (chain) {
1064602adf40SYehuda Sadeh 		tmp = chain;
1065602adf40SYehuda Sadeh 		chain = chain->bi_next;
1066602adf40SYehuda Sadeh 		bio_put(tmp);
1067602adf40SYehuda Sadeh 	}
1068602adf40SYehuda Sadeh }
1069602adf40SYehuda Sadeh 
1070602adf40SYehuda Sadeh /*
1071602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
1072602adf40SYehuda Sadeh  */
1073602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
1074602adf40SYehuda Sadeh {
1075602adf40SYehuda Sadeh 	struct bio_vec *bv;
1076602adf40SYehuda Sadeh 	unsigned long flags;
1077602adf40SYehuda Sadeh 	void *buf;
1078602adf40SYehuda Sadeh 	int i;
1079602adf40SYehuda Sadeh 	int pos = 0;
1080602adf40SYehuda Sadeh 
1081602adf40SYehuda Sadeh 	while (chain) {
1082602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
1083602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
1084602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
1085602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
1086602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
1087602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
108885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
1089602adf40SYehuda Sadeh 			}
1090602adf40SYehuda Sadeh 			pos += bv->bv_len;
1091602adf40SYehuda Sadeh 		}
1092602adf40SYehuda Sadeh 
1093602adf40SYehuda Sadeh 		chain = chain->bi_next;
1094602adf40SYehuda Sadeh 	}
1095602adf40SYehuda Sadeh }
1096602adf40SYehuda Sadeh 
1097602adf40SYehuda Sadeh /*
1098b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
1099b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
1100b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
1101b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
1102b9434c5bSAlex Elder  */
1103b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
1104b9434c5bSAlex Elder {
1105b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
1106b9434c5bSAlex Elder 
1107b9434c5bSAlex Elder 	rbd_assert(end > offset);
1108b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
1109b9434c5bSAlex Elder 	while (offset < end) {
1110b9434c5bSAlex Elder 		size_t page_offset;
1111b9434c5bSAlex Elder 		size_t length;
1112b9434c5bSAlex Elder 		unsigned long flags;
1113b9434c5bSAlex Elder 		void *kaddr;
1114b9434c5bSAlex Elder 
1115b9434c5bSAlex Elder 		page_offset = (size_t)(offset & ~PAGE_MASK);
1116b9434c5bSAlex Elder 		length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1117b9434c5bSAlex Elder 		local_irq_save(flags);
1118b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1119b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1120b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1121b9434c5bSAlex Elder 		local_irq_restore(flags);
1122b9434c5bSAlex Elder 
1123b9434c5bSAlex Elder 		offset += length;
1124b9434c5bSAlex Elder 		page++;
1125b9434c5bSAlex Elder 	}
1126b9434c5bSAlex Elder }
1127b9434c5bSAlex Elder 
1128b9434c5bSAlex Elder /*
1129f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1130f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1131602adf40SYehuda Sadeh  */
1132f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1133f7760dadSAlex Elder 					unsigned int offset,
1134f7760dadSAlex Elder 					unsigned int len,
1135f7760dadSAlex Elder 					gfp_t gfpmask)
1136602adf40SYehuda Sadeh {
1137f7760dadSAlex Elder 	struct bio_vec *bv;
1138f7760dadSAlex Elder 	unsigned int resid;
1139f7760dadSAlex Elder 	unsigned short idx;
1140f7760dadSAlex Elder 	unsigned int voff;
1141f7760dadSAlex Elder 	unsigned short end_idx;
1142f7760dadSAlex Elder 	unsigned short vcnt;
1143f7760dadSAlex Elder 	struct bio *bio;
1144602adf40SYehuda Sadeh 
1145f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1146f7760dadSAlex Elder 
1147f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1148f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1149f7760dadSAlex Elder 
1150f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1151f7760dadSAlex Elder 		return NULL;
1152f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1153f7760dadSAlex Elder 		return NULL;
1154f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1155f7760dadSAlex Elder 		return NULL;
1156f7760dadSAlex Elder 
1157f7760dadSAlex Elder 	/* Find first affected segment... */
1158f7760dadSAlex Elder 
1159f7760dadSAlex Elder 	resid = offset;
1160f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1161f7760dadSAlex Elder 		if (resid < bv->bv_len)
1162f7760dadSAlex Elder 			break;
1163f7760dadSAlex Elder 		resid -= bv->bv_len;
1164602adf40SYehuda Sadeh 	}
1165f7760dadSAlex Elder 	voff = resid;
1166602adf40SYehuda Sadeh 
1167f7760dadSAlex Elder 	/* ...and the last affected segment */
1168542582fcSAlex Elder 
1169f7760dadSAlex Elder 	resid += len;
1170f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1171f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1172f7760dadSAlex Elder 			break;
1173f7760dadSAlex Elder 		resid -= bv->bv_len;
1174f7760dadSAlex Elder 	}
1175f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1176602adf40SYehuda Sadeh 
1177f7760dadSAlex Elder 	/* Build the clone */
1178f7760dadSAlex Elder 
1179f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1180f7760dadSAlex Elder 	if (!bio)
1181f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1182f7760dadSAlex Elder 
1183f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1184f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1185f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1186f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1187602adf40SYehuda Sadeh 
1188602adf40SYehuda Sadeh 	/*
1189f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1190f7760dadSAlex Elder 	 * and last (or only) entries.
1191602adf40SYehuda Sadeh 	 */
1192f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1193f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1194f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1195f7760dadSAlex Elder 	if (vcnt > 1) {
1196f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1197f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1198602adf40SYehuda Sadeh 	} else {
1199f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1200602adf40SYehuda Sadeh 	}
1201602adf40SYehuda Sadeh 
1202f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1203f7760dadSAlex Elder 	bio->bi_size = len;
1204f7760dadSAlex Elder 	bio->bi_idx = 0;
1205602adf40SYehuda Sadeh 
1206f7760dadSAlex Elder 	return bio;
1207602adf40SYehuda Sadeh }
1208602adf40SYehuda Sadeh 
1209f7760dadSAlex Elder /*
1210f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1211f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1212f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1213f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1214f7760dadSAlex Elder  *
1215f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1216f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1217f7760dadSAlex Elder  * the start of data to be cloned is located.
1218f7760dadSAlex Elder  *
1219f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1220f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1221f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1222f7760dadSAlex Elder  */
1223f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1224f7760dadSAlex Elder 					unsigned int *offset,
1225f7760dadSAlex Elder 					unsigned int len,
1226f7760dadSAlex Elder 					gfp_t gfpmask)
1227f7760dadSAlex Elder {
1228f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1229f7760dadSAlex Elder 	unsigned int off = *offset;
1230f7760dadSAlex Elder 	struct bio *chain = NULL;
1231f7760dadSAlex Elder 	struct bio **end;
1232602adf40SYehuda Sadeh 
1233f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1234602adf40SYehuda Sadeh 
1235f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1236f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1237602adf40SYehuda Sadeh 
1238f7760dadSAlex Elder 	end = &chain;
1239f7760dadSAlex Elder 	while (len) {
1240f7760dadSAlex Elder 		unsigned int bi_size;
1241f7760dadSAlex Elder 		struct bio *bio;
1242f7760dadSAlex Elder 
1243f5400b7aSAlex Elder 		if (!bi) {
1244f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1245f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1246f5400b7aSAlex Elder 		}
1247f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1248f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1249f7760dadSAlex Elder 		if (!bio)
1250f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1251f7760dadSAlex Elder 
1252f7760dadSAlex Elder 		*end = bio;
1253f7760dadSAlex Elder 		end = &bio->bi_next;
1254f7760dadSAlex Elder 
1255f7760dadSAlex Elder 		off += bi_size;
1256f7760dadSAlex Elder 		if (off == bi->bi_size) {
1257f7760dadSAlex Elder 			bi = bi->bi_next;
1258f7760dadSAlex Elder 			off = 0;
1259f7760dadSAlex Elder 		}
1260f7760dadSAlex Elder 		len -= bi_size;
1261f7760dadSAlex Elder 	}
1262f7760dadSAlex Elder 	*bio_src = bi;
1263f7760dadSAlex Elder 	*offset = off;
1264f7760dadSAlex Elder 
1265f7760dadSAlex Elder 	return chain;
1266f7760dadSAlex Elder out_err:
1267f7760dadSAlex Elder 	bio_chain_put(chain);
1268f7760dadSAlex Elder 
1269602adf40SYehuda Sadeh 	return NULL;
1270602adf40SYehuda Sadeh }
1271602adf40SYehuda Sadeh 
1272926f9b3fSAlex Elder /*
1273926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1274926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1275926f9b3fSAlex Elder  * again.
1276926f9b3fSAlex Elder  */
12776365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
12786365d33aSAlex Elder {
12796365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
12806365d33aSAlex Elder 		struct rbd_device *rbd_dev;
12816365d33aSAlex Elder 
128257acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
12836365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
12846365d33aSAlex Elder 			obj_request);
12856365d33aSAlex Elder 	}
12866365d33aSAlex Elder }
12876365d33aSAlex Elder 
12886365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
12896365d33aSAlex Elder {
12906365d33aSAlex Elder 	smp_mb();
12916365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
12926365d33aSAlex Elder }
12936365d33aSAlex Elder 
129457acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
129557acbaa7SAlex Elder {
129657acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
129757acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
129857acbaa7SAlex Elder 
129957acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
130057acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
130157acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
130257acbaa7SAlex Elder 			obj_request);
130357acbaa7SAlex Elder 	}
130457acbaa7SAlex Elder }
130557acbaa7SAlex Elder 
130657acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
130757acbaa7SAlex Elder {
130857acbaa7SAlex Elder 	smp_mb();
130957acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
131057acbaa7SAlex Elder }
131157acbaa7SAlex Elder 
13125679c59fSAlex Elder /*
13135679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
13145679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
13155679c59fSAlex Elder  *
13165679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
13175679c59fSAlex Elder  * away again.  It's possible that the response from two existence
13185679c59fSAlex Elder  * checks are separated by the creation of the target object, and
13195679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
13205679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
13215679c59fSAlex Elder  */
13225679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
13235679c59fSAlex Elder 				bool exists)
13245679c59fSAlex Elder {
13255679c59fSAlex Elder 	if (exists)
13265679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
13275679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
13285679c59fSAlex Elder 	smp_mb();
13295679c59fSAlex Elder }
13305679c59fSAlex Elder 
13315679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
13325679c59fSAlex Elder {
13335679c59fSAlex Elder 	smp_mb();
13345679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
13355679c59fSAlex Elder }
13365679c59fSAlex Elder 
13375679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
13385679c59fSAlex Elder {
13395679c59fSAlex Elder 	smp_mb();
13405679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
13415679c59fSAlex Elder }
13425679c59fSAlex Elder 
1343bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1344bf0d5f50SAlex Elder {
134537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
134637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1347bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1348bf0d5f50SAlex Elder }
1349bf0d5f50SAlex Elder 
1350bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1351bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1352bf0d5f50SAlex Elder {
1353bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
135437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
135537206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1356bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1357bf0d5f50SAlex Elder }
1358bf0d5f50SAlex Elder 
1359bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1360bf0d5f50SAlex Elder {
136137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
136237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1363bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1364bf0d5f50SAlex Elder }
1365bf0d5f50SAlex Elder 
1366bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1367bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1368bf0d5f50SAlex Elder {
1369bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
137037206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
137137206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1372bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1373bf0d5f50SAlex Elder }
1374bf0d5f50SAlex Elder 
1375bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1376bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1377bf0d5f50SAlex Elder {
137825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
137925dcf954SAlex Elder 
1380b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1381bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
138225dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
13836365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
13846365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1385bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
138625dcf954SAlex Elder 	img_request->obj_request_count++;
138725dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
138837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
138937206ee5SAlex Elder 		obj_request->which);
1390bf0d5f50SAlex Elder }
1391bf0d5f50SAlex Elder 
1392bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1393bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1394bf0d5f50SAlex Elder {
1395bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
139625dcf954SAlex Elder 
139737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
139837206ee5SAlex Elder 		obj_request->which);
1399bf0d5f50SAlex Elder 	list_del(&obj_request->links);
140025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
140125dcf954SAlex Elder 	img_request->obj_request_count--;
140225dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
140325dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
14046365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1405bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1406bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
140725dcf954SAlex Elder 	obj_request->callback = NULL;
1408bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1409bf0d5f50SAlex Elder }
1410bf0d5f50SAlex Elder 
1411bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1412bf0d5f50SAlex Elder {
1413bf0d5f50SAlex Elder 	switch (type) {
14149969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1415bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1416788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1417bf0d5f50SAlex Elder 		return true;
1418bf0d5f50SAlex Elder 	default:
1419bf0d5f50SAlex Elder 		return false;
1420bf0d5f50SAlex Elder 	}
1421bf0d5f50SAlex Elder }
1422bf0d5f50SAlex Elder 
1423bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1424bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1425bf0d5f50SAlex Elder {
142637206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
142737206ee5SAlex Elder 
1428bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1429bf0d5f50SAlex Elder }
1430bf0d5f50SAlex Elder 
1431bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1432bf0d5f50SAlex Elder {
143355f27e09SAlex Elder 
143437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
143555f27e09SAlex Elder 
143655f27e09SAlex Elder 	/*
143755f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
143855f27e09SAlex Elder 	 * count for the image request.  We could instead use
143955f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
144055f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
144155f27e09SAlex Elder 	 */
144255f27e09SAlex Elder 	if (!img_request->result) {
144355f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
144455f27e09SAlex Elder 		u64 xferred = 0;
144555f27e09SAlex Elder 
144655f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
144755f27e09SAlex Elder 			xferred += obj_request->xferred;
144855f27e09SAlex Elder 		img_request->xferred = xferred;
144955f27e09SAlex Elder 	}
145055f27e09SAlex Elder 
1451bf0d5f50SAlex Elder 	if (img_request->callback)
1452bf0d5f50SAlex Elder 		img_request->callback(img_request);
1453bf0d5f50SAlex Elder 	else
1454bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1455bf0d5f50SAlex Elder }
1456bf0d5f50SAlex Elder 
1457788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1458788e2df3SAlex Elder 
1459788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1460788e2df3SAlex Elder {
146137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
146237206ee5SAlex Elder 
1463788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1464788e2df3SAlex Elder }
1465788e2df3SAlex Elder 
14660c425248SAlex Elder /*
14670c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
14680c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
14690c425248SAlex Elder  * and currently never change thereafter.
14700c425248SAlex Elder  */
14710c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
14720c425248SAlex Elder {
14730c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
14740c425248SAlex Elder 	smp_mb();
14750c425248SAlex Elder }
14760c425248SAlex Elder 
14770c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
14780c425248SAlex Elder {
14790c425248SAlex Elder 	smp_mb();
14800c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
14810c425248SAlex Elder }
14820c425248SAlex Elder 
14839849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
14849849e986SAlex Elder {
14859849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
14869849e986SAlex Elder 	smp_mb();
14879849e986SAlex Elder }
14889849e986SAlex Elder 
14899849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
14909849e986SAlex Elder {
14919849e986SAlex Elder 	smp_mb();
14929849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
14939849e986SAlex Elder }
14949849e986SAlex Elder 
1495d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1496d0b2e944SAlex Elder {
1497d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1498d0b2e944SAlex Elder 	smp_mb();
1499d0b2e944SAlex Elder }
1500d0b2e944SAlex Elder 
1501d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1502d0b2e944SAlex Elder {
1503d0b2e944SAlex Elder 	smp_mb();
1504d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1505d0b2e944SAlex Elder }
1506d0b2e944SAlex Elder 
15076e2a4505SAlex Elder static void
15086e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
15096e2a4505SAlex Elder {
1510b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1511b9434c5bSAlex Elder 	u64 length = obj_request->length;
1512b9434c5bSAlex Elder 
15136e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15146e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1515b9434c5bSAlex Elder 		xferred, length);
15166e2a4505SAlex Elder 	/*
15176e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
15186e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
15196e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
15206e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
15216e2a4505SAlex Elder 	 * was satisfied.
15226e2a4505SAlex Elder 	 */
1523b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
15246e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1525b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
15266e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1527b9434c5bSAlex Elder 		else
1528b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
15296e2a4505SAlex Elder 		obj_request->result = 0;
1530b9434c5bSAlex Elder 		obj_request->xferred = length;
1531b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1532b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1533b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1534b9434c5bSAlex Elder 		else
1535b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1536b9434c5bSAlex Elder 		obj_request->xferred = length;
15376e2a4505SAlex Elder 	}
15386e2a4505SAlex Elder 	obj_request_done_set(obj_request);
15396e2a4505SAlex Elder }
15406e2a4505SAlex Elder 
1541bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1542bf0d5f50SAlex Elder {
154337206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
154437206ee5SAlex Elder 		obj_request->callback);
1545bf0d5f50SAlex Elder 	if (obj_request->callback)
1546bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1547788e2df3SAlex Elder 	else
1548788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1549bf0d5f50SAlex Elder }
1550bf0d5f50SAlex Elder 
1551c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
155239bf2c5dSAlex Elder {
155339bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
155439bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
155539bf2c5dSAlex Elder }
155639bf2c5dSAlex Elder 
1557c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1558bf0d5f50SAlex Elder {
155957acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1560a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
156157acbaa7SAlex Elder 	bool layered = false;
156257acbaa7SAlex Elder 
156357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
156457acbaa7SAlex Elder 		img_request = obj_request->img_request;
156557acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1566a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
156757acbaa7SAlex Elder 	}
15688b3e1a56SAlex Elder 
15698b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
15708b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
15718b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1572a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1573a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
15748b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
15758b3e1a56SAlex Elder 	else if (img_request)
15766e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
15776e2a4505SAlex Elder 	else
157807741308SAlex Elder 		obj_request_done_set(obj_request);
1579bf0d5f50SAlex Elder }
1580bf0d5f50SAlex Elder 
1581c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1582bf0d5f50SAlex Elder {
15831b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
15841b83bef2SSage Weil 		obj_request->result, obj_request->length);
15851b83bef2SSage Weil 	/*
15868b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
15878b3e1a56SAlex Elder 	 * it to our originally-requested length.
15881b83bef2SSage Weil 	 */
15891b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
159007741308SAlex Elder 	obj_request_done_set(obj_request);
1591bf0d5f50SAlex Elder }
1592bf0d5f50SAlex Elder 
1593fbfab539SAlex Elder /*
1594fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1595fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1596fbfab539SAlex Elder  */
1597c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1598fbfab539SAlex Elder {
159937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1600fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1601fbfab539SAlex Elder }
1602fbfab539SAlex Elder 
1603bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1604bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1605bf0d5f50SAlex Elder {
1606bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1607bf0d5f50SAlex Elder 	u16 opcode;
1608bf0d5f50SAlex Elder 
160937206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1610bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
161157acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
161257acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
161357acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
161457acbaa7SAlex Elder 	} else {
161557acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
161657acbaa7SAlex Elder 	}
1617bf0d5f50SAlex Elder 
16181b83bef2SSage Weil 	if (osd_req->r_result < 0)
16191b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1620bf0d5f50SAlex Elder 
16210eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1622bf0d5f50SAlex Elder 
1623c47f9371SAlex Elder 	/*
1624c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1625c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1626c47f9371SAlex Elder 	 */
16271b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1628c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
162979528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1630bf0d5f50SAlex Elder 	switch (opcode) {
1631bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1632c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1633bf0d5f50SAlex Elder 		break;
1634bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1635c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1636bf0d5f50SAlex Elder 		break;
1637fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1638c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1639fbfab539SAlex Elder 		break;
164036be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1641b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
16429969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1643c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
16449969ebc5SAlex Elder 		break;
1645bf0d5f50SAlex Elder 	default:
1646bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1647bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1648bf0d5f50SAlex Elder 		break;
1649bf0d5f50SAlex Elder 	}
1650bf0d5f50SAlex Elder 
165107741308SAlex Elder 	if (obj_request_done_test(obj_request))
1652bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1653bf0d5f50SAlex Elder }
1654bf0d5f50SAlex Elder 
16559d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1656430c28c3SAlex Elder {
1657430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16588c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16599d4df01fSAlex Elder 	u64 snap_id;
1660430c28c3SAlex Elder 
16618c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1662430c28c3SAlex Elder 
16639d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
16648c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
16659d4df01fSAlex Elder 			NULL, snap_id, NULL);
16669d4df01fSAlex Elder }
16679d4df01fSAlex Elder 
16689d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
16699d4df01fSAlex Elder {
16709d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
16719d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
16729d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
16739d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
16749d4df01fSAlex Elder 
16759d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
16769d4df01fSAlex Elder 
16779d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
16789d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
16799d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1680430c28c3SAlex Elder }
1681430c28c3SAlex Elder 
1682bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1683bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1684bf0d5f50SAlex Elder 					bool write_request,
1685430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1686bf0d5f50SAlex Elder {
1687bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1688bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1689bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1690bf0d5f50SAlex Elder 
16916365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
16926365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
16936365d33aSAlex Elder 
16940c425248SAlex Elder 		rbd_assert(write_request ==
16950c425248SAlex Elder 				img_request_write_test(img_request));
16960c425248SAlex Elder 		if (write_request)
1697bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1698bf0d5f50SAlex Elder 	}
1699bf0d5f50SAlex Elder 
1700bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1701bf0d5f50SAlex Elder 
1702bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1703bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1704bf0d5f50SAlex Elder 	if (!osd_req)
1705bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1706bf0d5f50SAlex Elder 
1707430c28c3SAlex Elder 	if (write_request)
1708bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1709430c28c3SAlex Elder 	else
1710bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1711bf0d5f50SAlex Elder 
1712bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1713bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1714bf0d5f50SAlex Elder 
1715bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1716bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1717bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1718bf0d5f50SAlex Elder 
1719bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1720bf0d5f50SAlex Elder 
1721bf0d5f50SAlex Elder 	return osd_req;
1722bf0d5f50SAlex Elder }
1723bf0d5f50SAlex Elder 
17240eefd470SAlex Elder /*
17250eefd470SAlex Elder  * Create a copyup osd request based on the information in the
17260eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
17270eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
17280eefd470SAlex Elder  */
17290eefd470SAlex Elder static struct ceph_osd_request *
17300eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
17310eefd470SAlex Elder {
17320eefd470SAlex Elder 	struct rbd_img_request *img_request;
17330eefd470SAlex Elder 	struct ceph_snap_context *snapc;
17340eefd470SAlex Elder 	struct rbd_device *rbd_dev;
17350eefd470SAlex Elder 	struct ceph_osd_client *osdc;
17360eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
17370eefd470SAlex Elder 
17380eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17390eefd470SAlex Elder 	img_request = obj_request->img_request;
17400eefd470SAlex Elder 	rbd_assert(img_request);
17410eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
17420eefd470SAlex Elder 
17430eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
17440eefd470SAlex Elder 
17450eefd470SAlex Elder 	snapc = img_request->snapc;
17460eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
17470eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
17480eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
17490eefd470SAlex Elder 	if (!osd_req)
17500eefd470SAlex Elder 		return NULL;	/* ENOMEM */
17510eefd470SAlex Elder 
17520eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
17530eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
17540eefd470SAlex Elder 	osd_req->r_priv = obj_request;
17550eefd470SAlex Elder 
17560eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
17570eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
17580eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
17590eefd470SAlex Elder 
17600eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
17610eefd470SAlex Elder 
17620eefd470SAlex Elder 	return osd_req;
17630eefd470SAlex Elder }
17640eefd470SAlex Elder 
17650eefd470SAlex Elder 
1766bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1767bf0d5f50SAlex Elder {
1768bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1769bf0d5f50SAlex Elder }
1770bf0d5f50SAlex Elder 
1771bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1772bf0d5f50SAlex Elder 
1773bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1774bf0d5f50SAlex Elder 						u64 offset, u64 length,
1775bf0d5f50SAlex Elder 						enum obj_request_type type)
1776bf0d5f50SAlex Elder {
1777bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1778bf0d5f50SAlex Elder 	size_t size;
1779bf0d5f50SAlex Elder 	char *name;
1780bf0d5f50SAlex Elder 
1781bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1782bf0d5f50SAlex Elder 
1783bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1784f907ad55SAlex Elder 	name = kmalloc(size, GFP_KERNEL);
1785f907ad55SAlex Elder 	if (!name)
1786bf0d5f50SAlex Elder 		return NULL;
1787bf0d5f50SAlex Elder 
1788868311b1SAlex Elder 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
1789f907ad55SAlex Elder 	if (!obj_request) {
1790f907ad55SAlex Elder 		kfree(name);
1791f907ad55SAlex Elder 		return NULL;
1792f907ad55SAlex Elder 	}
1793f907ad55SAlex Elder 
1794bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1795bf0d5f50SAlex Elder 	obj_request->offset = offset;
1796bf0d5f50SAlex Elder 	obj_request->length = length;
1797926f9b3fSAlex Elder 	obj_request->flags = 0;
1798bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1799bf0d5f50SAlex Elder 	obj_request->type = type;
1800bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1801788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1802bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1803bf0d5f50SAlex Elder 
180437206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
180537206ee5SAlex Elder 		offset, length, (int)type, obj_request);
180637206ee5SAlex Elder 
1807bf0d5f50SAlex Elder 	return obj_request;
1808bf0d5f50SAlex Elder }
1809bf0d5f50SAlex Elder 
1810bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1811bf0d5f50SAlex Elder {
1812bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1813bf0d5f50SAlex Elder 
1814bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1815bf0d5f50SAlex Elder 
181637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
181737206ee5SAlex Elder 
1818bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1819bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1820bf0d5f50SAlex Elder 
1821bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1822bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1823bf0d5f50SAlex Elder 
1824bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1825bf0d5f50SAlex Elder 	switch (obj_request->type) {
18269969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
18279969ebc5SAlex Elder 		break;		/* Nothing to do */
1828bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1829bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1830bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1831bf0d5f50SAlex Elder 		break;
1832788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1833788e2df3SAlex Elder 		if (obj_request->pages)
1834788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1835788e2df3SAlex Elder 						obj_request->page_count);
1836788e2df3SAlex Elder 		break;
1837bf0d5f50SAlex Elder 	}
1838bf0d5f50SAlex Elder 
1839f907ad55SAlex Elder 	kfree(obj_request->object_name);
1840868311b1SAlex Elder 	obj_request->object_name = NULL;
1841868311b1SAlex Elder 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1842bf0d5f50SAlex Elder }
1843bf0d5f50SAlex Elder 
1844bf0d5f50SAlex Elder /*
1845bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1846bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1847bf0d5f50SAlex Elder  * (if there is one).
1848bf0d5f50SAlex Elder  */
1849cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1850cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1851bf0d5f50SAlex Elder 					u64 offset, u64 length,
18529849e986SAlex Elder 					bool write_request,
18539849e986SAlex Elder 					bool child_request)
1854bf0d5f50SAlex Elder {
1855bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1856bf0d5f50SAlex Elder 
18571c2a9dfeSAlex Elder 	img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
1858bf0d5f50SAlex Elder 	if (!img_request)
1859bf0d5f50SAlex Elder 		return NULL;
1860bf0d5f50SAlex Elder 
1861bf0d5f50SAlex Elder 	if (write_request) {
1862bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1863812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1864bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1865bf0d5f50SAlex Elder 	}
1866bf0d5f50SAlex Elder 
1867bf0d5f50SAlex Elder 	img_request->rq = NULL;
1868bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1869bf0d5f50SAlex Elder 	img_request->offset = offset;
1870bf0d5f50SAlex Elder 	img_request->length = length;
18710c425248SAlex Elder 	img_request->flags = 0;
18720c425248SAlex Elder 	if (write_request) {
18730c425248SAlex Elder 		img_request_write_set(img_request);
1874468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
18750c425248SAlex Elder 	} else {
1876bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
18770c425248SAlex Elder 	}
18789849e986SAlex Elder 	if (child_request)
18799849e986SAlex Elder 		img_request_child_set(img_request);
1880d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1881d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1882bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1883bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1884bf0d5f50SAlex Elder 	img_request->callback = NULL;
1885a5a337d4SAlex Elder 	img_request->result = 0;
1886bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1887bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1888bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1889bf0d5f50SAlex Elder 
1890bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1891bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1892bf0d5f50SAlex Elder 
189337206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
189437206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
189537206ee5SAlex Elder 		img_request);
189637206ee5SAlex Elder 
1897bf0d5f50SAlex Elder 	return img_request;
1898bf0d5f50SAlex Elder }
1899bf0d5f50SAlex Elder 
1900bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1901bf0d5f50SAlex Elder {
1902bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1903bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1904bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1905bf0d5f50SAlex Elder 
1906bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1907bf0d5f50SAlex Elder 
190837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
190937206ee5SAlex Elder 
1910bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1911bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
191225dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1913bf0d5f50SAlex Elder 
19140c425248SAlex Elder 	if (img_request_write_test(img_request))
1915812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1916bf0d5f50SAlex Elder 
19178b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
19188b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
19198b3e1a56SAlex Elder 
19201c2a9dfeSAlex Elder 	kmem_cache_free(rbd_img_request_cache, img_request);
1921bf0d5f50SAlex Elder }
1922bf0d5f50SAlex Elder 
19231217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
19241217857fSAlex Elder {
19256365d33aSAlex Elder 	struct rbd_img_request *img_request;
19261217857fSAlex Elder 	unsigned int xferred;
19271217857fSAlex Elder 	int result;
19288b3e1a56SAlex Elder 	bool more;
19291217857fSAlex Elder 
19306365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19316365d33aSAlex Elder 	img_request = obj_request->img_request;
19326365d33aSAlex Elder 
19331217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
19341217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
19351217857fSAlex Elder 	result = obj_request->result;
19361217857fSAlex Elder 	if (result) {
19371217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
19381217857fSAlex Elder 
19391217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
19401217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
19411217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
19421217857fSAlex Elder 			obj_request->offset);
19431217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
19441217857fSAlex Elder 			result, xferred);
19451217857fSAlex Elder 		if (!img_request->result)
19461217857fSAlex Elder 			img_request->result = result;
19471217857fSAlex Elder 	}
19481217857fSAlex Elder 
1949f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
1950f1a4739fSAlex Elder 
1951f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
1952f1a4739fSAlex Elder 		obj_request->pages = NULL;
1953f1a4739fSAlex Elder 		obj_request->page_count = 0;
1954f1a4739fSAlex Elder 	}
1955f1a4739fSAlex Elder 
19568b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
19578b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
19588b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
19598b3e1a56SAlex Elder 	} else {
19608b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
19618b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
19628b3e1a56SAlex Elder 	}
19638b3e1a56SAlex Elder 
19648b3e1a56SAlex Elder 	return more;
19651217857fSAlex Elder }
19661217857fSAlex Elder 
19672169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
19682169238dSAlex Elder {
19692169238dSAlex Elder 	struct rbd_img_request *img_request;
19702169238dSAlex Elder 	u32 which = obj_request->which;
19712169238dSAlex Elder 	bool more = true;
19722169238dSAlex Elder 
19736365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19742169238dSAlex Elder 	img_request = obj_request->img_request;
19752169238dSAlex Elder 
19762169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
19772169238dSAlex Elder 	rbd_assert(img_request != NULL);
19782169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
19792169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
19802169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
19812169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
19822169238dSAlex Elder 
19832169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
19842169238dSAlex Elder 	if (which != img_request->next_completion)
19852169238dSAlex Elder 		goto out;
19862169238dSAlex Elder 
19872169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
19882169238dSAlex Elder 		rbd_assert(more);
19892169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
19902169238dSAlex Elder 
19912169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
19922169238dSAlex Elder 			break;
19931217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
19942169238dSAlex Elder 		which++;
19952169238dSAlex Elder 	}
19962169238dSAlex Elder 
19972169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
19982169238dSAlex Elder 	img_request->next_completion = which;
19992169238dSAlex Elder out:
20002169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
20012169238dSAlex Elder 
20022169238dSAlex Elder 	if (!more)
20032169238dSAlex Elder 		rbd_img_request_complete(img_request);
20042169238dSAlex Elder }
20052169238dSAlex Elder 
2006f1a4739fSAlex Elder /*
2007f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
2008f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
2009f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
2010f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
2011f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
2012f1a4739fSAlex Elder  * all data described by the image request.
2013f1a4739fSAlex Elder  */
2014f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
2015f1a4739fSAlex Elder 					enum obj_request_type type,
2016f1a4739fSAlex Elder 					void *data_desc)
2017bf0d5f50SAlex Elder {
2018bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
2019bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
2020bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
20210c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
2022f1a4739fSAlex Elder 	struct bio *bio_list;
2023f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
2024f1a4739fSAlex Elder 	struct page **pages;
20257da22d29SAlex Elder 	u64 img_offset;
2026bf0d5f50SAlex Elder 	u64 resid;
2027bf0d5f50SAlex Elder 	u16 opcode;
2028bf0d5f50SAlex Elder 
2029f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2030f1a4739fSAlex Elder 		(int)type, data_desc);
203137206ee5SAlex Elder 
2032430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
20337da22d29SAlex Elder 	img_offset = img_request->offset;
2034bf0d5f50SAlex Elder 	resid = img_request->length;
20354dda41d3SAlex Elder 	rbd_assert(resid > 0);
2036f1a4739fSAlex Elder 
2037f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
2038f1a4739fSAlex Elder 		bio_list = data_desc;
2039f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2040f1a4739fSAlex Elder 	} else {
2041f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
2042f1a4739fSAlex Elder 		pages = data_desc;
2043f1a4739fSAlex Elder 	}
2044f1a4739fSAlex Elder 
2045bf0d5f50SAlex Elder 	while (resid) {
20462fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
2047bf0d5f50SAlex Elder 		const char *object_name;
2048bf0d5f50SAlex Elder 		u64 offset;
2049bf0d5f50SAlex Elder 		u64 length;
2050bf0d5f50SAlex Elder 
20517da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
2052bf0d5f50SAlex Elder 		if (!object_name)
2053bf0d5f50SAlex Elder 			goto out_unwind;
20547da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
20557da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
2056bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
2057f1a4739fSAlex Elder 						offset, length, type);
205878c2a44aSAlex Elder 		/* object request has its own copy of the object name */
205978c2a44aSAlex Elder 		rbd_segment_name_free(object_name);
2060bf0d5f50SAlex Elder 		if (!obj_request)
2061bf0d5f50SAlex Elder 			goto out_unwind;
2062bf0d5f50SAlex Elder 
2063f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
2064f1a4739fSAlex Elder 			unsigned int clone_size;
2065f1a4739fSAlex Elder 
2066bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
2067bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
2068f1a4739fSAlex Elder 			obj_request->bio_list =
2069f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
2070f1a4739fSAlex Elder 								&bio_offset,
2071f1a4739fSAlex Elder 								clone_size,
2072bf0d5f50SAlex Elder 								GFP_ATOMIC);
2073bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
2074bf0d5f50SAlex Elder 				goto out_partial;
2075f1a4739fSAlex Elder 		} else {
2076f1a4739fSAlex Elder 			unsigned int page_count;
2077f1a4739fSAlex Elder 
2078f1a4739fSAlex Elder 			obj_request->pages = pages;
2079f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
2080f1a4739fSAlex Elder 			obj_request->page_count = page_count;
2081f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
2082f1a4739fSAlex Elder 				page_count--;	/* more on last page */
2083f1a4739fSAlex Elder 			pages += page_count;
2084f1a4739fSAlex Elder 		}
2085bf0d5f50SAlex Elder 
20862fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
20872fa12320SAlex Elder 						obj_request);
20882fa12320SAlex Elder 		if (!osd_req)
2089bf0d5f50SAlex Elder 			goto out_partial;
20902fa12320SAlex Elder 		obj_request->osd_req = osd_req;
20912169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
2092430c28c3SAlex Elder 
20932fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
20942fa12320SAlex Elder 						0, 0);
2095f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
2096406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
2097f1a4739fSAlex Elder 					obj_request->bio_list, length);
2098f1a4739fSAlex Elder 		else
2099f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
2100f1a4739fSAlex Elder 					obj_request->pages, length,
2101f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
21029d4df01fSAlex Elder 
21039d4df01fSAlex Elder 		if (write_request)
21049d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
21059d4df01fSAlex Elder 		else
21069d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
2107430c28c3SAlex Elder 
21087da22d29SAlex Elder 		obj_request->img_offset = img_offset;
2109bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
2110bf0d5f50SAlex Elder 
21117da22d29SAlex Elder 		img_offset += length;
2112bf0d5f50SAlex Elder 		resid -= length;
2113bf0d5f50SAlex Elder 	}
2114bf0d5f50SAlex Elder 
2115bf0d5f50SAlex Elder 	return 0;
2116bf0d5f50SAlex Elder 
2117bf0d5f50SAlex Elder out_partial:
2118bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
2119bf0d5f50SAlex Elder out_unwind:
2120bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2121bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2122bf0d5f50SAlex Elder 
2123bf0d5f50SAlex Elder 	return -ENOMEM;
2124bf0d5f50SAlex Elder }
2125bf0d5f50SAlex Elder 
21263d7efd18SAlex Elder static void
21270eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
21280eefd470SAlex Elder {
21290eefd470SAlex Elder 	struct rbd_img_request *img_request;
21300eefd470SAlex Elder 	struct rbd_device *rbd_dev;
21310eefd470SAlex Elder 	u64 length;
21320eefd470SAlex Elder 	u32 page_count;
21330eefd470SAlex Elder 
21340eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
21350eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21360eefd470SAlex Elder 	img_request = obj_request->img_request;
21370eefd470SAlex Elder 	rbd_assert(img_request);
21380eefd470SAlex Elder 
21390eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
21400eefd470SAlex Elder 	rbd_assert(rbd_dev);
21410eefd470SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
21420eefd470SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
21430eefd470SAlex Elder 
21440eefd470SAlex Elder 	rbd_assert(obj_request->copyup_pages);
21450eefd470SAlex Elder 	ceph_release_page_vector(obj_request->copyup_pages, page_count);
21460eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
21470eefd470SAlex Elder 
21480eefd470SAlex Elder 	/*
21490eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
21500eefd470SAlex Elder 	 * original write request.  There is no such thing as a
21510eefd470SAlex Elder 	 * successful short write, so if the request was successful
21520eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
21530eefd470SAlex Elder 	 */
21540eefd470SAlex Elder 	if (!obj_request->result)
21550eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
21560eefd470SAlex Elder 
21570eefd470SAlex Elder 	/* Finish up with the normal image object callback */
21580eefd470SAlex Elder 
21590eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
21600eefd470SAlex Elder }
21610eefd470SAlex Elder 
21620eefd470SAlex Elder static void
21633d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
21643d7efd18SAlex Elder {
21653d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
21660eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
21670eefd470SAlex Elder 	struct ceph_osd_client *osdc;
21680eefd470SAlex Elder 	struct rbd_device *rbd_dev;
21693d7efd18SAlex Elder 	struct page **pages;
21703d7efd18SAlex Elder 	int result;
21713d7efd18SAlex Elder 	u64 obj_size;
21723d7efd18SAlex Elder 	u64 xferred;
21733d7efd18SAlex Elder 
21743d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
21753d7efd18SAlex Elder 
21763d7efd18SAlex Elder 	/* First get what we need from the image request */
21773d7efd18SAlex Elder 
21783d7efd18SAlex Elder 	pages = img_request->copyup_pages;
21793d7efd18SAlex Elder 	rbd_assert(pages != NULL);
21803d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
21813d7efd18SAlex Elder 
21823d7efd18SAlex Elder 	orig_request = img_request->obj_request;
21833d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
21840eefd470SAlex Elder 	rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
21853d7efd18SAlex Elder 	result = img_request->result;
21863d7efd18SAlex Elder 	obj_size = img_request->length;
21873d7efd18SAlex Elder 	xferred = img_request->xferred;
21883d7efd18SAlex Elder 
21890eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
21900eefd470SAlex Elder 	rbd_assert(rbd_dev);
21910eefd470SAlex Elder 	rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
21920eefd470SAlex Elder 
21933d7efd18SAlex Elder 	rbd_img_request_put(img_request);
21943d7efd18SAlex Elder 
21950eefd470SAlex Elder 	if (result)
21960eefd470SAlex Elder 		goto out_err;
21973d7efd18SAlex Elder 
21980eefd470SAlex Elder 	/* Allocate the new copyup osd request for the original request */
21993d7efd18SAlex Elder 
22000eefd470SAlex Elder 	result = -ENOMEM;
22010eefd470SAlex Elder 	rbd_assert(!orig_request->osd_req);
22020eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
22030eefd470SAlex Elder 	if (!osd_req)
22040eefd470SAlex Elder 		goto out_err;
22050eefd470SAlex Elder 	orig_request->osd_req = osd_req;
22060eefd470SAlex Elder 	orig_request->copyup_pages = pages;
22073d7efd18SAlex Elder 
22080eefd470SAlex Elder 	/* Initialize the copyup op */
22090eefd470SAlex Elder 
22100eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
22110eefd470SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
22120eefd470SAlex Elder 						false, false);
22130eefd470SAlex Elder 
22140eefd470SAlex Elder 	/* Then the original write request op */
22150eefd470SAlex Elder 
22160eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
22170eefd470SAlex Elder 					orig_request->offset,
22180eefd470SAlex Elder 					orig_request->length, 0, 0);
22190eefd470SAlex Elder 	osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
22200eefd470SAlex Elder 					orig_request->length);
22210eefd470SAlex Elder 
22220eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
22230eefd470SAlex Elder 
22240eefd470SAlex Elder 	/* All set, send it off. */
22250eefd470SAlex Elder 
22260eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
22270eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
22280eefd470SAlex Elder 	result = rbd_obj_request_submit(osdc, orig_request);
22290eefd470SAlex Elder 	if (!result)
22300eefd470SAlex Elder 		return;
22310eefd470SAlex Elder out_err:
22320eefd470SAlex Elder 	/* Record the error code and complete the request */
22330eefd470SAlex Elder 
22340eefd470SAlex Elder 	orig_request->result = result;
22350eefd470SAlex Elder 	orig_request->xferred = 0;
22363d7efd18SAlex Elder 	obj_request_done_set(orig_request);
22373d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
22383d7efd18SAlex Elder }
22393d7efd18SAlex Elder 
22403d7efd18SAlex Elder /*
22413d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
22423d7efd18SAlex Elder  * entire target of the given object request.  This is used for
22433d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
22443d7efd18SAlex Elder  * object request from the image request does not exist.
22453d7efd18SAlex Elder  *
22463d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
22473d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
22483d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
22493d7efd18SAlex Elder  * the original object request for the copyup operation.
22503d7efd18SAlex Elder  *
22513d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
22523d7efd18SAlex Elder  * object request and mark it done so it gets completed.
22533d7efd18SAlex Elder  */
22543d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
22553d7efd18SAlex Elder {
22563d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
22573d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
22583d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
22593d7efd18SAlex Elder 	u64 img_offset;
22603d7efd18SAlex Elder 	u64 length;
22613d7efd18SAlex Elder 	struct page **pages = NULL;
22623d7efd18SAlex Elder 	u32 page_count;
22633d7efd18SAlex Elder 	int result;
22643d7efd18SAlex Elder 
22653d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
22663d7efd18SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
22673d7efd18SAlex Elder 
22683d7efd18SAlex Elder 	img_request = obj_request->img_request;
22693d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
22703d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
22713d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
22723d7efd18SAlex Elder 
22733d7efd18SAlex Elder 	/*
22740eefd470SAlex Elder 	 * First things first.  The original osd request is of no
22750eefd470SAlex Elder 	 * use to use any more, we'll need a new one that can hold
22760eefd470SAlex Elder 	 * the two ops in a copyup request.  We'll get that later,
22770eefd470SAlex Elder 	 * but for now we can release the old one.
22780eefd470SAlex Elder 	 */
22790eefd470SAlex Elder 	rbd_osd_req_destroy(obj_request->osd_req);
22800eefd470SAlex Elder 	obj_request->osd_req = NULL;
22810eefd470SAlex Elder 
22820eefd470SAlex Elder 	/*
22833d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
22843d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
22853d7efd18SAlex Elder 	 */
22863d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
22873d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
22883d7efd18SAlex Elder 
22893d7efd18SAlex Elder 	/*
2290a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2291a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2292a9e8ba2cSAlex Elder 	 * necessary.
2293a9e8ba2cSAlex Elder 	 */
2294a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2295a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2296a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2297a9e8ba2cSAlex Elder 	}
2298a9e8ba2cSAlex Elder 
2299a9e8ba2cSAlex Elder 	/*
23003d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
23013d7efd18SAlex Elder 	 * from the parent.
23023d7efd18SAlex Elder 	 */
23033d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
23043d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
23053d7efd18SAlex Elder 	if (IS_ERR(pages)) {
23063d7efd18SAlex Elder 		result = PTR_ERR(pages);
23073d7efd18SAlex Elder 		pages = NULL;
23083d7efd18SAlex Elder 		goto out_err;
23093d7efd18SAlex Elder 	}
23103d7efd18SAlex Elder 
23113d7efd18SAlex Elder 	result = -ENOMEM;
23123d7efd18SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
23133d7efd18SAlex Elder 						img_offset, length,
23143d7efd18SAlex Elder 						false, true);
23153d7efd18SAlex Elder 	if (!parent_request)
23163d7efd18SAlex Elder 		goto out_err;
23173d7efd18SAlex Elder 	rbd_obj_request_get(obj_request);
23183d7efd18SAlex Elder 	parent_request->obj_request = obj_request;
23193d7efd18SAlex Elder 
23203d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
23213d7efd18SAlex Elder 	if (result)
23223d7efd18SAlex Elder 		goto out_err;
23233d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
23243d7efd18SAlex Elder 
23253d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
23263d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
23273d7efd18SAlex Elder 	if (!result)
23283d7efd18SAlex Elder 		return 0;
23293d7efd18SAlex Elder 
23303d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
23313d7efd18SAlex Elder 	parent_request->obj_request = NULL;
23323d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
23333d7efd18SAlex Elder out_err:
23343d7efd18SAlex Elder 	if (pages)
23353d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
23363d7efd18SAlex Elder 	if (parent_request)
23373d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
23383d7efd18SAlex Elder 	obj_request->result = result;
23393d7efd18SAlex Elder 	obj_request->xferred = 0;
23403d7efd18SAlex Elder 	obj_request_done_set(obj_request);
23413d7efd18SAlex Elder 
23423d7efd18SAlex Elder 	return result;
23433d7efd18SAlex Elder }
23443d7efd18SAlex Elder 
2345c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2346c5b5ef6cSAlex Elder {
2347c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2348c5b5ef6cSAlex Elder 	int result;
2349c5b5ef6cSAlex Elder 
2350c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2351c5b5ef6cSAlex Elder 
2352c5b5ef6cSAlex Elder 	/*
2353c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2354c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2355c5b5ef6cSAlex Elder 	 * we're done with the request.
2356c5b5ef6cSAlex Elder 	 */
2357c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2358c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2359c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2360c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2361c5b5ef6cSAlex Elder 
2362c5b5ef6cSAlex Elder 	result = obj_request->result;
2363c5b5ef6cSAlex Elder 	obj_request->result = 0;
2364c5b5ef6cSAlex Elder 
2365c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2366c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2367c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2368c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2369c5b5ef6cSAlex Elder 
2370c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2371c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2372c5b5ef6cSAlex Elder 
2373c5b5ef6cSAlex Elder 	/*
2374c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2375c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2376c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2377c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2378c5b5ef6cSAlex Elder 	 */
2379c5b5ef6cSAlex Elder 	if (!result) {
2380c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2381c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2382c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2383c5b5ef6cSAlex Elder 	} else if (result) {
2384c5b5ef6cSAlex Elder 		orig_request->result = result;
23853d7efd18SAlex Elder 		goto out;
2386c5b5ef6cSAlex Elder 	}
2387c5b5ef6cSAlex Elder 
2388c5b5ef6cSAlex Elder 	/*
2389c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2390c5b5ef6cSAlex Elder 	 * whether the target object exists.
2391c5b5ef6cSAlex Elder 	 */
2392b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
23933d7efd18SAlex Elder out:
2394c5b5ef6cSAlex Elder 	if (orig_request->result)
2395c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2396c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
2397c5b5ef6cSAlex Elder }
2398c5b5ef6cSAlex Elder 
2399c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2400c5b5ef6cSAlex Elder {
2401c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2402c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2403c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2404c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2405c5b5ef6cSAlex Elder 	u32 page_count;
2406c5b5ef6cSAlex Elder 	size_t size;
2407c5b5ef6cSAlex Elder 	int ret;
2408c5b5ef6cSAlex Elder 
2409c5b5ef6cSAlex Elder 	/*
2410c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2411c5b5ef6cSAlex Elder 	 *     le64 length;
2412c5b5ef6cSAlex Elder 	 *     struct {
2413c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2414c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2415c5b5ef6cSAlex Elder 	 *     } mtime;
2416c5b5ef6cSAlex Elder 	 */
2417c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2418c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2419c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2420c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2421c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2422c5b5ef6cSAlex Elder 
2423c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2424c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2425c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2426c5b5ef6cSAlex Elder 	if (!stat_request)
2427c5b5ef6cSAlex Elder 		goto out;
2428c5b5ef6cSAlex Elder 
2429c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2430c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2431c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2432c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2433c5b5ef6cSAlex Elder 
2434c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2435c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2436c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2437c5b5ef6cSAlex Elder 						stat_request);
2438c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2439c5b5ef6cSAlex Elder 		goto out;
2440c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2441c5b5ef6cSAlex Elder 
2442c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2443c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2444c5b5ef6cSAlex Elder 					false, false);
24459d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2446c5b5ef6cSAlex Elder 
2447c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2448c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2449c5b5ef6cSAlex Elder out:
2450c5b5ef6cSAlex Elder 	if (ret)
2451c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2452c5b5ef6cSAlex Elder 
2453c5b5ef6cSAlex Elder 	return ret;
2454c5b5ef6cSAlex Elder }
2455c5b5ef6cSAlex Elder 
2456b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2457b454e36dSAlex Elder {
2458b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2459a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
24603d7efd18SAlex Elder 	bool known;
2461b454e36dSAlex Elder 
2462b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2463b454e36dSAlex Elder 
2464b454e36dSAlex Elder 	img_request = obj_request->img_request;
2465b454e36dSAlex Elder 	rbd_assert(img_request);
2466a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2467b454e36dSAlex Elder 
2468b454e36dSAlex Elder 	/*
2469a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2470a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2471a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2472a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2473a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2474a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2475a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2476a9e8ba2cSAlex Elder 	 * simple object request.
2477b454e36dSAlex Elder 	 */
2478b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2479b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2480a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
24813d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
24823d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2483b454e36dSAlex Elder 
2484b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2485b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2486b454e36dSAlex Elder 
2487b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2488b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2489b454e36dSAlex Elder 
2490b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2491b454e36dSAlex Elder 	}
2492b454e36dSAlex Elder 
2493b454e36dSAlex Elder 	/*
24943d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
24953d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
24963d7efd18SAlex Elder 	 * start by reading the data for the full target object from
24973d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2498b454e36dSAlex Elder 	 */
24993d7efd18SAlex Elder 	if (known)
25003d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
25013d7efd18SAlex Elder 
25023d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2503b454e36dSAlex Elder 
2504b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2505b454e36dSAlex Elder }
2506b454e36dSAlex Elder 
2507bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2508bf0d5f50SAlex Elder {
2509bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
251046faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2511bf0d5f50SAlex Elder 
251237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
251346faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2514bf0d5f50SAlex Elder 		int ret;
2515bf0d5f50SAlex Elder 
2516b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2517bf0d5f50SAlex Elder 		if (ret)
2518bf0d5f50SAlex Elder 			return ret;
2519bf0d5f50SAlex Elder 	}
2520bf0d5f50SAlex Elder 
2521bf0d5f50SAlex Elder 	return 0;
2522bf0d5f50SAlex Elder }
2523bf0d5f50SAlex Elder 
25248b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
25258b3e1a56SAlex Elder {
25268b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2527a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2528a9e8ba2cSAlex Elder 	u64 obj_end;
25298b3e1a56SAlex Elder 
25308b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
25318b3e1a56SAlex Elder 
25328b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
2533a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2534a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
25358b3e1a56SAlex Elder 
2536a9e8ba2cSAlex Elder 	obj_request->result = img_request->result;
2537a9e8ba2cSAlex Elder 	if (obj_request->result)
2538a9e8ba2cSAlex Elder 		goto out;
2539a9e8ba2cSAlex Elder 
2540a9e8ba2cSAlex Elder 	/*
2541a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2542a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2543a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2544a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2545a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2546a9e8ba2cSAlex Elder 	 */
2547a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2548a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2549a9e8ba2cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2550a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2551a9e8ba2cSAlex Elder 		u64 xferred = 0;
2552a9e8ba2cSAlex Elder 
2553a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2554a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2555a9e8ba2cSAlex Elder 					obj_request->img_offset;
2556a9e8ba2cSAlex Elder 
2557a9e8ba2cSAlex Elder 		obj_request->xferred = min(img_request->xferred, xferred);
2558a9e8ba2cSAlex Elder 	} else {
2559a9e8ba2cSAlex Elder 		obj_request->xferred = img_request->xferred;
2560a9e8ba2cSAlex Elder 	}
2561a9e8ba2cSAlex Elder out:
2562b5b09be3SAlex Elder 	rbd_img_request_put(img_request);
25638b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
25648b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
25658b3e1a56SAlex Elder }
25668b3e1a56SAlex Elder 
25678b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
25688b3e1a56SAlex Elder {
25698b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
25708b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
25718b3e1a56SAlex Elder 	int result;
25728b3e1a56SAlex Elder 
25738b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
25748b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
25758b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
25768b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
25778b3e1a56SAlex Elder 
25788b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
25798b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
25808b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
25818b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
25828b3e1a56SAlex Elder 						obj_request->img_offset,
25838b3e1a56SAlex Elder 						obj_request->length,
25848b3e1a56SAlex Elder 						false, true);
25858b3e1a56SAlex Elder 	result = -ENOMEM;
25868b3e1a56SAlex Elder 	if (!img_request)
25878b3e1a56SAlex Elder 		goto out_err;
25888b3e1a56SAlex Elder 
25898b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
25908b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
25918b3e1a56SAlex Elder 
2592f1a4739fSAlex Elder 	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2593f1a4739fSAlex Elder 					obj_request->bio_list);
25948b3e1a56SAlex Elder 	if (result)
25958b3e1a56SAlex Elder 		goto out_err;
25968b3e1a56SAlex Elder 
25978b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
25988b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
25998b3e1a56SAlex Elder 	if (result)
26008b3e1a56SAlex Elder 		goto out_err;
26018b3e1a56SAlex Elder 
26028b3e1a56SAlex Elder 	return;
26038b3e1a56SAlex Elder out_err:
26048b3e1a56SAlex Elder 	if (img_request)
26058b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
26068b3e1a56SAlex Elder 	obj_request->result = result;
26078b3e1a56SAlex Elder 	obj_request->xferred = 0;
26088b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
26098b3e1a56SAlex Elder }
26108b3e1a56SAlex Elder 
2611cc4a38bdSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
2612b8d70035SAlex Elder {
2613b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
26142169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2615b8d70035SAlex Elder 	int ret;
2616b8d70035SAlex Elder 
2617b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2618b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2619b8d70035SAlex Elder 	if (!obj_request)
2620b8d70035SAlex Elder 		return -ENOMEM;
2621b8d70035SAlex Elder 
2622b8d70035SAlex Elder 	ret = -ENOMEM;
2623430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2624b8d70035SAlex Elder 	if (!obj_request->osd_req)
2625b8d70035SAlex Elder 		goto out;
26262169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2627b8d70035SAlex Elder 
2628c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2629cc4a38bdSAlex Elder 					notify_id, 0, 0);
26309d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2631430c28c3SAlex Elder 
2632b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2633b8d70035SAlex Elder out:
2634cf81b60eSAlex Elder 	if (ret)
2635b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2636b8d70035SAlex Elder 
2637b8d70035SAlex Elder 	return ret;
2638b8d70035SAlex Elder }
2639b8d70035SAlex Elder 
2640b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2641b8d70035SAlex Elder {
2642b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2643e627db08SAlex Elder 	int ret;
2644b8d70035SAlex Elder 
2645b8d70035SAlex Elder 	if (!rbd_dev)
2646b8d70035SAlex Elder 		return;
2647b8d70035SAlex Elder 
264837206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2649b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long)notify_id,
2650b8d70035SAlex Elder 		(unsigned int)opcode);
2651e627db08SAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
2652e627db08SAlex Elder 	if (ret)
2653e627db08SAlex Elder 		rbd_warn(rbd_dev, ": header refresh error (%d)\n", ret);
2654b8d70035SAlex Elder 
2655cc4a38bdSAlex Elder 	rbd_obj_notify_ack(rbd_dev, notify_id);
2656b8d70035SAlex Elder }
2657b8d70035SAlex Elder 
26589969ebc5SAlex Elder /*
26599969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
26609969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
26619969ebc5SAlex Elder  */
26629969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
26639969ebc5SAlex Elder {
26649969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
26659969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
26669969ebc5SAlex Elder 	int ret;
26679969ebc5SAlex Elder 
26689969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
26699969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
26709969ebc5SAlex Elder 
26719969ebc5SAlex Elder 	if (start) {
26723c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
26739969ebc5SAlex Elder 						&rbd_dev->watch_event);
26749969ebc5SAlex Elder 		if (ret < 0)
26759969ebc5SAlex Elder 			return ret;
26768eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
26779969ebc5SAlex Elder 	}
26789969ebc5SAlex Elder 
26799969ebc5SAlex Elder 	ret = -ENOMEM;
26809969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
26819969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
26829969ebc5SAlex Elder 	if (!obj_request)
26839969ebc5SAlex Elder 		goto out_cancel;
26849969ebc5SAlex Elder 
2685430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2686430c28c3SAlex Elder 	if (!obj_request->osd_req)
2687430c28c3SAlex Elder 		goto out_cancel;
2688430c28c3SAlex Elder 
26898eb87565SAlex Elder 	if (start)
2690975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
26918eb87565SAlex Elder 	else
26926977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2693975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
26942169238dSAlex Elder 
26952169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2696b21ebdddSAlex Elder 				rbd_dev->watch_event->cookie, 0, start);
26979d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
26982169238dSAlex Elder 
26999969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
27009969ebc5SAlex Elder 	if (ret)
27019969ebc5SAlex Elder 		goto out_cancel;
27029969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
27039969ebc5SAlex Elder 	if (ret)
27049969ebc5SAlex Elder 		goto out_cancel;
27059969ebc5SAlex Elder 	ret = obj_request->result;
27069969ebc5SAlex Elder 	if (ret)
27079969ebc5SAlex Elder 		goto out_cancel;
27089969ebc5SAlex Elder 
27098eb87565SAlex Elder 	/*
27108eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
27118eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
27128eb87565SAlex Elder 	 * a pointer to the object request during that time (in
27138eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
27148eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
27158eb87565SAlex Elder 	 * unregistered it.
27168eb87565SAlex Elder 	 */
27178eb87565SAlex Elder 	if (start) {
27188eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
27198eb87565SAlex Elder 
27208eb87565SAlex Elder 		return 0;
27218eb87565SAlex Elder 	}
27228eb87565SAlex Elder 
27238eb87565SAlex Elder 	/* We have successfully torn down the watch request */
27248eb87565SAlex Elder 
27258eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
27268eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
27279969ebc5SAlex Elder out_cancel:
27289969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
27299969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
27309969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
27319969ebc5SAlex Elder 	if (obj_request)
27329969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
27339969ebc5SAlex Elder 
27349969ebc5SAlex Elder 	return ret;
27359969ebc5SAlex Elder }
27369969ebc5SAlex Elder 
273736be9a76SAlex Elder /*
2738f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2739f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
274036be9a76SAlex Elder  */
274136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
274236be9a76SAlex Elder 			     const char *object_name,
274336be9a76SAlex Elder 			     const char *class_name,
274436be9a76SAlex Elder 			     const char *method_name,
27454157976bSAlex Elder 			     const void *outbound,
274636be9a76SAlex Elder 			     size_t outbound_size,
27474157976bSAlex Elder 			     void *inbound,
2748e2a58ee5SAlex Elder 			     size_t inbound_size)
274936be9a76SAlex Elder {
27502169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
275136be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
275236be9a76SAlex Elder 	struct page **pages;
275336be9a76SAlex Elder 	u32 page_count;
275436be9a76SAlex Elder 	int ret;
275536be9a76SAlex Elder 
275636be9a76SAlex Elder 	/*
27576010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
27586010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
27596010a451SAlex Elder 	 * also supply outbound data--parameters for the object
27606010a451SAlex Elder 	 * method.  Currently if this is present it will be a
27616010a451SAlex Elder 	 * snapshot id.
276236be9a76SAlex Elder 	 */
276336be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
276436be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
276536be9a76SAlex Elder 	if (IS_ERR(pages))
276636be9a76SAlex Elder 		return PTR_ERR(pages);
276736be9a76SAlex Elder 
276836be9a76SAlex Elder 	ret = -ENOMEM;
27696010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
277036be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
277136be9a76SAlex Elder 	if (!obj_request)
277236be9a76SAlex Elder 		goto out;
277336be9a76SAlex Elder 
277436be9a76SAlex Elder 	obj_request->pages = pages;
277536be9a76SAlex Elder 	obj_request->page_count = page_count;
277636be9a76SAlex Elder 
2777430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
277836be9a76SAlex Elder 	if (!obj_request->osd_req)
277936be9a76SAlex Elder 		goto out;
278036be9a76SAlex Elder 
2781c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
278204017e29SAlex Elder 					class_name, method_name);
278304017e29SAlex Elder 	if (outbound_size) {
278404017e29SAlex Elder 		struct ceph_pagelist *pagelist;
278504017e29SAlex Elder 
278604017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
278704017e29SAlex Elder 		if (!pagelist)
278804017e29SAlex Elder 			goto out;
278904017e29SAlex Elder 
279004017e29SAlex Elder 		ceph_pagelist_init(pagelist);
279104017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
279204017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
279304017e29SAlex Elder 						pagelist);
279404017e29SAlex Elder 	}
2795a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2796a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
279744cd188dSAlex Elder 					0, false, false);
27989d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2799430c28c3SAlex Elder 
280036be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
280136be9a76SAlex Elder 	if (ret)
280236be9a76SAlex Elder 		goto out;
280336be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
280436be9a76SAlex Elder 	if (ret)
280536be9a76SAlex Elder 		goto out;
280636be9a76SAlex Elder 
280736be9a76SAlex Elder 	ret = obj_request->result;
280836be9a76SAlex Elder 	if (ret < 0)
280936be9a76SAlex Elder 		goto out;
281057385b51SAlex Elder 
281157385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
281257385b51SAlex Elder 	ret = (int)obj_request->xferred;
2813903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
281436be9a76SAlex Elder out:
281536be9a76SAlex Elder 	if (obj_request)
281636be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
281736be9a76SAlex Elder 	else
281836be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
281936be9a76SAlex Elder 
282036be9a76SAlex Elder 	return ret;
282136be9a76SAlex Elder }
282236be9a76SAlex Elder 
2823bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2824cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2825bf0d5f50SAlex Elder {
2826bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2827bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2828bf0d5f50SAlex Elder 	struct request *rq;
2829bf0d5f50SAlex Elder 	int result;
2830bf0d5f50SAlex Elder 
2831bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2832bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2833bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2834bf0d5f50SAlex Elder 		u64 offset;
2835bf0d5f50SAlex Elder 		u64 length;
2836bf0d5f50SAlex Elder 
2837bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2838bf0d5f50SAlex Elder 
2839bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
28404dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
28414dda41d3SAlex Elder 				(int) rq->cmd_type);
28424dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
28434dda41d3SAlex Elder 			continue;
28444dda41d3SAlex Elder 		}
28454dda41d3SAlex Elder 
28464dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
28474dda41d3SAlex Elder 
28484dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
28494dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
28504dda41d3SAlex Elder 
28514dda41d3SAlex Elder 		if (!length) {
28524dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2853bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2854bf0d5f50SAlex Elder 			continue;
2855bf0d5f50SAlex Elder 		}
2856bf0d5f50SAlex Elder 
2857bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2858bf0d5f50SAlex Elder 
2859bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2860bf0d5f50SAlex Elder 
2861bf0d5f50SAlex Elder 		if (write_request) {
2862bf0d5f50SAlex Elder 			result = -EROFS;
2863bf0d5f50SAlex Elder 			if (read_only)
2864bf0d5f50SAlex Elder 				goto end_request;
2865bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2866bf0d5f50SAlex Elder 		}
2867bf0d5f50SAlex Elder 
28686d292906SAlex Elder 		/*
28696d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
28706d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
28716d292906SAlex Elder 		 * have disappeared by the time our request arrives
28726d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
28736d292906SAlex Elder 		 * we already know.
28746d292906SAlex Elder 		 */
28756d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2876bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2877bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2878bf0d5f50SAlex Elder 			result = -ENXIO;
2879bf0d5f50SAlex Elder 			goto end_request;
2880bf0d5f50SAlex Elder 		}
2881bf0d5f50SAlex Elder 
2882bf0d5f50SAlex Elder 		result = -EINVAL;
2883c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
2884c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2885c0cd10dbSAlex Elder 				offset, length);
2886bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2887c0cd10dbSAlex Elder 		}
2888bf0d5f50SAlex Elder 
288900a653e2SAlex Elder 		result = -EIO;
289000a653e2SAlex Elder 		if (offset + length > rbd_dev->mapping.size) {
289100a653e2SAlex Elder 			rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)\n",
289200a653e2SAlex Elder 				offset, length, rbd_dev->mapping.size);
289300a653e2SAlex Elder 			goto end_request;
289400a653e2SAlex Elder 		}
289500a653e2SAlex Elder 
2896bf0d5f50SAlex Elder 		result = -ENOMEM;
2897bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
28989849e986SAlex Elder 							write_request, false);
2899bf0d5f50SAlex Elder 		if (!img_request)
2900bf0d5f50SAlex Elder 			goto end_request;
2901bf0d5f50SAlex Elder 
2902bf0d5f50SAlex Elder 		img_request->rq = rq;
2903bf0d5f50SAlex Elder 
2904f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2905f1a4739fSAlex Elder 						rq->bio);
2906bf0d5f50SAlex Elder 		if (!result)
2907bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2908bf0d5f50SAlex Elder 		if (result)
2909bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2910bf0d5f50SAlex Elder end_request:
2911bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2912bf0d5f50SAlex Elder 		if (result < 0) {
29137da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
29147da22d29SAlex Elder 				write_request ? "write" : "read",
29157da22d29SAlex Elder 				length, offset, result);
29167da22d29SAlex Elder 
2917bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2918bf0d5f50SAlex Elder 		}
2919bf0d5f50SAlex Elder 	}
2920bf0d5f50SAlex Elder }
2921bf0d5f50SAlex Elder 
2922602adf40SYehuda Sadeh /*
2923602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2924602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2925f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2926602adf40SYehuda Sadeh  */
2927602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2928602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2929602adf40SYehuda Sadeh {
2930602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2931e5cfeed2SAlex Elder 	sector_t sector_offset;
2932e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2933e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2934e5cfeed2SAlex Elder 	int ret;
2935602adf40SYehuda Sadeh 
2936e5cfeed2SAlex Elder 	/*
2937e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2938e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2939e5cfeed2SAlex Elder 	 * device.
2940e5cfeed2SAlex Elder 	 */
2941e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2942e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2943e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2944593a9e7bSAlex Elder 
2945e5cfeed2SAlex Elder 	/*
2946e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2947e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2948e5cfeed2SAlex Elder 	 */
2949e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2950e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2951e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2952e5cfeed2SAlex Elder 	else
2953e5cfeed2SAlex Elder 		ret = 0;
2954e5cfeed2SAlex Elder 
2955e5cfeed2SAlex Elder 	/*
2956e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2957e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2958e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2959e5cfeed2SAlex Elder 	 * added to an empty bio."
2960e5cfeed2SAlex Elder 	 */
2961e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2962e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2963e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2964e5cfeed2SAlex Elder 
2965e5cfeed2SAlex Elder 	return ret;
2966602adf40SYehuda Sadeh }
2967602adf40SYehuda Sadeh 
2968602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2969602adf40SYehuda Sadeh {
2970602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2971602adf40SYehuda Sadeh 
2972602adf40SYehuda Sadeh 	if (!disk)
2973602adf40SYehuda Sadeh 		return;
2974602adf40SYehuda Sadeh 
2975a0cab924SAlex Elder 	rbd_dev->disk = NULL;
2976a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
2977602adf40SYehuda Sadeh 		del_gendisk(disk);
2978602adf40SYehuda Sadeh 		if (disk->queue)
2979602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
2980a0cab924SAlex Elder 	}
2981602adf40SYehuda Sadeh 	put_disk(disk);
2982602adf40SYehuda Sadeh }
2983602adf40SYehuda Sadeh 
2984788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2985788e2df3SAlex Elder 				const char *object_name,
29867097f8dfSAlex Elder 				u64 offset, u64 length, void *buf)
2987788e2df3SAlex Elder 
2988788e2df3SAlex Elder {
29892169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2990788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2991788e2df3SAlex Elder 	struct page **pages = NULL;
2992788e2df3SAlex Elder 	u32 page_count;
29931ceae7efSAlex Elder 	size_t size;
2994788e2df3SAlex Elder 	int ret;
2995788e2df3SAlex Elder 
2996788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2997788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2998788e2df3SAlex Elder 	if (IS_ERR(pages))
2999788e2df3SAlex Elder 		ret = PTR_ERR(pages);
3000788e2df3SAlex Elder 
3001788e2df3SAlex Elder 	ret = -ENOMEM;
3002788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
3003788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
3004788e2df3SAlex Elder 	if (!obj_request)
3005788e2df3SAlex Elder 		goto out;
3006788e2df3SAlex Elder 
3007788e2df3SAlex Elder 	obj_request->pages = pages;
3008788e2df3SAlex Elder 	obj_request->page_count = page_count;
3009788e2df3SAlex Elder 
3010430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
3011788e2df3SAlex Elder 	if (!obj_request->osd_req)
3012788e2df3SAlex Elder 		goto out;
3013788e2df3SAlex Elder 
3014c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
3015c99d2d4aSAlex Elder 					offset, length, 0, 0);
3016406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
3017a4ce40a9SAlex Elder 					obj_request->pages,
301844cd188dSAlex Elder 					obj_request->length,
301944cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
302044cd188dSAlex Elder 					false, false);
30219d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
3022430c28c3SAlex Elder 
3023788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
3024788e2df3SAlex Elder 	if (ret)
3025788e2df3SAlex Elder 		goto out;
3026788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
3027788e2df3SAlex Elder 	if (ret)
3028788e2df3SAlex Elder 		goto out;
3029788e2df3SAlex Elder 
3030788e2df3SAlex Elder 	ret = obj_request->result;
3031788e2df3SAlex Elder 	if (ret < 0)
3032788e2df3SAlex Elder 		goto out;
30331ceae7efSAlex Elder 
30341ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
30351ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
3036903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
303723ed6e13SAlex Elder 	rbd_assert(size <= (size_t)INT_MAX);
303823ed6e13SAlex Elder 	ret = (int)size;
3039788e2df3SAlex Elder out:
3040788e2df3SAlex Elder 	if (obj_request)
3041788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
3042788e2df3SAlex Elder 	else
3043788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
3044788e2df3SAlex Elder 
3045788e2df3SAlex Elder 	return ret;
3046788e2df3SAlex Elder }
3047788e2df3SAlex Elder 
3048602adf40SYehuda Sadeh /*
3049662518b1SAlex Elder  * Read the complete header for the given rbd device.  On successful
3050662518b1SAlex Elder  * return, the rbd_dev->header field will contain up-to-date
3051662518b1SAlex Elder  * information about the image.
30524156d998SAlex Elder  */
3053662518b1SAlex Elder static int rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
30544156d998SAlex Elder {
30554156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
30564156d998SAlex Elder 	u32 snap_count = 0;
30574156d998SAlex Elder 	u64 names_size = 0;
30584156d998SAlex Elder 	u32 want_count;
30594156d998SAlex Elder 	int ret;
30604156d998SAlex Elder 
30614156d998SAlex Elder 	/*
30624156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
30634156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
30644156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
30654156d998SAlex Elder 	 * the number of snapshots could change by the time we read
30664156d998SAlex Elder 	 * it in, in which case we re-read it.
30674156d998SAlex Elder 	 */
30684156d998SAlex Elder 	do {
30694156d998SAlex Elder 		size_t size;
30704156d998SAlex Elder 
30714156d998SAlex Elder 		kfree(ondisk);
30724156d998SAlex Elder 
30734156d998SAlex Elder 		size = sizeof (*ondisk);
30744156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
30754156d998SAlex Elder 		size += names_size;
30764156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
30774156d998SAlex Elder 		if (!ondisk)
3078662518b1SAlex Elder 			return -ENOMEM;
30794156d998SAlex Elder 
3080788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
30817097f8dfSAlex Elder 				       0, size, ondisk);
30824156d998SAlex Elder 		if (ret < 0)
3083662518b1SAlex Elder 			goto out;
3084c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
30854156d998SAlex Elder 			ret = -ENXIO;
308606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
308706ecc6cbSAlex Elder 				size, ret);
3088662518b1SAlex Elder 			goto out;
30894156d998SAlex Elder 		}
30904156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
30914156d998SAlex Elder 			ret = -ENXIO;
309206ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
3093662518b1SAlex Elder 			goto out;
30944156d998SAlex Elder 		}
30954156d998SAlex Elder 
30964156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
30974156d998SAlex Elder 		want_count = snap_count;
30984156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
30994156d998SAlex Elder 	} while (snap_count != want_count);
31004156d998SAlex Elder 
3101662518b1SAlex Elder 	ret = rbd_header_from_disk(rbd_dev, ondisk);
3102662518b1SAlex Elder out:
31034156d998SAlex Elder 	kfree(ondisk);
3104602adf40SYehuda Sadeh 
31054156d998SAlex Elder 	return ret;
3106602adf40SYehuda Sadeh }
3107602adf40SYehuda Sadeh 
3108602adf40SYehuda Sadeh /*
3109602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
3110602adf40SYehuda Sadeh  */
3111cc4a38bdSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
3112602adf40SYehuda Sadeh {
3113662518b1SAlex Elder 	return rbd_dev_v1_header_read(rbd_dev);
3114602adf40SYehuda Sadeh }
3115602adf40SYehuda Sadeh 
311615228edeSAlex Elder /*
311715228edeSAlex Elder  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
311815228edeSAlex Elder  * has disappeared from the (just updated) snapshot context.
311915228edeSAlex Elder  */
312015228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev)
312115228edeSAlex Elder {
312215228edeSAlex Elder 	u64 snap_id;
312315228edeSAlex Elder 
312415228edeSAlex Elder 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
312515228edeSAlex Elder 		return;
312615228edeSAlex Elder 
312715228edeSAlex Elder 	snap_id = rbd_dev->spec->snap_id;
312815228edeSAlex Elder 	if (snap_id == CEPH_NOSNAP)
312915228edeSAlex Elder 		return;
313015228edeSAlex Elder 
313115228edeSAlex Elder 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
313215228edeSAlex Elder 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
313315228edeSAlex Elder }
313415228edeSAlex Elder 
3135cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev)
31361fe5e993SAlex Elder {
3137e627db08SAlex Elder 	u64 mapping_size;
31381fe5e993SAlex Elder 	int ret;
31391fe5e993SAlex Elder 
3140117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3141e627db08SAlex Elder 	mapping_size = rbd_dev->mapping.size;
31421fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3143117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
3144cc4a38bdSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev);
3145117973fbSAlex Elder 	else
3146cc4a38bdSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev);
314715228edeSAlex Elder 
314815228edeSAlex Elder 	/* If it's a mapped snapshot, validate its EXISTS flag */
314915228edeSAlex Elder 
315015228edeSAlex Elder 	rbd_exists_validate(rbd_dev);
31511fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
315200a653e2SAlex Elder 	if (mapping_size != rbd_dev->mapping.size) {
315300a653e2SAlex Elder 		sector_t size;
315400a653e2SAlex Elder 
315500a653e2SAlex Elder 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
315600a653e2SAlex Elder 		dout("setting size to %llu sectors", (unsigned long long)size);
315700a653e2SAlex Elder 		set_capacity(rbd_dev->disk, size);
3158a3fbe5d4SAlex Elder 		revalidate_disk(rbd_dev->disk);
315900a653e2SAlex Elder 	}
31601fe5e993SAlex Elder 
31611fe5e993SAlex Elder 	return ret;
31621fe5e993SAlex Elder }
31631fe5e993SAlex Elder 
3164602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3165602adf40SYehuda Sadeh {
3166602adf40SYehuda Sadeh 	struct gendisk *disk;
3167602adf40SYehuda Sadeh 	struct request_queue *q;
3168593a9e7bSAlex Elder 	u64 segment_size;
3169602adf40SYehuda Sadeh 
3170602adf40SYehuda Sadeh 	/* create gendisk info */
3171602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3172602adf40SYehuda Sadeh 	if (!disk)
31731fcdb8aaSAlex Elder 		return -ENOMEM;
3174602adf40SYehuda Sadeh 
3175f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3176de71a297SAlex Elder 		 rbd_dev->dev_id);
3177602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3178602adf40SYehuda Sadeh 	disk->first_minor = 0;
3179602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3180602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3181602adf40SYehuda Sadeh 
3182bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3183602adf40SYehuda Sadeh 	if (!q)
3184602adf40SYehuda Sadeh 		goto out_disk;
3185029bcbd8SJosh Durgin 
3186593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3187593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3188593a9e7bSAlex Elder 
3189029bcbd8SJosh Durgin 	/* set io sizes to object size */
3190593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3191593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3192593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3193593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3194593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3195029bcbd8SJosh Durgin 
3196602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3197602adf40SYehuda Sadeh 	disk->queue = q;
3198602adf40SYehuda Sadeh 
3199602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3200602adf40SYehuda Sadeh 
3201602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3202602adf40SYehuda Sadeh 
3203602adf40SYehuda Sadeh 	return 0;
3204602adf40SYehuda Sadeh out_disk:
3205602adf40SYehuda Sadeh 	put_disk(disk);
32061fcdb8aaSAlex Elder 
32071fcdb8aaSAlex Elder 	return -ENOMEM;
3208602adf40SYehuda Sadeh }
3209602adf40SYehuda Sadeh 
3210dfc5606dSYehuda Sadeh /*
3211dfc5606dSYehuda Sadeh   sysfs
3212dfc5606dSYehuda Sadeh */
3213602adf40SYehuda Sadeh 
3214593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3215593a9e7bSAlex Elder {
3216593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3217593a9e7bSAlex Elder }
3218593a9e7bSAlex Elder 
3219dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3220dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3221602adf40SYehuda Sadeh {
3222593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3223dfc5606dSYehuda Sadeh 
3224fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3225fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3226602adf40SYehuda Sadeh }
3227602adf40SYehuda Sadeh 
322834b13184SAlex Elder /*
322934b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
323034b13184SAlex Elder  * necessarily the base image.
323134b13184SAlex Elder  */
323234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
323334b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
323434b13184SAlex Elder {
323534b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
323634b13184SAlex Elder 
323734b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
323834b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
323934b13184SAlex Elder }
324034b13184SAlex Elder 
3241dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3242dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3243602adf40SYehuda Sadeh {
3244593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3245dfc5606dSYehuda Sadeh 
3246fc71d833SAlex Elder 	if (rbd_dev->major)
3247dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3248fc71d833SAlex Elder 
3249fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3250fc71d833SAlex Elder 
3251dfc5606dSYehuda Sadeh }
3252dfc5606dSYehuda Sadeh 
3253dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3254dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3255dfc5606dSYehuda Sadeh {
3256593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3257dfc5606dSYehuda Sadeh 
32581dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
32591dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3260dfc5606dSYehuda Sadeh }
3261dfc5606dSYehuda Sadeh 
3262dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3263dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3264dfc5606dSYehuda Sadeh {
3265593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3266dfc5606dSYehuda Sadeh 
32670d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3268dfc5606dSYehuda Sadeh }
3269dfc5606dSYehuda Sadeh 
32709bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
32719bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
32729bb2f334SAlex Elder {
32739bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
32749bb2f334SAlex Elder 
32750d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
32760d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
32779bb2f334SAlex Elder }
32789bb2f334SAlex Elder 
3279dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3280dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3281dfc5606dSYehuda Sadeh {
3282593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3283dfc5606dSYehuda Sadeh 
3284a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
32850d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3286a92ffdf8SAlex Elder 
3287a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3288dfc5606dSYehuda Sadeh }
3289dfc5606dSYehuda Sadeh 
3290589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3291589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3292589d30e0SAlex Elder {
3293589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3294589d30e0SAlex Elder 
32950d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3296589d30e0SAlex Elder }
3297589d30e0SAlex Elder 
329834b13184SAlex Elder /*
329934b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
330034b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
330134b13184SAlex Elder  */
3302dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3303dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3304dfc5606dSYehuda Sadeh 			     char *buf)
3305dfc5606dSYehuda Sadeh {
3306593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3307dfc5606dSYehuda Sadeh 
33080d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3309dfc5606dSYehuda Sadeh }
3310dfc5606dSYehuda Sadeh 
331186b00e0dSAlex Elder /*
331286b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
331386b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
331486b00e0dSAlex Elder  * "(no parent image)".
331586b00e0dSAlex Elder  */
331686b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
331786b00e0dSAlex Elder 			     struct device_attribute *attr,
331886b00e0dSAlex Elder 			     char *buf)
331986b00e0dSAlex Elder {
332086b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
332186b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
332286b00e0dSAlex Elder 	int count;
332386b00e0dSAlex Elder 	char *bufp = buf;
332486b00e0dSAlex Elder 
332586b00e0dSAlex Elder 	if (!spec)
332686b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
332786b00e0dSAlex Elder 
332886b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
332986b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
333086b00e0dSAlex Elder 	if (count < 0)
333186b00e0dSAlex Elder 		return count;
333286b00e0dSAlex Elder 	bufp += count;
333386b00e0dSAlex Elder 
333486b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
333586b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
333686b00e0dSAlex Elder 	if (count < 0)
333786b00e0dSAlex Elder 		return count;
333886b00e0dSAlex Elder 	bufp += count;
333986b00e0dSAlex Elder 
334086b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
334186b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
334286b00e0dSAlex Elder 	if (count < 0)
334386b00e0dSAlex Elder 		return count;
334486b00e0dSAlex Elder 	bufp += count;
334586b00e0dSAlex Elder 
334686b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
334786b00e0dSAlex Elder 	if (count < 0)
334886b00e0dSAlex Elder 		return count;
334986b00e0dSAlex Elder 	bufp += count;
335086b00e0dSAlex Elder 
335186b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
335286b00e0dSAlex Elder }
335386b00e0dSAlex Elder 
3354dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3355dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3356dfc5606dSYehuda Sadeh 				 const char *buf,
3357dfc5606dSYehuda Sadeh 				 size_t size)
3358dfc5606dSYehuda Sadeh {
3359593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3360b813623aSAlex Elder 	int ret;
3361602adf40SYehuda Sadeh 
3362cc4a38bdSAlex Elder 	ret = rbd_dev_refresh(rbd_dev);
3363e627db08SAlex Elder 	if (ret)
3364e627db08SAlex Elder 		rbd_warn(rbd_dev, ": manual header refresh error (%d)\n", ret);
3365b813623aSAlex Elder 
3366b813623aSAlex Elder 	return ret < 0 ? ret : size;
3367dfc5606dSYehuda Sadeh }
3368602adf40SYehuda Sadeh 
3369dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
337034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3371dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3372dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3373dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
33749bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3375dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3376589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3377dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3378dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
337986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3380dfc5606dSYehuda Sadeh 
3381dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3382dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
338334b13184SAlex Elder 	&dev_attr_features.attr,
3384dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3385dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3386dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
33879bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3388dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3389589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3390dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
339186b00e0dSAlex Elder 	&dev_attr_parent.attr,
3392dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3393dfc5606dSYehuda Sadeh 	NULL
3394dfc5606dSYehuda Sadeh };
3395dfc5606dSYehuda Sadeh 
3396dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3397dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3398dfc5606dSYehuda Sadeh };
3399dfc5606dSYehuda Sadeh 
3400dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3401dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3402dfc5606dSYehuda Sadeh 	NULL
3403dfc5606dSYehuda Sadeh };
3404dfc5606dSYehuda Sadeh 
3405dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3406dfc5606dSYehuda Sadeh {
3407dfc5606dSYehuda Sadeh }
3408dfc5606dSYehuda Sadeh 
3409dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3410dfc5606dSYehuda Sadeh 	.name		= "rbd",
3411dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3412dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3413dfc5606dSYehuda Sadeh };
3414dfc5606dSYehuda Sadeh 
34158b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
34168b8fb99cSAlex Elder {
34178b8fb99cSAlex Elder 	kref_get(&spec->kref);
34188b8fb99cSAlex Elder 
34198b8fb99cSAlex Elder 	return spec;
34208b8fb99cSAlex Elder }
34218b8fb99cSAlex Elder 
34228b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
34238b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
34248b8fb99cSAlex Elder {
34258b8fb99cSAlex Elder 	if (spec)
34268b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
34278b8fb99cSAlex Elder }
34288b8fb99cSAlex Elder 
34298b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
34308b8fb99cSAlex Elder {
34318b8fb99cSAlex Elder 	struct rbd_spec *spec;
34328b8fb99cSAlex Elder 
34338b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
34348b8fb99cSAlex Elder 	if (!spec)
34358b8fb99cSAlex Elder 		return NULL;
34368b8fb99cSAlex Elder 	kref_init(&spec->kref);
34378b8fb99cSAlex Elder 
34388b8fb99cSAlex Elder 	return spec;
34398b8fb99cSAlex Elder }
34408b8fb99cSAlex Elder 
34418b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
34428b8fb99cSAlex Elder {
34438b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
34448b8fb99cSAlex Elder 
34458b8fb99cSAlex Elder 	kfree(spec->pool_name);
34468b8fb99cSAlex Elder 	kfree(spec->image_id);
34478b8fb99cSAlex Elder 	kfree(spec->image_name);
34488b8fb99cSAlex Elder 	kfree(spec->snap_name);
34498b8fb99cSAlex Elder 	kfree(spec);
34508b8fb99cSAlex Elder }
34518b8fb99cSAlex Elder 
3452cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3453c53d5893SAlex Elder 				struct rbd_spec *spec)
3454c53d5893SAlex Elder {
3455c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3456c53d5893SAlex Elder 
3457c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3458c53d5893SAlex Elder 	if (!rbd_dev)
3459c53d5893SAlex Elder 		return NULL;
3460c53d5893SAlex Elder 
3461c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
34626d292906SAlex Elder 	rbd_dev->flags = 0;
3463c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3464c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3465c53d5893SAlex Elder 
3466c53d5893SAlex Elder 	rbd_dev->spec = spec;
3467c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3468c53d5893SAlex Elder 
34690903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
34700903e875SAlex Elder 
34710903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34720903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
34730903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34740903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
34750903e875SAlex Elder 
3476c53d5893SAlex Elder 	return rbd_dev;
3477c53d5893SAlex Elder }
3478c53d5893SAlex Elder 
3479c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3480c53d5893SAlex Elder {
3481c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3482c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3483c53d5893SAlex Elder 	kfree(rbd_dev);
3484c53d5893SAlex Elder }
3485c53d5893SAlex Elder 
3486dfc5606dSYehuda Sadeh /*
34879d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
34889d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
34899d475de5SAlex Elder  * image.
34909d475de5SAlex Elder  */
34919d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
34929d475de5SAlex Elder 				u8 *order, u64 *snap_size)
34939d475de5SAlex Elder {
34949d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
34959d475de5SAlex Elder 	int ret;
34969d475de5SAlex Elder 	struct {
34979d475de5SAlex Elder 		u8 order;
34989d475de5SAlex Elder 		__le64 size;
34999d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
35009d475de5SAlex Elder 
350136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35029d475de5SAlex Elder 				"rbd", "get_size",
35034157976bSAlex Elder 				&snapid, sizeof (snapid),
3504e2a58ee5SAlex Elder 				&size_buf, sizeof (size_buf));
350536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35069d475de5SAlex Elder 	if (ret < 0)
35079d475de5SAlex Elder 		return ret;
350857385b51SAlex Elder 	if (ret < sizeof (size_buf))
350957385b51SAlex Elder 		return -ERANGE;
35109d475de5SAlex Elder 
3511c86f86e9SAlex Elder 	if (order)
35129d475de5SAlex Elder 		*order = size_buf.order;
35139d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
35149d475de5SAlex Elder 
35159d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
35169d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
35179d475de5SAlex Elder 		(unsigned long long)*snap_size);
35189d475de5SAlex Elder 
35199d475de5SAlex Elder 	return 0;
35209d475de5SAlex Elder }
35219d475de5SAlex Elder 
35229d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
35239d475de5SAlex Elder {
35249d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
35259d475de5SAlex Elder 					&rbd_dev->header.obj_order,
35269d475de5SAlex Elder 					&rbd_dev->header.image_size);
35279d475de5SAlex Elder }
35289d475de5SAlex Elder 
35291e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
35301e130199SAlex Elder {
35311e130199SAlex Elder 	void *reply_buf;
35321e130199SAlex Elder 	int ret;
35331e130199SAlex Elder 	void *p;
35341e130199SAlex Elder 
35351e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
35361e130199SAlex Elder 	if (!reply_buf)
35371e130199SAlex Elder 		return -ENOMEM;
35381e130199SAlex Elder 
353936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35404157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
3541e2a58ee5SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
354236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35431e130199SAlex Elder 	if (ret < 0)
35441e130199SAlex Elder 		goto out;
35451e130199SAlex Elder 
35461e130199SAlex Elder 	p = reply_buf;
35471e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
354857385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
354957385b51SAlex Elder 	ret = 0;
35501e130199SAlex Elder 
35511e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
35521e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
35531e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
35541e130199SAlex Elder 	} else {
35551e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
35561e130199SAlex Elder 	}
35571e130199SAlex Elder out:
35581e130199SAlex Elder 	kfree(reply_buf);
35591e130199SAlex Elder 
35601e130199SAlex Elder 	return ret;
35611e130199SAlex Elder }
35621e130199SAlex Elder 
3563b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3564b1b5402aSAlex Elder 		u64 *snap_features)
3565b1b5402aSAlex Elder {
3566b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3567b1b5402aSAlex Elder 	struct {
3568b1b5402aSAlex Elder 		__le64 features;
3569b1b5402aSAlex Elder 		__le64 incompat;
35704157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3571d889140cSAlex Elder 	u64 incompat;
3572b1b5402aSAlex Elder 	int ret;
3573b1b5402aSAlex Elder 
357436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3575b1b5402aSAlex Elder 				"rbd", "get_features",
35764157976bSAlex Elder 				&snapid, sizeof (snapid),
3577e2a58ee5SAlex Elder 				&features_buf, sizeof (features_buf));
357836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3579b1b5402aSAlex Elder 	if (ret < 0)
3580b1b5402aSAlex Elder 		return ret;
358157385b51SAlex Elder 	if (ret < sizeof (features_buf))
358257385b51SAlex Elder 		return -ERANGE;
3583d889140cSAlex Elder 
3584d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
35855cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3586b8f5c6edSAlex Elder 		return -ENXIO;
3587d889140cSAlex Elder 
3588b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3589b1b5402aSAlex Elder 
3590b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3591b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3592b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3593b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3594b1b5402aSAlex Elder 
3595b1b5402aSAlex Elder 	return 0;
3596b1b5402aSAlex Elder }
3597b1b5402aSAlex Elder 
3598b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3599b1b5402aSAlex Elder {
3600b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3601b1b5402aSAlex Elder 						&rbd_dev->header.features);
3602b1b5402aSAlex Elder }
3603b1b5402aSAlex Elder 
360486b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
360586b00e0dSAlex Elder {
360686b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
360786b00e0dSAlex Elder 	size_t size;
360886b00e0dSAlex Elder 	void *reply_buf = NULL;
360986b00e0dSAlex Elder 	__le64 snapid;
361086b00e0dSAlex Elder 	void *p;
361186b00e0dSAlex Elder 	void *end;
361286b00e0dSAlex Elder 	char *image_id;
361386b00e0dSAlex Elder 	u64 overlap;
361486b00e0dSAlex Elder 	int ret;
361586b00e0dSAlex Elder 
361686b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
361786b00e0dSAlex Elder 	if (!parent_spec)
361886b00e0dSAlex Elder 		return -ENOMEM;
361986b00e0dSAlex Elder 
362086b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
362186b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
362286b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
362386b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
362486b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
362586b00e0dSAlex Elder 	if (!reply_buf) {
362686b00e0dSAlex Elder 		ret = -ENOMEM;
362786b00e0dSAlex Elder 		goto out_err;
362886b00e0dSAlex Elder 	}
362986b00e0dSAlex Elder 
363086b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
363136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
363286b00e0dSAlex Elder 				"rbd", "get_parent",
36334157976bSAlex Elder 				&snapid, sizeof (snapid),
3634e2a58ee5SAlex Elder 				reply_buf, size);
363536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
363686b00e0dSAlex Elder 	if (ret < 0)
363786b00e0dSAlex Elder 		goto out_err;
363886b00e0dSAlex Elder 
363986b00e0dSAlex Elder 	p = reply_buf;
364057385b51SAlex Elder 	end = reply_buf + ret;
364157385b51SAlex Elder 	ret = -ERANGE;
364286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
364386b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
364486b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
364586b00e0dSAlex Elder 
36460903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36470903e875SAlex Elder 
36480903e875SAlex Elder 	ret = -EIO;
3649c0cd10dbSAlex Elder 	if (parent_spec->pool_id > (u64)U32_MAX) {
3650c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3651c0cd10dbSAlex Elder 			(unsigned long long)parent_spec->pool_id, U32_MAX);
365257385b51SAlex Elder 		goto out_err;
3653c0cd10dbSAlex Elder 	}
36540903e875SAlex Elder 
3655979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
365686b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
365786b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
365886b00e0dSAlex Elder 		goto out_err;
365986b00e0dSAlex Elder 	}
366086b00e0dSAlex Elder 	parent_spec->image_id = image_id;
366186b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
366286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
366386b00e0dSAlex Elder 
366486b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
366586b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
366686b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
366786b00e0dSAlex Elder out:
366886b00e0dSAlex Elder 	ret = 0;
366986b00e0dSAlex Elder out_err:
367086b00e0dSAlex Elder 	kfree(reply_buf);
367186b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
367286b00e0dSAlex Elder 
367386b00e0dSAlex Elder 	return ret;
367486b00e0dSAlex Elder }
367586b00e0dSAlex Elder 
3676cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3677cc070d59SAlex Elder {
3678cc070d59SAlex Elder 	struct {
3679cc070d59SAlex Elder 		__le64 stripe_unit;
3680cc070d59SAlex Elder 		__le64 stripe_count;
3681cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3682cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3683cc070d59SAlex Elder 	void *p;
3684cc070d59SAlex Elder 	u64 obj_size;
3685cc070d59SAlex Elder 	u64 stripe_unit;
3686cc070d59SAlex Elder 	u64 stripe_count;
3687cc070d59SAlex Elder 	int ret;
3688cc070d59SAlex Elder 
3689cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3690cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3691e2a58ee5SAlex Elder 				(char *)&striping_info_buf, size);
3692cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3693cc070d59SAlex Elder 	if (ret < 0)
3694cc070d59SAlex Elder 		return ret;
3695cc070d59SAlex Elder 	if (ret < size)
3696cc070d59SAlex Elder 		return -ERANGE;
3697cc070d59SAlex Elder 
3698cc070d59SAlex Elder 	/*
3699cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3700cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3701cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3702cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3703cc070d59SAlex Elder 	 */
3704cc070d59SAlex Elder 	ret = -EINVAL;
3705cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3706cc070d59SAlex Elder 	p = &striping_info_buf;
3707cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3708cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3709cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3710cc070d59SAlex Elder 				"(got %llu want %llu)",
3711cc070d59SAlex Elder 				stripe_unit, obj_size);
3712cc070d59SAlex Elder 		return -EINVAL;
3713cc070d59SAlex Elder 	}
3714cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3715cc070d59SAlex Elder 	if (stripe_count != 1) {
3716cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3717cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3718cc070d59SAlex Elder 		return -EINVAL;
3719cc070d59SAlex Elder 	}
3720500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
3721500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
3722cc070d59SAlex Elder 
3723cc070d59SAlex Elder 	return 0;
3724cc070d59SAlex Elder }
3725cc070d59SAlex Elder 
37269e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
37279e15b77dSAlex Elder {
37289e15b77dSAlex Elder 	size_t image_id_size;
37299e15b77dSAlex Elder 	char *image_id;
37309e15b77dSAlex Elder 	void *p;
37319e15b77dSAlex Elder 	void *end;
37329e15b77dSAlex Elder 	size_t size;
37339e15b77dSAlex Elder 	void *reply_buf = NULL;
37349e15b77dSAlex Elder 	size_t len = 0;
37359e15b77dSAlex Elder 	char *image_name = NULL;
37369e15b77dSAlex Elder 	int ret;
37379e15b77dSAlex Elder 
37389e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
37399e15b77dSAlex Elder 
374069e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
374169e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
37429e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
37439e15b77dSAlex Elder 	if (!image_id)
37449e15b77dSAlex Elder 		return NULL;
37459e15b77dSAlex Elder 
37469e15b77dSAlex Elder 	p = image_id;
37474157976bSAlex Elder 	end = image_id + image_id_size;
374869e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
37499e15b77dSAlex Elder 
37509e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
37519e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
37529e15b77dSAlex Elder 	if (!reply_buf)
37539e15b77dSAlex Elder 		goto out;
37549e15b77dSAlex Elder 
375536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
37569e15b77dSAlex Elder 				"rbd", "dir_get_name",
37579e15b77dSAlex Elder 				image_id, image_id_size,
3758e2a58ee5SAlex Elder 				reply_buf, size);
37599e15b77dSAlex Elder 	if (ret < 0)
37609e15b77dSAlex Elder 		goto out;
37619e15b77dSAlex Elder 	p = reply_buf;
3762f40eb349SAlex Elder 	end = reply_buf + ret;
3763f40eb349SAlex Elder 
37649e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
37659e15b77dSAlex Elder 	if (IS_ERR(image_name))
37669e15b77dSAlex Elder 		image_name = NULL;
37679e15b77dSAlex Elder 	else
37689e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
37699e15b77dSAlex Elder out:
37709e15b77dSAlex Elder 	kfree(reply_buf);
37719e15b77dSAlex Elder 	kfree(image_id);
37729e15b77dSAlex Elder 
37739e15b77dSAlex Elder 	return image_name;
37749e15b77dSAlex Elder }
37759e15b77dSAlex Elder 
37762ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
37772ad3d716SAlex Elder {
37782ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
37792ad3d716SAlex Elder 	const char *snap_name;
37802ad3d716SAlex Elder 	u32 which = 0;
37812ad3d716SAlex Elder 
37822ad3d716SAlex Elder 	/* Skip over names until we find the one we are looking for */
37832ad3d716SAlex Elder 
37842ad3d716SAlex Elder 	snap_name = rbd_dev->header.snap_names;
37852ad3d716SAlex Elder 	while (which < snapc->num_snaps) {
37862ad3d716SAlex Elder 		if (!strcmp(name, snap_name))
37872ad3d716SAlex Elder 			return snapc->snaps[which];
37882ad3d716SAlex Elder 		snap_name += strlen(snap_name) + 1;
37892ad3d716SAlex Elder 		which++;
37902ad3d716SAlex Elder 	}
37912ad3d716SAlex Elder 	return CEPH_NOSNAP;
37922ad3d716SAlex Elder }
37932ad3d716SAlex Elder 
37942ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
37952ad3d716SAlex Elder {
37962ad3d716SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
37972ad3d716SAlex Elder 	u32 which;
37982ad3d716SAlex Elder 	bool found = false;
37992ad3d716SAlex Elder 	u64 snap_id;
38002ad3d716SAlex Elder 
38012ad3d716SAlex Elder 	for (which = 0; !found && which < snapc->num_snaps; which++) {
38022ad3d716SAlex Elder 		const char *snap_name;
38032ad3d716SAlex Elder 
38042ad3d716SAlex Elder 		snap_id = snapc->snaps[which];
38052ad3d716SAlex Elder 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
38062ad3d716SAlex Elder 		if (IS_ERR(snap_name))
38072ad3d716SAlex Elder 			break;
38082ad3d716SAlex Elder 		found = !strcmp(name, snap_name);
38092ad3d716SAlex Elder 		kfree(snap_name);
38102ad3d716SAlex Elder 	}
38112ad3d716SAlex Elder 	return found ? snap_id : CEPH_NOSNAP;
38122ad3d716SAlex Elder }
38132ad3d716SAlex Elder 
38142ad3d716SAlex Elder /*
38152ad3d716SAlex Elder  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
38162ad3d716SAlex Elder  * no snapshot by that name is found, or if an error occurs.
38172ad3d716SAlex Elder  */
38182ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
38192ad3d716SAlex Elder {
38202ad3d716SAlex Elder 	if (rbd_dev->image_format == 1)
38212ad3d716SAlex Elder 		return rbd_v1_snap_id_by_name(rbd_dev, name);
38222ad3d716SAlex Elder 
38232ad3d716SAlex Elder 	return rbd_v2_snap_id_by_name(rbd_dev, name);
38242ad3d716SAlex Elder }
38252ad3d716SAlex Elder 
38269e15b77dSAlex Elder /*
38272e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
38282e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
38292e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
38302e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
38312e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
38322e9f7f1cSAlex Elder  * allocated.
3833e1d4213fSAlex Elder  *
3834e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
3835e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
3836e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
38379e15b77dSAlex Elder  */
38382e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
38399e15b77dSAlex Elder {
38402e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
38412e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
38422e9f7f1cSAlex Elder 	const char *pool_name;
38432e9f7f1cSAlex Elder 	const char *image_name;
38442e9f7f1cSAlex Elder 	const char *snap_name;
38459e15b77dSAlex Elder 	int ret;
38469e15b77dSAlex Elder 
3847e1d4213fSAlex Elder 	/*
3848e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
3849e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
3850e1d4213fSAlex Elder 	 */
38512e9f7f1cSAlex Elder 	if (spec->pool_name) {
38522e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
38532ad3d716SAlex Elder 			u64 snap_id;
3854e1d4213fSAlex Elder 
38552ad3d716SAlex Elder 			snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
38562ad3d716SAlex Elder 			if (snap_id == CEPH_NOSNAP)
3857e1d4213fSAlex Elder 				return -ENOENT;
38582ad3d716SAlex Elder 			spec->snap_id = snap_id;
3859e1d4213fSAlex Elder 		} else {
38602e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
3861e1d4213fSAlex Elder 		}
3862e1d4213fSAlex Elder 
3863e1d4213fSAlex Elder 		return 0;
3864e1d4213fSAlex Elder 	}
38659e15b77dSAlex Elder 
38662e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
38679e15b77dSAlex Elder 
38682e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
38692e9f7f1cSAlex Elder 	if (!pool_name) {
38702e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
3871935dc89fSAlex Elder 		return -EIO;
3872935dc89fSAlex Elder 	}
38732e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
38742e9f7f1cSAlex Elder 	if (!pool_name)
38759e15b77dSAlex Elder 		return -ENOMEM;
38769e15b77dSAlex Elder 
38779e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
38789e15b77dSAlex Elder 
38792e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
38802e9f7f1cSAlex Elder 	if (!image_name)
388106ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
38829e15b77dSAlex Elder 
38832e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
38849e15b77dSAlex Elder 
38852e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
38862e9f7f1cSAlex Elder 	if (!snap_name) {
38872e9f7f1cSAlex Elder 		ret = -ENOMEM;
38889e15b77dSAlex Elder 		goto out_err;
38892e9f7f1cSAlex Elder 	}
38902e9f7f1cSAlex Elder 
38912e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
38922e9f7f1cSAlex Elder 	spec->image_name = image_name;
38932e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
38949e15b77dSAlex Elder 
38959e15b77dSAlex Elder 	return 0;
38969e15b77dSAlex Elder out_err:
38972e9f7f1cSAlex Elder 	kfree(image_name);
38982e9f7f1cSAlex Elder 	kfree(pool_name);
38999e15b77dSAlex Elder 
39009e15b77dSAlex Elder 	return ret;
39019e15b77dSAlex Elder }
39029e15b77dSAlex Elder 
3903cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
390435d489f9SAlex Elder {
390535d489f9SAlex Elder 	size_t size;
390635d489f9SAlex Elder 	int ret;
390735d489f9SAlex Elder 	void *reply_buf;
390835d489f9SAlex Elder 	void *p;
390935d489f9SAlex Elder 	void *end;
391035d489f9SAlex Elder 	u64 seq;
391135d489f9SAlex Elder 	u32 snap_count;
391235d489f9SAlex Elder 	struct ceph_snap_context *snapc;
391335d489f9SAlex Elder 	u32 i;
391435d489f9SAlex Elder 
391535d489f9SAlex Elder 	/*
391635d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
391735d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
391835d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
391935d489f9SAlex Elder 	 * prepared to receive.
392035d489f9SAlex Elder 	 */
392135d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
392235d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
392335d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
392435d489f9SAlex Elder 	if (!reply_buf)
392535d489f9SAlex Elder 		return -ENOMEM;
392635d489f9SAlex Elder 
392736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
39284157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
3929e2a58ee5SAlex Elder 				reply_buf, size);
393036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
393135d489f9SAlex Elder 	if (ret < 0)
393235d489f9SAlex Elder 		goto out;
393335d489f9SAlex Elder 
393435d489f9SAlex Elder 	p = reply_buf;
393557385b51SAlex Elder 	end = reply_buf + ret;
393657385b51SAlex Elder 	ret = -ERANGE;
393735d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
393835d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
393935d489f9SAlex Elder 
394035d489f9SAlex Elder 	/*
394135d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
394235d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
394335d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
394435d489f9SAlex Elder 	 * allocate is representable in a size_t.
394535d489f9SAlex Elder 	 */
394635d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
394735d489f9SAlex Elder 				 / sizeof (u64)) {
394835d489f9SAlex Elder 		ret = -EINVAL;
394935d489f9SAlex Elder 		goto out;
395035d489f9SAlex Elder 	}
395135d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
395235d489f9SAlex Elder 		goto out;
3953468521c1SAlex Elder 	ret = 0;
395435d489f9SAlex Elder 
3955812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
395635d489f9SAlex Elder 	if (!snapc) {
395735d489f9SAlex Elder 		ret = -ENOMEM;
395835d489f9SAlex Elder 		goto out;
395935d489f9SAlex Elder 	}
396035d489f9SAlex Elder 	snapc->seq = seq;
396135d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
396235d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
396335d489f9SAlex Elder 
396449ece554SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
396535d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
396635d489f9SAlex Elder 
396735d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
396835d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
396935d489f9SAlex Elder out:
397035d489f9SAlex Elder 	kfree(reply_buf);
397135d489f9SAlex Elder 
397257385b51SAlex Elder 	return ret;
397335d489f9SAlex Elder }
397435d489f9SAlex Elder 
397554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
397654cac61fSAlex Elder 					u64 snap_id)
3977b8b1e2dbSAlex Elder {
3978b8b1e2dbSAlex Elder 	size_t size;
3979b8b1e2dbSAlex Elder 	void *reply_buf;
398054cac61fSAlex Elder 	__le64 snapid;
3981b8b1e2dbSAlex Elder 	int ret;
3982b8b1e2dbSAlex Elder 	void *p;
3983b8b1e2dbSAlex Elder 	void *end;
3984b8b1e2dbSAlex Elder 	char *snap_name;
3985b8b1e2dbSAlex Elder 
3986b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3987b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3988b8b1e2dbSAlex Elder 	if (!reply_buf)
3989b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3990b8b1e2dbSAlex Elder 
399154cac61fSAlex Elder 	snapid = cpu_to_le64(snap_id);
399236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3993b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
399454cac61fSAlex Elder 				&snapid, sizeof (snapid),
3995e2a58ee5SAlex Elder 				reply_buf, size);
399636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3997f40eb349SAlex Elder 	if (ret < 0) {
3998f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
3999b8b1e2dbSAlex Elder 		goto out;
4000f40eb349SAlex Elder 	}
4001b8b1e2dbSAlex Elder 
4002b8b1e2dbSAlex Elder 	p = reply_buf;
4003f40eb349SAlex Elder 	end = reply_buf + ret;
4004e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4005f40eb349SAlex Elder 	if (IS_ERR(snap_name))
4006b8b1e2dbSAlex Elder 		goto out;
4007f40eb349SAlex Elder 
4008b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
400954cac61fSAlex Elder 		(unsigned long long)snap_id, snap_name);
4010b8b1e2dbSAlex Elder out:
4011b8b1e2dbSAlex Elder 	kfree(reply_buf);
4012b8b1e2dbSAlex Elder 
4013f40eb349SAlex Elder 	return snap_name;
4014b8b1e2dbSAlex Elder }
4015b8b1e2dbSAlex Elder 
4016cc4a38bdSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
4017117973fbSAlex Elder {
4018117973fbSAlex Elder 	int ret;
4019117973fbSAlex Elder 
4020117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
4021117973fbSAlex Elder 
4022117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
4023117973fbSAlex Elder 	if (ret)
4024117973fbSAlex Elder 		goto out;
402529334ba4SAlex Elder 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP)
402629334ba4SAlex Elder 		if (rbd_dev->mapping.size != rbd_dev->header.image_size)
402729334ba4SAlex Elder 			rbd_dev->mapping.size = rbd_dev->header.image_size;
4028117973fbSAlex Elder 
4029cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
4030117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4031117973fbSAlex Elder 	if (ret)
4032117973fbSAlex Elder 		goto out;
4033117973fbSAlex Elder out:
4034117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
4035117973fbSAlex Elder 
4036117973fbSAlex Elder 	return ret;
4037117973fbSAlex Elder }
4038117973fbSAlex Elder 
4039dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4040dfc5606dSYehuda Sadeh {
4041dfc5606dSYehuda Sadeh 	struct device *dev;
4042cd789ab9SAlex Elder 	int ret;
4043dfc5606dSYehuda Sadeh 
4044dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4045dfc5606dSYehuda Sadeh 
4046cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4047dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4048dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4049dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4050200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4051de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4052dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4053dfc5606dSYehuda Sadeh 
4054dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4055cd789ab9SAlex Elder 
4056dfc5606dSYehuda Sadeh 	return ret;
4057602adf40SYehuda Sadeh }
4058602adf40SYehuda Sadeh 
4059dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4060dfc5606dSYehuda Sadeh {
4061dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4062dfc5606dSYehuda Sadeh }
4063dfc5606dSYehuda Sadeh 
4064e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
40651ddbe94eSAlex Elder 
40661ddbe94eSAlex Elder /*
4067499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4068499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
40691ddbe94eSAlex Elder  */
4070e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4071b7f23c36SAlex Elder {
4072e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4073499afd5bSAlex Elder 
4074499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4075499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4076499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4077e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4078e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4079b7f23c36SAlex Elder }
4080b7f23c36SAlex Elder 
40811ddbe94eSAlex Elder /*
4082499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4083499afd5bSAlex Elder  * identifier is no longer in use.
40841ddbe94eSAlex Elder  */
4085e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
40861ddbe94eSAlex Elder {
4087d184f6bfSAlex Elder 	struct list_head *tmp;
4088de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4089d184f6bfSAlex Elder 	int max_id;
4090d184f6bfSAlex Elder 
4091aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4092499afd5bSAlex Elder 
4093e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4094e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4095499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4096499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4097d184f6bfSAlex Elder 
4098d184f6bfSAlex Elder 	/*
4099d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4100d184f6bfSAlex Elder 	 * is nothing special we need to do.
4101d184f6bfSAlex Elder 	 */
4102e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4103d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4104d184f6bfSAlex Elder 		return;
4105d184f6bfSAlex Elder 	}
4106d184f6bfSAlex Elder 
4107d184f6bfSAlex Elder 	/*
4108d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4109d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4110d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4111d184f6bfSAlex Elder 	 */
4112d184f6bfSAlex Elder 	max_id = 0;
4113d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4114d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4115d184f6bfSAlex Elder 
4116d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4117b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4118b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4119d184f6bfSAlex Elder 	}
4120499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
41211ddbe94eSAlex Elder 
41221ddbe94eSAlex Elder 	/*
4123e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4124d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4125d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4126d184f6bfSAlex Elder 	 * case.
41271ddbe94eSAlex Elder 	 */
4128e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4129e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4130b7f23c36SAlex Elder }
4131b7f23c36SAlex Elder 
4132a725f65eSAlex Elder /*
4133e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4134e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4135593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4136593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4137e28fff26SAlex Elder  */
4138e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4139e28fff26SAlex Elder {
4140e28fff26SAlex Elder         /*
4141e28fff26SAlex Elder         * These are the characters that produce nonzero for
4142e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4143e28fff26SAlex Elder         */
4144e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4145e28fff26SAlex Elder 
4146e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4147e28fff26SAlex Elder 
4148e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4149e28fff26SAlex Elder }
4150e28fff26SAlex Elder 
4151e28fff26SAlex Elder /*
4152e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4153e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4154593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4155593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4156e28fff26SAlex Elder  *
4157e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4158e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4159e28fff26SAlex Elder  * token_size if the token would not fit.
4160e28fff26SAlex Elder  *
4161593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4162e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4163e28fff26SAlex Elder  * too small to hold it.
4164e28fff26SAlex Elder  */
4165e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4166e28fff26SAlex Elder 				char *token,
4167e28fff26SAlex Elder 				size_t token_size)
4168e28fff26SAlex Elder {
4169e28fff26SAlex Elder         size_t len;
4170e28fff26SAlex Elder 
4171e28fff26SAlex Elder 	len = next_token(buf);
4172e28fff26SAlex Elder 	if (len < token_size) {
4173e28fff26SAlex Elder 		memcpy(token, *buf, len);
4174e28fff26SAlex Elder 		*(token + len) = '\0';
4175e28fff26SAlex Elder 	}
4176e28fff26SAlex Elder 	*buf += len;
4177e28fff26SAlex Elder 
4178e28fff26SAlex Elder         return len;
4179e28fff26SAlex Elder }
4180e28fff26SAlex Elder 
4181e28fff26SAlex Elder /*
4182ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4183ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4184ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4185ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4186ea3352f4SAlex Elder  *
4187ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4188ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4189ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4190ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4191ea3352f4SAlex Elder  *
4192ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4193ea3352f4SAlex Elder  * the end of the found token.
4194ea3352f4SAlex Elder  *
4195ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4196ea3352f4SAlex Elder  */
4197ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4198ea3352f4SAlex Elder {
4199ea3352f4SAlex Elder 	char *dup;
4200ea3352f4SAlex Elder 	size_t len;
4201ea3352f4SAlex Elder 
4202ea3352f4SAlex Elder 	len = next_token(buf);
42034caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4204ea3352f4SAlex Elder 	if (!dup)
4205ea3352f4SAlex Elder 		return NULL;
4206ea3352f4SAlex Elder 	*(dup + len) = '\0';
4207ea3352f4SAlex Elder 	*buf += len;
4208ea3352f4SAlex Elder 
4209ea3352f4SAlex Elder 	if (lenp)
4210ea3352f4SAlex Elder 		*lenp = len;
4211ea3352f4SAlex Elder 
4212ea3352f4SAlex Elder 	return dup;
4213ea3352f4SAlex Elder }
4214ea3352f4SAlex Elder 
4215ea3352f4SAlex Elder /*
4216859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4217859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4218859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4219859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4220d22f76e7SAlex Elder  *
4221859c31dfSAlex Elder  * The information extracted from these options is recorded in
4222859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4223859c31dfSAlex Elder  * structures:
4224859c31dfSAlex Elder  *  ceph_opts
4225859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4226859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4227859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4228859c31dfSAlex Elder  *  rbd_opts
4229859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4230859c31dfSAlex Elder  *	this function; caller must release with kfree().
4231859c31dfSAlex Elder  *  spec
4232859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4233859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4234859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4235859c31dfSAlex Elder  *
4236859c31dfSAlex Elder  * The options passed take this form:
4237859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4238859c31dfSAlex Elder  * where:
4239859c31dfSAlex Elder  *  <mon_addrs>
4240859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4241859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4242859c31dfSAlex Elder  *      by a port number (separated by a colon).
4243859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4244859c31dfSAlex Elder  *  <options>
4245859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4246859c31dfSAlex Elder  *  <pool_name>
4247859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4248859c31dfSAlex Elder  *  <image_name>
4249859c31dfSAlex Elder  *      The name of the image in that pool to map.
4250859c31dfSAlex Elder  *  <snap_id>
4251859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4252859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4253859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4254859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4255a725f65eSAlex Elder  */
4256859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4257dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4258859c31dfSAlex Elder 				struct rbd_options **opts,
4259859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4260a725f65eSAlex Elder {
4261e28fff26SAlex Elder 	size_t len;
4262859c31dfSAlex Elder 	char *options;
42630ddebc0cSAlex Elder 	const char *mon_addrs;
4264ecb4dc22SAlex Elder 	char *snap_name;
42650ddebc0cSAlex Elder 	size_t mon_addrs_size;
4266859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
42674e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4268859c31dfSAlex Elder 	struct ceph_options *copts;
4269dc79b113SAlex Elder 	int ret;
4270e28fff26SAlex Elder 
4271e28fff26SAlex Elder 	/* The first four tokens are required */
4272e28fff26SAlex Elder 
42737ef3214aSAlex Elder 	len = next_token(&buf);
42744fb5d671SAlex Elder 	if (!len) {
42754fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
42764fb5d671SAlex Elder 		return -EINVAL;
42774fb5d671SAlex Elder 	}
42780ddebc0cSAlex Elder 	mon_addrs = buf;
4279f28e565aSAlex Elder 	mon_addrs_size = len + 1;
42807ef3214aSAlex Elder 	buf += len;
4281a725f65eSAlex Elder 
4282dc79b113SAlex Elder 	ret = -EINVAL;
4283f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4284f28e565aSAlex Elder 	if (!options)
4285dc79b113SAlex Elder 		return -ENOMEM;
42864fb5d671SAlex Elder 	if (!*options) {
42874fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
42884fb5d671SAlex Elder 		goto out_err;
42894fb5d671SAlex Elder 	}
4290a725f65eSAlex Elder 
4291859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4292859c31dfSAlex Elder 	if (!spec)
4293f28e565aSAlex Elder 		goto out_mem;
4294859c31dfSAlex Elder 
4295859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4296859c31dfSAlex Elder 	if (!spec->pool_name)
4297859c31dfSAlex Elder 		goto out_mem;
42984fb5d671SAlex Elder 	if (!*spec->pool_name) {
42994fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
43004fb5d671SAlex Elder 		goto out_err;
43014fb5d671SAlex Elder 	}
4302e28fff26SAlex Elder 
430369e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4304859c31dfSAlex Elder 	if (!spec->image_name)
4305f28e565aSAlex Elder 		goto out_mem;
43064fb5d671SAlex Elder 	if (!*spec->image_name) {
43074fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
43084fb5d671SAlex Elder 		goto out_err;
43094fb5d671SAlex Elder 	}
4310e28fff26SAlex Elder 
4311f28e565aSAlex Elder 	/*
4312f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4313f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4314f28e565aSAlex Elder 	 */
43153feeb894SAlex Elder 	len = next_token(&buf);
4316820a5f3eSAlex Elder 	if (!len) {
43173feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
43183feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4319f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4320dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4321f28e565aSAlex Elder 		goto out_err;
4322849b4260SAlex Elder 	}
4323ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4324ecb4dc22SAlex Elder 	if (!snap_name)
4325f28e565aSAlex Elder 		goto out_mem;
4326ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4327ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4328e5c35534SAlex Elder 
43290ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4330e28fff26SAlex Elder 
43314e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
43324e9afebaSAlex Elder 	if (!rbd_opts)
43334e9afebaSAlex Elder 		goto out_mem;
43344e9afebaSAlex Elder 
43354e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4336d22f76e7SAlex Elder 
4337859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
43380ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
43394e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4340859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4341859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4342dc79b113SAlex Elder 		goto out_err;
4343dc79b113SAlex Elder 	}
4344859c31dfSAlex Elder 	kfree(options);
4345859c31dfSAlex Elder 
4346859c31dfSAlex Elder 	*ceph_opts = copts;
43474e9afebaSAlex Elder 	*opts = rbd_opts;
4348859c31dfSAlex Elder 	*rbd_spec = spec;
43490ddebc0cSAlex Elder 
4350dc79b113SAlex Elder 	return 0;
4351f28e565aSAlex Elder out_mem:
4352dc79b113SAlex Elder 	ret = -ENOMEM;
4353d22f76e7SAlex Elder out_err:
4354859c31dfSAlex Elder 	kfree(rbd_opts);
4355859c31dfSAlex Elder 	rbd_spec_put(spec);
4356f28e565aSAlex Elder 	kfree(options);
4357d22f76e7SAlex Elder 
4358dc79b113SAlex Elder 	return ret;
4359a725f65eSAlex Elder }
4360a725f65eSAlex Elder 
4361589d30e0SAlex Elder /*
4362589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4363589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4364589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4365589d30e0SAlex Elder  *
4366589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4367589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4368589d30e0SAlex Elder  * with the supplied name.
4369589d30e0SAlex Elder  *
4370589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4371589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4372589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4373589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4374589d30e0SAlex Elder  */
4375589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4376589d30e0SAlex Elder {
4377589d30e0SAlex Elder 	int ret;
4378589d30e0SAlex Elder 	size_t size;
4379589d30e0SAlex Elder 	char *object_name;
4380589d30e0SAlex Elder 	void *response;
4381c0fba368SAlex Elder 	char *image_id;
43822f82ee54SAlex Elder 
4383589d30e0SAlex Elder 	/*
43842c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
43852c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4386c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4387c0fba368SAlex Elder 	 * do still need to set the image format though.
43882c0d0a10SAlex Elder 	 */
4389c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4390c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4391c0fba368SAlex Elder 
43922c0d0a10SAlex Elder 		return 0;
4393c0fba368SAlex Elder 	}
43942c0d0a10SAlex Elder 
43952c0d0a10SAlex Elder 	/*
4396589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4397589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4398589d30e0SAlex Elder 	 */
439969e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4400589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4401589d30e0SAlex Elder 	if (!object_name)
4402589d30e0SAlex Elder 		return -ENOMEM;
44030d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4404589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4405589d30e0SAlex Elder 
4406589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4407589d30e0SAlex Elder 
4408589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4409589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4410589d30e0SAlex Elder 	if (!response) {
4411589d30e0SAlex Elder 		ret = -ENOMEM;
4412589d30e0SAlex Elder 		goto out;
4413589d30e0SAlex Elder 	}
4414589d30e0SAlex Elder 
4415c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4416c0fba368SAlex Elder 
441736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
44184157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
4419e2a58ee5SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX);
442036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4421c0fba368SAlex Elder 	if (ret == -ENOENT) {
4422c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4423c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4424c0fba368SAlex Elder 		if (!ret)
4425c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4426c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4427c0fba368SAlex Elder 		void *p = response;
4428589d30e0SAlex Elder 
4429c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4430979ed480SAlex Elder 						NULL, GFP_NOIO);
4431c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4432c0fba368SAlex Elder 		if (!ret)
4433c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4434589d30e0SAlex Elder 	} else {
4435c0fba368SAlex Elder 		ret = -EINVAL;
4436c0fba368SAlex Elder 	}
4437c0fba368SAlex Elder 
4438c0fba368SAlex Elder 	if (!ret) {
4439c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4440c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4441589d30e0SAlex Elder 	}
4442589d30e0SAlex Elder out:
4443589d30e0SAlex Elder 	kfree(response);
4444589d30e0SAlex Elder 	kfree(object_name);
4445589d30e0SAlex Elder 
4446589d30e0SAlex Elder 	return ret;
4447589d30e0SAlex Elder }
4448589d30e0SAlex Elder 
44496fd48b3bSAlex Elder /* Undo whatever state changes are made by v1 or v2 image probe */
44506fd48b3bSAlex Elder 
44516fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
44526fd48b3bSAlex Elder {
44536fd48b3bSAlex Elder 	struct rbd_image_header	*header;
44546fd48b3bSAlex Elder 
44556fd48b3bSAlex Elder 	rbd_dev_remove_parent(rbd_dev);
44566fd48b3bSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
44576fd48b3bSAlex Elder 	rbd_dev->parent_spec = NULL;
44586fd48b3bSAlex Elder 	rbd_dev->parent_overlap = 0;
44596fd48b3bSAlex Elder 
44606fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
44616fd48b3bSAlex Elder 
44626fd48b3bSAlex Elder 	header = &rbd_dev->header;
4463812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
44646fd48b3bSAlex Elder 	kfree(header->snap_sizes);
44656fd48b3bSAlex Elder 	kfree(header->snap_names);
44666fd48b3bSAlex Elder 	kfree(header->object_prefix);
44676fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
44686fd48b3bSAlex Elder }
44696fd48b3bSAlex Elder 
4470a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4471a30b71b9SAlex Elder {
447230d60ba2SAlex Elder 	return rbd_dev_v1_header_read(rbd_dev);
4473a30b71b9SAlex Elder }
4474a30b71b9SAlex Elder 
4475a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4476a30b71b9SAlex Elder {
44779d475de5SAlex Elder 	int ret;
4478a30b71b9SAlex Elder 
44799d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
448057385b51SAlex Elder 	if (ret)
44819d475de5SAlex Elder 		goto out_err;
44821e130199SAlex Elder 
44831e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
44841e130199SAlex Elder 
44851e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
448657385b51SAlex Elder 	if (ret)
44871e130199SAlex Elder 		goto out_err;
4488b1b5402aSAlex Elder 
4489d889140cSAlex Elder 	/* Get the and check features for the image */
4490b1b5402aSAlex Elder 
4491b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
449257385b51SAlex Elder 	if (ret)
4493b1b5402aSAlex Elder 		goto out_err;
449435d489f9SAlex Elder 
449586b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
449686b00e0dSAlex Elder 
449786b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
449886b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
449957385b51SAlex Elder 		if (ret)
450086b00e0dSAlex Elder 			goto out_err;
450196882f55SAlex Elder 		/*
4502c734b796SAlex Elder 		 * Print a warning if this image has a parent.
4503c734b796SAlex Elder 		 * Don't print it if the image now being probed
4504c734b796SAlex Elder 		 * is itself a parent.  We can tell at this point
4505c734b796SAlex Elder 		 * because we won't know its pool name yet (just its
4506c734b796SAlex Elder 		 * pool id).
450796882f55SAlex Elder 		 */
4508c734b796SAlex Elder 		if (rbd_dev->parent_spec && rbd_dev->spec->pool_name)
450996882f55SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
451096882f55SAlex Elder 					"is EXPERIMENTAL!");
451186b00e0dSAlex Elder 	}
451286b00e0dSAlex Elder 
4513cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4514cc070d59SAlex Elder 
4515cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4516cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4517cc070d59SAlex Elder 		if (ret < 0)
4518cc070d59SAlex Elder 			goto out_err;
4519cc070d59SAlex Elder 	}
4520cc070d59SAlex Elder 
45216e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
452235d489f9SAlex Elder 
45236e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
45246e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
45256e14b1a6SAlex Elder 
45266e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
45276e14b1a6SAlex Elder 
4528cc4a38bdSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev);
452935d489f9SAlex Elder 	if (ret)
453035d489f9SAlex Elder 		goto out_err;
45316e14b1a6SAlex Elder 
453235152979SAlex Elder 	return 0;
45339d475de5SAlex Elder out_err:
453486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
453586b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
453686b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
45379d475de5SAlex Elder 	kfree(rbd_dev->header_name);
45389d475de5SAlex Elder 	rbd_dev->header_name = NULL;
45391e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
45401e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
45419d475de5SAlex Elder 
45429d475de5SAlex Elder 	return ret;
4543a30b71b9SAlex Elder }
4544a30b71b9SAlex Elder 
4545124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
454683a06263SAlex Elder {
45472f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4548124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4549124afba2SAlex Elder 	struct rbd_client *rbdc;
4550124afba2SAlex Elder 	int ret;
4551124afba2SAlex Elder 
4552124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4553124afba2SAlex Elder 		return 0;
4554124afba2SAlex Elder 	/*
4555124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4556124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4557124afba2SAlex Elder 	 * parent/child relationships always share both.
4558124afba2SAlex Elder 	 */
4559124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4560124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4561124afba2SAlex Elder 
4562124afba2SAlex Elder 	ret = -ENOMEM;
4563124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4564124afba2SAlex Elder 	if (!parent)
4565124afba2SAlex Elder 		goto out_err;
4566124afba2SAlex Elder 
456751344a38SAlex Elder 	ret = rbd_dev_image_probe(parent, true);
4568124afba2SAlex Elder 	if (ret < 0)
4569124afba2SAlex Elder 		goto out_err;
4570124afba2SAlex Elder 	rbd_dev->parent = parent;
4571124afba2SAlex Elder 
4572124afba2SAlex Elder 	return 0;
4573124afba2SAlex Elder out_err:
4574124afba2SAlex Elder 	if (parent) {
4575124afba2SAlex Elder 		rbd_spec_put(rbd_dev->parent_spec);
4576124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4577124afba2SAlex Elder 		rbd_dev_destroy(parent);
4578124afba2SAlex Elder 	} else {
4579124afba2SAlex Elder 		rbd_put_client(rbdc);
4580124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4581124afba2SAlex Elder 	}
4582124afba2SAlex Elder 
4583124afba2SAlex Elder 	return ret;
4584124afba2SAlex Elder }
4585124afba2SAlex Elder 
4586200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4587124afba2SAlex Elder {
458883a06263SAlex Elder 	int ret;
458983a06263SAlex Elder 
459083a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
459183a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
459283a06263SAlex Elder 
459383a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
459483a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
459583a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
459683a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
459783a06263SAlex Elder 
459883a06263SAlex Elder 	/* Get our block major device number. */
459983a06263SAlex Elder 
460083a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
460183a06263SAlex Elder 	if (ret < 0)
460283a06263SAlex Elder 		goto err_out_id;
460383a06263SAlex Elder 	rbd_dev->major = ret;
460483a06263SAlex Elder 
460583a06263SAlex Elder 	/* Set up the blkdev mapping. */
460683a06263SAlex Elder 
460783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
460883a06263SAlex Elder 	if (ret)
460983a06263SAlex Elder 		goto err_out_blkdev;
461083a06263SAlex Elder 
4611f35a4deeSAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
461283a06263SAlex Elder 	if (ret)
461383a06263SAlex Elder 		goto err_out_disk;
4614f35a4deeSAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4615f35a4deeSAlex Elder 
4616f35a4deeSAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
4617f35a4deeSAlex Elder 	if (ret)
4618f35a4deeSAlex Elder 		goto err_out_mapping;
461983a06263SAlex Elder 
462083a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
462183a06263SAlex Elder 
4622129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
462383a06263SAlex Elder 	add_disk(rbd_dev->disk);
462483a06263SAlex Elder 
462583a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
462683a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
462783a06263SAlex Elder 
462883a06263SAlex Elder 	return ret;
46292f82ee54SAlex Elder 
4630f35a4deeSAlex Elder err_out_mapping:
4631f35a4deeSAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
463283a06263SAlex Elder err_out_disk:
463383a06263SAlex Elder 	rbd_free_disk(rbd_dev);
463483a06263SAlex Elder err_out_blkdev:
463583a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
463683a06263SAlex Elder err_out_id:
463783a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4638d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
463983a06263SAlex Elder 
464083a06263SAlex Elder 	return ret;
464183a06263SAlex Elder }
464283a06263SAlex Elder 
4643332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4644332bb12dSAlex Elder {
4645332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4646332bb12dSAlex Elder 	size_t size;
4647332bb12dSAlex Elder 
4648332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4649332bb12dSAlex Elder 
4650332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4651332bb12dSAlex Elder 
4652332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4653332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4654332bb12dSAlex Elder 	else
4655332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4656332bb12dSAlex Elder 
4657332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4658332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4659332bb12dSAlex Elder 		return -ENOMEM;
4660332bb12dSAlex Elder 
4661332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4662332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4663332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4664332bb12dSAlex Elder 	else
4665332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4666332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4667332bb12dSAlex Elder 	return 0;
4668332bb12dSAlex Elder }
4669332bb12dSAlex Elder 
4670200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4671200a6a8bSAlex Elder {
46726fd48b3bSAlex Elder 	int ret;
46736fd48b3bSAlex Elder 
46746fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
46756fd48b3bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 0);
46766fd48b3bSAlex Elder 	if (ret)
46776fd48b3bSAlex Elder 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
4678200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
46796fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
46806fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
46816fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
46826fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
46836fd48b3bSAlex Elder 
4684200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
4685200a6a8bSAlex Elder }
4686200a6a8bSAlex Elder 
4687a30b71b9SAlex Elder /*
4688a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4689a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4690a30b71b9SAlex Elder  * id.
4691a30b71b9SAlex Elder  */
469251344a38SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool read_only)
4693a30b71b9SAlex Elder {
4694a30b71b9SAlex Elder 	int ret;
4695b644de2bSAlex Elder 	int tmp;
4696a30b71b9SAlex Elder 
4697a30b71b9SAlex Elder 	/*
4698a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4699a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4700a30b71b9SAlex Elder 	 * it's a format 1 image.
4701a30b71b9SAlex Elder 	 */
4702a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4703a30b71b9SAlex Elder 	if (ret)
4704c0fba368SAlex Elder 		return ret;
4705c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
4706c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4707c0fba368SAlex Elder 
4708332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
4709332bb12dSAlex Elder 	if (ret)
4710332bb12dSAlex Elder 		goto err_out_format;
4711332bb12dSAlex Elder 
4712b644de2bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4713b644de2bSAlex Elder 	if (ret)
4714b644de2bSAlex Elder 		goto out_header_name;
4715b644de2bSAlex Elder 
4716c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
4717a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4718a30b71b9SAlex Elder 	else
4719a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
47205655c4d9SAlex Elder 	if (ret)
4721b644de2bSAlex Elder 		goto err_out_watch;
4722a30b71b9SAlex Elder 
47239bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
47249bb81c9bSAlex Elder 	if (ret)
472533dca39fSAlex Elder 		goto err_out_probe;
47269bb81c9bSAlex Elder 
472751344a38SAlex Elder 	/* If we are mapping a snapshot it must be marked read-only */
472851344a38SAlex Elder 
472951344a38SAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
473051344a38SAlex Elder 		read_only = true;
473151344a38SAlex Elder 	rbd_dev->mapping.read_only = read_only;
473251344a38SAlex Elder 
47339bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
473430d60ba2SAlex Elder 	if (ret)
473530d60ba2SAlex Elder 		goto err_out_probe;
473683a06263SAlex Elder 
473730d60ba2SAlex Elder 	dout("discovered format %u image, header name is %s\n",
473830d60ba2SAlex Elder 		rbd_dev->image_format, rbd_dev->header_name);
473930d60ba2SAlex Elder 
474030d60ba2SAlex Elder 	return 0;
47416fd48b3bSAlex Elder err_out_probe:
47426fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4743b644de2bSAlex Elder err_out_watch:
4744b644de2bSAlex Elder 	tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4745b644de2bSAlex Elder 	if (tmp)
4746b644de2bSAlex Elder 		rbd_warn(rbd_dev, "unable to tear down watch request\n");
4747332bb12dSAlex Elder out_header_name:
4748332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
4749332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
4750332bb12dSAlex Elder err_out_format:
4751332bb12dSAlex Elder 	rbd_dev->image_format = 0;
47525655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
47535655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
47545655c4d9SAlex Elder 
47555655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
47565655c4d9SAlex Elder 
47575655c4d9SAlex Elder 	return ret;
475883a06263SAlex Elder }
475983a06263SAlex Elder 
476059c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
476159c2be1eSYehuda Sadeh 		       const char *buf,
476259c2be1eSYehuda Sadeh 		       size_t count)
4763602adf40SYehuda Sadeh {
4764cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4765dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
47664e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4767859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
47689d3997fdSAlex Elder 	struct rbd_client *rbdc;
476927cc2594SAlex Elder 	struct ceph_osd_client *osdc;
477051344a38SAlex Elder 	bool read_only;
477127cc2594SAlex Elder 	int rc = -ENOMEM;
4772602adf40SYehuda Sadeh 
4773602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4774602adf40SYehuda Sadeh 		return -ENODEV;
4775602adf40SYehuda Sadeh 
4776a725f65eSAlex Elder 	/* parse add command */
4777859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4778dc79b113SAlex Elder 	if (rc < 0)
4779bd4ba655SAlex Elder 		goto err_out_module;
478051344a38SAlex Elder 	read_only = rbd_opts->read_only;
478151344a38SAlex Elder 	kfree(rbd_opts);
478251344a38SAlex Elder 	rbd_opts = NULL;	/* done with this */
4783a725f65eSAlex Elder 
47849d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
47859d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
47869d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
47870ddebc0cSAlex Elder 		goto err_out_args;
47889d3997fdSAlex Elder 	}
4789c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4790602adf40SYehuda Sadeh 
4791602adf40SYehuda Sadeh 	/* pick the pool */
47929d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4793859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4794602adf40SYehuda Sadeh 	if (rc < 0)
4795602adf40SYehuda Sadeh 		goto err_out_client;
4796859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
4797859c31dfSAlex Elder 
47980903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
47990903e875SAlex Elder 
4800c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
4801c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4802c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
48030903e875SAlex Elder 		rc = -EIO;
48040903e875SAlex Elder 		goto err_out_client;
48050903e875SAlex Elder 	}
48060903e875SAlex Elder 
4807c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4808bd4ba655SAlex Elder 	if (!rbd_dev)
4809bd4ba655SAlex Elder 		goto err_out_client;
4810c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4811c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4812602adf40SYehuda Sadeh 
481351344a38SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev, read_only);
4814a30b71b9SAlex Elder 	if (rc < 0)
4815c53d5893SAlex Elder 		goto err_out_rbd_dev;
481605fd6f6fSAlex Elder 
4817b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
4818b536f69aSAlex Elder 	if (!rc)
4819602adf40SYehuda Sadeh 		return count;
4820b536f69aSAlex Elder 
4821b536f69aSAlex Elder 	rbd_dev_image_release(rbd_dev);
4822c53d5893SAlex Elder err_out_rbd_dev:
4823c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4824bd4ba655SAlex Elder err_out_client:
48259d3997fdSAlex Elder 	rbd_put_client(rbdc);
48260ddebc0cSAlex Elder err_out_args:
482778cea76eSAlex Elder 	if (ceph_opts)
482878cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
48294e9afebaSAlex Elder 	kfree(rbd_opts);
4830859c31dfSAlex Elder 	rbd_spec_put(spec);
4831bd4ba655SAlex Elder err_out_module:
4832bd4ba655SAlex Elder 	module_put(THIS_MODULE);
483327cc2594SAlex Elder 
4834602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
483527cc2594SAlex Elder 
483627cc2594SAlex Elder 	return (ssize_t)rc;
4837602adf40SYehuda Sadeh }
4838602adf40SYehuda Sadeh 
4839de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4840602adf40SYehuda Sadeh {
4841602adf40SYehuda Sadeh 	struct list_head *tmp;
4842602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4843602adf40SYehuda Sadeh 
4844e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4845602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4846602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4847de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4848e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4849602adf40SYehuda Sadeh 			return rbd_dev;
4850602adf40SYehuda Sadeh 		}
4851e124a82fSAlex Elder 	}
4852e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4853602adf40SYehuda Sadeh 	return NULL;
4854602adf40SYehuda Sadeh }
4855602adf40SYehuda Sadeh 
4856200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
4857602adf40SYehuda Sadeh {
4858593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4859602adf40SYehuda Sadeh 
4860602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4861200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
48626d80b130SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
4863602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
4864200a6a8bSAlex Elder 	rbd_dev->major = 0;
4865e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4866d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
4867602adf40SYehuda Sadeh }
4868602adf40SYehuda Sadeh 
486905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
487005a46afdSAlex Elder {
4871ad945fc1SAlex Elder 	while (rbd_dev->parent) {
487205a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
487305a46afdSAlex Elder 		struct rbd_device *second = first->parent;
487405a46afdSAlex Elder 		struct rbd_device *third;
487505a46afdSAlex Elder 
487605a46afdSAlex Elder 		/*
487705a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
487805a46afdSAlex Elder 		 * remove it.
487905a46afdSAlex Elder 		 */
488005a46afdSAlex Elder 		while (second && (third = second->parent)) {
488105a46afdSAlex Elder 			first = second;
488205a46afdSAlex Elder 			second = third;
488305a46afdSAlex Elder 		}
4884ad945fc1SAlex Elder 		rbd_assert(second);
48858ad42cd0SAlex Elder 		rbd_dev_image_release(second);
4886ad945fc1SAlex Elder 		first->parent = NULL;
4887ad945fc1SAlex Elder 		first->parent_overlap = 0;
4888ad945fc1SAlex Elder 
4889ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
489005a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
489105a46afdSAlex Elder 		first->parent_spec = NULL;
489205a46afdSAlex Elder 	}
489305a46afdSAlex Elder }
489405a46afdSAlex Elder 
4895dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4896602adf40SYehuda Sadeh 			  const char *buf,
4897602adf40SYehuda Sadeh 			  size_t count)
4898602adf40SYehuda Sadeh {
4899602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
49000d8189e1SAlex Elder 	int target_id;
4901602adf40SYehuda Sadeh 	unsigned long ul;
49020d8189e1SAlex Elder 	int ret;
4903602adf40SYehuda Sadeh 
49040d8189e1SAlex Elder 	ret = strict_strtoul(buf, 10, &ul);
49050d8189e1SAlex Elder 	if (ret)
49060d8189e1SAlex Elder 		return ret;
4907602adf40SYehuda Sadeh 
4908602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4909602adf40SYehuda Sadeh 	target_id = (int) ul;
4910602adf40SYehuda Sadeh 	if (target_id != ul)
4911602adf40SYehuda Sadeh 		return -EINVAL;
4912602adf40SYehuda Sadeh 
4913602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4914602adf40SYehuda Sadeh 
4915602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4916602adf40SYehuda Sadeh 	if (!rbd_dev) {
4917602adf40SYehuda Sadeh 		ret = -ENOENT;
4918602adf40SYehuda Sadeh 		goto done;
4919602adf40SYehuda Sadeh 	}
4920602adf40SYehuda Sadeh 
4921a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4922b82d167bSAlex Elder 	if (rbd_dev->open_count)
492342382b70SAlex Elder 		ret = -EBUSY;
4924b82d167bSAlex Elder 	else
4925b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4926a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4927b82d167bSAlex Elder 	if (ret < 0)
492842382b70SAlex Elder 		goto done;
49290d8189e1SAlex Elder 	ret = count;
4930b480815aSAlex Elder 	rbd_bus_del_dev(rbd_dev);
49318ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
493279ab7558SAlex Elder 	module_put(THIS_MODULE);
4933602adf40SYehuda Sadeh done:
4934602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4935aafb230eSAlex Elder 
4936602adf40SYehuda Sadeh 	return ret;
4937602adf40SYehuda Sadeh }
4938602adf40SYehuda Sadeh 
4939602adf40SYehuda Sadeh /*
4940602adf40SYehuda Sadeh  * create control files in sysfs
4941dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4942602adf40SYehuda Sadeh  */
4943602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4944602adf40SYehuda Sadeh {
4945dfc5606dSYehuda Sadeh 	int ret;
4946602adf40SYehuda Sadeh 
4947fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4948dfc5606dSYehuda Sadeh 	if (ret < 0)
4949dfc5606dSYehuda Sadeh 		return ret;
4950602adf40SYehuda Sadeh 
4951fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4952fed4c143SAlex Elder 	if (ret < 0)
4953fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4954602adf40SYehuda Sadeh 
4955602adf40SYehuda Sadeh 	return ret;
4956602adf40SYehuda Sadeh }
4957602adf40SYehuda Sadeh 
4958602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4959602adf40SYehuda Sadeh {
4960dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4961fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4962602adf40SYehuda Sadeh }
4963602adf40SYehuda Sadeh 
49641c2a9dfeSAlex Elder static int rbd_slab_init(void)
49651c2a9dfeSAlex Elder {
49661c2a9dfeSAlex Elder 	rbd_assert(!rbd_img_request_cache);
49671c2a9dfeSAlex Elder 	rbd_img_request_cache = kmem_cache_create("rbd_img_request",
49681c2a9dfeSAlex Elder 					sizeof (struct rbd_img_request),
49691c2a9dfeSAlex Elder 					__alignof__(struct rbd_img_request),
49701c2a9dfeSAlex Elder 					0, NULL);
4971868311b1SAlex Elder 	if (!rbd_img_request_cache)
4972868311b1SAlex Elder 		return -ENOMEM;
4973868311b1SAlex Elder 
4974868311b1SAlex Elder 	rbd_assert(!rbd_obj_request_cache);
4975868311b1SAlex Elder 	rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
4976868311b1SAlex Elder 					sizeof (struct rbd_obj_request),
4977868311b1SAlex Elder 					__alignof__(struct rbd_obj_request),
4978868311b1SAlex Elder 					0, NULL);
497978c2a44aSAlex Elder 	if (!rbd_obj_request_cache)
498078c2a44aSAlex Elder 		goto out_err;
498178c2a44aSAlex Elder 
498278c2a44aSAlex Elder 	rbd_assert(!rbd_segment_name_cache);
498378c2a44aSAlex Elder 	rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
498478c2a44aSAlex Elder 					MAX_OBJ_NAME_SIZE + 1, 1, 0, NULL);
498578c2a44aSAlex Elder 	if (rbd_segment_name_cache)
49861c2a9dfeSAlex Elder 		return 0;
498778c2a44aSAlex Elder out_err:
498878c2a44aSAlex Elder 	if (rbd_obj_request_cache) {
498978c2a44aSAlex Elder 		kmem_cache_destroy(rbd_obj_request_cache);
499078c2a44aSAlex Elder 		rbd_obj_request_cache = NULL;
499178c2a44aSAlex Elder 	}
49921c2a9dfeSAlex Elder 
4993868311b1SAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
4994868311b1SAlex Elder 	rbd_img_request_cache = NULL;
4995868311b1SAlex Elder 
49961c2a9dfeSAlex Elder 	return -ENOMEM;
49971c2a9dfeSAlex Elder }
49981c2a9dfeSAlex Elder 
49991c2a9dfeSAlex Elder static void rbd_slab_exit(void)
50001c2a9dfeSAlex Elder {
500178c2a44aSAlex Elder 	rbd_assert(rbd_segment_name_cache);
500278c2a44aSAlex Elder 	kmem_cache_destroy(rbd_segment_name_cache);
500378c2a44aSAlex Elder 	rbd_segment_name_cache = NULL;
500478c2a44aSAlex Elder 
5005868311b1SAlex Elder 	rbd_assert(rbd_obj_request_cache);
5006868311b1SAlex Elder 	kmem_cache_destroy(rbd_obj_request_cache);
5007868311b1SAlex Elder 	rbd_obj_request_cache = NULL;
5008868311b1SAlex Elder 
50091c2a9dfeSAlex Elder 	rbd_assert(rbd_img_request_cache);
50101c2a9dfeSAlex Elder 	kmem_cache_destroy(rbd_img_request_cache);
50111c2a9dfeSAlex Elder 	rbd_img_request_cache = NULL;
50121c2a9dfeSAlex Elder }
50131c2a9dfeSAlex Elder 
5014cc344fa1SAlex Elder static int __init rbd_init(void)
5015602adf40SYehuda Sadeh {
5016602adf40SYehuda Sadeh 	int rc;
5017602adf40SYehuda Sadeh 
50181e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
50191e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
50201e32d34cSAlex Elder 
50211e32d34cSAlex Elder 		return -EINVAL;
50221e32d34cSAlex Elder 	}
50231c2a9dfeSAlex Elder 	rc = rbd_slab_init();
5024602adf40SYehuda Sadeh 	if (rc)
5025602adf40SYehuda Sadeh 		return rc;
50261c2a9dfeSAlex Elder 	rc = rbd_sysfs_init();
50271c2a9dfeSAlex Elder 	if (rc)
50281c2a9dfeSAlex Elder 		rbd_slab_exit();
50291c2a9dfeSAlex Elder 	else
5030f0f8cef5SAlex Elder 		pr_info("loaded " RBD_DRV_NAME_LONG "\n");
50311c2a9dfeSAlex Elder 
50321c2a9dfeSAlex Elder 	return rc;
5033602adf40SYehuda Sadeh }
5034602adf40SYehuda Sadeh 
5035cc344fa1SAlex Elder static void __exit rbd_exit(void)
5036602adf40SYehuda Sadeh {
5037602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
50381c2a9dfeSAlex Elder 	rbd_slab_exit();
5039602adf40SYehuda Sadeh }
5040602adf40SYehuda Sadeh 
5041602adf40SYehuda Sadeh module_init(rbd_init);
5042602adf40SYehuda Sadeh module_exit(rbd_exit);
5043602adf40SYehuda Sadeh 
5044602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5045602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5046602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5047602adf40SYehuda Sadeh 
5048602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5049602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5050602adf40SYehuda Sadeh 
5051602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5052