xref: /openbmc/linux/drivers/block/rbd.c (revision a3fbe5d4)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
83770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
111500d0c0fSAlex Elder 	u64 stripe_unit;
112500d0c0fSAlex Elder 	u64 stripe_count;
113500d0c0fSAlex Elder 
11459c2be1eSYehuda Sadeh 	u64 obj_version;
11559c2be1eSYehuda Sadeh };
11659c2be1eSYehuda Sadeh 
1170d7dbfceSAlex Elder /*
1180d7dbfceSAlex Elder  * An rbd image specification.
1190d7dbfceSAlex Elder  *
1200d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
121c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
122c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
123c66c6e0cSAlex Elder  *
124c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
125c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
126c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
127c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
128c66c6e0cSAlex Elder  *
129c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
130c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
131c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
132c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
133c66c6e0cSAlex Elder  * is shared between the parent and child).
134c66c6e0cSAlex Elder  *
135c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
136c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
137c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
138c66c6e0cSAlex Elder  *
139c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
140c66c6e0cSAlex Elder  * could be a null pointer).
1410d7dbfceSAlex Elder  */
1420d7dbfceSAlex Elder struct rbd_spec {
1430d7dbfceSAlex Elder 	u64		pool_id;
144ecb4dc22SAlex Elder 	const char	*pool_name;
1450d7dbfceSAlex Elder 
146ecb4dc22SAlex Elder 	const char	*image_id;
147ecb4dc22SAlex Elder 	const char	*image_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	u64		snap_id;
150ecb4dc22SAlex Elder 	const char	*snap_name;
1510d7dbfceSAlex Elder 
1520d7dbfceSAlex Elder 	struct kref	kref;
1530d7dbfceSAlex Elder };
1540d7dbfceSAlex Elder 
155602adf40SYehuda Sadeh /*
156f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
157602adf40SYehuda Sadeh  */
158602adf40SYehuda Sadeh struct rbd_client {
159602adf40SYehuda Sadeh 	struct ceph_client	*client;
160602adf40SYehuda Sadeh 	struct kref		kref;
161602adf40SYehuda Sadeh 	struct list_head	node;
162602adf40SYehuda Sadeh };
163602adf40SYehuda Sadeh 
164bf0d5f50SAlex Elder struct rbd_img_request;
165bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
166bf0d5f50SAlex Elder 
167bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
168bf0d5f50SAlex Elder 
169bf0d5f50SAlex Elder struct rbd_obj_request;
170bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
171bf0d5f50SAlex Elder 
1729969ebc5SAlex Elder enum obj_request_type {
1739969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1749969ebc5SAlex Elder };
175bf0d5f50SAlex Elder 
176926f9b3fSAlex Elder enum obj_req_flags {
177926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1786365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1795679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1805679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
181926f9b3fSAlex Elder };
182926f9b3fSAlex Elder 
183bf0d5f50SAlex Elder struct rbd_obj_request {
184bf0d5f50SAlex Elder 	const char		*object_name;
185bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
186bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
187926f9b3fSAlex Elder 	unsigned long		flags;
188bf0d5f50SAlex Elder 
189c5b5ef6cSAlex Elder 	/*
190c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
191c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
192c5b5ef6cSAlex Elder 	 *
193c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
194c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
195c5b5ef6cSAlex Elder 	 *
196c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
197c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
198c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
199c5b5ef6cSAlex Elder 	 *
200c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
201c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
202c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
203c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
204c5b5ef6cSAlex Elder 	 */
205c5b5ef6cSAlex Elder 	union {
206c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
207c5b5ef6cSAlex Elder 		struct {
208bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
209c5b5ef6cSAlex Elder 			u64			img_offset;
210c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
211c5b5ef6cSAlex Elder 			struct list_head	links;
212c5b5ef6cSAlex Elder 		};
213c5b5ef6cSAlex Elder 	};
214bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
215bf0d5f50SAlex Elder 
216bf0d5f50SAlex Elder 	enum obj_request_type	type;
217788e2df3SAlex Elder 	union {
218bf0d5f50SAlex Elder 		struct bio	*bio_list;
219788e2df3SAlex Elder 		struct {
220788e2df3SAlex Elder 			struct page	**pages;
221788e2df3SAlex Elder 			u32		page_count;
222788e2df3SAlex Elder 		};
223788e2df3SAlex Elder 	};
2240eefd470SAlex Elder 	struct page		**copyup_pages;
225bf0d5f50SAlex Elder 
226bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
227bf0d5f50SAlex Elder 
228bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
229bf0d5f50SAlex Elder 	u64			version;
2301b83bef2SSage Weil 	int			result;
231bf0d5f50SAlex Elder 
232bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
233788e2df3SAlex Elder 	struct completion	completion;
234bf0d5f50SAlex Elder 
235bf0d5f50SAlex Elder 	struct kref		kref;
236bf0d5f50SAlex Elder };
237bf0d5f50SAlex Elder 
2380c425248SAlex Elder enum img_req_flags {
2399849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2409849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
241d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2420c425248SAlex Elder };
2430c425248SAlex Elder 
244bf0d5f50SAlex Elder struct rbd_img_request {
245bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
246bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
247bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2480c425248SAlex Elder 	unsigned long		flags;
249bf0d5f50SAlex Elder 	union {
250bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2519849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2529849e986SAlex Elder 	};
2539849e986SAlex Elder 	union {
2549849e986SAlex Elder 		struct request		*rq;		/* block request */
2559849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
256bf0d5f50SAlex Elder 	};
2573d7efd18SAlex Elder 	struct page		**copyup_pages;
258bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
259bf0d5f50SAlex Elder 	u32			next_completion;
260bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
26155f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
262a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
263bf0d5f50SAlex Elder 
264bf0d5f50SAlex Elder 	u32			obj_request_count;
265bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder 	struct kref		kref;
268bf0d5f50SAlex Elder };
269bf0d5f50SAlex Elder 
270bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
271ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
272bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
273ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
274bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
275ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
276bf0d5f50SAlex Elder 
277dfc5606dSYehuda Sadeh struct rbd_snap {
278dfc5606dSYehuda Sadeh 	const char		*name;
2793591538fSJosh Durgin 	u64			size;
280dfc5606dSYehuda Sadeh 	struct list_head	node;
281dfc5606dSYehuda Sadeh 	u64			id;
28234b13184SAlex Elder 	u64			features;
283dfc5606dSYehuda Sadeh };
284dfc5606dSYehuda Sadeh 
285f84344f3SAlex Elder struct rbd_mapping {
28699c1f08fSAlex Elder 	u64                     size;
28734b13184SAlex Elder 	u64                     features;
288f84344f3SAlex Elder 	bool			read_only;
289f84344f3SAlex Elder };
290f84344f3SAlex Elder 
291602adf40SYehuda Sadeh /*
292602adf40SYehuda Sadeh  * a single device
293602adf40SYehuda Sadeh  */
294602adf40SYehuda Sadeh struct rbd_device {
295de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
296602adf40SYehuda Sadeh 
297602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
298602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
299602adf40SYehuda Sadeh 
300a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
301602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
302602adf40SYehuda Sadeh 
303602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
304602adf40SYehuda Sadeh 
305b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
306602adf40SYehuda Sadeh 
307602adf40SYehuda Sadeh 	struct rbd_image_header	header;
308b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3090d7dbfceSAlex Elder 	struct rbd_spec		*spec;
310602adf40SYehuda Sadeh 
3110d7dbfceSAlex Elder 	char			*header_name;
312971f839aSAlex Elder 
3130903e875SAlex Elder 	struct ceph_file_layout	layout;
3140903e875SAlex Elder 
31559c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
316975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31759c2be1eSYehuda Sadeh 
31886b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31986b00e0dSAlex Elder 	u64			parent_overlap;
3202f82ee54SAlex Elder 	struct rbd_device	*parent;
32186b00e0dSAlex Elder 
322c666601aSJosh Durgin 	/* protects updating the header */
323c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
324f84344f3SAlex Elder 
325f84344f3SAlex Elder 	struct rbd_mapping	mapping;
326602adf40SYehuda Sadeh 
327602adf40SYehuda Sadeh 	struct list_head	node;
328dfc5606dSYehuda Sadeh 
329dfc5606dSYehuda Sadeh 	/* list of snapshots */
330dfc5606dSYehuda Sadeh 	struct list_head	snaps;
331dfc5606dSYehuda Sadeh 
332dfc5606dSYehuda Sadeh 	/* sysfs related */
333dfc5606dSYehuda Sadeh 	struct device		dev;
334b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
335dfc5606dSYehuda Sadeh };
336dfc5606dSYehuda Sadeh 
337b82d167bSAlex Elder /*
338b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
339b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
340b82d167bSAlex Elder  *
341b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
342b82d167bSAlex Elder  * "open_count" field) requires atomic access.
343b82d167bSAlex Elder  */
3446d292906SAlex Elder enum rbd_dev_flags {
3456d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
346b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3476d292906SAlex Elder };
3486d292906SAlex Elder 
349602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
350e124a82fSAlex Elder 
351602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
352e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
353e124a82fSAlex Elder 
354602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
355432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
356602adf40SYehuda Sadeh 
3573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3583d7efd18SAlex Elder 
359304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
360304f6808SAlex Elder 
361200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev);
3626087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap);
363dfc5606dSYehuda Sadeh 
364f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
365f0f8cef5SAlex Elder 		       size_t count);
366f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
367f0f8cef5SAlex Elder 			  size_t count);
36871f293e2SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
369f0f8cef5SAlex Elder 
370f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
371f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
372f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
373f0f8cef5SAlex Elder 	__ATTR_NULL
374f0f8cef5SAlex Elder };
375f0f8cef5SAlex Elder 
376f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
377f0f8cef5SAlex Elder 	.name		= "rbd",
378f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
379f0f8cef5SAlex Elder };
380f0f8cef5SAlex Elder 
381f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
382f0f8cef5SAlex Elder {
383f0f8cef5SAlex Elder }
384f0f8cef5SAlex Elder 
385f0f8cef5SAlex Elder static struct device rbd_root_dev = {
386f0f8cef5SAlex Elder 	.init_name =    "rbd",
387f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
388f0f8cef5SAlex Elder };
389f0f8cef5SAlex Elder 
39006ecc6cbSAlex Elder static __printf(2, 3)
39106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
39206ecc6cbSAlex Elder {
39306ecc6cbSAlex Elder 	struct va_format vaf;
39406ecc6cbSAlex Elder 	va_list args;
39506ecc6cbSAlex Elder 
39606ecc6cbSAlex Elder 	va_start(args, fmt);
39706ecc6cbSAlex Elder 	vaf.fmt = fmt;
39806ecc6cbSAlex Elder 	vaf.va = &args;
39906ecc6cbSAlex Elder 
40006ecc6cbSAlex Elder 	if (!rbd_dev)
40106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
40206ecc6cbSAlex Elder 	else if (rbd_dev->disk)
40306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
40406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
40506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
40606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
41006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
41106ecc6cbSAlex Elder 	else	/* punt */
41206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
41306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
41406ecc6cbSAlex Elder 	va_end(args);
41506ecc6cbSAlex Elder }
41606ecc6cbSAlex Elder 
417aafb230eSAlex Elder #ifdef RBD_DEBUG
418aafb230eSAlex Elder #define rbd_assert(expr)						\
419aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
420aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
421aafb230eSAlex Elder 						"at line %d:\n\n"	\
422aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
423aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
424aafb230eSAlex Elder 			BUG();						\
425aafb230eSAlex Elder 		}
426aafb230eSAlex Elder #else /* !RBD_DEBUG */
427aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
428aafb230eSAlex Elder #endif /* !RBD_DEBUG */
429dfc5606dSYehuda Sadeh 
430b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
43105a46afdSAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
43205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
4338b3e1a56SAlex Elder 
434117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
435117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
43659c2be1eSYehuda Sadeh 
437602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
438602adf40SYehuda Sadeh {
439f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
440b82d167bSAlex Elder 	bool removing = false;
441602adf40SYehuda Sadeh 
442f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
443602adf40SYehuda Sadeh 		return -EROFS;
444602adf40SYehuda Sadeh 
445a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
446b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
447b82d167bSAlex Elder 		removing = true;
448b82d167bSAlex Elder 	else
449b82d167bSAlex Elder 		rbd_dev->open_count++;
450a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
451b82d167bSAlex Elder 	if (removing)
452b82d167bSAlex Elder 		return -ENOENT;
453b82d167bSAlex Elder 
45442382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
456f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45742382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
458340c7a2bSAlex Elder 
459602adf40SYehuda Sadeh 	return 0;
460602adf40SYehuda Sadeh }
461602adf40SYehuda Sadeh 
462dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
463dfc5606dSYehuda Sadeh {
464dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
465b82d167bSAlex Elder 	unsigned long open_count_before;
466b82d167bSAlex Elder 
467a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
468b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
469a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
470b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
471dfc5606dSYehuda Sadeh 
47242382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
473c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
47442382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
475dfc5606dSYehuda Sadeh 
476dfc5606dSYehuda Sadeh 	return 0;
477dfc5606dSYehuda Sadeh }
478dfc5606dSYehuda Sadeh 
479602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
480602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
481602adf40SYehuda Sadeh 	.open			= rbd_open,
482dfc5606dSYehuda Sadeh 	.release		= rbd_release,
483602adf40SYehuda Sadeh };
484602adf40SYehuda Sadeh 
485602adf40SYehuda Sadeh /*
486602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48743ae4701SAlex Elder  * We own *ceph_opts.
488602adf40SYehuda Sadeh  */
489f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
490602adf40SYehuda Sadeh {
491602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
492602adf40SYehuda Sadeh 	int ret = -ENOMEM;
493602adf40SYehuda Sadeh 
49437206ee5SAlex Elder 	dout("%s:\n", __func__);
495602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
496602adf40SYehuda Sadeh 	if (!rbdc)
497602adf40SYehuda Sadeh 		goto out_opt;
498602adf40SYehuda Sadeh 
499602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
500602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
501602adf40SYehuda Sadeh 
502bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
503bc534d86SAlex Elder 
50443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
505602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
506bc534d86SAlex Elder 		goto out_mutex;
50743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
508602adf40SYehuda Sadeh 
509602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
510602adf40SYehuda Sadeh 	if (ret < 0)
511602adf40SYehuda Sadeh 		goto out_err;
512602adf40SYehuda Sadeh 
513432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
514602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
515432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
516602adf40SYehuda Sadeh 
517bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51837206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
519bc534d86SAlex Elder 
520602adf40SYehuda Sadeh 	return rbdc;
521602adf40SYehuda Sadeh 
522602adf40SYehuda Sadeh out_err:
523602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
524bc534d86SAlex Elder out_mutex:
525bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
526602adf40SYehuda Sadeh 	kfree(rbdc);
527602adf40SYehuda Sadeh out_opt:
52843ae4701SAlex Elder 	if (ceph_opts)
52943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
53037206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
53137206ee5SAlex Elder 
53228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
533602adf40SYehuda Sadeh }
534602adf40SYehuda Sadeh 
5352f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5362f82ee54SAlex Elder {
5372f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5382f82ee54SAlex Elder 
5392f82ee54SAlex Elder 	return rbdc;
5402f82ee54SAlex Elder }
5412f82ee54SAlex Elder 
542602adf40SYehuda Sadeh /*
5431f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5441f7ba331SAlex Elder  * found, bump its reference count.
545602adf40SYehuda Sadeh  */
5461f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
547602adf40SYehuda Sadeh {
548602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5491f7ba331SAlex Elder 	bool found = false;
550602adf40SYehuda Sadeh 
55143ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
552602adf40SYehuda Sadeh 		return NULL;
553602adf40SYehuda Sadeh 
5541f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5551f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5561f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5572f82ee54SAlex Elder 			__rbd_get_client(client_node);
5582f82ee54SAlex Elder 
5591f7ba331SAlex Elder 			found = true;
5601f7ba331SAlex Elder 			break;
5611f7ba331SAlex Elder 		}
5621f7ba331SAlex Elder 	}
5631f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5641f7ba331SAlex Elder 
5651f7ba331SAlex Elder 	return found ? client_node : NULL;
566602adf40SYehuda Sadeh }
567602adf40SYehuda Sadeh 
568602adf40SYehuda Sadeh /*
56959c2be1eSYehuda Sadeh  * mount options
57059c2be1eSYehuda Sadeh  */
57159c2be1eSYehuda Sadeh enum {
57259c2be1eSYehuda Sadeh 	Opt_last_int,
57359c2be1eSYehuda Sadeh 	/* int args above */
57459c2be1eSYehuda Sadeh 	Opt_last_string,
57559c2be1eSYehuda Sadeh 	/* string args above */
576cc0538b6SAlex Elder 	Opt_read_only,
577cc0538b6SAlex Elder 	Opt_read_write,
578cc0538b6SAlex Elder 	/* Boolean args above */
579cc0538b6SAlex Elder 	Opt_last_bool,
58059c2be1eSYehuda Sadeh };
58159c2be1eSYehuda Sadeh 
58243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
58359c2be1eSYehuda Sadeh 	/* int args above */
58459c2be1eSYehuda Sadeh 	/* string args above */
585be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
586cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
587cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
588cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
589cc0538b6SAlex Elder 	/* Boolean args above */
59059c2be1eSYehuda Sadeh 	{-1, NULL}
59159c2be1eSYehuda Sadeh };
59259c2be1eSYehuda Sadeh 
59398571b5aSAlex Elder struct rbd_options {
59498571b5aSAlex Elder 	bool	read_only;
59598571b5aSAlex Elder };
59698571b5aSAlex Elder 
59798571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59898571b5aSAlex Elder 
59959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
60059c2be1eSYehuda Sadeh {
60143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
60259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
60359c2be1eSYehuda Sadeh 	int token, intval, ret;
60459c2be1eSYehuda Sadeh 
60543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60659c2be1eSYehuda Sadeh 	if (token < 0)
60759c2be1eSYehuda Sadeh 		return -EINVAL;
60859c2be1eSYehuda Sadeh 
60959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
61059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
61159c2be1eSYehuda Sadeh 		if (ret < 0) {
61259c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
61359c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
61459c2be1eSYehuda Sadeh 			return ret;
61559c2be1eSYehuda Sadeh 		}
61659c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61759c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61859c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
61959c2be1eSYehuda Sadeh 		     argstr[0].from);
620cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
621cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
62259c2be1eSYehuda Sadeh 	} else {
62359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
62459c2be1eSYehuda Sadeh 	}
62559c2be1eSYehuda Sadeh 
62659c2be1eSYehuda Sadeh 	switch (token) {
627cc0538b6SAlex Elder 	case Opt_read_only:
628cc0538b6SAlex Elder 		rbd_opts->read_only = true;
629cc0538b6SAlex Elder 		break;
630cc0538b6SAlex Elder 	case Opt_read_write:
631cc0538b6SAlex Elder 		rbd_opts->read_only = false;
632cc0538b6SAlex Elder 		break;
63359c2be1eSYehuda Sadeh 	default:
634aafb230eSAlex Elder 		rbd_assert(false);
635aafb230eSAlex Elder 		break;
63659c2be1eSYehuda Sadeh 	}
63759c2be1eSYehuda Sadeh 	return 0;
63859c2be1eSYehuda Sadeh }
63959c2be1eSYehuda Sadeh 
64059c2be1eSYehuda Sadeh /*
641602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
642602adf40SYehuda Sadeh  * not exist create it.
643602adf40SYehuda Sadeh  */
6449d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
645602adf40SYehuda Sadeh {
646f8c38929SAlex Elder 	struct rbd_client *rbdc;
64759c2be1eSYehuda Sadeh 
6481f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6499d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
65043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6519d3997fdSAlex Elder 	else
652f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
653d720bcb0SAlex Elder 
6549d3997fdSAlex Elder 	return rbdc;
655602adf40SYehuda Sadeh }
656602adf40SYehuda Sadeh 
657602adf40SYehuda Sadeh /*
658602adf40SYehuda Sadeh  * Destroy ceph client
659d23a4b3fSAlex Elder  *
660432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
661602adf40SYehuda Sadeh  */
662602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
663602adf40SYehuda Sadeh {
664602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
665602adf40SYehuda Sadeh 
66637206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
667cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
668602adf40SYehuda Sadeh 	list_del(&rbdc->node);
669cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
670602adf40SYehuda Sadeh 
671602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
672602adf40SYehuda Sadeh 	kfree(rbdc);
673602adf40SYehuda Sadeh }
674602adf40SYehuda Sadeh 
675602adf40SYehuda Sadeh /*
676602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
677602adf40SYehuda Sadeh  * it.
678602adf40SYehuda Sadeh  */
6799d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
680602adf40SYehuda Sadeh {
681c53d5893SAlex Elder 	if (rbdc)
6829d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
683602adf40SYehuda Sadeh }
684602adf40SYehuda Sadeh 
685a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
686a30b71b9SAlex Elder {
687a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
688a30b71b9SAlex Elder }
689a30b71b9SAlex Elder 
6908e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6918e94af8eSAlex Elder {
692103a150fSAlex Elder 	size_t size;
693103a150fSAlex Elder 	u32 snap_count;
694103a150fSAlex Elder 
695103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
696103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
697103a150fSAlex Elder 		return false;
698103a150fSAlex Elder 
699db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
700db2388b6SAlex Elder 
701db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
702db2388b6SAlex Elder 		return false;
703db2388b6SAlex Elder 
704db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
705db2388b6SAlex Elder 
706db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
707db2388b6SAlex Elder 		return false;
708db2388b6SAlex Elder 
709103a150fSAlex Elder 	/*
710103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
711103a150fSAlex Elder 	 * that limits the number of snapshots.
712103a150fSAlex Elder 	 */
713103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
714103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
715103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
716103a150fSAlex Elder 		return false;
717103a150fSAlex Elder 
718103a150fSAlex Elder 	/*
719103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
720103a150fSAlex Elder 	 * header must also be representable in a size_t.
721103a150fSAlex Elder 	 */
722103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
723103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
724103a150fSAlex Elder 		return false;
725103a150fSAlex Elder 
726103a150fSAlex Elder 	return true;
7278e94af8eSAlex Elder }
7288e94af8eSAlex Elder 
729602adf40SYehuda Sadeh /*
730602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
731602adf40SYehuda Sadeh  * header.
732602adf40SYehuda Sadeh  */
733602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7344156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
735602adf40SYehuda Sadeh {
736ccece235SAlex Elder 	u32 snap_count;
73758c17b0eSAlex Elder 	size_t len;
738d2bb24e5SAlex Elder 	size_t size;
739621901d6SAlex Elder 	u32 i;
740602adf40SYehuda Sadeh 
7416a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7426a52325fSAlex Elder 
743103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
744103a150fSAlex Elder 
74558c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
74658c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7476a52325fSAlex Elder 	if (!header->object_prefix)
748602adf40SYehuda Sadeh 		return -ENOMEM;
74958c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
75058c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
75100f1f36fSAlex Elder 
752602adf40SYehuda Sadeh 	if (snap_count) {
753f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
754f785cc1dSAlex Elder 
755621901d6SAlex Elder 		/* Save a copy of the snapshot names */
756621901d6SAlex Elder 
757f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
758f785cc1dSAlex Elder 			return -EIO;
759f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
760602adf40SYehuda Sadeh 		if (!header->snap_names)
7616a52325fSAlex Elder 			goto out_err;
762f785cc1dSAlex Elder 		/*
763f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
764f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
765f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
766f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
767f785cc1dSAlex Elder 		 */
768f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
769f785cc1dSAlex Elder 			snap_names_len);
7706a52325fSAlex Elder 
771621901d6SAlex Elder 		/* Record each snapshot's size */
772621901d6SAlex Elder 
773d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
774d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
775602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7766a52325fSAlex Elder 			goto out_err;
777621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
778621901d6SAlex Elder 			header->snap_sizes[i] =
779621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
780602adf40SYehuda Sadeh 	} else {
781602adf40SYehuda Sadeh 		header->snap_names = NULL;
782602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
783602adf40SYehuda Sadeh 	}
784849b4260SAlex Elder 
78534b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
786602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
787602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
788602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7896a52325fSAlex Elder 
790621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
791621901d6SAlex Elder 
792f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
793468521c1SAlex Elder 
794812164f8SAlex Elder 	header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
7956a52325fSAlex Elder 	if (!header->snapc)
7966a52325fSAlex Elder 		goto out_err;
797505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
798621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
799468521c1SAlex Elder 		header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
800602adf40SYehuda Sadeh 
801602adf40SYehuda Sadeh 	return 0;
802602adf40SYehuda Sadeh 
8036a52325fSAlex Elder out_err:
804849b4260SAlex Elder 	kfree(header->snap_sizes);
805ccece235SAlex Elder 	header->snap_sizes = NULL;
806602adf40SYehuda Sadeh 	kfree(header->snap_names);
807ccece235SAlex Elder 	header->snap_names = NULL;
8086a52325fSAlex Elder 	kfree(header->object_prefix);
8096a52325fSAlex Elder 	header->object_prefix = NULL;
810ccece235SAlex Elder 
81100f1f36fSAlex Elder 	return -ENOMEM;
812602adf40SYehuda Sadeh }
813602adf40SYehuda Sadeh 
8149e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
8159e15b77dSAlex Elder {
8169e15b77dSAlex Elder 	struct rbd_snap *snap;
8179e15b77dSAlex Elder 
8189e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
8199e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
8209e15b77dSAlex Elder 
8219e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
8229e15b77dSAlex Elder 		if (snap_id == snap->id)
8239e15b77dSAlex Elder 			return snap->name;
8249e15b77dSAlex Elder 
8259e15b77dSAlex Elder 	return NULL;
8269e15b77dSAlex Elder }
8279e15b77dSAlex Elder 
8288b0241f8SAlex Elder static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev,
8298b0241f8SAlex Elder 					const char *snap_name)
830602adf40SYehuda Sadeh {
831e86924a8SAlex Elder 	struct rbd_snap *snap;
83200f1f36fSAlex Elder 
8338b0241f8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
8348b0241f8SAlex Elder 		if (!strcmp(snap_name, snap->name))
8358b0241f8SAlex Elder 			return snap;
83600f1f36fSAlex Elder 
8378b0241f8SAlex Elder 	return NULL;
83800f1f36fSAlex Elder }
839602adf40SYehuda Sadeh 
840d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
841602adf40SYehuda Sadeh {
8420d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
843cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
84499c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
84534b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
846602adf40SYehuda Sadeh 	} else {
8478b0241f8SAlex Elder 		struct rbd_snap *snap;
8488b0241f8SAlex Elder 
8498b0241f8SAlex Elder 		snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
8508b0241f8SAlex Elder 		if (!snap)
8518b0241f8SAlex Elder 			return -ENOENT;
8528b0241f8SAlex Elder 		rbd_dev->mapping.size = snap->size;
8538b0241f8SAlex Elder 		rbd_dev->mapping.features = snap->features;
854f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
855602adf40SYehuda Sadeh 	}
8566d292906SAlex Elder 
8578b0241f8SAlex Elder 	return 0;
858602adf40SYehuda Sadeh }
859602adf40SYehuda Sadeh 
860d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
861d1cf5788SAlex Elder {
862d1cf5788SAlex Elder 	rbd_dev->mapping.size = 0;
863d1cf5788SAlex Elder 	rbd_dev->mapping.features = 0;
864d1cf5788SAlex Elder 	rbd_dev->mapping.read_only = true;
865d1cf5788SAlex Elder }
866d1cf5788SAlex Elder 
867200a6a8bSAlex Elder static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
868200a6a8bSAlex Elder {
869200a6a8bSAlex Elder 	rbd_dev->mapping.size = 0;
870200a6a8bSAlex Elder 	rbd_dev->mapping.features = 0;
871200a6a8bSAlex Elder 	rbd_dev->mapping.read_only = true;
872200a6a8bSAlex Elder }
873200a6a8bSAlex Elder 
87498571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
875602adf40SYehuda Sadeh {
87665ccfe21SAlex Elder 	char *name;
87765ccfe21SAlex Elder 	u64 segment;
87865ccfe21SAlex Elder 	int ret;
879602adf40SYehuda Sadeh 
8802fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
88165ccfe21SAlex Elder 	if (!name)
88265ccfe21SAlex Elder 		return NULL;
88365ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8842fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
88565ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8862fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
88765ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
88865ccfe21SAlex Elder 			segment, ret);
88965ccfe21SAlex Elder 		kfree(name);
89065ccfe21SAlex Elder 		name = NULL;
89165ccfe21SAlex Elder 	}
892602adf40SYehuda Sadeh 
89365ccfe21SAlex Elder 	return name;
89465ccfe21SAlex Elder }
895602adf40SYehuda Sadeh 
89665ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
89765ccfe21SAlex Elder {
89865ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
899602adf40SYehuda Sadeh 
90065ccfe21SAlex Elder 	return offset & (segment_size - 1);
90165ccfe21SAlex Elder }
90265ccfe21SAlex Elder 
90365ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
90465ccfe21SAlex Elder 				u64 offset, u64 length)
90565ccfe21SAlex Elder {
90665ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
90765ccfe21SAlex Elder 
90865ccfe21SAlex Elder 	offset &= segment_size - 1;
90965ccfe21SAlex Elder 
910aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
91165ccfe21SAlex Elder 	if (offset + length > segment_size)
91265ccfe21SAlex Elder 		length = segment_size - offset;
91365ccfe21SAlex Elder 
91465ccfe21SAlex Elder 	return length;
915602adf40SYehuda Sadeh }
916602adf40SYehuda Sadeh 
917602adf40SYehuda Sadeh /*
918029bcbd8SJosh Durgin  * returns the size of an object in the image
919029bcbd8SJosh Durgin  */
920029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
921029bcbd8SJosh Durgin {
922029bcbd8SJosh Durgin 	return 1 << header->obj_order;
923029bcbd8SJosh Durgin }
924029bcbd8SJosh Durgin 
925029bcbd8SJosh Durgin /*
926602adf40SYehuda Sadeh  * bio helpers
927602adf40SYehuda Sadeh  */
928602adf40SYehuda Sadeh 
929602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
930602adf40SYehuda Sadeh {
931602adf40SYehuda Sadeh 	struct bio *tmp;
932602adf40SYehuda Sadeh 
933602adf40SYehuda Sadeh 	while (chain) {
934602adf40SYehuda Sadeh 		tmp = chain;
935602adf40SYehuda Sadeh 		chain = chain->bi_next;
936602adf40SYehuda Sadeh 		bio_put(tmp);
937602adf40SYehuda Sadeh 	}
938602adf40SYehuda Sadeh }
939602adf40SYehuda Sadeh 
940602adf40SYehuda Sadeh /*
941602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
942602adf40SYehuda Sadeh  */
943602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
944602adf40SYehuda Sadeh {
945602adf40SYehuda Sadeh 	struct bio_vec *bv;
946602adf40SYehuda Sadeh 	unsigned long flags;
947602adf40SYehuda Sadeh 	void *buf;
948602adf40SYehuda Sadeh 	int i;
949602adf40SYehuda Sadeh 	int pos = 0;
950602adf40SYehuda Sadeh 
951602adf40SYehuda Sadeh 	while (chain) {
952602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
953602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
954602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
955602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
956602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
957602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
95885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
959602adf40SYehuda Sadeh 			}
960602adf40SYehuda Sadeh 			pos += bv->bv_len;
961602adf40SYehuda Sadeh 		}
962602adf40SYehuda Sadeh 
963602adf40SYehuda Sadeh 		chain = chain->bi_next;
964602adf40SYehuda Sadeh 	}
965602adf40SYehuda Sadeh }
966602adf40SYehuda Sadeh 
967602adf40SYehuda Sadeh /*
968b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
969b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
970b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
971b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
972b9434c5bSAlex Elder  */
973b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
974b9434c5bSAlex Elder {
975b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
976b9434c5bSAlex Elder 
977b9434c5bSAlex Elder 	rbd_assert(end > offset);
978b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
979b9434c5bSAlex Elder 	while (offset < end) {
980b9434c5bSAlex Elder 		size_t page_offset;
981b9434c5bSAlex Elder 		size_t length;
982b9434c5bSAlex Elder 		unsigned long flags;
983b9434c5bSAlex Elder 		void *kaddr;
984b9434c5bSAlex Elder 
985b9434c5bSAlex Elder 		page_offset = (size_t)(offset & ~PAGE_MASK);
986b9434c5bSAlex Elder 		length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
987b9434c5bSAlex Elder 		local_irq_save(flags);
988b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
989b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
990b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
991b9434c5bSAlex Elder 		local_irq_restore(flags);
992b9434c5bSAlex Elder 
993b9434c5bSAlex Elder 		offset += length;
994b9434c5bSAlex Elder 		page++;
995b9434c5bSAlex Elder 	}
996b9434c5bSAlex Elder }
997b9434c5bSAlex Elder 
998b9434c5bSAlex Elder /*
999f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1000f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1001602adf40SYehuda Sadeh  */
1002f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1003f7760dadSAlex Elder 					unsigned int offset,
1004f7760dadSAlex Elder 					unsigned int len,
1005f7760dadSAlex Elder 					gfp_t gfpmask)
1006602adf40SYehuda Sadeh {
1007f7760dadSAlex Elder 	struct bio_vec *bv;
1008f7760dadSAlex Elder 	unsigned int resid;
1009f7760dadSAlex Elder 	unsigned short idx;
1010f7760dadSAlex Elder 	unsigned int voff;
1011f7760dadSAlex Elder 	unsigned short end_idx;
1012f7760dadSAlex Elder 	unsigned short vcnt;
1013f7760dadSAlex Elder 	struct bio *bio;
1014602adf40SYehuda Sadeh 
1015f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1016f7760dadSAlex Elder 
1017f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1018f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1019f7760dadSAlex Elder 
1020f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1021f7760dadSAlex Elder 		return NULL;
1022f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1023f7760dadSAlex Elder 		return NULL;
1024f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1025f7760dadSAlex Elder 		return NULL;
1026f7760dadSAlex Elder 
1027f7760dadSAlex Elder 	/* Find first affected segment... */
1028f7760dadSAlex Elder 
1029f7760dadSAlex Elder 	resid = offset;
1030f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1031f7760dadSAlex Elder 		if (resid < bv->bv_len)
1032f7760dadSAlex Elder 			break;
1033f7760dadSAlex Elder 		resid -= bv->bv_len;
1034602adf40SYehuda Sadeh 	}
1035f7760dadSAlex Elder 	voff = resid;
1036602adf40SYehuda Sadeh 
1037f7760dadSAlex Elder 	/* ...and the last affected segment */
1038542582fcSAlex Elder 
1039f7760dadSAlex Elder 	resid += len;
1040f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1041f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1042f7760dadSAlex Elder 			break;
1043f7760dadSAlex Elder 		resid -= bv->bv_len;
1044f7760dadSAlex Elder 	}
1045f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1046602adf40SYehuda Sadeh 
1047f7760dadSAlex Elder 	/* Build the clone */
1048f7760dadSAlex Elder 
1049f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1050f7760dadSAlex Elder 	if (!bio)
1051f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1052f7760dadSAlex Elder 
1053f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1054f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1055f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1056f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1057602adf40SYehuda Sadeh 
1058602adf40SYehuda Sadeh 	/*
1059f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1060f7760dadSAlex Elder 	 * and last (or only) entries.
1061602adf40SYehuda Sadeh 	 */
1062f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1063f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1064f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1065f7760dadSAlex Elder 	if (vcnt > 1) {
1066f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1067f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1068602adf40SYehuda Sadeh 	} else {
1069f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1070602adf40SYehuda Sadeh 	}
1071602adf40SYehuda Sadeh 
1072f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1073f7760dadSAlex Elder 	bio->bi_size = len;
1074f7760dadSAlex Elder 	bio->bi_idx = 0;
1075602adf40SYehuda Sadeh 
1076f7760dadSAlex Elder 	return bio;
1077602adf40SYehuda Sadeh }
1078602adf40SYehuda Sadeh 
1079f7760dadSAlex Elder /*
1080f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1081f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1082f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1083f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1084f7760dadSAlex Elder  *
1085f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1086f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1087f7760dadSAlex Elder  * the start of data to be cloned is located.
1088f7760dadSAlex Elder  *
1089f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1090f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1091f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1092f7760dadSAlex Elder  */
1093f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1094f7760dadSAlex Elder 					unsigned int *offset,
1095f7760dadSAlex Elder 					unsigned int len,
1096f7760dadSAlex Elder 					gfp_t gfpmask)
1097f7760dadSAlex Elder {
1098f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1099f7760dadSAlex Elder 	unsigned int off = *offset;
1100f7760dadSAlex Elder 	struct bio *chain = NULL;
1101f7760dadSAlex Elder 	struct bio **end;
1102602adf40SYehuda Sadeh 
1103f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1104602adf40SYehuda Sadeh 
1105f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1106f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1107602adf40SYehuda Sadeh 
1108f7760dadSAlex Elder 	end = &chain;
1109f7760dadSAlex Elder 	while (len) {
1110f7760dadSAlex Elder 		unsigned int bi_size;
1111f7760dadSAlex Elder 		struct bio *bio;
1112f7760dadSAlex Elder 
1113f5400b7aSAlex Elder 		if (!bi) {
1114f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1115f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1116f5400b7aSAlex Elder 		}
1117f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1118f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1119f7760dadSAlex Elder 		if (!bio)
1120f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1121f7760dadSAlex Elder 
1122f7760dadSAlex Elder 		*end = bio;
1123f7760dadSAlex Elder 		end = &bio->bi_next;
1124f7760dadSAlex Elder 
1125f7760dadSAlex Elder 		off += bi_size;
1126f7760dadSAlex Elder 		if (off == bi->bi_size) {
1127f7760dadSAlex Elder 			bi = bi->bi_next;
1128f7760dadSAlex Elder 			off = 0;
1129f7760dadSAlex Elder 		}
1130f7760dadSAlex Elder 		len -= bi_size;
1131f7760dadSAlex Elder 	}
1132f7760dadSAlex Elder 	*bio_src = bi;
1133f7760dadSAlex Elder 	*offset = off;
1134f7760dadSAlex Elder 
1135f7760dadSAlex Elder 	return chain;
1136f7760dadSAlex Elder out_err:
1137f7760dadSAlex Elder 	bio_chain_put(chain);
1138f7760dadSAlex Elder 
1139602adf40SYehuda Sadeh 	return NULL;
1140602adf40SYehuda Sadeh }
1141602adf40SYehuda Sadeh 
1142926f9b3fSAlex Elder /*
1143926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1144926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1145926f9b3fSAlex Elder  * again.
1146926f9b3fSAlex Elder  */
11476365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
11486365d33aSAlex Elder {
11496365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
11506365d33aSAlex Elder 		struct rbd_device *rbd_dev;
11516365d33aSAlex Elder 
115257acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
11536365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
11546365d33aSAlex Elder 			obj_request);
11556365d33aSAlex Elder 	}
11566365d33aSAlex Elder }
11576365d33aSAlex Elder 
11586365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
11596365d33aSAlex Elder {
11606365d33aSAlex Elder 	smp_mb();
11616365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
11626365d33aSAlex Elder }
11636365d33aSAlex Elder 
116457acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
116557acbaa7SAlex Elder {
116657acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
116757acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
116857acbaa7SAlex Elder 
116957acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
117057acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
117157acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
117257acbaa7SAlex Elder 			obj_request);
117357acbaa7SAlex Elder 	}
117457acbaa7SAlex Elder }
117557acbaa7SAlex Elder 
117657acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
117757acbaa7SAlex Elder {
117857acbaa7SAlex Elder 	smp_mb();
117957acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
118057acbaa7SAlex Elder }
118157acbaa7SAlex Elder 
11825679c59fSAlex Elder /*
11835679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
11845679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
11855679c59fSAlex Elder  *
11865679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
11875679c59fSAlex Elder  * away again.  It's possible that the response from two existence
11885679c59fSAlex Elder  * checks are separated by the creation of the target object, and
11895679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
11905679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
11915679c59fSAlex Elder  */
11925679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
11935679c59fSAlex Elder 				bool exists)
11945679c59fSAlex Elder {
11955679c59fSAlex Elder 	if (exists)
11965679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
11975679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
11985679c59fSAlex Elder 	smp_mb();
11995679c59fSAlex Elder }
12005679c59fSAlex Elder 
12015679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
12025679c59fSAlex Elder {
12035679c59fSAlex Elder 	smp_mb();
12045679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
12055679c59fSAlex Elder }
12065679c59fSAlex Elder 
12075679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
12085679c59fSAlex Elder {
12095679c59fSAlex Elder 	smp_mb();
12105679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
12115679c59fSAlex Elder }
12125679c59fSAlex Elder 
1213bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1214bf0d5f50SAlex Elder {
121537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
121637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1217bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1218bf0d5f50SAlex Elder }
1219bf0d5f50SAlex Elder 
1220bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1221bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1222bf0d5f50SAlex Elder {
1223bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
122437206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
122537206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1226bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1227bf0d5f50SAlex Elder }
1228bf0d5f50SAlex Elder 
1229bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1230bf0d5f50SAlex Elder {
123137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
123237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1233bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1234bf0d5f50SAlex Elder }
1235bf0d5f50SAlex Elder 
1236bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1237bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1238bf0d5f50SAlex Elder {
1239bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
124037206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
124137206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1242bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1243bf0d5f50SAlex Elder }
1244bf0d5f50SAlex Elder 
1245bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1246bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1247bf0d5f50SAlex Elder {
124825dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
124925dcf954SAlex Elder 
1250b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1251bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
125225dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
12536365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
12546365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1255bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
125625dcf954SAlex Elder 	img_request->obj_request_count++;
125725dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
125837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
125937206ee5SAlex Elder 		obj_request->which);
1260bf0d5f50SAlex Elder }
1261bf0d5f50SAlex Elder 
1262bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1263bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1264bf0d5f50SAlex Elder {
1265bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
126625dcf954SAlex Elder 
126737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
126837206ee5SAlex Elder 		obj_request->which);
1269bf0d5f50SAlex Elder 	list_del(&obj_request->links);
127025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
127125dcf954SAlex Elder 	img_request->obj_request_count--;
127225dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
127325dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
12746365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1275bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1276bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
127725dcf954SAlex Elder 	obj_request->callback = NULL;
1278bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1279bf0d5f50SAlex Elder }
1280bf0d5f50SAlex Elder 
1281bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1282bf0d5f50SAlex Elder {
1283bf0d5f50SAlex Elder 	switch (type) {
12849969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1285bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1286788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1287bf0d5f50SAlex Elder 		return true;
1288bf0d5f50SAlex Elder 	default:
1289bf0d5f50SAlex Elder 		return false;
1290bf0d5f50SAlex Elder 	}
1291bf0d5f50SAlex Elder }
1292bf0d5f50SAlex Elder 
1293bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1294bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1295bf0d5f50SAlex Elder {
129637206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
129737206ee5SAlex Elder 
1298bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1299bf0d5f50SAlex Elder }
1300bf0d5f50SAlex Elder 
1301bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1302bf0d5f50SAlex Elder {
130355f27e09SAlex Elder 
130437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
130555f27e09SAlex Elder 
130655f27e09SAlex Elder 	/*
130755f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
130855f27e09SAlex Elder 	 * count for the image request.  We could instead use
130955f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
131055f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
131155f27e09SAlex Elder 	 */
131255f27e09SAlex Elder 	if (!img_request->result) {
131355f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
131455f27e09SAlex Elder 		u64 xferred = 0;
131555f27e09SAlex Elder 
131655f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
131755f27e09SAlex Elder 			xferred += obj_request->xferred;
131855f27e09SAlex Elder 		img_request->xferred = xferred;
131955f27e09SAlex Elder 	}
132055f27e09SAlex Elder 
1321bf0d5f50SAlex Elder 	if (img_request->callback)
1322bf0d5f50SAlex Elder 		img_request->callback(img_request);
1323bf0d5f50SAlex Elder 	else
1324bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1325bf0d5f50SAlex Elder }
1326bf0d5f50SAlex Elder 
1327788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1328788e2df3SAlex Elder 
1329788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1330788e2df3SAlex Elder {
133137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
133237206ee5SAlex Elder 
1333788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1334788e2df3SAlex Elder }
1335788e2df3SAlex Elder 
13360c425248SAlex Elder /*
13370c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13380c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13390c425248SAlex Elder  * and currently never change thereafter.
13400c425248SAlex Elder  */
13410c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
13420c425248SAlex Elder {
13430c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
13440c425248SAlex Elder 	smp_mb();
13450c425248SAlex Elder }
13460c425248SAlex Elder 
13470c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
13480c425248SAlex Elder {
13490c425248SAlex Elder 	smp_mb();
13500c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
13510c425248SAlex Elder }
13520c425248SAlex Elder 
13539849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
13549849e986SAlex Elder {
13559849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
13569849e986SAlex Elder 	smp_mb();
13579849e986SAlex Elder }
13589849e986SAlex Elder 
13599849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
13609849e986SAlex Elder {
13619849e986SAlex Elder 	smp_mb();
13629849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
13639849e986SAlex Elder }
13649849e986SAlex Elder 
1365d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1366d0b2e944SAlex Elder {
1367d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1368d0b2e944SAlex Elder 	smp_mb();
1369d0b2e944SAlex Elder }
1370d0b2e944SAlex Elder 
1371d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1372d0b2e944SAlex Elder {
1373d0b2e944SAlex Elder 	smp_mb();
1374d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1375d0b2e944SAlex Elder }
1376d0b2e944SAlex Elder 
13776e2a4505SAlex Elder static void
13786e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
13796e2a4505SAlex Elder {
1380b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1381b9434c5bSAlex Elder 	u64 length = obj_request->length;
1382b9434c5bSAlex Elder 
13836e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
13846e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1385b9434c5bSAlex Elder 		xferred, length);
13866e2a4505SAlex Elder 	/*
13876e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
13886e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
13896e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
13906e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
13916e2a4505SAlex Elder 	 * was satisfied.
13926e2a4505SAlex Elder 	 */
1393b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
13946e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1395b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
13966e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1397b9434c5bSAlex Elder 		else
1398b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
13996e2a4505SAlex Elder 		obj_request->result = 0;
1400b9434c5bSAlex Elder 		obj_request->xferred = length;
1401b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1402b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1403b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1404b9434c5bSAlex Elder 		else
1405b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1406b9434c5bSAlex Elder 		obj_request->xferred = length;
14076e2a4505SAlex Elder 	}
14086e2a4505SAlex Elder 	obj_request_done_set(obj_request);
14096e2a4505SAlex Elder }
14106e2a4505SAlex Elder 
1411bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1412bf0d5f50SAlex Elder {
141337206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
141437206ee5SAlex Elder 		obj_request->callback);
1415bf0d5f50SAlex Elder 	if (obj_request->callback)
1416bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1417788e2df3SAlex Elder 	else
1418788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1419bf0d5f50SAlex Elder }
1420bf0d5f50SAlex Elder 
1421c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
142239bf2c5dSAlex Elder {
142339bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
142439bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
142539bf2c5dSAlex Elder }
142639bf2c5dSAlex Elder 
1427c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1428bf0d5f50SAlex Elder {
142957acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1430a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
143157acbaa7SAlex Elder 	bool layered = false;
143257acbaa7SAlex Elder 
143357acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
143457acbaa7SAlex Elder 		img_request = obj_request->img_request;
143557acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1436a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
143757acbaa7SAlex Elder 	}
14388b3e1a56SAlex Elder 
14398b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
14408b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
14418b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1442a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1443a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
14448b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
14458b3e1a56SAlex Elder 	else if (img_request)
14466e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
14476e2a4505SAlex Elder 	else
144807741308SAlex Elder 		obj_request_done_set(obj_request);
1449bf0d5f50SAlex Elder }
1450bf0d5f50SAlex Elder 
1451c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1452bf0d5f50SAlex Elder {
14531b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
14541b83bef2SSage Weil 		obj_request->result, obj_request->length);
14551b83bef2SSage Weil 	/*
14568b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
14578b3e1a56SAlex Elder 	 * it to our originally-requested length.
14581b83bef2SSage Weil 	 */
14591b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
146007741308SAlex Elder 	obj_request_done_set(obj_request);
1461bf0d5f50SAlex Elder }
1462bf0d5f50SAlex Elder 
1463fbfab539SAlex Elder /*
1464fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1465fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1466fbfab539SAlex Elder  */
1467c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1468fbfab539SAlex Elder {
146937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1470fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1471fbfab539SAlex Elder }
1472fbfab539SAlex Elder 
1473bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1474bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1475bf0d5f50SAlex Elder {
1476bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1477bf0d5f50SAlex Elder 	u16 opcode;
1478bf0d5f50SAlex Elder 
147937206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1480bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
148157acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
148257acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
148357acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
148457acbaa7SAlex Elder 	} else {
148557acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
148657acbaa7SAlex Elder 	}
1487bf0d5f50SAlex Elder 
14881b83bef2SSage Weil 	if (osd_req->r_result < 0)
14891b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1490bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1491bf0d5f50SAlex Elder 
14920eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1493bf0d5f50SAlex Elder 
1494c47f9371SAlex Elder 	/*
1495c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1496c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1497c47f9371SAlex Elder 	 */
14981b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1499c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
150079528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1501bf0d5f50SAlex Elder 	switch (opcode) {
1502bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1503c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1504bf0d5f50SAlex Elder 		break;
1505bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1506c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1507bf0d5f50SAlex Elder 		break;
1508fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1509c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1510fbfab539SAlex Elder 		break;
151136be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1512b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
15139969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1514c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
15159969ebc5SAlex Elder 		break;
1516bf0d5f50SAlex Elder 	default:
1517bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1518bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1519bf0d5f50SAlex Elder 		break;
1520bf0d5f50SAlex Elder 	}
1521bf0d5f50SAlex Elder 
152207741308SAlex Elder 	if (obj_request_done_test(obj_request))
1523bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1524bf0d5f50SAlex Elder }
1525bf0d5f50SAlex Elder 
15269d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1527430c28c3SAlex Elder {
1528430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15298c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15309d4df01fSAlex Elder 	u64 snap_id;
1531430c28c3SAlex Elder 
15328c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1533430c28c3SAlex Elder 
15349d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
15358c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15369d4df01fSAlex Elder 			NULL, snap_id, NULL);
15379d4df01fSAlex Elder }
15389d4df01fSAlex Elder 
15399d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
15409d4df01fSAlex Elder {
15419d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15429d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15439d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
15449d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
15459d4df01fSAlex Elder 
15469d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
15479d4df01fSAlex Elder 
15489d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
15499d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15509d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1551430c28c3SAlex Elder }
1552430c28c3SAlex Elder 
1553bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1554bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1555bf0d5f50SAlex Elder 					bool write_request,
1556430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1557bf0d5f50SAlex Elder {
1558bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1559bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1560bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1561bf0d5f50SAlex Elder 
15626365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
15636365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
15646365d33aSAlex Elder 
15650c425248SAlex Elder 		rbd_assert(write_request ==
15660c425248SAlex Elder 				img_request_write_test(img_request));
15670c425248SAlex Elder 		if (write_request)
1568bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1569bf0d5f50SAlex Elder 	}
1570bf0d5f50SAlex Elder 
1571bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1572bf0d5f50SAlex Elder 
1573bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1574bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1575bf0d5f50SAlex Elder 	if (!osd_req)
1576bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1577bf0d5f50SAlex Elder 
1578430c28c3SAlex Elder 	if (write_request)
1579bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1580430c28c3SAlex Elder 	else
1581bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1582bf0d5f50SAlex Elder 
1583bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1584bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1585bf0d5f50SAlex Elder 
1586bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1587bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1588bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1589bf0d5f50SAlex Elder 
1590bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1591bf0d5f50SAlex Elder 
1592bf0d5f50SAlex Elder 	return osd_req;
1593bf0d5f50SAlex Elder }
1594bf0d5f50SAlex Elder 
15950eefd470SAlex Elder /*
15960eefd470SAlex Elder  * Create a copyup osd request based on the information in the
15970eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
15980eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
15990eefd470SAlex Elder  */
16000eefd470SAlex Elder static struct ceph_osd_request *
16010eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
16020eefd470SAlex Elder {
16030eefd470SAlex Elder 	struct rbd_img_request *img_request;
16040eefd470SAlex Elder 	struct ceph_snap_context *snapc;
16050eefd470SAlex Elder 	struct rbd_device *rbd_dev;
16060eefd470SAlex Elder 	struct ceph_osd_client *osdc;
16070eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
16080eefd470SAlex Elder 
16090eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16100eefd470SAlex Elder 	img_request = obj_request->img_request;
16110eefd470SAlex Elder 	rbd_assert(img_request);
16120eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
16130eefd470SAlex Elder 
16140eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
16150eefd470SAlex Elder 
16160eefd470SAlex Elder 	snapc = img_request->snapc;
16170eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
16180eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
16190eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
16200eefd470SAlex Elder 	if (!osd_req)
16210eefd470SAlex Elder 		return NULL;	/* ENOMEM */
16220eefd470SAlex Elder 
16230eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
16240eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
16250eefd470SAlex Elder 	osd_req->r_priv = obj_request;
16260eefd470SAlex Elder 
16270eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
16280eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
16290eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
16300eefd470SAlex Elder 
16310eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
16320eefd470SAlex Elder 
16330eefd470SAlex Elder 	return osd_req;
16340eefd470SAlex Elder }
16350eefd470SAlex Elder 
16360eefd470SAlex Elder 
1637bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1638bf0d5f50SAlex Elder {
1639bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1640bf0d5f50SAlex Elder }
1641bf0d5f50SAlex Elder 
1642bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1643bf0d5f50SAlex Elder 
1644bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1645bf0d5f50SAlex Elder 						u64 offset, u64 length,
1646bf0d5f50SAlex Elder 						enum obj_request_type type)
1647bf0d5f50SAlex Elder {
1648bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1649bf0d5f50SAlex Elder 	size_t size;
1650bf0d5f50SAlex Elder 	char *name;
1651bf0d5f50SAlex Elder 
1652bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1653bf0d5f50SAlex Elder 
1654bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1655bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1656bf0d5f50SAlex Elder 	if (!obj_request)
1657bf0d5f50SAlex Elder 		return NULL;
1658bf0d5f50SAlex Elder 
1659bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1660bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1661bf0d5f50SAlex Elder 	obj_request->offset = offset;
1662bf0d5f50SAlex Elder 	obj_request->length = length;
1663926f9b3fSAlex Elder 	obj_request->flags = 0;
1664bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1665bf0d5f50SAlex Elder 	obj_request->type = type;
1666bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1667788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1668bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1669bf0d5f50SAlex Elder 
167037206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
167137206ee5SAlex Elder 		offset, length, (int)type, obj_request);
167237206ee5SAlex Elder 
1673bf0d5f50SAlex Elder 	return obj_request;
1674bf0d5f50SAlex Elder }
1675bf0d5f50SAlex Elder 
1676bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1677bf0d5f50SAlex Elder {
1678bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1679bf0d5f50SAlex Elder 
1680bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1681bf0d5f50SAlex Elder 
168237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
168337206ee5SAlex Elder 
1684bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1685bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1686bf0d5f50SAlex Elder 
1687bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1688bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1689bf0d5f50SAlex Elder 
1690bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1691bf0d5f50SAlex Elder 	switch (obj_request->type) {
16929969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
16939969ebc5SAlex Elder 		break;		/* Nothing to do */
1694bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1695bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1696bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1697bf0d5f50SAlex Elder 		break;
1698788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1699788e2df3SAlex Elder 		if (obj_request->pages)
1700788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1701788e2df3SAlex Elder 						obj_request->page_count);
1702788e2df3SAlex Elder 		break;
1703bf0d5f50SAlex Elder 	}
1704bf0d5f50SAlex Elder 
1705bf0d5f50SAlex Elder 	kfree(obj_request);
1706bf0d5f50SAlex Elder }
1707bf0d5f50SAlex Elder 
1708bf0d5f50SAlex Elder /*
1709bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1710bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1711bf0d5f50SAlex Elder  * (if there is one).
1712bf0d5f50SAlex Elder  */
1713cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1714cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1715bf0d5f50SAlex Elder 					u64 offset, u64 length,
17169849e986SAlex Elder 					bool write_request,
17179849e986SAlex Elder 					bool child_request)
1718bf0d5f50SAlex Elder {
1719bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1720bf0d5f50SAlex Elder 
1721bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1722bf0d5f50SAlex Elder 	if (!img_request)
1723bf0d5f50SAlex Elder 		return NULL;
1724bf0d5f50SAlex Elder 
1725bf0d5f50SAlex Elder 	if (write_request) {
1726bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1727812164f8SAlex Elder 		ceph_get_snap_context(rbd_dev->header.snapc);
1728bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1729bf0d5f50SAlex Elder 	}
1730bf0d5f50SAlex Elder 
1731bf0d5f50SAlex Elder 	img_request->rq = NULL;
1732bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1733bf0d5f50SAlex Elder 	img_request->offset = offset;
1734bf0d5f50SAlex Elder 	img_request->length = length;
17350c425248SAlex Elder 	img_request->flags = 0;
17360c425248SAlex Elder 	if (write_request) {
17370c425248SAlex Elder 		img_request_write_set(img_request);
1738468521c1SAlex Elder 		img_request->snapc = rbd_dev->header.snapc;
17390c425248SAlex Elder 	} else {
1740bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
17410c425248SAlex Elder 	}
17429849e986SAlex Elder 	if (child_request)
17439849e986SAlex Elder 		img_request_child_set(img_request);
1744d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1745d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1746bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1747bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1748bf0d5f50SAlex Elder 	img_request->callback = NULL;
1749a5a337d4SAlex Elder 	img_request->result = 0;
1750bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1751bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1752bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1753bf0d5f50SAlex Elder 
1754bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1755bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1756bf0d5f50SAlex Elder 
175737206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
175837206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
175937206ee5SAlex Elder 		img_request);
176037206ee5SAlex Elder 
1761bf0d5f50SAlex Elder 	return img_request;
1762bf0d5f50SAlex Elder }
1763bf0d5f50SAlex Elder 
1764bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1765bf0d5f50SAlex Elder {
1766bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1767bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1768bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1769bf0d5f50SAlex Elder 
1770bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1771bf0d5f50SAlex Elder 
177237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
177337206ee5SAlex Elder 
1774bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1775bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
177625dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1777bf0d5f50SAlex Elder 
17780c425248SAlex Elder 	if (img_request_write_test(img_request))
1779812164f8SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1780bf0d5f50SAlex Elder 
17818b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
17828b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
17838b3e1a56SAlex Elder 
1784bf0d5f50SAlex Elder 	kfree(img_request);
1785bf0d5f50SAlex Elder }
1786bf0d5f50SAlex Elder 
17871217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
17881217857fSAlex Elder {
17896365d33aSAlex Elder 	struct rbd_img_request *img_request;
17901217857fSAlex Elder 	unsigned int xferred;
17911217857fSAlex Elder 	int result;
17928b3e1a56SAlex Elder 	bool more;
17931217857fSAlex Elder 
17946365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17956365d33aSAlex Elder 	img_request = obj_request->img_request;
17966365d33aSAlex Elder 
17971217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
17981217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
17991217857fSAlex Elder 	result = obj_request->result;
18001217857fSAlex Elder 	if (result) {
18011217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
18021217857fSAlex Elder 
18031217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
18041217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
18051217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
18061217857fSAlex Elder 			obj_request->offset);
18071217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
18081217857fSAlex Elder 			result, xferred);
18091217857fSAlex Elder 		if (!img_request->result)
18101217857fSAlex Elder 			img_request->result = result;
18111217857fSAlex Elder 	}
18121217857fSAlex Elder 
1813f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
1814f1a4739fSAlex Elder 
1815f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
1816f1a4739fSAlex Elder 		obj_request->pages = NULL;
1817f1a4739fSAlex Elder 		obj_request->page_count = 0;
1818f1a4739fSAlex Elder 	}
1819f1a4739fSAlex Elder 
18208b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
18218b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
18228b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
18238b3e1a56SAlex Elder 	} else {
18248b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
18258b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
18268b3e1a56SAlex Elder 	}
18278b3e1a56SAlex Elder 
18288b3e1a56SAlex Elder 	return more;
18291217857fSAlex Elder }
18301217857fSAlex Elder 
18312169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
18322169238dSAlex Elder {
18332169238dSAlex Elder 	struct rbd_img_request *img_request;
18342169238dSAlex Elder 	u32 which = obj_request->which;
18352169238dSAlex Elder 	bool more = true;
18362169238dSAlex Elder 
18376365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18382169238dSAlex Elder 	img_request = obj_request->img_request;
18392169238dSAlex Elder 
18402169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
18412169238dSAlex Elder 	rbd_assert(img_request != NULL);
18422169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
18432169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
18442169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
18452169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
18462169238dSAlex Elder 
18472169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
18482169238dSAlex Elder 	if (which != img_request->next_completion)
18492169238dSAlex Elder 		goto out;
18502169238dSAlex Elder 
18512169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
18522169238dSAlex Elder 		rbd_assert(more);
18532169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
18542169238dSAlex Elder 
18552169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
18562169238dSAlex Elder 			break;
18571217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
18582169238dSAlex Elder 		which++;
18592169238dSAlex Elder 	}
18602169238dSAlex Elder 
18612169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
18622169238dSAlex Elder 	img_request->next_completion = which;
18632169238dSAlex Elder out:
18642169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
18652169238dSAlex Elder 
18662169238dSAlex Elder 	if (!more)
18672169238dSAlex Elder 		rbd_img_request_complete(img_request);
18682169238dSAlex Elder }
18692169238dSAlex Elder 
1870f1a4739fSAlex Elder /*
1871f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
1872f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
1873f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
1874f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
1875f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
1876f1a4739fSAlex Elder  * all data described by the image request.
1877f1a4739fSAlex Elder  */
1878f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
1879f1a4739fSAlex Elder 					enum obj_request_type type,
1880f1a4739fSAlex Elder 					void *data_desc)
1881bf0d5f50SAlex Elder {
1882bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1883bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1884bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
18850c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1886f1a4739fSAlex Elder 	struct bio *bio_list;
1887f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
1888f1a4739fSAlex Elder 	struct page **pages;
18897da22d29SAlex Elder 	u64 img_offset;
1890bf0d5f50SAlex Elder 	u64 resid;
1891bf0d5f50SAlex Elder 	u16 opcode;
1892bf0d5f50SAlex Elder 
1893f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1894f1a4739fSAlex Elder 		(int)type, data_desc);
189537206ee5SAlex Elder 
1896430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
18977da22d29SAlex Elder 	img_offset = img_request->offset;
1898bf0d5f50SAlex Elder 	resid = img_request->length;
18994dda41d3SAlex Elder 	rbd_assert(resid > 0);
1900f1a4739fSAlex Elder 
1901f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
1902f1a4739fSAlex Elder 		bio_list = data_desc;
1903f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1904f1a4739fSAlex Elder 	} else {
1905f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
1906f1a4739fSAlex Elder 		pages = data_desc;
1907f1a4739fSAlex Elder 	}
1908f1a4739fSAlex Elder 
1909bf0d5f50SAlex Elder 	while (resid) {
19102fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1911bf0d5f50SAlex Elder 		const char *object_name;
1912bf0d5f50SAlex Elder 		u64 offset;
1913bf0d5f50SAlex Elder 		u64 length;
1914bf0d5f50SAlex Elder 
19157da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1916bf0d5f50SAlex Elder 		if (!object_name)
1917bf0d5f50SAlex Elder 			goto out_unwind;
19187da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
19197da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1920bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1921f1a4739fSAlex Elder 						offset, length, type);
1922bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1923bf0d5f50SAlex Elder 		if (!obj_request)
1924bf0d5f50SAlex Elder 			goto out_unwind;
1925bf0d5f50SAlex Elder 
1926f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
1927f1a4739fSAlex Elder 			unsigned int clone_size;
1928f1a4739fSAlex Elder 
1929bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
1930bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
1931f1a4739fSAlex Elder 			obj_request->bio_list =
1932f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
1933f1a4739fSAlex Elder 								&bio_offset,
1934f1a4739fSAlex Elder 								clone_size,
1935bf0d5f50SAlex Elder 								GFP_ATOMIC);
1936bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
1937bf0d5f50SAlex Elder 				goto out_partial;
1938f1a4739fSAlex Elder 		} else {
1939f1a4739fSAlex Elder 			unsigned int page_count;
1940f1a4739fSAlex Elder 
1941f1a4739fSAlex Elder 			obj_request->pages = pages;
1942f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
1943f1a4739fSAlex Elder 			obj_request->page_count = page_count;
1944f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
1945f1a4739fSAlex Elder 				page_count--;	/* more on last page */
1946f1a4739fSAlex Elder 			pages += page_count;
1947f1a4739fSAlex Elder 		}
1948bf0d5f50SAlex Elder 
19492fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
19502fa12320SAlex Elder 						obj_request);
19512fa12320SAlex Elder 		if (!osd_req)
1952bf0d5f50SAlex Elder 			goto out_partial;
19532fa12320SAlex Elder 		obj_request->osd_req = osd_req;
19542169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1955430c28c3SAlex Elder 
19562fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
19572fa12320SAlex Elder 						0, 0);
1958f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
1959406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
1960f1a4739fSAlex Elder 					obj_request->bio_list, length);
1961f1a4739fSAlex Elder 		else
1962f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
1963f1a4739fSAlex Elder 					obj_request->pages, length,
1964f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
19659d4df01fSAlex Elder 
19669d4df01fSAlex Elder 		if (write_request)
19679d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
19689d4df01fSAlex Elder 		else
19699d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
1970430c28c3SAlex Elder 
19717da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1972bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1973bf0d5f50SAlex Elder 
19747da22d29SAlex Elder 		img_offset += length;
1975bf0d5f50SAlex Elder 		resid -= length;
1976bf0d5f50SAlex Elder 	}
1977bf0d5f50SAlex Elder 
1978bf0d5f50SAlex Elder 	return 0;
1979bf0d5f50SAlex Elder 
1980bf0d5f50SAlex Elder out_partial:
1981bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1982bf0d5f50SAlex Elder out_unwind:
1983bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1984bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1985bf0d5f50SAlex Elder 
1986bf0d5f50SAlex Elder 	return -ENOMEM;
1987bf0d5f50SAlex Elder }
1988bf0d5f50SAlex Elder 
19893d7efd18SAlex Elder static void
19900eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
19910eefd470SAlex Elder {
19920eefd470SAlex Elder 	struct rbd_img_request *img_request;
19930eefd470SAlex Elder 	struct rbd_device *rbd_dev;
19940eefd470SAlex Elder 	u64 length;
19950eefd470SAlex Elder 	u32 page_count;
19960eefd470SAlex Elder 
19970eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
19980eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
19990eefd470SAlex Elder 	img_request = obj_request->img_request;
20000eefd470SAlex Elder 	rbd_assert(img_request);
20010eefd470SAlex Elder 
20020eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20030eefd470SAlex Elder 	rbd_assert(rbd_dev);
20040eefd470SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
20050eefd470SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
20060eefd470SAlex Elder 
20070eefd470SAlex Elder 	rbd_assert(obj_request->copyup_pages);
20080eefd470SAlex Elder 	ceph_release_page_vector(obj_request->copyup_pages, page_count);
20090eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
20100eefd470SAlex Elder 
20110eefd470SAlex Elder 	/*
20120eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
20130eefd470SAlex Elder 	 * original write request.  There is no such thing as a
20140eefd470SAlex Elder 	 * successful short write, so if the request was successful
20150eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
20160eefd470SAlex Elder 	 */
20170eefd470SAlex Elder 	if (!obj_request->result)
20180eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
20190eefd470SAlex Elder 
20200eefd470SAlex Elder 	/* Finish up with the normal image object callback */
20210eefd470SAlex Elder 
20220eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
20230eefd470SAlex Elder }
20240eefd470SAlex Elder 
20250eefd470SAlex Elder static void
20263d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
20273d7efd18SAlex Elder {
20283d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
20290eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
20300eefd470SAlex Elder 	struct ceph_osd_client *osdc;
20310eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20323d7efd18SAlex Elder 	struct page **pages;
20333d7efd18SAlex Elder 	int result;
20343d7efd18SAlex Elder 	u64 obj_size;
20353d7efd18SAlex Elder 	u64 xferred;
20363d7efd18SAlex Elder 
20373d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
20383d7efd18SAlex Elder 
20393d7efd18SAlex Elder 	/* First get what we need from the image request */
20403d7efd18SAlex Elder 
20413d7efd18SAlex Elder 	pages = img_request->copyup_pages;
20423d7efd18SAlex Elder 	rbd_assert(pages != NULL);
20433d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
20443d7efd18SAlex Elder 
20453d7efd18SAlex Elder 	orig_request = img_request->obj_request;
20463d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
20470eefd470SAlex Elder 	rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
20483d7efd18SAlex Elder 	result = img_request->result;
20493d7efd18SAlex Elder 	obj_size = img_request->length;
20503d7efd18SAlex Elder 	xferred = img_request->xferred;
20513d7efd18SAlex Elder 
20520eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20530eefd470SAlex Elder 	rbd_assert(rbd_dev);
20540eefd470SAlex Elder 	rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
20550eefd470SAlex Elder 
20563d7efd18SAlex Elder 	rbd_img_request_put(img_request);
20573d7efd18SAlex Elder 
20580eefd470SAlex Elder 	if (result)
20590eefd470SAlex Elder 		goto out_err;
20603d7efd18SAlex Elder 
20610eefd470SAlex Elder 	/* Allocate the new copyup osd request for the original request */
20623d7efd18SAlex Elder 
20630eefd470SAlex Elder 	result = -ENOMEM;
20640eefd470SAlex Elder 	rbd_assert(!orig_request->osd_req);
20650eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
20660eefd470SAlex Elder 	if (!osd_req)
20670eefd470SAlex Elder 		goto out_err;
20680eefd470SAlex Elder 	orig_request->osd_req = osd_req;
20690eefd470SAlex Elder 	orig_request->copyup_pages = pages;
20703d7efd18SAlex Elder 
20710eefd470SAlex Elder 	/* Initialize the copyup op */
20720eefd470SAlex Elder 
20730eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
20740eefd470SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
20750eefd470SAlex Elder 						false, false);
20760eefd470SAlex Elder 
20770eefd470SAlex Elder 	/* Then the original write request op */
20780eefd470SAlex Elder 
20790eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
20800eefd470SAlex Elder 					orig_request->offset,
20810eefd470SAlex Elder 					orig_request->length, 0, 0);
20820eefd470SAlex Elder 	osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
20830eefd470SAlex Elder 					orig_request->length);
20840eefd470SAlex Elder 
20850eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
20860eefd470SAlex Elder 
20870eefd470SAlex Elder 	/* All set, send it off. */
20880eefd470SAlex Elder 
20890eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
20900eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
20910eefd470SAlex Elder 	result = rbd_obj_request_submit(osdc, orig_request);
20920eefd470SAlex Elder 	if (!result)
20930eefd470SAlex Elder 		return;
20940eefd470SAlex Elder out_err:
20950eefd470SAlex Elder 	/* Record the error code and complete the request */
20960eefd470SAlex Elder 
20970eefd470SAlex Elder 	orig_request->result = result;
20980eefd470SAlex Elder 	orig_request->xferred = 0;
20993d7efd18SAlex Elder 	obj_request_done_set(orig_request);
21003d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
21013d7efd18SAlex Elder }
21023d7efd18SAlex Elder 
21033d7efd18SAlex Elder /*
21043d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
21053d7efd18SAlex Elder  * entire target of the given object request.  This is used for
21063d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
21073d7efd18SAlex Elder  * object request from the image request does not exist.
21083d7efd18SAlex Elder  *
21093d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
21103d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
21113d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
21123d7efd18SAlex Elder  * the original object request for the copyup operation.
21133d7efd18SAlex Elder  *
21143d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
21153d7efd18SAlex Elder  * object request and mark it done so it gets completed.
21163d7efd18SAlex Elder  */
21173d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
21183d7efd18SAlex Elder {
21193d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
21203d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
21213d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
21223d7efd18SAlex Elder 	u64 img_offset;
21233d7efd18SAlex Elder 	u64 length;
21243d7efd18SAlex Elder 	struct page **pages = NULL;
21253d7efd18SAlex Elder 	u32 page_count;
21263d7efd18SAlex Elder 	int result;
21273d7efd18SAlex Elder 
21283d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21293d7efd18SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
21303d7efd18SAlex Elder 
21313d7efd18SAlex Elder 	img_request = obj_request->img_request;
21323d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
21333d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
21343d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
21353d7efd18SAlex Elder 
21363d7efd18SAlex Elder 	/*
21370eefd470SAlex Elder 	 * First things first.  The original osd request is of no
21380eefd470SAlex Elder 	 * use to use any more, we'll need a new one that can hold
21390eefd470SAlex Elder 	 * the two ops in a copyup request.  We'll get that later,
21400eefd470SAlex Elder 	 * but for now we can release the old one.
21410eefd470SAlex Elder 	 */
21420eefd470SAlex Elder 	rbd_osd_req_destroy(obj_request->osd_req);
21430eefd470SAlex Elder 	obj_request->osd_req = NULL;
21440eefd470SAlex Elder 
21450eefd470SAlex Elder 	/*
21463d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
21473d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
21483d7efd18SAlex Elder 	 */
21493d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
21503d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
21513d7efd18SAlex Elder 
21523d7efd18SAlex Elder 	/*
2153a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2154a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2155a9e8ba2cSAlex Elder 	 * necessary.
2156a9e8ba2cSAlex Elder 	 */
2157a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2158a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2159a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2160a9e8ba2cSAlex Elder 	}
2161a9e8ba2cSAlex Elder 
2162a9e8ba2cSAlex Elder 	/*
21633d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
21643d7efd18SAlex Elder 	 * from the parent.
21653d7efd18SAlex Elder 	 */
21663d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
21673d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
21683d7efd18SAlex Elder 	if (IS_ERR(pages)) {
21693d7efd18SAlex Elder 		result = PTR_ERR(pages);
21703d7efd18SAlex Elder 		pages = NULL;
21713d7efd18SAlex Elder 		goto out_err;
21723d7efd18SAlex Elder 	}
21733d7efd18SAlex Elder 
21743d7efd18SAlex Elder 	result = -ENOMEM;
21753d7efd18SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
21763d7efd18SAlex Elder 						img_offset, length,
21773d7efd18SAlex Elder 						false, true);
21783d7efd18SAlex Elder 	if (!parent_request)
21793d7efd18SAlex Elder 		goto out_err;
21803d7efd18SAlex Elder 	rbd_obj_request_get(obj_request);
21813d7efd18SAlex Elder 	parent_request->obj_request = obj_request;
21823d7efd18SAlex Elder 
21833d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
21843d7efd18SAlex Elder 	if (result)
21853d7efd18SAlex Elder 		goto out_err;
21863d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
21873d7efd18SAlex Elder 
21883d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
21893d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
21903d7efd18SAlex Elder 	if (!result)
21913d7efd18SAlex Elder 		return 0;
21923d7efd18SAlex Elder 
21933d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
21943d7efd18SAlex Elder 	parent_request->obj_request = NULL;
21953d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
21963d7efd18SAlex Elder out_err:
21973d7efd18SAlex Elder 	if (pages)
21983d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
21993d7efd18SAlex Elder 	if (parent_request)
22003d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
22013d7efd18SAlex Elder 	obj_request->result = result;
22023d7efd18SAlex Elder 	obj_request->xferred = 0;
22033d7efd18SAlex Elder 	obj_request_done_set(obj_request);
22043d7efd18SAlex Elder 
22053d7efd18SAlex Elder 	return result;
22063d7efd18SAlex Elder }
22073d7efd18SAlex Elder 
2208c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2209c5b5ef6cSAlex Elder {
2210c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2211c5b5ef6cSAlex Elder 	int result;
2212c5b5ef6cSAlex Elder 
2213c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2214c5b5ef6cSAlex Elder 
2215c5b5ef6cSAlex Elder 	/*
2216c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2217c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2218c5b5ef6cSAlex Elder 	 * we're done with the request.
2219c5b5ef6cSAlex Elder 	 */
2220c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2221c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2222c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2223c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2224c5b5ef6cSAlex Elder 
2225c5b5ef6cSAlex Elder 	result = obj_request->result;
2226c5b5ef6cSAlex Elder 	obj_request->result = 0;
2227c5b5ef6cSAlex Elder 
2228c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2229c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2230c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2231c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2232c5b5ef6cSAlex Elder 
2233c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2234c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2235c5b5ef6cSAlex Elder 
2236c5b5ef6cSAlex Elder 	/*
2237c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2238c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2239c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2240c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2241c5b5ef6cSAlex Elder 	 */
2242c5b5ef6cSAlex Elder 	if (!result) {
2243c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2244c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2245c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2246c5b5ef6cSAlex Elder 	} else if (result) {
2247c5b5ef6cSAlex Elder 		orig_request->result = result;
22483d7efd18SAlex Elder 		goto out;
2249c5b5ef6cSAlex Elder 	}
2250c5b5ef6cSAlex Elder 
2251c5b5ef6cSAlex Elder 	/*
2252c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2253c5b5ef6cSAlex Elder 	 * whether the target object exists.
2254c5b5ef6cSAlex Elder 	 */
2255b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
22563d7efd18SAlex Elder out:
2257c5b5ef6cSAlex Elder 	if (orig_request->result)
2258c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2259c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
2260c5b5ef6cSAlex Elder }
2261c5b5ef6cSAlex Elder 
2262c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2263c5b5ef6cSAlex Elder {
2264c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2265c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2266c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2267c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2268c5b5ef6cSAlex Elder 	u32 page_count;
2269c5b5ef6cSAlex Elder 	size_t size;
2270c5b5ef6cSAlex Elder 	int ret;
2271c5b5ef6cSAlex Elder 
2272c5b5ef6cSAlex Elder 	/*
2273c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2274c5b5ef6cSAlex Elder 	 *     le64 length;
2275c5b5ef6cSAlex Elder 	 *     struct {
2276c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2277c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2278c5b5ef6cSAlex Elder 	 *     } mtime;
2279c5b5ef6cSAlex Elder 	 */
2280c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2281c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2282c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2283c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2284c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2285c5b5ef6cSAlex Elder 
2286c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2287c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2288c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2289c5b5ef6cSAlex Elder 	if (!stat_request)
2290c5b5ef6cSAlex Elder 		goto out;
2291c5b5ef6cSAlex Elder 
2292c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2293c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2294c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2295c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2296c5b5ef6cSAlex Elder 
2297c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2298c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2299c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2300c5b5ef6cSAlex Elder 						stat_request);
2301c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2302c5b5ef6cSAlex Elder 		goto out;
2303c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2304c5b5ef6cSAlex Elder 
2305c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2306c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2307c5b5ef6cSAlex Elder 					false, false);
23089d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2309c5b5ef6cSAlex Elder 
2310c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2311c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2312c5b5ef6cSAlex Elder out:
2313c5b5ef6cSAlex Elder 	if (ret)
2314c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2315c5b5ef6cSAlex Elder 
2316c5b5ef6cSAlex Elder 	return ret;
2317c5b5ef6cSAlex Elder }
2318c5b5ef6cSAlex Elder 
2319b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2320b454e36dSAlex Elder {
2321b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2322a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
23233d7efd18SAlex Elder 	bool known;
2324b454e36dSAlex Elder 
2325b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2326b454e36dSAlex Elder 
2327b454e36dSAlex Elder 	img_request = obj_request->img_request;
2328b454e36dSAlex Elder 	rbd_assert(img_request);
2329a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2330b454e36dSAlex Elder 
2331b454e36dSAlex Elder 	/*
2332a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2333a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2334a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2335a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2336a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2337a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2338a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2339a9e8ba2cSAlex Elder 	 * simple object request.
2340b454e36dSAlex Elder 	 */
2341b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2342b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2343a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
23443d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
23453d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2346b454e36dSAlex Elder 
2347b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2348b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2349b454e36dSAlex Elder 
2350b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2351b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2352b454e36dSAlex Elder 
2353b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2354b454e36dSAlex Elder 	}
2355b454e36dSAlex Elder 
2356b454e36dSAlex Elder 	/*
23573d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
23583d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
23593d7efd18SAlex Elder 	 * start by reading the data for the full target object from
23603d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2361b454e36dSAlex Elder 	 */
23623d7efd18SAlex Elder 	if (known)
23633d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
23643d7efd18SAlex Elder 
23653d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2366b454e36dSAlex Elder 
2367b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2368b454e36dSAlex Elder }
2369b454e36dSAlex Elder 
2370bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2371bf0d5f50SAlex Elder {
2372bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
237346faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2374bf0d5f50SAlex Elder 
237537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
237646faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2377bf0d5f50SAlex Elder 		int ret;
2378bf0d5f50SAlex Elder 
2379b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2380bf0d5f50SAlex Elder 		if (ret)
2381bf0d5f50SAlex Elder 			return ret;
2382bf0d5f50SAlex Elder 	}
2383bf0d5f50SAlex Elder 
2384bf0d5f50SAlex Elder 	return 0;
2385bf0d5f50SAlex Elder }
2386bf0d5f50SAlex Elder 
23878b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
23888b3e1a56SAlex Elder {
23898b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2390a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2391a9e8ba2cSAlex Elder 	u64 obj_end;
23928b3e1a56SAlex Elder 
23938b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
23948b3e1a56SAlex Elder 
23958b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
2396a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2397a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
23988b3e1a56SAlex Elder 
2399a9e8ba2cSAlex Elder 	obj_request->result = img_request->result;
2400a9e8ba2cSAlex Elder 	if (obj_request->result)
2401a9e8ba2cSAlex Elder 		goto out;
2402a9e8ba2cSAlex Elder 
2403a9e8ba2cSAlex Elder 	/*
2404a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2405a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2406a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2407a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2408a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2409a9e8ba2cSAlex Elder 	 */
2410a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2411a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2412a9e8ba2cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2413a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2414a9e8ba2cSAlex Elder 		u64 xferred = 0;
2415a9e8ba2cSAlex Elder 
2416a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2417a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2418a9e8ba2cSAlex Elder 					obj_request->img_offset;
2419a9e8ba2cSAlex Elder 
2420a9e8ba2cSAlex Elder 		obj_request->xferred = min(img_request->xferred, xferred);
2421a9e8ba2cSAlex Elder 	} else {
2422a9e8ba2cSAlex Elder 		obj_request->xferred = img_request->xferred;
2423a9e8ba2cSAlex Elder 	}
2424a9e8ba2cSAlex Elder out:
24258b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
24268b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
24278b3e1a56SAlex Elder }
24288b3e1a56SAlex Elder 
24298b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
24308b3e1a56SAlex Elder {
24318b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
24328b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
24338b3e1a56SAlex Elder 	int result;
24348b3e1a56SAlex Elder 
24358b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
24368b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
24378b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
24388b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
24398b3e1a56SAlex Elder 
24408b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
24418b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
24428b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
24438b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
24448b3e1a56SAlex Elder 						obj_request->img_offset,
24458b3e1a56SAlex Elder 						obj_request->length,
24468b3e1a56SAlex Elder 						false, true);
24478b3e1a56SAlex Elder 	result = -ENOMEM;
24488b3e1a56SAlex Elder 	if (!img_request)
24498b3e1a56SAlex Elder 		goto out_err;
24508b3e1a56SAlex Elder 
24518b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
24528b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
24538b3e1a56SAlex Elder 
2454f1a4739fSAlex Elder 	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2455f1a4739fSAlex Elder 					obj_request->bio_list);
24568b3e1a56SAlex Elder 	if (result)
24578b3e1a56SAlex Elder 		goto out_err;
24588b3e1a56SAlex Elder 
24598b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
24608b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
24618b3e1a56SAlex Elder 	if (result)
24628b3e1a56SAlex Elder 		goto out_err;
24638b3e1a56SAlex Elder 
24648b3e1a56SAlex Elder 	return;
24658b3e1a56SAlex Elder out_err:
24668b3e1a56SAlex Elder 	if (img_request)
24678b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
24688b3e1a56SAlex Elder 	obj_request->result = result;
24698b3e1a56SAlex Elder 	obj_request->xferred = 0;
24708b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
24718b3e1a56SAlex Elder }
24728b3e1a56SAlex Elder 
2473cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
2474b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
2475b8d70035SAlex Elder {
2476b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
24772169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2478b8d70035SAlex Elder 	int ret;
2479b8d70035SAlex Elder 
2480b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2481b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2482b8d70035SAlex Elder 	if (!obj_request)
2483b8d70035SAlex Elder 		return -ENOMEM;
2484b8d70035SAlex Elder 
2485b8d70035SAlex Elder 	ret = -ENOMEM;
2486430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2487b8d70035SAlex Elder 	if (!obj_request->osd_req)
2488b8d70035SAlex Elder 		goto out;
24892169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2490b8d70035SAlex Elder 
2491c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2492c99d2d4aSAlex Elder 					notify_id, ver, 0);
24939d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2494430c28c3SAlex Elder 
2495b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2496b8d70035SAlex Elder out:
2497cf81b60eSAlex Elder 	if (ret)
2498b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2499b8d70035SAlex Elder 
2500b8d70035SAlex Elder 	return ret;
2501b8d70035SAlex Elder }
2502b8d70035SAlex Elder 
2503b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2504b8d70035SAlex Elder {
2505b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2506b8d70035SAlex Elder 	u64 hver;
2507b8d70035SAlex Elder 
2508b8d70035SAlex Elder 	if (!rbd_dev)
2509b8d70035SAlex Elder 		return;
2510b8d70035SAlex Elder 
251137206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2512b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
2513b8d70035SAlex Elder 		(unsigned int) opcode);
2514522a0cc0SAlex Elder 	(void)rbd_dev_refresh(rbd_dev, &hver);
2515b8d70035SAlex Elder 
2516cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
2517b8d70035SAlex Elder }
2518b8d70035SAlex Elder 
25199969ebc5SAlex Elder /*
25209969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
25219969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
25229969ebc5SAlex Elder  */
25239969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
25249969ebc5SAlex Elder {
25259969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
25269969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
25279969ebc5SAlex Elder 	int ret;
25289969ebc5SAlex Elder 
25299969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
25309969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
25319969ebc5SAlex Elder 
25329969ebc5SAlex Elder 	if (start) {
25333c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
25349969ebc5SAlex Elder 						&rbd_dev->watch_event);
25359969ebc5SAlex Elder 		if (ret < 0)
25369969ebc5SAlex Elder 			return ret;
25378eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
25389969ebc5SAlex Elder 	}
25399969ebc5SAlex Elder 
25409969ebc5SAlex Elder 	ret = -ENOMEM;
25419969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
25429969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
25439969ebc5SAlex Elder 	if (!obj_request)
25449969ebc5SAlex Elder 		goto out_cancel;
25459969ebc5SAlex Elder 
2546430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2547430c28c3SAlex Elder 	if (!obj_request->osd_req)
2548430c28c3SAlex Elder 		goto out_cancel;
2549430c28c3SAlex Elder 
25508eb87565SAlex Elder 	if (start)
2551975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
25528eb87565SAlex Elder 	else
25536977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2554975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
25552169238dSAlex Elder 
25562169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
25572169238dSAlex Elder 				rbd_dev->watch_event->cookie,
25582169238dSAlex Elder 				rbd_dev->header.obj_version, start);
25599d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
25602169238dSAlex Elder 
25619969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
25629969ebc5SAlex Elder 	if (ret)
25639969ebc5SAlex Elder 		goto out_cancel;
25649969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
25659969ebc5SAlex Elder 	if (ret)
25669969ebc5SAlex Elder 		goto out_cancel;
25679969ebc5SAlex Elder 	ret = obj_request->result;
25689969ebc5SAlex Elder 	if (ret)
25699969ebc5SAlex Elder 		goto out_cancel;
25709969ebc5SAlex Elder 
25718eb87565SAlex Elder 	/*
25728eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
25738eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
25748eb87565SAlex Elder 	 * a pointer to the object request during that time (in
25758eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
25768eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
25778eb87565SAlex Elder 	 * unregistered it.
25788eb87565SAlex Elder 	 */
25798eb87565SAlex Elder 	if (start) {
25808eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
25818eb87565SAlex Elder 
25828eb87565SAlex Elder 		return 0;
25838eb87565SAlex Elder 	}
25848eb87565SAlex Elder 
25858eb87565SAlex Elder 	/* We have successfully torn down the watch request */
25868eb87565SAlex Elder 
25878eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
25888eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
25899969ebc5SAlex Elder out_cancel:
25909969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
25919969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
25929969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
25939969ebc5SAlex Elder 	if (obj_request)
25949969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
25959969ebc5SAlex Elder 
25969969ebc5SAlex Elder 	return ret;
25979969ebc5SAlex Elder }
25989969ebc5SAlex Elder 
259936be9a76SAlex Elder /*
2600f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2601f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
260236be9a76SAlex Elder  */
260336be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
260436be9a76SAlex Elder 			     const char *object_name,
260536be9a76SAlex Elder 			     const char *class_name,
260636be9a76SAlex Elder 			     const char *method_name,
26074157976bSAlex Elder 			     const void *outbound,
260836be9a76SAlex Elder 			     size_t outbound_size,
26094157976bSAlex Elder 			     void *inbound,
261036be9a76SAlex Elder 			     size_t inbound_size,
261136be9a76SAlex Elder 			     u64 *version)
261236be9a76SAlex Elder {
26132169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
261436be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
261536be9a76SAlex Elder 	struct page **pages;
261636be9a76SAlex Elder 	u32 page_count;
261736be9a76SAlex Elder 	int ret;
261836be9a76SAlex Elder 
261936be9a76SAlex Elder 	/*
26206010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
26216010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
26226010a451SAlex Elder 	 * also supply outbound data--parameters for the object
26236010a451SAlex Elder 	 * method.  Currently if this is present it will be a
26246010a451SAlex Elder 	 * snapshot id.
262536be9a76SAlex Elder 	 */
262636be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
262736be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
262836be9a76SAlex Elder 	if (IS_ERR(pages))
262936be9a76SAlex Elder 		return PTR_ERR(pages);
263036be9a76SAlex Elder 
263136be9a76SAlex Elder 	ret = -ENOMEM;
26326010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
263336be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
263436be9a76SAlex Elder 	if (!obj_request)
263536be9a76SAlex Elder 		goto out;
263636be9a76SAlex Elder 
263736be9a76SAlex Elder 	obj_request->pages = pages;
263836be9a76SAlex Elder 	obj_request->page_count = page_count;
263936be9a76SAlex Elder 
2640430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
264136be9a76SAlex Elder 	if (!obj_request->osd_req)
264236be9a76SAlex Elder 		goto out;
264336be9a76SAlex Elder 
2644c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
264504017e29SAlex Elder 					class_name, method_name);
264604017e29SAlex Elder 	if (outbound_size) {
264704017e29SAlex Elder 		struct ceph_pagelist *pagelist;
264804017e29SAlex Elder 
264904017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
265004017e29SAlex Elder 		if (!pagelist)
265104017e29SAlex Elder 			goto out;
265204017e29SAlex Elder 
265304017e29SAlex Elder 		ceph_pagelist_init(pagelist);
265404017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
265504017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
265604017e29SAlex Elder 						pagelist);
265704017e29SAlex Elder 	}
2658a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2659a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
266044cd188dSAlex Elder 					0, false, false);
26619d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2662430c28c3SAlex Elder 
266336be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
266436be9a76SAlex Elder 	if (ret)
266536be9a76SAlex Elder 		goto out;
266636be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
266736be9a76SAlex Elder 	if (ret)
266836be9a76SAlex Elder 		goto out;
266936be9a76SAlex Elder 
267036be9a76SAlex Elder 	ret = obj_request->result;
267136be9a76SAlex Elder 	if (ret < 0)
267236be9a76SAlex Elder 		goto out;
267357385b51SAlex Elder 
267457385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
267557385b51SAlex Elder 	ret = (int)obj_request->xferred;
2676903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
267736be9a76SAlex Elder 	if (version)
267836be9a76SAlex Elder 		*version = obj_request->version;
267936be9a76SAlex Elder out:
268036be9a76SAlex Elder 	if (obj_request)
268136be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
268236be9a76SAlex Elder 	else
268336be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
268436be9a76SAlex Elder 
268536be9a76SAlex Elder 	return ret;
268636be9a76SAlex Elder }
268736be9a76SAlex Elder 
2688bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2689cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2690bf0d5f50SAlex Elder {
2691bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2692bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2693bf0d5f50SAlex Elder 	struct request *rq;
2694bf0d5f50SAlex Elder 	int result;
2695bf0d5f50SAlex Elder 
2696bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2697bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2698bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2699bf0d5f50SAlex Elder 		u64 offset;
2700bf0d5f50SAlex Elder 		u64 length;
2701bf0d5f50SAlex Elder 
2702bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2703bf0d5f50SAlex Elder 
2704bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
27054dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
27064dda41d3SAlex Elder 				(int) rq->cmd_type);
27074dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
27084dda41d3SAlex Elder 			continue;
27094dda41d3SAlex Elder 		}
27104dda41d3SAlex Elder 
27114dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
27124dda41d3SAlex Elder 
27134dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
27144dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
27154dda41d3SAlex Elder 
27164dda41d3SAlex Elder 		if (!length) {
27174dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2718bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2719bf0d5f50SAlex Elder 			continue;
2720bf0d5f50SAlex Elder 		}
2721bf0d5f50SAlex Elder 
2722bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2723bf0d5f50SAlex Elder 
2724bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2725bf0d5f50SAlex Elder 
2726bf0d5f50SAlex Elder 		if (write_request) {
2727bf0d5f50SAlex Elder 			result = -EROFS;
2728bf0d5f50SAlex Elder 			if (read_only)
2729bf0d5f50SAlex Elder 				goto end_request;
2730bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2731bf0d5f50SAlex Elder 		}
2732bf0d5f50SAlex Elder 
27336d292906SAlex Elder 		/*
27346d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
27356d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
27366d292906SAlex Elder 		 * have disappeared by the time our request arrives
27376d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
27386d292906SAlex Elder 		 * we already know.
27396d292906SAlex Elder 		 */
27406d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2741bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2742bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2743bf0d5f50SAlex Elder 			result = -ENXIO;
2744bf0d5f50SAlex Elder 			goto end_request;
2745bf0d5f50SAlex Elder 		}
2746bf0d5f50SAlex Elder 
2747bf0d5f50SAlex Elder 		result = -EINVAL;
2748c0cd10dbSAlex Elder 		if (offset && length > U64_MAX - offset + 1) {
2749c0cd10dbSAlex Elder 			rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2750c0cd10dbSAlex Elder 				offset, length);
2751bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2752c0cd10dbSAlex Elder 		}
2753bf0d5f50SAlex Elder 
2754bf0d5f50SAlex Elder 		result = -ENOMEM;
2755bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
27569849e986SAlex Elder 							write_request, false);
2757bf0d5f50SAlex Elder 		if (!img_request)
2758bf0d5f50SAlex Elder 			goto end_request;
2759bf0d5f50SAlex Elder 
2760bf0d5f50SAlex Elder 		img_request->rq = rq;
2761bf0d5f50SAlex Elder 
2762f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2763f1a4739fSAlex Elder 						rq->bio);
2764bf0d5f50SAlex Elder 		if (!result)
2765bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2766bf0d5f50SAlex Elder 		if (result)
2767bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2768bf0d5f50SAlex Elder end_request:
2769bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2770bf0d5f50SAlex Elder 		if (result < 0) {
27717da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
27727da22d29SAlex Elder 				write_request ? "write" : "read",
27737da22d29SAlex Elder 				length, offset, result);
27747da22d29SAlex Elder 
2775bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2776bf0d5f50SAlex Elder 		}
2777bf0d5f50SAlex Elder 	}
2778bf0d5f50SAlex Elder }
2779bf0d5f50SAlex Elder 
2780602adf40SYehuda Sadeh /*
2781602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2782602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2783f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2784602adf40SYehuda Sadeh  */
2785602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2786602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2787602adf40SYehuda Sadeh {
2788602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2789e5cfeed2SAlex Elder 	sector_t sector_offset;
2790e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2791e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2792e5cfeed2SAlex Elder 	int ret;
2793602adf40SYehuda Sadeh 
2794e5cfeed2SAlex Elder 	/*
2795e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2796e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2797e5cfeed2SAlex Elder 	 * device.
2798e5cfeed2SAlex Elder 	 */
2799e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2800e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2801e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2802593a9e7bSAlex Elder 
2803e5cfeed2SAlex Elder 	/*
2804e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2805e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2806e5cfeed2SAlex Elder 	 */
2807e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2808e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2809e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2810e5cfeed2SAlex Elder 	else
2811e5cfeed2SAlex Elder 		ret = 0;
2812e5cfeed2SAlex Elder 
2813e5cfeed2SAlex Elder 	/*
2814e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2815e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2816e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2817e5cfeed2SAlex Elder 	 * added to an empty bio."
2818e5cfeed2SAlex Elder 	 */
2819e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2820e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2821e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2822e5cfeed2SAlex Elder 
2823e5cfeed2SAlex Elder 	return ret;
2824602adf40SYehuda Sadeh }
2825602adf40SYehuda Sadeh 
2826602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2827602adf40SYehuda Sadeh {
2828602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2829602adf40SYehuda Sadeh 
2830602adf40SYehuda Sadeh 	if (!disk)
2831602adf40SYehuda Sadeh 		return;
2832602adf40SYehuda Sadeh 
2833a0cab924SAlex Elder 	rbd_dev->disk = NULL;
2834a0cab924SAlex Elder 	if (disk->flags & GENHD_FL_UP) {
2835602adf40SYehuda Sadeh 		del_gendisk(disk);
2836602adf40SYehuda Sadeh 		if (disk->queue)
2837602adf40SYehuda Sadeh 			blk_cleanup_queue(disk->queue);
2838a0cab924SAlex Elder 	}
2839602adf40SYehuda Sadeh 	put_disk(disk);
2840602adf40SYehuda Sadeh }
2841602adf40SYehuda Sadeh 
2842788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2843788e2df3SAlex Elder 				const char *object_name,
2844788e2df3SAlex Elder 				u64 offset, u64 length,
284580ef15bfSAlex Elder 				void *buf, u64 *version)
2846788e2df3SAlex Elder 
2847788e2df3SAlex Elder {
28482169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2849788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2850788e2df3SAlex Elder 	struct page **pages = NULL;
2851788e2df3SAlex Elder 	u32 page_count;
28521ceae7efSAlex Elder 	size_t size;
2853788e2df3SAlex Elder 	int ret;
2854788e2df3SAlex Elder 
2855788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2856788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2857788e2df3SAlex Elder 	if (IS_ERR(pages))
2858788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2859788e2df3SAlex Elder 
2860788e2df3SAlex Elder 	ret = -ENOMEM;
2861788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2862788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2863788e2df3SAlex Elder 	if (!obj_request)
2864788e2df3SAlex Elder 		goto out;
2865788e2df3SAlex Elder 
2866788e2df3SAlex Elder 	obj_request->pages = pages;
2867788e2df3SAlex Elder 	obj_request->page_count = page_count;
2868788e2df3SAlex Elder 
2869430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2870788e2df3SAlex Elder 	if (!obj_request->osd_req)
2871788e2df3SAlex Elder 		goto out;
2872788e2df3SAlex Elder 
2873c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2874c99d2d4aSAlex Elder 					offset, length, 0, 0);
2875406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2876a4ce40a9SAlex Elder 					obj_request->pages,
287744cd188dSAlex Elder 					obj_request->length,
287844cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
287944cd188dSAlex Elder 					false, false);
28809d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2881430c28c3SAlex Elder 
2882788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2883788e2df3SAlex Elder 	if (ret)
2884788e2df3SAlex Elder 		goto out;
2885788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2886788e2df3SAlex Elder 	if (ret)
2887788e2df3SAlex Elder 		goto out;
2888788e2df3SAlex Elder 
2889788e2df3SAlex Elder 	ret = obj_request->result;
2890788e2df3SAlex Elder 	if (ret < 0)
2891788e2df3SAlex Elder 		goto out;
28921ceae7efSAlex Elder 
28931ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
28941ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2895903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
289623ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
289723ed6e13SAlex Elder 	ret = (int) size;
2898788e2df3SAlex Elder 	if (version)
2899788e2df3SAlex Elder 		*version = obj_request->version;
2900788e2df3SAlex Elder out:
2901788e2df3SAlex Elder 	if (obj_request)
2902788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2903788e2df3SAlex Elder 	else
2904788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2905788e2df3SAlex Elder 
2906788e2df3SAlex Elder 	return ret;
2907788e2df3SAlex Elder }
2908788e2df3SAlex Elder 
2909602adf40SYehuda Sadeh /*
29104156d998SAlex Elder  * Read the complete header for the given rbd device.
29114156d998SAlex Elder  *
29124156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
29134156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
29144156d998SAlex Elder  * of a variable that will be filled in with the version of the
29154156d998SAlex Elder  * header object at the time it was read.
29164156d998SAlex Elder  *
29174156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
29184156d998SAlex Elder  */
29194156d998SAlex Elder static struct rbd_image_header_ondisk *
29204156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
29214156d998SAlex Elder {
29224156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
29234156d998SAlex Elder 	u32 snap_count = 0;
29244156d998SAlex Elder 	u64 names_size = 0;
29254156d998SAlex Elder 	u32 want_count;
29264156d998SAlex Elder 	int ret;
29274156d998SAlex Elder 
29284156d998SAlex Elder 	/*
29294156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
29304156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
29314156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
29324156d998SAlex Elder 	 * the number of snapshots could change by the time we read
29334156d998SAlex Elder 	 * it in, in which case we re-read it.
29344156d998SAlex Elder 	 */
29354156d998SAlex Elder 	do {
29364156d998SAlex Elder 		size_t size;
29374156d998SAlex Elder 
29384156d998SAlex Elder 		kfree(ondisk);
29394156d998SAlex Elder 
29404156d998SAlex Elder 		size = sizeof (*ondisk);
29414156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
29424156d998SAlex Elder 		size += names_size;
29434156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
29444156d998SAlex Elder 		if (!ondisk)
29454156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
29464156d998SAlex Elder 
2947788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
294880ef15bfSAlex Elder 				       0, size, ondisk, version);
29494156d998SAlex Elder 		if (ret < 0)
29504156d998SAlex Elder 			goto out_err;
2951c0cd10dbSAlex Elder 		if ((size_t)ret < size) {
29524156d998SAlex Elder 			ret = -ENXIO;
295306ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
295406ecc6cbSAlex Elder 				size, ret);
29554156d998SAlex Elder 			goto out_err;
29564156d998SAlex Elder 		}
29574156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
29584156d998SAlex Elder 			ret = -ENXIO;
295906ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
29604156d998SAlex Elder 			goto out_err;
29614156d998SAlex Elder 		}
29624156d998SAlex Elder 
29634156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
29644156d998SAlex Elder 		want_count = snap_count;
29654156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
29664156d998SAlex Elder 	} while (snap_count != want_count);
29674156d998SAlex Elder 
29684156d998SAlex Elder 	return ondisk;
29694156d998SAlex Elder 
29704156d998SAlex Elder out_err:
29714156d998SAlex Elder 	kfree(ondisk);
29724156d998SAlex Elder 
29734156d998SAlex Elder 	return ERR_PTR(ret);
29744156d998SAlex Elder }
29754156d998SAlex Elder 
29764156d998SAlex Elder /*
2977602adf40SYehuda Sadeh  * reload the ondisk the header
2978602adf40SYehuda Sadeh  */
2979602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2980602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2981602adf40SYehuda Sadeh {
29824156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
29834156d998SAlex Elder 	u64 ver = 0;
29844156d998SAlex Elder 	int ret;
2985602adf40SYehuda Sadeh 
29864156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
29874156d998SAlex Elder 	if (IS_ERR(ondisk))
29884156d998SAlex Elder 		return PTR_ERR(ondisk);
29894156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
29904156d998SAlex Elder 	if (ret >= 0)
299159c2be1eSYehuda Sadeh 		header->obj_version = ver;
29924156d998SAlex Elder 	kfree(ondisk);
2993602adf40SYehuda Sadeh 
29944156d998SAlex Elder 	return ret;
2995602adf40SYehuda Sadeh }
2996602adf40SYehuda Sadeh 
299741f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2998dfc5606dSYehuda Sadeh {
2999dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
3000a0593290SAlex Elder 	struct rbd_snap *next;
3001dfc5606dSYehuda Sadeh 
30026087b51bSAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
30036087b51bSAlex Elder 		list_del(&snap->node);
30046087b51bSAlex Elder 		rbd_snap_destroy(snap);
30056087b51bSAlex Elder 	}
3006dfc5606dSYehuda Sadeh }
3007dfc5606dSYehuda Sadeh 
30089478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
30099478554aSAlex Elder {
30100d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
30119478554aSAlex Elder 		return;
30129478554aSAlex Elder 
3013e28626a0SAlex Elder 	if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3014e28626a0SAlex Elder 		sector_t size;
3015e28626a0SAlex Elder 
3016e28626a0SAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
3017e28626a0SAlex Elder 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
30189478554aSAlex Elder 		dout("setting size to %llu sectors", (unsigned long long)size);
30199478554aSAlex Elder 		set_capacity(rbd_dev->disk, size);
30209478554aSAlex Elder 	}
3021e28626a0SAlex Elder }
30229478554aSAlex Elder 
3023602adf40SYehuda Sadeh /*
3024602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
3025602adf40SYehuda Sadeh  */
3026117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
3027602adf40SYehuda Sadeh {
3028602adf40SYehuda Sadeh 	int ret;
3029602adf40SYehuda Sadeh 	struct rbd_image_header h;
3030602adf40SYehuda Sadeh 
3031602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
3032602adf40SYehuda Sadeh 	if (ret < 0)
3033602adf40SYehuda Sadeh 		return ret;
3034602adf40SYehuda Sadeh 
3035a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
3036a51aa0c0SJosh Durgin 
30379478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
30389478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
30399478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
30409db4b3e3SSage Weil 
3041849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
3042602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
3043849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
3044d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
3045812164f8SAlex Elder 	ceph_put_snap_context(rbd_dev->header.snapc);
3046602adf40SYehuda Sadeh 
3047b813623aSAlex Elder 	if (hver)
3048b813623aSAlex Elder 		*hver = h.obj_version;
3049a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
305093a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
3051602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
3052602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
3053602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
3054849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
3055c0cd10dbSAlex Elder 	if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3056c0cd10dbSAlex Elder 		rbd_warn(rbd_dev, "object prefix changed (ignoring)");
3057849b4260SAlex Elder 	kfree(h.object_prefix);
3058849b4260SAlex Elder 
3059304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3060dfc5606dSYehuda Sadeh 
3061c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
3062602adf40SYehuda Sadeh 
3063dfc5606dSYehuda Sadeh 	return ret;
3064602adf40SYehuda Sadeh }
3065602adf40SYehuda Sadeh 
3066117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
30671fe5e993SAlex Elder {
3068a3fbe5d4SAlex Elder 	u64 image_size;
30691fe5e993SAlex Elder 	int ret;
30701fe5e993SAlex Elder 
3071117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
3072a3fbe5d4SAlex Elder 	image_size = rbd_dev->header.image_size;
30731fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3074117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
3075117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
3076117973fbSAlex Elder 	else
3077117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
30781fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
3079522a0cc0SAlex Elder 	if (ret)
3080522a0cc0SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
3081522a0cc0SAlex Elder 			   " update snaps: %d\n", ret);
3082a3fbe5d4SAlex Elder 	if (image_size != rbd_dev->header.image_size)
3083a3fbe5d4SAlex Elder 		revalidate_disk(rbd_dev->disk);
30841fe5e993SAlex Elder 
30851fe5e993SAlex Elder 	return ret;
30861fe5e993SAlex Elder }
30871fe5e993SAlex Elder 
3088602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3089602adf40SYehuda Sadeh {
3090602adf40SYehuda Sadeh 	struct gendisk *disk;
3091602adf40SYehuda Sadeh 	struct request_queue *q;
3092593a9e7bSAlex Elder 	u64 segment_size;
3093602adf40SYehuda Sadeh 
3094602adf40SYehuda Sadeh 	/* create gendisk info */
3095602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3096602adf40SYehuda Sadeh 	if (!disk)
30971fcdb8aaSAlex Elder 		return -ENOMEM;
3098602adf40SYehuda Sadeh 
3099f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3100de71a297SAlex Elder 		 rbd_dev->dev_id);
3101602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3102602adf40SYehuda Sadeh 	disk->first_minor = 0;
3103602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3104602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3105602adf40SYehuda Sadeh 
3106bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3107602adf40SYehuda Sadeh 	if (!q)
3108602adf40SYehuda Sadeh 		goto out_disk;
3109029bcbd8SJosh Durgin 
3110593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3111593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3112593a9e7bSAlex Elder 
3113029bcbd8SJosh Durgin 	/* set io sizes to object size */
3114593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3115593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3116593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3117593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3118593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3119029bcbd8SJosh Durgin 
3120602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3121602adf40SYehuda Sadeh 	disk->queue = q;
3122602adf40SYehuda Sadeh 
3123602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3124602adf40SYehuda Sadeh 
3125602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3126602adf40SYehuda Sadeh 
3127602adf40SYehuda Sadeh 	return 0;
3128602adf40SYehuda Sadeh out_disk:
3129602adf40SYehuda Sadeh 	put_disk(disk);
31301fcdb8aaSAlex Elder 
31311fcdb8aaSAlex Elder 	return -ENOMEM;
3132602adf40SYehuda Sadeh }
3133602adf40SYehuda Sadeh 
3134dfc5606dSYehuda Sadeh /*
3135dfc5606dSYehuda Sadeh   sysfs
3136dfc5606dSYehuda Sadeh */
3137602adf40SYehuda Sadeh 
3138593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3139593a9e7bSAlex Elder {
3140593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3141593a9e7bSAlex Elder }
3142593a9e7bSAlex Elder 
3143dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3144dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3145602adf40SYehuda Sadeh {
3146593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3147dfc5606dSYehuda Sadeh 
3148fc71d833SAlex Elder 	return sprintf(buf, "%llu\n",
3149fc71d833SAlex Elder 		(unsigned long long)rbd_dev->mapping.size);
3150602adf40SYehuda Sadeh }
3151602adf40SYehuda Sadeh 
315234b13184SAlex Elder /*
315334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
315434b13184SAlex Elder  * necessarily the base image.
315534b13184SAlex Elder  */
315634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
315734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
315834b13184SAlex Elder {
315934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
316034b13184SAlex Elder 
316134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
316234b13184SAlex Elder 			(unsigned long long)rbd_dev->mapping.features);
316334b13184SAlex Elder }
316434b13184SAlex Elder 
3165dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3166dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3167602adf40SYehuda Sadeh {
3168593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3169dfc5606dSYehuda Sadeh 
3170fc71d833SAlex Elder 	if (rbd_dev->major)
3171dfc5606dSYehuda Sadeh 		return sprintf(buf, "%d\n", rbd_dev->major);
3172fc71d833SAlex Elder 
3173fc71d833SAlex Elder 	return sprintf(buf, "(none)\n");
3174fc71d833SAlex Elder 
3175dfc5606dSYehuda Sadeh }
3176dfc5606dSYehuda Sadeh 
3177dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3178dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3179dfc5606dSYehuda Sadeh {
3180593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3181dfc5606dSYehuda Sadeh 
31821dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
31831dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3184dfc5606dSYehuda Sadeh }
3185dfc5606dSYehuda Sadeh 
3186dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3187dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3188dfc5606dSYehuda Sadeh {
3189593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3190dfc5606dSYehuda Sadeh 
31910d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3192dfc5606dSYehuda Sadeh }
3193dfc5606dSYehuda Sadeh 
31949bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
31959bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
31969bb2f334SAlex Elder {
31979bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
31989bb2f334SAlex Elder 
31990d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
32000d7dbfceSAlex Elder 			(unsigned long long) rbd_dev->spec->pool_id);
32019bb2f334SAlex Elder }
32029bb2f334SAlex Elder 
3203dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3204dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3205dfc5606dSYehuda Sadeh {
3206593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3207dfc5606dSYehuda Sadeh 
3208a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
32090d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3210a92ffdf8SAlex Elder 
3211a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3212dfc5606dSYehuda Sadeh }
3213dfc5606dSYehuda Sadeh 
3214589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3215589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3216589d30e0SAlex Elder {
3217589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3218589d30e0SAlex Elder 
32190d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3220589d30e0SAlex Elder }
3221589d30e0SAlex Elder 
322234b13184SAlex Elder /*
322334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
322434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
322534b13184SAlex Elder  */
3226dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3227dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3228dfc5606dSYehuda Sadeh 			     char *buf)
3229dfc5606dSYehuda Sadeh {
3230593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3231dfc5606dSYehuda Sadeh 
32320d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3233dfc5606dSYehuda Sadeh }
3234dfc5606dSYehuda Sadeh 
323586b00e0dSAlex Elder /*
323686b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
323786b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
323886b00e0dSAlex Elder  * "(no parent image)".
323986b00e0dSAlex Elder  */
324086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
324186b00e0dSAlex Elder 			     struct device_attribute *attr,
324286b00e0dSAlex Elder 			     char *buf)
324386b00e0dSAlex Elder {
324486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
324586b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
324686b00e0dSAlex Elder 	int count;
324786b00e0dSAlex Elder 	char *bufp = buf;
324886b00e0dSAlex Elder 
324986b00e0dSAlex Elder 	if (!spec)
325086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
325186b00e0dSAlex Elder 
325286b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
325386b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
325486b00e0dSAlex Elder 	if (count < 0)
325586b00e0dSAlex Elder 		return count;
325686b00e0dSAlex Elder 	bufp += count;
325786b00e0dSAlex Elder 
325886b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
325986b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
326086b00e0dSAlex Elder 	if (count < 0)
326186b00e0dSAlex Elder 		return count;
326286b00e0dSAlex Elder 	bufp += count;
326386b00e0dSAlex Elder 
326486b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
326586b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
326686b00e0dSAlex Elder 	if (count < 0)
326786b00e0dSAlex Elder 		return count;
326886b00e0dSAlex Elder 	bufp += count;
326986b00e0dSAlex Elder 
327086b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
327186b00e0dSAlex Elder 	if (count < 0)
327286b00e0dSAlex Elder 		return count;
327386b00e0dSAlex Elder 	bufp += count;
327486b00e0dSAlex Elder 
327586b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
327686b00e0dSAlex Elder }
327786b00e0dSAlex Elder 
3278dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3279dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3280dfc5606dSYehuda Sadeh 				 const char *buf,
3281dfc5606dSYehuda Sadeh 				 size_t size)
3282dfc5606dSYehuda Sadeh {
3283593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3284b813623aSAlex Elder 	int ret;
3285602adf40SYehuda Sadeh 
3286117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
3287b813623aSAlex Elder 
3288b813623aSAlex Elder 	return ret < 0 ? ret : size;
3289dfc5606dSYehuda Sadeh }
3290602adf40SYehuda Sadeh 
3291dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
329234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3293dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3294dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3295dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
32969bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3297dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3298589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3299dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3300dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
330186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3302dfc5606dSYehuda Sadeh 
3303dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3304dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
330534b13184SAlex Elder 	&dev_attr_features.attr,
3306dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3307dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3308dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
33099bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3310dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3311589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3312dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
331386b00e0dSAlex Elder 	&dev_attr_parent.attr,
3314dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3315dfc5606dSYehuda Sadeh 	NULL
3316dfc5606dSYehuda Sadeh };
3317dfc5606dSYehuda Sadeh 
3318dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3319dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3320dfc5606dSYehuda Sadeh };
3321dfc5606dSYehuda Sadeh 
3322dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3323dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3324dfc5606dSYehuda Sadeh 	NULL
3325dfc5606dSYehuda Sadeh };
3326dfc5606dSYehuda Sadeh 
3327dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3328dfc5606dSYehuda Sadeh {
3329dfc5606dSYehuda Sadeh }
3330dfc5606dSYehuda Sadeh 
3331dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3332dfc5606dSYehuda Sadeh 	.name		= "rbd",
3333dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3334dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3335dfc5606dSYehuda Sadeh };
3336dfc5606dSYehuda Sadeh 
33378b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
33388b8fb99cSAlex Elder {
33398b8fb99cSAlex Elder 	kref_get(&spec->kref);
33408b8fb99cSAlex Elder 
33418b8fb99cSAlex Elder 	return spec;
33428b8fb99cSAlex Elder }
33438b8fb99cSAlex Elder 
33448b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
33458b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
33468b8fb99cSAlex Elder {
33478b8fb99cSAlex Elder 	if (spec)
33488b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
33498b8fb99cSAlex Elder }
33508b8fb99cSAlex Elder 
33518b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
33528b8fb99cSAlex Elder {
33538b8fb99cSAlex Elder 	struct rbd_spec *spec;
33548b8fb99cSAlex Elder 
33558b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
33568b8fb99cSAlex Elder 	if (!spec)
33578b8fb99cSAlex Elder 		return NULL;
33588b8fb99cSAlex Elder 	kref_init(&spec->kref);
33598b8fb99cSAlex Elder 
33608b8fb99cSAlex Elder 	return spec;
33618b8fb99cSAlex Elder }
33628b8fb99cSAlex Elder 
33638b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
33648b8fb99cSAlex Elder {
33658b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
33668b8fb99cSAlex Elder 
33678b8fb99cSAlex Elder 	kfree(spec->pool_name);
33688b8fb99cSAlex Elder 	kfree(spec->image_id);
33698b8fb99cSAlex Elder 	kfree(spec->image_name);
33708b8fb99cSAlex Elder 	kfree(spec->snap_name);
33718b8fb99cSAlex Elder 	kfree(spec);
33728b8fb99cSAlex Elder }
33738b8fb99cSAlex Elder 
3374cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3375c53d5893SAlex Elder 				struct rbd_spec *spec)
3376c53d5893SAlex Elder {
3377c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3378c53d5893SAlex Elder 
3379c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3380c53d5893SAlex Elder 	if (!rbd_dev)
3381c53d5893SAlex Elder 		return NULL;
3382c53d5893SAlex Elder 
3383c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
33846d292906SAlex Elder 	rbd_dev->flags = 0;
3385c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3386c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
3387c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3388c53d5893SAlex Elder 
3389c53d5893SAlex Elder 	rbd_dev->spec = spec;
3390c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3391c53d5893SAlex Elder 
33920903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
33930903e875SAlex Elder 
33940903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
33950903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
33960903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
33970903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
33980903e875SAlex Elder 
3399c53d5893SAlex Elder 	return rbd_dev;
3400c53d5893SAlex Elder }
3401c53d5893SAlex Elder 
3402c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3403c53d5893SAlex Elder {
3404c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3405c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3406c53d5893SAlex Elder 	kfree(rbd_dev);
3407c53d5893SAlex Elder }
3408c53d5893SAlex Elder 
34096087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap)
3410dfc5606dSYehuda Sadeh {
34113e83b65bSAlex Elder 	kfree(snap->name);
34123e83b65bSAlex Elder 	kfree(snap);
3413dfc5606dSYehuda Sadeh }
3414dfc5606dSYehuda Sadeh 
34156087b51bSAlex Elder static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
3416c8d18425SAlex Elder 						const char *snap_name,
341734b13184SAlex Elder 						u64 snap_id, u64 snap_size,
341834b13184SAlex Elder 						u64 snap_features)
3419dfc5606dSYehuda Sadeh {
34204e891e0aSAlex Elder 	struct rbd_snap *snap;
34214e891e0aSAlex Elder 
34224e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
3423dfc5606dSYehuda Sadeh 	if (!snap)
34244e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
34254e891e0aSAlex Elder 
34266e584f52SAlex Elder 	snap->name = snap_name;
3427c8d18425SAlex Elder 	snap->id = snap_id;
3428c8d18425SAlex Elder 	snap->size = snap_size;
342934b13184SAlex Elder 	snap->features = snap_features;
34304e891e0aSAlex Elder 
34314e891e0aSAlex Elder 	return snap;
3432dfc5606dSYehuda Sadeh }
3433dfc5606dSYehuda Sadeh 
34346e584f52SAlex Elder /*
34356e584f52SAlex Elder  * Returns a dynamically-allocated snapshot name if successful, or a
34366e584f52SAlex Elder  * pointer-coded error otherwise.
34376e584f52SAlex Elder  */
3438cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3439cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
3440cd892126SAlex Elder {
3441cd892126SAlex Elder 	char *snap_name;
34426e584f52SAlex Elder 	int i;
3443cd892126SAlex Elder 
3444cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3445cd892126SAlex Elder 
3446cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
3447cd892126SAlex Elder 
3448cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
34496e584f52SAlex Elder 	for (i = 0; i < which; i++)
3450cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
3451cd892126SAlex Elder 
34526e584f52SAlex Elder 	snap_name = kstrdup(snap_name, GFP_KERNEL);
34536e584f52SAlex Elder 	if (!snap_name)
34546e584f52SAlex Elder 		return ERR_PTR(-ENOMEM);
34556e584f52SAlex Elder 
34566e584f52SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
34576e584f52SAlex Elder 	*snap_features = 0;	/* No features for v1 */
34586e584f52SAlex Elder 
3459cd892126SAlex Elder 	return snap_name;
3460cd892126SAlex Elder }
3461cd892126SAlex Elder 
3462dfc5606dSYehuda Sadeh /*
34639d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
34649d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
34659d475de5SAlex Elder  * image.
34669d475de5SAlex Elder  */
34679d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
34689d475de5SAlex Elder 				u8 *order, u64 *snap_size)
34699d475de5SAlex Elder {
34709d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
34719d475de5SAlex Elder 	int ret;
34729d475de5SAlex Elder 	struct {
34739d475de5SAlex Elder 		u8 order;
34749d475de5SAlex Elder 		__le64 size;
34759d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
34769d475de5SAlex Elder 
347736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
34789d475de5SAlex Elder 				"rbd", "get_size",
34794157976bSAlex Elder 				&snapid, sizeof (snapid),
34804157976bSAlex Elder 				&size_buf, sizeof (size_buf), NULL);
348136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
34829d475de5SAlex Elder 	if (ret < 0)
34839d475de5SAlex Elder 		return ret;
348457385b51SAlex Elder 	if (ret < sizeof (size_buf))
348557385b51SAlex Elder 		return -ERANGE;
34869d475de5SAlex Elder 
3487c86f86e9SAlex Elder 	if (order)
34889d475de5SAlex Elder 		*order = size_buf.order;
34899d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
34909d475de5SAlex Elder 
34919d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
34929d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
34939d475de5SAlex Elder 		(unsigned long long)*snap_size);
34949d475de5SAlex Elder 
34959d475de5SAlex Elder 	return 0;
34969d475de5SAlex Elder }
34979d475de5SAlex Elder 
34989d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
34999d475de5SAlex Elder {
35009d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
35019d475de5SAlex Elder 					&rbd_dev->header.obj_order,
35029d475de5SAlex Elder 					&rbd_dev->header.image_size);
35039d475de5SAlex Elder }
35049d475de5SAlex Elder 
35051e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
35061e130199SAlex Elder {
35071e130199SAlex Elder 	void *reply_buf;
35081e130199SAlex Elder 	int ret;
35091e130199SAlex Elder 	void *p;
35101e130199SAlex Elder 
35111e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
35121e130199SAlex Elder 	if (!reply_buf)
35131e130199SAlex Elder 		return -ENOMEM;
35141e130199SAlex Elder 
351536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35164157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
351707b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
351836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35191e130199SAlex Elder 	if (ret < 0)
35201e130199SAlex Elder 		goto out;
35211e130199SAlex Elder 
35221e130199SAlex Elder 	p = reply_buf;
35231e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
352457385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
352557385b51SAlex Elder 	ret = 0;
35261e130199SAlex Elder 
35271e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
35281e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
35291e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
35301e130199SAlex Elder 	} else {
35311e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
35321e130199SAlex Elder 	}
35331e130199SAlex Elder out:
35341e130199SAlex Elder 	kfree(reply_buf);
35351e130199SAlex Elder 
35361e130199SAlex Elder 	return ret;
35371e130199SAlex Elder }
35381e130199SAlex Elder 
3539b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3540b1b5402aSAlex Elder 		u64 *snap_features)
3541b1b5402aSAlex Elder {
3542b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3543b1b5402aSAlex Elder 	struct {
3544b1b5402aSAlex Elder 		__le64 features;
3545b1b5402aSAlex Elder 		__le64 incompat;
35464157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3547d889140cSAlex Elder 	u64 incompat;
3548b1b5402aSAlex Elder 	int ret;
3549b1b5402aSAlex Elder 
355036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3551b1b5402aSAlex Elder 				"rbd", "get_features",
35524157976bSAlex Elder 				&snapid, sizeof (snapid),
35534157976bSAlex Elder 				&features_buf, sizeof (features_buf), NULL);
355436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3555b1b5402aSAlex Elder 	if (ret < 0)
3556b1b5402aSAlex Elder 		return ret;
355757385b51SAlex Elder 	if (ret < sizeof (features_buf))
355857385b51SAlex Elder 		return -ERANGE;
3559d889140cSAlex Elder 
3560d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
35615cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3562b8f5c6edSAlex Elder 		return -ENXIO;
3563d889140cSAlex Elder 
3564b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3565b1b5402aSAlex Elder 
3566b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3567b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3568b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3569b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3570b1b5402aSAlex Elder 
3571b1b5402aSAlex Elder 	return 0;
3572b1b5402aSAlex Elder }
3573b1b5402aSAlex Elder 
3574b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3575b1b5402aSAlex Elder {
3576b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3577b1b5402aSAlex Elder 						&rbd_dev->header.features);
3578b1b5402aSAlex Elder }
3579b1b5402aSAlex Elder 
358086b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
358186b00e0dSAlex Elder {
358286b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
358386b00e0dSAlex Elder 	size_t size;
358486b00e0dSAlex Elder 	void *reply_buf = NULL;
358586b00e0dSAlex Elder 	__le64 snapid;
358686b00e0dSAlex Elder 	void *p;
358786b00e0dSAlex Elder 	void *end;
358886b00e0dSAlex Elder 	char *image_id;
358986b00e0dSAlex Elder 	u64 overlap;
359086b00e0dSAlex Elder 	int ret;
359186b00e0dSAlex Elder 
359286b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
359386b00e0dSAlex Elder 	if (!parent_spec)
359486b00e0dSAlex Elder 		return -ENOMEM;
359586b00e0dSAlex Elder 
359686b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
359786b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
359886b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
359986b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
360086b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
360186b00e0dSAlex Elder 	if (!reply_buf) {
360286b00e0dSAlex Elder 		ret = -ENOMEM;
360386b00e0dSAlex Elder 		goto out_err;
360486b00e0dSAlex Elder 	}
360586b00e0dSAlex Elder 
360686b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
360736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
360886b00e0dSAlex Elder 				"rbd", "get_parent",
36094157976bSAlex Elder 				&snapid, sizeof (snapid),
36104157976bSAlex Elder 				reply_buf, size, NULL);
361136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
361286b00e0dSAlex Elder 	if (ret < 0)
361386b00e0dSAlex Elder 		goto out_err;
361486b00e0dSAlex Elder 
361586b00e0dSAlex Elder 	p = reply_buf;
361657385b51SAlex Elder 	end = reply_buf + ret;
361757385b51SAlex Elder 	ret = -ERANGE;
361886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
361986b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
362086b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
362186b00e0dSAlex Elder 
36220903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36230903e875SAlex Elder 
36240903e875SAlex Elder 	ret = -EIO;
3625c0cd10dbSAlex Elder 	if (parent_spec->pool_id > (u64)U32_MAX) {
3626c0cd10dbSAlex Elder 		rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3627c0cd10dbSAlex Elder 			(unsigned long long)parent_spec->pool_id, U32_MAX);
362857385b51SAlex Elder 		goto out_err;
3629c0cd10dbSAlex Elder 	}
36300903e875SAlex Elder 
3631979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
363286b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
363386b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
363486b00e0dSAlex Elder 		goto out_err;
363586b00e0dSAlex Elder 	}
363686b00e0dSAlex Elder 	parent_spec->image_id = image_id;
363786b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
363886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
363986b00e0dSAlex Elder 
364086b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
364186b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
364286b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
364386b00e0dSAlex Elder out:
364486b00e0dSAlex Elder 	ret = 0;
364586b00e0dSAlex Elder out_err:
364686b00e0dSAlex Elder 	kfree(reply_buf);
364786b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
364886b00e0dSAlex Elder 
364986b00e0dSAlex Elder 	return ret;
365086b00e0dSAlex Elder }
365186b00e0dSAlex Elder 
3652cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3653cc070d59SAlex Elder {
3654cc070d59SAlex Elder 	struct {
3655cc070d59SAlex Elder 		__le64 stripe_unit;
3656cc070d59SAlex Elder 		__le64 stripe_count;
3657cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3658cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3659cc070d59SAlex Elder 	void *p;
3660cc070d59SAlex Elder 	u64 obj_size;
3661cc070d59SAlex Elder 	u64 stripe_unit;
3662cc070d59SAlex Elder 	u64 stripe_count;
3663cc070d59SAlex Elder 	int ret;
3664cc070d59SAlex Elder 
3665cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3666cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3667cc070d59SAlex Elder 				(char *)&striping_info_buf, size, NULL);
3668cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3669cc070d59SAlex Elder 	if (ret < 0)
3670cc070d59SAlex Elder 		return ret;
3671cc070d59SAlex Elder 	if (ret < size)
3672cc070d59SAlex Elder 		return -ERANGE;
3673cc070d59SAlex Elder 
3674cc070d59SAlex Elder 	/*
3675cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3676cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3677cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3678cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3679cc070d59SAlex Elder 	 */
3680cc070d59SAlex Elder 	ret = -EINVAL;
3681cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3682cc070d59SAlex Elder 	p = &striping_info_buf;
3683cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3684cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3685cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3686cc070d59SAlex Elder 				"(got %llu want %llu)",
3687cc070d59SAlex Elder 				stripe_unit, obj_size);
3688cc070d59SAlex Elder 		return -EINVAL;
3689cc070d59SAlex Elder 	}
3690cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3691cc070d59SAlex Elder 	if (stripe_count != 1) {
3692cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3693cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3694cc070d59SAlex Elder 		return -EINVAL;
3695cc070d59SAlex Elder 	}
3696500d0c0fSAlex Elder 	rbd_dev->header.stripe_unit = stripe_unit;
3697500d0c0fSAlex Elder 	rbd_dev->header.stripe_count = stripe_count;
3698cc070d59SAlex Elder 
3699cc070d59SAlex Elder 	return 0;
3700cc070d59SAlex Elder }
3701cc070d59SAlex Elder 
37029e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
37039e15b77dSAlex Elder {
37049e15b77dSAlex Elder 	size_t image_id_size;
37059e15b77dSAlex Elder 	char *image_id;
37069e15b77dSAlex Elder 	void *p;
37079e15b77dSAlex Elder 	void *end;
37089e15b77dSAlex Elder 	size_t size;
37099e15b77dSAlex Elder 	void *reply_buf = NULL;
37109e15b77dSAlex Elder 	size_t len = 0;
37119e15b77dSAlex Elder 	char *image_name = NULL;
37129e15b77dSAlex Elder 	int ret;
37139e15b77dSAlex Elder 
37149e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
37159e15b77dSAlex Elder 
371669e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
371769e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
37189e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
37199e15b77dSAlex Elder 	if (!image_id)
37209e15b77dSAlex Elder 		return NULL;
37219e15b77dSAlex Elder 
37229e15b77dSAlex Elder 	p = image_id;
37234157976bSAlex Elder 	end = image_id + image_id_size;
372469e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
37259e15b77dSAlex Elder 
37269e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
37279e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
37289e15b77dSAlex Elder 	if (!reply_buf)
37299e15b77dSAlex Elder 		goto out;
37309e15b77dSAlex Elder 
373136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
37329e15b77dSAlex Elder 				"rbd", "dir_get_name",
37339e15b77dSAlex Elder 				image_id, image_id_size,
37344157976bSAlex Elder 				reply_buf, size, NULL);
37359e15b77dSAlex Elder 	if (ret < 0)
37369e15b77dSAlex Elder 		goto out;
37379e15b77dSAlex Elder 	p = reply_buf;
3738f40eb349SAlex Elder 	end = reply_buf + ret;
3739f40eb349SAlex Elder 
37409e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
37419e15b77dSAlex Elder 	if (IS_ERR(image_name))
37429e15b77dSAlex Elder 		image_name = NULL;
37439e15b77dSAlex Elder 	else
37449e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
37459e15b77dSAlex Elder out:
37469e15b77dSAlex Elder 	kfree(reply_buf);
37479e15b77dSAlex Elder 	kfree(image_id);
37489e15b77dSAlex Elder 
37499e15b77dSAlex Elder 	return image_name;
37509e15b77dSAlex Elder }
37519e15b77dSAlex Elder 
37529e15b77dSAlex Elder /*
37532e9f7f1cSAlex Elder  * When an rbd image has a parent image, it is identified by the
37542e9f7f1cSAlex Elder  * pool, image, and snapshot ids (not names).  This function fills
37552e9f7f1cSAlex Elder  * in the names for those ids.  (It's OK if we can't figure out the
37562e9f7f1cSAlex Elder  * name for an image id, but the pool and snapshot ids should always
37572e9f7f1cSAlex Elder  * exist and have names.)  All names in an rbd spec are dynamically
37582e9f7f1cSAlex Elder  * allocated.
3759e1d4213fSAlex Elder  *
3760e1d4213fSAlex Elder  * When an image being mapped (not a parent) is probed, we have the
3761e1d4213fSAlex Elder  * pool name and pool id, image name and image id, and the snapshot
3762e1d4213fSAlex Elder  * name.  The only thing we're missing is the snapshot id.
37632e9f7f1cSAlex Elder  *
37642e9f7f1cSAlex Elder  * The set of snapshots for an image is not known until they have
37652e9f7f1cSAlex Elder  * been read by rbd_dev_snaps_update(), so we can't completely fill
37662e9f7f1cSAlex Elder  * in this information until after that has been called.
37679e15b77dSAlex Elder  */
37682e9f7f1cSAlex Elder static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
37699e15b77dSAlex Elder {
37702e9f7f1cSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
37712e9f7f1cSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
37722e9f7f1cSAlex Elder 	const char *pool_name;
37732e9f7f1cSAlex Elder 	const char *image_name;
37742e9f7f1cSAlex Elder 	const char *snap_name;
37759e15b77dSAlex Elder 	int ret;
37769e15b77dSAlex Elder 
3777e1d4213fSAlex Elder 	/*
3778e1d4213fSAlex Elder 	 * An image being mapped will have the pool name (etc.), but
3779e1d4213fSAlex Elder 	 * we need to look up the snapshot id.
3780e1d4213fSAlex Elder 	 */
37812e9f7f1cSAlex Elder 	if (spec->pool_name) {
37822e9f7f1cSAlex Elder 		if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
3783e1d4213fSAlex Elder 			struct rbd_snap *snap;
3784e1d4213fSAlex Elder 
37852e9f7f1cSAlex Elder 			snap = snap_by_name(rbd_dev, spec->snap_name);
3786e1d4213fSAlex Elder 			if (!snap)
3787e1d4213fSAlex Elder 				return -ENOENT;
37882e9f7f1cSAlex Elder 			spec->snap_id = snap->id;
3789e1d4213fSAlex Elder 		} else {
37902e9f7f1cSAlex Elder 			spec->snap_id = CEPH_NOSNAP;
3791e1d4213fSAlex Elder 		}
3792e1d4213fSAlex Elder 
3793e1d4213fSAlex Elder 		return 0;
3794e1d4213fSAlex Elder 	}
37959e15b77dSAlex Elder 
37962e9f7f1cSAlex Elder 	/* Get the pool name; we have to make our own copy of this */
37979e15b77dSAlex Elder 
37982e9f7f1cSAlex Elder 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
37992e9f7f1cSAlex Elder 	if (!pool_name) {
38002e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
3801935dc89fSAlex Elder 		return -EIO;
3802935dc89fSAlex Elder 	}
38032e9f7f1cSAlex Elder 	pool_name = kstrdup(pool_name, GFP_KERNEL);
38042e9f7f1cSAlex Elder 	if (!pool_name)
38059e15b77dSAlex Elder 		return -ENOMEM;
38069e15b77dSAlex Elder 
38079e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
38089e15b77dSAlex Elder 
38092e9f7f1cSAlex Elder 	image_name = rbd_dev_image_name(rbd_dev);
38102e9f7f1cSAlex Elder 	if (!image_name)
381106ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
38129e15b77dSAlex Elder 
38132e9f7f1cSAlex Elder 	/* Look up the snapshot name, and make a copy */
38149e15b77dSAlex Elder 
38152e9f7f1cSAlex Elder 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
38162e9f7f1cSAlex Elder 	if (!snap_name) {
38172e9f7f1cSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id);
38189e15b77dSAlex Elder 		ret = -EIO;
38199e15b77dSAlex Elder 		goto out_err;
38209e15b77dSAlex Elder 	}
38212e9f7f1cSAlex Elder 	snap_name = kstrdup(snap_name, GFP_KERNEL);
38222e9f7f1cSAlex Elder 	if (!snap_name) {
38232e9f7f1cSAlex Elder 		ret = -ENOMEM;
38249e15b77dSAlex Elder 		goto out_err;
38252e9f7f1cSAlex Elder 	}
38262e9f7f1cSAlex Elder 
38272e9f7f1cSAlex Elder 	spec->pool_name = pool_name;
38282e9f7f1cSAlex Elder 	spec->image_name = image_name;
38292e9f7f1cSAlex Elder 	spec->snap_name = snap_name;
38309e15b77dSAlex Elder 
38319e15b77dSAlex Elder 	return 0;
38329e15b77dSAlex Elder out_err:
38332e9f7f1cSAlex Elder 	kfree(image_name);
38342e9f7f1cSAlex Elder 	kfree(pool_name);
38359e15b77dSAlex Elder 
38369e15b77dSAlex Elder 	return ret;
38379e15b77dSAlex Elder }
38389e15b77dSAlex Elder 
38396e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
384035d489f9SAlex Elder {
384135d489f9SAlex Elder 	size_t size;
384235d489f9SAlex Elder 	int ret;
384335d489f9SAlex Elder 	void *reply_buf;
384435d489f9SAlex Elder 	void *p;
384535d489f9SAlex Elder 	void *end;
384635d489f9SAlex Elder 	u64 seq;
384735d489f9SAlex Elder 	u32 snap_count;
384835d489f9SAlex Elder 	struct ceph_snap_context *snapc;
384935d489f9SAlex Elder 	u32 i;
385035d489f9SAlex Elder 
385135d489f9SAlex Elder 	/*
385235d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
385335d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
385435d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
385535d489f9SAlex Elder 	 * prepared to receive.
385635d489f9SAlex Elder 	 */
385735d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
385835d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
385935d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
386035d489f9SAlex Elder 	if (!reply_buf)
386135d489f9SAlex Elder 		return -ENOMEM;
386235d489f9SAlex Elder 
386336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38644157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
386507b2391fSAlex Elder 				reply_buf, size, ver);
386636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
386735d489f9SAlex Elder 	if (ret < 0)
386835d489f9SAlex Elder 		goto out;
386935d489f9SAlex Elder 
387035d489f9SAlex Elder 	p = reply_buf;
387157385b51SAlex Elder 	end = reply_buf + ret;
387257385b51SAlex Elder 	ret = -ERANGE;
387335d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
387435d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
387535d489f9SAlex Elder 
387635d489f9SAlex Elder 	/*
387735d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
387835d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
387935d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
388035d489f9SAlex Elder 	 * allocate is representable in a size_t.
388135d489f9SAlex Elder 	 */
388235d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
388335d489f9SAlex Elder 				 / sizeof (u64)) {
388435d489f9SAlex Elder 		ret = -EINVAL;
388535d489f9SAlex Elder 		goto out;
388635d489f9SAlex Elder 	}
388735d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
388835d489f9SAlex Elder 		goto out;
3889468521c1SAlex Elder 	ret = 0;
389035d489f9SAlex Elder 
3891812164f8SAlex Elder 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
389235d489f9SAlex Elder 	if (!snapc) {
389335d489f9SAlex Elder 		ret = -ENOMEM;
389435d489f9SAlex Elder 		goto out;
389535d489f9SAlex Elder 	}
389635d489f9SAlex Elder 	snapc->seq = seq;
389735d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
389835d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
389935d489f9SAlex Elder 
390035d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
390135d489f9SAlex Elder 
390235d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
390335d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
390435d489f9SAlex Elder out:
390535d489f9SAlex Elder 	kfree(reply_buf);
390635d489f9SAlex Elder 
390757385b51SAlex Elder 	return ret;
390835d489f9SAlex Elder }
390935d489f9SAlex Elder 
3910b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3911b8b1e2dbSAlex Elder {
3912b8b1e2dbSAlex Elder 	size_t size;
3913b8b1e2dbSAlex Elder 	void *reply_buf;
3914b8b1e2dbSAlex Elder 	__le64 snap_id;
3915b8b1e2dbSAlex Elder 	int ret;
3916b8b1e2dbSAlex Elder 	void *p;
3917b8b1e2dbSAlex Elder 	void *end;
3918b8b1e2dbSAlex Elder 	char *snap_name;
3919b8b1e2dbSAlex Elder 
3920b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3921b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3922b8b1e2dbSAlex Elder 	if (!reply_buf)
3923b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3924b8b1e2dbSAlex Elder 
3925acb1b6caSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3926b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
392736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3928b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
39294157976bSAlex Elder 				&snap_id, sizeof (snap_id),
393007b2391fSAlex Elder 				reply_buf, size, NULL);
393136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3932f40eb349SAlex Elder 	if (ret < 0) {
3933f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
3934b8b1e2dbSAlex Elder 		goto out;
3935f40eb349SAlex Elder 	}
3936b8b1e2dbSAlex Elder 
3937b8b1e2dbSAlex Elder 	p = reply_buf;
3938f40eb349SAlex Elder 	end = reply_buf + ret;
3939e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3940f40eb349SAlex Elder 	if (IS_ERR(snap_name))
3941b8b1e2dbSAlex Elder 		goto out;
3942f40eb349SAlex Elder 
3943b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
3944b8b1e2dbSAlex Elder 		(unsigned long long)le64_to_cpu(snap_id), snap_name);
3945b8b1e2dbSAlex Elder out:
3946b8b1e2dbSAlex Elder 	kfree(reply_buf);
3947b8b1e2dbSAlex Elder 
3948f40eb349SAlex Elder 	return snap_name;
3949b8b1e2dbSAlex Elder }
3950b8b1e2dbSAlex Elder 
3951b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3952b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3953b8b1e2dbSAlex Elder {
3954e0b49868SAlex Elder 	u64 snap_id;
3955acb1b6caSAlex Elder 	u64 size;
3956acb1b6caSAlex Elder 	u64 features;
3957acb1b6caSAlex Elder 	char *snap_name;
3958b8b1e2dbSAlex Elder 	int ret;
3959b8b1e2dbSAlex Elder 
3960acb1b6caSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3961b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3962acb1b6caSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
3963b8b1e2dbSAlex Elder 	if (ret)
3964acb1b6caSAlex Elder 		goto out_err;
3965b8b1e2dbSAlex Elder 
3966acb1b6caSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
3967acb1b6caSAlex Elder 	if (ret)
3968acb1b6caSAlex Elder 		goto out_err;
3969acb1b6caSAlex Elder 
3970acb1b6caSAlex Elder 	snap_name = rbd_dev_v2_snap_name(rbd_dev, which);
3971acb1b6caSAlex Elder 	if (!IS_ERR(snap_name)) {
3972acb1b6caSAlex Elder 		*snap_size = size;
3973acb1b6caSAlex Elder 		*snap_features = features;
3974acb1b6caSAlex Elder 	}
3975acb1b6caSAlex Elder 
3976acb1b6caSAlex Elder 	return snap_name;
3977acb1b6caSAlex Elder out_err:
3978acb1b6caSAlex Elder 	return ERR_PTR(ret);
3979b8b1e2dbSAlex Elder }
3980b8b1e2dbSAlex Elder 
3981b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3982b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3983b8b1e2dbSAlex Elder {
3984b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3985b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3986b8b1e2dbSAlex Elder 					snap_size, snap_features);
3987b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3988b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3989b8b1e2dbSAlex Elder 					snap_size, snap_features);
3990b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3991b8b1e2dbSAlex Elder }
3992b8b1e2dbSAlex Elder 
3993117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3994117973fbSAlex Elder {
3995117973fbSAlex Elder 	int ret;
3996117973fbSAlex Elder 
3997117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3998117973fbSAlex Elder 
3999117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
4000117973fbSAlex Elder 	if (ret)
4001117973fbSAlex Elder 		goto out;
4002117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
4003117973fbSAlex Elder 
4004117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4005117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4006117973fbSAlex Elder 	if (ret)
4007117973fbSAlex Elder 		goto out;
4008117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
4009117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
4010117973fbSAlex Elder 	if (ret)
4011117973fbSAlex Elder 		goto out;
4012117973fbSAlex Elder out:
4013117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
4014117973fbSAlex Elder 
4015117973fbSAlex Elder 	return ret;
4016117973fbSAlex Elder }
4017117973fbSAlex Elder 
40189d475de5SAlex Elder /*
401935938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
402035938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
402135938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
402235938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
402335938150SAlex Elder  * And verify there are no changes to snapshots we already know
402435938150SAlex Elder  * about.
402535938150SAlex Elder  *
402635938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
402735938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
402835938150SAlex Elder  * are also maintained in that order.)
4029522a0cc0SAlex Elder  *
4030522a0cc0SAlex Elder  * Note that any error occurs while updating the snapshot list
4031522a0cc0SAlex Elder  * aborts the update, and the entire list is cleared.  The snapshot
4032522a0cc0SAlex Elder  * list becomes inconsistent at that point anyway, so it might as
4033522a0cc0SAlex Elder  * well be empty.
4034dfc5606dSYehuda Sadeh  */
4035304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
4036dfc5606dSYehuda Sadeh {
403735938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
403835938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
403935938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
404035938150SAlex Elder 	struct list_head *links = head->next;
404135938150SAlex Elder 	u32 index = 0;
4042522a0cc0SAlex Elder 	int ret = 0;
4043dfc5606dSYehuda Sadeh 
40449fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
404535938150SAlex Elder 	while (index < snap_count || links != head) {
404635938150SAlex Elder 		u64 snap_id;
404735938150SAlex Elder 		struct rbd_snap *snap;
4048cd892126SAlex Elder 		char *snap_name;
4049cd892126SAlex Elder 		u64 snap_size = 0;
4050cd892126SAlex Elder 		u64 snap_features = 0;
4051dfc5606dSYehuda Sadeh 
405235938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
405335938150SAlex Elder 					     : CEPH_NOSNAP;
405435938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
405535938150SAlex Elder 				     : NULL;
4056aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
4057dfc5606dSYehuda Sadeh 
405835938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
405935938150SAlex Elder 			struct list_head *next = links->next;
4060dfc5606dSYehuda Sadeh 
40616d292906SAlex Elder 			/*
40626d292906SAlex Elder 			 * A previously-existing snapshot is not in
40636d292906SAlex Elder 			 * the new snap context.
40646d292906SAlex Elder 			 *
4065522a0cc0SAlex Elder 			 * If the now-missing snapshot is the one
4066522a0cc0SAlex Elder 			 * the image represents, clear its existence
4067522a0cc0SAlex Elder 			 * flag so we can avoid sending any more
4068522a0cc0SAlex Elder 			 * requests to it.
40696d292906SAlex Elder 			 */
40700d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
40716d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
40723e83b65bSAlex Elder 			dout("removing %ssnap id %llu\n",
40730d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
40740d7dbfceSAlex Elder 							"mapped " : "",
40759fcbb800SAlex Elder 				(unsigned long long)snap->id);
40766087b51bSAlex Elder 
40776087b51bSAlex Elder 			list_del(&snap->node);
40786087b51bSAlex Elder 			rbd_snap_destroy(snap);
4079dfc5606dSYehuda Sadeh 
408035938150SAlex Elder 			/* Done with this list entry; advance */
408135938150SAlex Elder 
408235938150SAlex Elder 			links = next;
408335938150SAlex Elder 			continue;
4084dfc5606dSYehuda Sadeh 		}
408535938150SAlex Elder 
4086b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
4087cd892126SAlex Elder 					&snap_size, &snap_features);
4088522a0cc0SAlex Elder 		if (IS_ERR(snap_name)) {
4089522a0cc0SAlex Elder 			ret = PTR_ERR(snap_name);
4090522a0cc0SAlex Elder 			dout("failed to get snap info, error %d\n", ret);
4091522a0cc0SAlex Elder 			goto out_err;
4092522a0cc0SAlex Elder 		}
4093cd892126SAlex Elder 
40949fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
40959fcbb800SAlex Elder 			(unsigned long long)snap_id);
409635938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
409735938150SAlex Elder 			struct rbd_snap *new_snap;
409835938150SAlex Elder 
409935938150SAlex Elder 			/* We haven't seen this snapshot before */
410035938150SAlex Elder 
41016087b51bSAlex Elder 			new_snap = rbd_snap_create(rbd_dev, snap_name,
4102cd892126SAlex Elder 					snap_id, snap_size, snap_features);
41039fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
4104522a0cc0SAlex Elder 				ret = PTR_ERR(new_snap);
4105522a0cc0SAlex Elder 				dout("  failed to add dev, error %d\n", ret);
4106522a0cc0SAlex Elder 				goto out_err;
41079fcbb800SAlex Elder 			}
410835938150SAlex Elder 
410935938150SAlex Elder 			/* New goes before existing, or at end of list */
411035938150SAlex Elder 
41119fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
411235938150SAlex Elder 			if (snap)
411335938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
411435938150SAlex Elder 			else
4115523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
411635938150SAlex Elder 		} else {
411735938150SAlex Elder 			/* Already have this one */
411835938150SAlex Elder 
41199fcbb800SAlex Elder 			dout("  already present\n");
41209fcbb800SAlex Elder 
4121cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
4122aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
4123cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
412435938150SAlex Elder 
412535938150SAlex Elder 			/* Done with this list entry; advance */
412635938150SAlex Elder 
412735938150SAlex Elder 			links = links->next;
4128dfc5606dSYehuda Sadeh 		}
412935938150SAlex Elder 
413035938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
413135938150SAlex Elder 
413235938150SAlex Elder 		index++;
4133dfc5606dSYehuda Sadeh 	}
41349fcbb800SAlex Elder 	dout("%s: done\n", __func__);
4135dfc5606dSYehuda Sadeh 
4136dfc5606dSYehuda Sadeh 	return 0;
4137522a0cc0SAlex Elder out_err:
4138522a0cc0SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4139522a0cc0SAlex Elder 
4140522a0cc0SAlex Elder 	return ret;
4141dfc5606dSYehuda Sadeh }
4142dfc5606dSYehuda Sadeh 
4143dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4144dfc5606dSYehuda Sadeh {
4145dfc5606dSYehuda Sadeh 	struct device *dev;
4146cd789ab9SAlex Elder 	int ret;
4147dfc5606dSYehuda Sadeh 
4148dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4149dfc5606dSYehuda Sadeh 
4150cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4151dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4152dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4153dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4154200a6a8bSAlex Elder 	dev->release = rbd_dev_device_release;
4155de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4156dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4157dfc5606dSYehuda Sadeh 
4158dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4159cd789ab9SAlex Elder 
4160dfc5606dSYehuda Sadeh 	return ret;
4161602adf40SYehuda Sadeh }
4162602adf40SYehuda Sadeh 
4163dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4164dfc5606dSYehuda Sadeh {
4165dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4166dfc5606dSYehuda Sadeh }
4167dfc5606dSYehuda Sadeh 
4168e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
41691ddbe94eSAlex Elder 
41701ddbe94eSAlex Elder /*
4171499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4172499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
41731ddbe94eSAlex Elder  */
4174e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4175b7f23c36SAlex Elder {
4176e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4177499afd5bSAlex Elder 
4178499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4179499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4180499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4181e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4182e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4183b7f23c36SAlex Elder }
4184b7f23c36SAlex Elder 
41851ddbe94eSAlex Elder /*
4186499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4187499afd5bSAlex Elder  * identifier is no longer in use.
41881ddbe94eSAlex Elder  */
4189e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
41901ddbe94eSAlex Elder {
4191d184f6bfSAlex Elder 	struct list_head *tmp;
4192de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4193d184f6bfSAlex Elder 	int max_id;
4194d184f6bfSAlex Elder 
4195aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4196499afd5bSAlex Elder 
4197e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4198e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4199499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4200499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4201d184f6bfSAlex Elder 
4202d184f6bfSAlex Elder 	/*
4203d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4204d184f6bfSAlex Elder 	 * is nothing special we need to do.
4205d184f6bfSAlex Elder 	 */
4206e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4207d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4208d184f6bfSAlex Elder 		return;
4209d184f6bfSAlex Elder 	}
4210d184f6bfSAlex Elder 
4211d184f6bfSAlex Elder 	/*
4212d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4213d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4214d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4215d184f6bfSAlex Elder 	 */
4216d184f6bfSAlex Elder 	max_id = 0;
4217d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4218d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4219d184f6bfSAlex Elder 
4220d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4221b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4222b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4223d184f6bfSAlex Elder 	}
4224499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
42251ddbe94eSAlex Elder 
42261ddbe94eSAlex Elder 	/*
4227e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4228d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4229d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4230d184f6bfSAlex Elder 	 * case.
42311ddbe94eSAlex Elder 	 */
4232e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4233e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4234b7f23c36SAlex Elder }
4235b7f23c36SAlex Elder 
4236a725f65eSAlex Elder /*
4237e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4238e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4239593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4240593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4241e28fff26SAlex Elder  */
4242e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4243e28fff26SAlex Elder {
4244e28fff26SAlex Elder         /*
4245e28fff26SAlex Elder         * These are the characters that produce nonzero for
4246e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4247e28fff26SAlex Elder         */
4248e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4249e28fff26SAlex Elder 
4250e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4251e28fff26SAlex Elder 
4252e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4253e28fff26SAlex Elder }
4254e28fff26SAlex Elder 
4255e28fff26SAlex Elder /*
4256e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4257e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4258593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4259593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4260e28fff26SAlex Elder  *
4261e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4262e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4263e28fff26SAlex Elder  * token_size if the token would not fit.
4264e28fff26SAlex Elder  *
4265593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4266e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4267e28fff26SAlex Elder  * too small to hold it.
4268e28fff26SAlex Elder  */
4269e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4270e28fff26SAlex Elder 				char *token,
4271e28fff26SAlex Elder 				size_t token_size)
4272e28fff26SAlex Elder {
4273e28fff26SAlex Elder         size_t len;
4274e28fff26SAlex Elder 
4275e28fff26SAlex Elder 	len = next_token(buf);
4276e28fff26SAlex Elder 	if (len < token_size) {
4277e28fff26SAlex Elder 		memcpy(token, *buf, len);
4278e28fff26SAlex Elder 		*(token + len) = '\0';
4279e28fff26SAlex Elder 	}
4280e28fff26SAlex Elder 	*buf += len;
4281e28fff26SAlex Elder 
4282e28fff26SAlex Elder         return len;
4283e28fff26SAlex Elder }
4284e28fff26SAlex Elder 
4285e28fff26SAlex Elder /*
4286ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4287ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4288ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4289ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4290ea3352f4SAlex Elder  *
4291ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4292ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4293ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4294ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4295ea3352f4SAlex Elder  *
4296ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4297ea3352f4SAlex Elder  * the end of the found token.
4298ea3352f4SAlex Elder  *
4299ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4300ea3352f4SAlex Elder  */
4301ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4302ea3352f4SAlex Elder {
4303ea3352f4SAlex Elder 	char *dup;
4304ea3352f4SAlex Elder 	size_t len;
4305ea3352f4SAlex Elder 
4306ea3352f4SAlex Elder 	len = next_token(buf);
43074caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4308ea3352f4SAlex Elder 	if (!dup)
4309ea3352f4SAlex Elder 		return NULL;
4310ea3352f4SAlex Elder 	*(dup + len) = '\0';
4311ea3352f4SAlex Elder 	*buf += len;
4312ea3352f4SAlex Elder 
4313ea3352f4SAlex Elder 	if (lenp)
4314ea3352f4SAlex Elder 		*lenp = len;
4315ea3352f4SAlex Elder 
4316ea3352f4SAlex Elder 	return dup;
4317ea3352f4SAlex Elder }
4318ea3352f4SAlex Elder 
4319ea3352f4SAlex Elder /*
4320859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4321859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4322859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4323859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4324d22f76e7SAlex Elder  *
4325859c31dfSAlex Elder  * The information extracted from these options is recorded in
4326859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4327859c31dfSAlex Elder  * structures:
4328859c31dfSAlex Elder  *  ceph_opts
4329859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4330859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4331859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4332859c31dfSAlex Elder  *  rbd_opts
4333859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4334859c31dfSAlex Elder  *	this function; caller must release with kfree().
4335859c31dfSAlex Elder  *  spec
4336859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4337859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4338859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4339859c31dfSAlex Elder  *
4340859c31dfSAlex Elder  * The options passed take this form:
4341859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4342859c31dfSAlex Elder  * where:
4343859c31dfSAlex Elder  *  <mon_addrs>
4344859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4345859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4346859c31dfSAlex Elder  *      by a port number (separated by a colon).
4347859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4348859c31dfSAlex Elder  *  <options>
4349859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4350859c31dfSAlex Elder  *  <pool_name>
4351859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4352859c31dfSAlex Elder  *  <image_name>
4353859c31dfSAlex Elder  *      The name of the image in that pool to map.
4354859c31dfSAlex Elder  *  <snap_id>
4355859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4356859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4357859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4358859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4359a725f65eSAlex Elder  */
4360859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4361dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4362859c31dfSAlex Elder 				struct rbd_options **opts,
4363859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4364a725f65eSAlex Elder {
4365e28fff26SAlex Elder 	size_t len;
4366859c31dfSAlex Elder 	char *options;
43670ddebc0cSAlex Elder 	const char *mon_addrs;
4368ecb4dc22SAlex Elder 	char *snap_name;
43690ddebc0cSAlex Elder 	size_t mon_addrs_size;
4370859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
43714e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4372859c31dfSAlex Elder 	struct ceph_options *copts;
4373dc79b113SAlex Elder 	int ret;
4374e28fff26SAlex Elder 
4375e28fff26SAlex Elder 	/* The first four tokens are required */
4376e28fff26SAlex Elder 
43777ef3214aSAlex Elder 	len = next_token(&buf);
43784fb5d671SAlex Elder 	if (!len) {
43794fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
43804fb5d671SAlex Elder 		return -EINVAL;
43814fb5d671SAlex Elder 	}
43820ddebc0cSAlex Elder 	mon_addrs = buf;
4383f28e565aSAlex Elder 	mon_addrs_size = len + 1;
43847ef3214aSAlex Elder 	buf += len;
4385a725f65eSAlex Elder 
4386dc79b113SAlex Elder 	ret = -EINVAL;
4387f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4388f28e565aSAlex Elder 	if (!options)
4389dc79b113SAlex Elder 		return -ENOMEM;
43904fb5d671SAlex Elder 	if (!*options) {
43914fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
43924fb5d671SAlex Elder 		goto out_err;
43934fb5d671SAlex Elder 	}
4394a725f65eSAlex Elder 
4395859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4396859c31dfSAlex Elder 	if (!spec)
4397f28e565aSAlex Elder 		goto out_mem;
4398859c31dfSAlex Elder 
4399859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4400859c31dfSAlex Elder 	if (!spec->pool_name)
4401859c31dfSAlex Elder 		goto out_mem;
44024fb5d671SAlex Elder 	if (!*spec->pool_name) {
44034fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
44044fb5d671SAlex Elder 		goto out_err;
44054fb5d671SAlex Elder 	}
4406e28fff26SAlex Elder 
440769e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4408859c31dfSAlex Elder 	if (!spec->image_name)
4409f28e565aSAlex Elder 		goto out_mem;
44104fb5d671SAlex Elder 	if (!*spec->image_name) {
44114fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
44124fb5d671SAlex Elder 		goto out_err;
44134fb5d671SAlex Elder 	}
4414e28fff26SAlex Elder 
4415f28e565aSAlex Elder 	/*
4416f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4417f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4418f28e565aSAlex Elder 	 */
44193feeb894SAlex Elder 	len = next_token(&buf);
4420820a5f3eSAlex Elder 	if (!len) {
44213feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
44223feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4423f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4424dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4425f28e565aSAlex Elder 		goto out_err;
4426849b4260SAlex Elder 	}
4427ecb4dc22SAlex Elder 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4428ecb4dc22SAlex Elder 	if (!snap_name)
4429f28e565aSAlex Elder 		goto out_mem;
4430ecb4dc22SAlex Elder 	*(snap_name + len) = '\0';
4431ecb4dc22SAlex Elder 	spec->snap_name = snap_name;
4432e5c35534SAlex Elder 
44330ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4434e28fff26SAlex Elder 
44354e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
44364e9afebaSAlex Elder 	if (!rbd_opts)
44374e9afebaSAlex Elder 		goto out_mem;
44384e9afebaSAlex Elder 
44394e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4440d22f76e7SAlex Elder 
4441859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
44420ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
44434e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4444859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4445859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4446dc79b113SAlex Elder 		goto out_err;
4447dc79b113SAlex Elder 	}
4448859c31dfSAlex Elder 	kfree(options);
4449859c31dfSAlex Elder 
4450859c31dfSAlex Elder 	*ceph_opts = copts;
44514e9afebaSAlex Elder 	*opts = rbd_opts;
4452859c31dfSAlex Elder 	*rbd_spec = spec;
44530ddebc0cSAlex Elder 
4454dc79b113SAlex Elder 	return 0;
4455f28e565aSAlex Elder out_mem:
4456dc79b113SAlex Elder 	ret = -ENOMEM;
4457d22f76e7SAlex Elder out_err:
4458859c31dfSAlex Elder 	kfree(rbd_opts);
4459859c31dfSAlex Elder 	rbd_spec_put(spec);
4460f28e565aSAlex Elder 	kfree(options);
4461d22f76e7SAlex Elder 
4462dc79b113SAlex Elder 	return ret;
4463a725f65eSAlex Elder }
4464a725f65eSAlex Elder 
4465589d30e0SAlex Elder /*
4466589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4467589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4468589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4469589d30e0SAlex Elder  *
4470589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4471589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4472589d30e0SAlex Elder  * with the supplied name.
4473589d30e0SAlex Elder  *
4474589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4475589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4476589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4477589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4478589d30e0SAlex Elder  */
4479589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4480589d30e0SAlex Elder {
4481589d30e0SAlex Elder 	int ret;
4482589d30e0SAlex Elder 	size_t size;
4483589d30e0SAlex Elder 	char *object_name;
4484589d30e0SAlex Elder 	void *response;
4485c0fba368SAlex Elder 	char *image_id;
44862f82ee54SAlex Elder 
4487589d30e0SAlex Elder 	/*
44882c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
44892c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
4490c0fba368SAlex Elder 	 * need to fetch the image id again in this case.  We
4491c0fba368SAlex Elder 	 * do still need to set the image format though.
44922c0d0a10SAlex Elder 	 */
4493c0fba368SAlex Elder 	if (rbd_dev->spec->image_id) {
4494c0fba368SAlex Elder 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4495c0fba368SAlex Elder 
44962c0d0a10SAlex Elder 		return 0;
4497c0fba368SAlex Elder 	}
44982c0d0a10SAlex Elder 
44992c0d0a10SAlex Elder 	/*
4500589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4501589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4502589d30e0SAlex Elder 	 */
450369e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4504589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4505589d30e0SAlex Elder 	if (!object_name)
4506589d30e0SAlex Elder 		return -ENOMEM;
45070d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4508589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4509589d30e0SAlex Elder 
4510589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4511589d30e0SAlex Elder 
4512589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4513589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4514589d30e0SAlex Elder 	if (!response) {
4515589d30e0SAlex Elder 		ret = -ENOMEM;
4516589d30e0SAlex Elder 		goto out;
4517589d30e0SAlex Elder 	}
4518589d30e0SAlex Elder 
4519c0fba368SAlex Elder 	/* If it doesn't exist we'll assume it's a format 1 image */
4520c0fba368SAlex Elder 
452136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
45224157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
452307b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
452436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4525c0fba368SAlex Elder 	if (ret == -ENOENT) {
4526c0fba368SAlex Elder 		image_id = kstrdup("", GFP_KERNEL);
4527c0fba368SAlex Elder 		ret = image_id ? 0 : -ENOMEM;
4528c0fba368SAlex Elder 		if (!ret)
4529c0fba368SAlex Elder 			rbd_dev->image_format = 1;
4530c0fba368SAlex Elder 	} else if (ret > sizeof (__le32)) {
4531c0fba368SAlex Elder 		void *p = response;
4532589d30e0SAlex Elder 
4533c0fba368SAlex Elder 		image_id = ceph_extract_encoded_string(&p, p + ret,
4534979ed480SAlex Elder 						NULL, GFP_NOIO);
4535c0fba368SAlex Elder 		ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4536c0fba368SAlex Elder 		if (!ret)
4537c0fba368SAlex Elder 			rbd_dev->image_format = 2;
4538589d30e0SAlex Elder 	} else {
4539c0fba368SAlex Elder 		ret = -EINVAL;
4540c0fba368SAlex Elder 	}
4541c0fba368SAlex Elder 
4542c0fba368SAlex Elder 	if (!ret) {
4543c0fba368SAlex Elder 		rbd_dev->spec->image_id = image_id;
4544c0fba368SAlex Elder 		dout("image_id is %s\n", image_id);
4545589d30e0SAlex Elder 	}
4546589d30e0SAlex Elder out:
4547589d30e0SAlex Elder 	kfree(response);
4548589d30e0SAlex Elder 	kfree(object_name);
4549589d30e0SAlex Elder 
4550589d30e0SAlex Elder 	return ret;
4551589d30e0SAlex Elder }
4552589d30e0SAlex Elder 
45536fd48b3bSAlex Elder /* Undo whatever state changes are made by v1 or v2 image probe */
45546fd48b3bSAlex Elder 
45556fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
45566fd48b3bSAlex Elder {
45576fd48b3bSAlex Elder 	struct rbd_image_header	*header;
45586fd48b3bSAlex Elder 
45596fd48b3bSAlex Elder 	rbd_dev_remove_parent(rbd_dev);
45606fd48b3bSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
45616fd48b3bSAlex Elder 	rbd_dev->parent_spec = NULL;
45626fd48b3bSAlex Elder 	rbd_dev->parent_overlap = 0;
45636fd48b3bSAlex Elder 
45646fd48b3bSAlex Elder 	/* Free dynamic fields from the header, then zero it out */
45656fd48b3bSAlex Elder 
45666fd48b3bSAlex Elder 	header = &rbd_dev->header;
4567812164f8SAlex Elder 	ceph_put_snap_context(header->snapc);
45686fd48b3bSAlex Elder 	kfree(header->snap_sizes);
45696fd48b3bSAlex Elder 	kfree(header->snap_names);
45706fd48b3bSAlex Elder 	kfree(header->object_prefix);
45716fd48b3bSAlex Elder 	memset(header, 0, sizeof (*header));
45726fd48b3bSAlex Elder }
45736fd48b3bSAlex Elder 
4574a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4575a30b71b9SAlex Elder {
4576a30b71b9SAlex Elder 	int ret;
4577a30b71b9SAlex Elder 
4578a30b71b9SAlex Elder 	/* Populate rbd image metadata */
4579a30b71b9SAlex Elder 
4580a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4581a30b71b9SAlex Elder 	if (ret < 0)
4582a30b71b9SAlex Elder 		goto out_err;
458386b00e0dSAlex Elder 
458486b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
458586b00e0dSAlex Elder 
458686b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
458786b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
458886b00e0dSAlex Elder 
4589a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4590a30b71b9SAlex Elder 		rbd_dev->header_name);
4591a30b71b9SAlex Elder 
4592a30b71b9SAlex Elder 	return 0;
4593a30b71b9SAlex Elder 
4594a30b71b9SAlex Elder out_err:
4595a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4596a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
45970d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
45980d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4599a30b71b9SAlex Elder 
4600a30b71b9SAlex Elder 	return ret;
4601a30b71b9SAlex Elder }
4602a30b71b9SAlex Elder 
4603a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4604a30b71b9SAlex Elder {
46059d475de5SAlex Elder 	int ret;
46066e14b1a6SAlex Elder 	u64 ver = 0;
4607a30b71b9SAlex Elder 
46089d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
460957385b51SAlex Elder 	if (ret)
46109d475de5SAlex Elder 		goto out_err;
46111e130199SAlex Elder 
46121e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
46131e130199SAlex Elder 
46141e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
461557385b51SAlex Elder 	if (ret)
46161e130199SAlex Elder 		goto out_err;
4617b1b5402aSAlex Elder 
4618d889140cSAlex Elder 	/* Get the and check features for the image */
4619b1b5402aSAlex Elder 
4620b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
462157385b51SAlex Elder 	if (ret)
4622b1b5402aSAlex Elder 		goto out_err;
462335d489f9SAlex Elder 
462486b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
462586b00e0dSAlex Elder 
462686b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
462786b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
462857385b51SAlex Elder 		if (ret)
462986b00e0dSAlex Elder 			goto out_err;
463096882f55SAlex Elder 
463196882f55SAlex Elder 		/*
463296882f55SAlex Elder 		 * Don't print a warning for parent images.  We can
463396882f55SAlex Elder 		 * tell this point because we won't know its pool
463496882f55SAlex Elder 		 * name yet (just its pool id).
463596882f55SAlex Elder 		 */
463696882f55SAlex Elder 		if (rbd_dev->spec->pool_name)
463796882f55SAlex Elder 			rbd_warn(rbd_dev, "WARNING: kernel layering "
463896882f55SAlex Elder 					"is EXPERIMENTAL!");
463986b00e0dSAlex Elder 	}
464086b00e0dSAlex Elder 
4641cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4642cc070d59SAlex Elder 
4643cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4644cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4645cc070d59SAlex Elder 		if (ret < 0)
4646cc070d59SAlex Elder 			goto out_err;
4647cc070d59SAlex Elder 	}
4648cc070d59SAlex Elder 
46496e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
465035d489f9SAlex Elder 
46516e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
46526e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
46536e14b1a6SAlex Elder 
46546e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
46556e14b1a6SAlex Elder 
46566e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
465735d489f9SAlex Elder 	if (ret)
465835d489f9SAlex Elder 		goto out_err;
46596e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
46606e14b1a6SAlex Elder 
4661a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4662a30b71b9SAlex Elder 		rbd_dev->header_name);
4663a30b71b9SAlex Elder 
466435152979SAlex Elder 	return 0;
46659d475de5SAlex Elder out_err:
466686b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
466786b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
466886b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
46699d475de5SAlex Elder 	kfree(rbd_dev->header_name);
46709d475de5SAlex Elder 	rbd_dev->header_name = NULL;
46711e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
46721e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
46739d475de5SAlex Elder 
46749d475de5SAlex Elder 	return ret;
4675a30b71b9SAlex Elder }
4676a30b71b9SAlex Elder 
4677124afba2SAlex Elder static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
467883a06263SAlex Elder {
46792f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
4680124afba2SAlex Elder 	struct rbd_spec *parent_spec;
4681124afba2SAlex Elder 	struct rbd_client *rbdc;
4682124afba2SAlex Elder 	int ret;
4683124afba2SAlex Elder 
4684124afba2SAlex Elder 	if (!rbd_dev->parent_spec)
4685124afba2SAlex Elder 		return 0;
4686124afba2SAlex Elder 	/*
4687124afba2SAlex Elder 	 * We need to pass a reference to the client and the parent
4688124afba2SAlex Elder 	 * spec when creating the parent rbd_dev.  Images related by
4689124afba2SAlex Elder 	 * parent/child relationships always share both.
4690124afba2SAlex Elder 	 */
4691124afba2SAlex Elder 	parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4692124afba2SAlex Elder 	rbdc = __rbd_get_client(rbd_dev->rbd_client);
4693124afba2SAlex Elder 
4694124afba2SAlex Elder 	ret = -ENOMEM;
4695124afba2SAlex Elder 	parent = rbd_dev_create(rbdc, parent_spec);
4696124afba2SAlex Elder 	if (!parent)
4697124afba2SAlex Elder 		goto out_err;
4698124afba2SAlex Elder 
4699124afba2SAlex Elder 	ret = rbd_dev_image_probe(parent);
4700124afba2SAlex Elder 	if (ret < 0)
4701124afba2SAlex Elder 		goto out_err;
4702124afba2SAlex Elder 	rbd_dev->parent = parent;
4703124afba2SAlex Elder 
4704124afba2SAlex Elder 	return 0;
4705124afba2SAlex Elder out_err:
4706124afba2SAlex Elder 	if (parent) {
4707124afba2SAlex Elder 		rbd_spec_put(rbd_dev->parent_spec);
4708124afba2SAlex Elder 		kfree(rbd_dev->header_name);
4709124afba2SAlex Elder 		rbd_dev_destroy(parent);
4710124afba2SAlex Elder 	} else {
4711124afba2SAlex Elder 		rbd_put_client(rbdc);
4712124afba2SAlex Elder 		rbd_spec_put(parent_spec);
4713124afba2SAlex Elder 	}
4714124afba2SAlex Elder 
4715124afba2SAlex Elder 	return ret;
4716124afba2SAlex Elder }
4717124afba2SAlex Elder 
4718200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
4719124afba2SAlex Elder {
472083a06263SAlex Elder 	int ret;
472183a06263SAlex Elder 
4722d1cf5788SAlex Elder 	ret = rbd_dev_mapping_set(rbd_dev);
472383a06263SAlex Elder 	if (ret)
47249bb81c9bSAlex Elder 		return ret;
47255de10f3bSAlex Elder 
472683a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
472783a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
472883a06263SAlex Elder 
472983a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
473083a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
473183a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
473283a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
473383a06263SAlex Elder 
473483a06263SAlex Elder 	/* Get our block major device number. */
473583a06263SAlex Elder 
473683a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
473783a06263SAlex Elder 	if (ret < 0)
473883a06263SAlex Elder 		goto err_out_id;
473983a06263SAlex Elder 	rbd_dev->major = ret;
474083a06263SAlex Elder 
474183a06263SAlex Elder 	/* Set up the blkdev mapping. */
474283a06263SAlex Elder 
474383a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
474483a06263SAlex Elder 	if (ret)
474583a06263SAlex Elder 		goto err_out_blkdev;
474683a06263SAlex Elder 
474783a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
474883a06263SAlex Elder 	if (ret)
474983a06263SAlex Elder 		goto err_out_disk;
475083a06263SAlex Elder 
475183a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
475283a06263SAlex Elder 
4753b5156e76SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
4754129b79d4SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
475583a06263SAlex Elder 	add_disk(rbd_dev->disk);
475683a06263SAlex Elder 
475783a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
475883a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
475983a06263SAlex Elder 
476083a06263SAlex Elder 	return ret;
47612f82ee54SAlex Elder 
476283a06263SAlex Elder err_out_disk:
476383a06263SAlex Elder 	rbd_free_disk(rbd_dev);
476483a06263SAlex Elder err_out_blkdev:
476583a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
476683a06263SAlex Elder err_out_id:
476783a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
4768d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
476983a06263SAlex Elder 
477083a06263SAlex Elder 	return ret;
477183a06263SAlex Elder }
477283a06263SAlex Elder 
4773332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4774332bb12dSAlex Elder {
4775332bb12dSAlex Elder 	struct rbd_spec *spec = rbd_dev->spec;
4776332bb12dSAlex Elder 	size_t size;
4777332bb12dSAlex Elder 
4778332bb12dSAlex Elder 	/* Record the header object name for this rbd image. */
4779332bb12dSAlex Elder 
4780332bb12dSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4781332bb12dSAlex Elder 
4782332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4783332bb12dSAlex Elder 		size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4784332bb12dSAlex Elder 	else
4785332bb12dSAlex Elder 		size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4786332bb12dSAlex Elder 
4787332bb12dSAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4788332bb12dSAlex Elder 	if (!rbd_dev->header_name)
4789332bb12dSAlex Elder 		return -ENOMEM;
4790332bb12dSAlex Elder 
4791332bb12dSAlex Elder 	if (rbd_dev->image_format == 1)
4792332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4793332bb12dSAlex Elder 			spec->image_name, RBD_SUFFIX);
4794332bb12dSAlex Elder 	else
4795332bb12dSAlex Elder 		sprintf(rbd_dev->header_name, "%s%s",
4796332bb12dSAlex Elder 			RBD_HEADER_PREFIX, spec->image_id);
4797332bb12dSAlex Elder 	return 0;
4798332bb12dSAlex Elder }
4799332bb12dSAlex Elder 
4800200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4801200a6a8bSAlex Elder {
48026fd48b3bSAlex Elder 	int ret;
48036fd48b3bSAlex Elder 
48046fd48b3bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
48056fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
48066fd48b3bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 0);
48076fd48b3bSAlex Elder 	if (ret)
48086fd48b3bSAlex Elder 		rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
4809200a6a8bSAlex Elder 	kfree(rbd_dev->header_name);
48106fd48b3bSAlex Elder 	rbd_dev->header_name = NULL;
48116fd48b3bSAlex Elder 	rbd_dev->image_format = 0;
48126fd48b3bSAlex Elder 	kfree(rbd_dev->spec->image_id);
48136fd48b3bSAlex Elder 	rbd_dev->spec->image_id = NULL;
48146fd48b3bSAlex Elder 
4815200a6a8bSAlex Elder 	rbd_dev_destroy(rbd_dev);
4816200a6a8bSAlex Elder }
4817200a6a8bSAlex Elder 
4818a30b71b9SAlex Elder /*
4819a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4820a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4821a30b71b9SAlex Elder  * id.
4822a30b71b9SAlex Elder  */
482371f293e2SAlex Elder static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
4824a30b71b9SAlex Elder {
4825a30b71b9SAlex Elder 	int ret;
4826b644de2bSAlex Elder 	int tmp;
4827a30b71b9SAlex Elder 
4828a30b71b9SAlex Elder 	/*
4829a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4830a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4831a30b71b9SAlex Elder 	 * it's a format 1 image.
4832a30b71b9SAlex Elder 	 */
4833a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4834a30b71b9SAlex Elder 	if (ret)
4835c0fba368SAlex Elder 		return ret;
4836c0fba368SAlex Elder 	rbd_assert(rbd_dev->spec->image_id);
4837c0fba368SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4838c0fba368SAlex Elder 
4839332bb12dSAlex Elder 	ret = rbd_dev_header_name(rbd_dev);
4840332bb12dSAlex Elder 	if (ret)
4841332bb12dSAlex Elder 		goto err_out_format;
4842332bb12dSAlex Elder 
4843b644de2bSAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4844b644de2bSAlex Elder 	if (ret)
4845b644de2bSAlex Elder 		goto out_header_name;
4846b644de2bSAlex Elder 
4847c0fba368SAlex Elder 	if (rbd_dev->image_format == 1)
4848a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4849a30b71b9SAlex Elder 	else
4850a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
48515655c4d9SAlex Elder 	if (ret)
4852b644de2bSAlex Elder 		goto err_out_watch;
4853a30b71b9SAlex Elder 
48549bb81c9bSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
48559bb81c9bSAlex Elder 	if (ret)
48566fd48b3bSAlex Elder 		goto err_out_probe;
48579bb81c9bSAlex Elder 
48589bb81c9bSAlex Elder 	ret = rbd_dev_spec_update(rbd_dev);
48599bb81c9bSAlex Elder 	if (ret)
48609bb81c9bSAlex Elder 		goto err_out_snaps;
48619bb81c9bSAlex Elder 
48629bb81c9bSAlex Elder 	ret = rbd_dev_probe_parent(rbd_dev);
48636fd48b3bSAlex Elder 	if (!ret)
48646fd48b3bSAlex Elder 		return 0;
486583a06263SAlex Elder 
48669bb81c9bSAlex Elder err_out_snaps:
48679bb81c9bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
48686fd48b3bSAlex Elder err_out_probe:
48696fd48b3bSAlex Elder 	rbd_dev_unprobe(rbd_dev);
4870b644de2bSAlex Elder err_out_watch:
4871b644de2bSAlex Elder 	tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4872b644de2bSAlex Elder 	if (tmp)
4873b644de2bSAlex Elder 		rbd_warn(rbd_dev, "unable to tear down watch request\n");
4874332bb12dSAlex Elder out_header_name:
4875332bb12dSAlex Elder 	kfree(rbd_dev->header_name);
4876332bb12dSAlex Elder 	rbd_dev->header_name = NULL;
4877332bb12dSAlex Elder err_out_format:
4878332bb12dSAlex Elder 	rbd_dev->image_format = 0;
48795655c4d9SAlex Elder 	kfree(rbd_dev->spec->image_id);
48805655c4d9SAlex Elder 	rbd_dev->spec->image_id = NULL;
48815655c4d9SAlex Elder 
48825655c4d9SAlex Elder 	dout("probe failed, returning %d\n", ret);
48835655c4d9SAlex Elder 
48845655c4d9SAlex Elder 	return ret;
488583a06263SAlex Elder }
488683a06263SAlex Elder 
488759c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
488859c2be1eSYehuda Sadeh 		       const char *buf,
488959c2be1eSYehuda Sadeh 		       size_t count)
4890602adf40SYehuda Sadeh {
4891cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4892dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
48934e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4894859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48959d3997fdSAlex Elder 	struct rbd_client *rbdc;
489627cc2594SAlex Elder 	struct ceph_osd_client *osdc;
489727cc2594SAlex Elder 	int rc = -ENOMEM;
4898602adf40SYehuda Sadeh 
4899602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4900602adf40SYehuda Sadeh 		return -ENODEV;
4901602adf40SYehuda Sadeh 
4902a725f65eSAlex Elder 	/* parse add command */
4903859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4904dc79b113SAlex Elder 	if (rc < 0)
4905bd4ba655SAlex Elder 		goto err_out_module;
4906a725f65eSAlex Elder 
49079d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
49089d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
49099d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
49100ddebc0cSAlex Elder 		goto err_out_args;
49119d3997fdSAlex Elder 	}
4912c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4913602adf40SYehuda Sadeh 
4914602adf40SYehuda Sadeh 	/* pick the pool */
49159d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4916859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4917602adf40SYehuda Sadeh 	if (rc < 0)
4918602adf40SYehuda Sadeh 		goto err_out_client;
4919859c31dfSAlex Elder 	spec->pool_id = (u64)rc;
4920859c31dfSAlex Elder 
49210903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
49220903e875SAlex Elder 
4923c0cd10dbSAlex Elder 	if (spec->pool_id > (u64)U32_MAX) {
4924c0cd10dbSAlex Elder 		rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4925c0cd10dbSAlex Elder 				(unsigned long long)spec->pool_id, U32_MAX);
49260903e875SAlex Elder 		rc = -EIO;
49270903e875SAlex Elder 		goto err_out_client;
49280903e875SAlex Elder 	}
49290903e875SAlex Elder 
4930c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4931bd4ba655SAlex Elder 	if (!rbd_dev)
4932bd4ba655SAlex Elder 		goto err_out_client;
4933c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4934c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4935602adf40SYehuda Sadeh 
4936bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4937c53d5893SAlex Elder 	kfree(rbd_opts);
4938c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4939bd4ba655SAlex Elder 
494071f293e2SAlex Elder 	rc = rbd_dev_image_probe(rbd_dev);
4941a30b71b9SAlex Elder 	if (rc < 0)
4942c53d5893SAlex Elder 		goto err_out_rbd_dev;
494305fd6f6fSAlex Elder 
4944b536f69aSAlex Elder 	rc = rbd_dev_device_setup(rbd_dev);
4945b536f69aSAlex Elder 	if (!rc)
4946602adf40SYehuda Sadeh 		return count;
4947b536f69aSAlex Elder 
4948b536f69aSAlex Elder 	rbd_dev_image_release(rbd_dev);
4949c53d5893SAlex Elder err_out_rbd_dev:
4950c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4951bd4ba655SAlex Elder err_out_client:
49529d3997fdSAlex Elder 	rbd_put_client(rbdc);
49530ddebc0cSAlex Elder err_out_args:
495478cea76eSAlex Elder 	if (ceph_opts)
495578cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
49564e9afebaSAlex Elder 	kfree(rbd_opts);
4957859c31dfSAlex Elder 	rbd_spec_put(spec);
4958bd4ba655SAlex Elder err_out_module:
4959bd4ba655SAlex Elder 	module_put(THIS_MODULE);
496027cc2594SAlex Elder 
4961602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
496227cc2594SAlex Elder 
496327cc2594SAlex Elder 	return (ssize_t)rc;
4964602adf40SYehuda Sadeh }
4965602adf40SYehuda Sadeh 
4966de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4967602adf40SYehuda Sadeh {
4968602adf40SYehuda Sadeh 	struct list_head *tmp;
4969602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4970602adf40SYehuda Sadeh 
4971e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4972602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4973602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4974de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4975e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4976602adf40SYehuda Sadeh 			return rbd_dev;
4977602adf40SYehuda Sadeh 		}
4978e124a82fSAlex Elder 	}
4979e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4980602adf40SYehuda Sadeh 	return NULL;
4981602adf40SYehuda Sadeh }
4982602adf40SYehuda Sadeh 
4983200a6a8bSAlex Elder static void rbd_dev_device_release(struct device *dev)
4984602adf40SYehuda Sadeh {
4985593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4986602adf40SYehuda Sadeh 
4987602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4988200a6a8bSAlex Elder 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4989200a6a8bSAlex Elder 	rbd_dev_clear_mapping(rbd_dev);
4990602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
4991200a6a8bSAlex Elder 	rbd_dev->major = 0;
4992e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4993d1cf5788SAlex Elder 	rbd_dev_mapping_clear(rbd_dev);
4994602adf40SYehuda Sadeh }
4995602adf40SYehuda Sadeh 
499605a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
499705a46afdSAlex Elder {
4998ad945fc1SAlex Elder 	while (rbd_dev->parent) {
499905a46afdSAlex Elder 		struct rbd_device *first = rbd_dev;
500005a46afdSAlex Elder 		struct rbd_device *second = first->parent;
500105a46afdSAlex Elder 		struct rbd_device *third;
500205a46afdSAlex Elder 
500305a46afdSAlex Elder 		/*
500405a46afdSAlex Elder 		 * Follow to the parent with no grandparent and
500505a46afdSAlex Elder 		 * remove it.
500605a46afdSAlex Elder 		 */
500705a46afdSAlex Elder 		while (second && (third = second->parent)) {
500805a46afdSAlex Elder 			first = second;
500905a46afdSAlex Elder 			second = third;
501005a46afdSAlex Elder 		}
5011ad945fc1SAlex Elder 		rbd_assert(second);
50128ad42cd0SAlex Elder 		rbd_dev_image_release(second);
5013ad945fc1SAlex Elder 		first->parent = NULL;
5014ad945fc1SAlex Elder 		first->parent_overlap = 0;
5015ad945fc1SAlex Elder 
5016ad945fc1SAlex Elder 		rbd_assert(first->parent_spec);
501705a46afdSAlex Elder 		rbd_spec_put(first->parent_spec);
501805a46afdSAlex Elder 		first->parent_spec = NULL;
501905a46afdSAlex Elder 	}
502005a46afdSAlex Elder }
502105a46afdSAlex Elder 
5022dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
5023602adf40SYehuda Sadeh 			  const char *buf,
5024602adf40SYehuda Sadeh 			  size_t count)
5025602adf40SYehuda Sadeh {
5026602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
50270d8189e1SAlex Elder 	int target_id;
5028602adf40SYehuda Sadeh 	unsigned long ul;
50290d8189e1SAlex Elder 	int ret;
5030602adf40SYehuda Sadeh 
50310d8189e1SAlex Elder 	ret = strict_strtoul(buf, 10, &ul);
50320d8189e1SAlex Elder 	if (ret)
50330d8189e1SAlex Elder 		return ret;
5034602adf40SYehuda Sadeh 
5035602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
5036602adf40SYehuda Sadeh 	target_id = (int) ul;
5037602adf40SYehuda Sadeh 	if (target_id != ul)
5038602adf40SYehuda Sadeh 		return -EINVAL;
5039602adf40SYehuda Sadeh 
5040602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
5041602adf40SYehuda Sadeh 
5042602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
5043602adf40SYehuda Sadeh 	if (!rbd_dev) {
5044602adf40SYehuda Sadeh 		ret = -ENOENT;
5045602adf40SYehuda Sadeh 		goto done;
5046602adf40SYehuda Sadeh 	}
5047602adf40SYehuda Sadeh 
5048a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
5049b82d167bSAlex Elder 	if (rbd_dev->open_count)
505042382b70SAlex Elder 		ret = -EBUSY;
5051b82d167bSAlex Elder 	else
5052b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
5053a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
5054b82d167bSAlex Elder 	if (ret < 0)
505542382b70SAlex Elder 		goto done;
50560d8189e1SAlex Elder 	ret = count;
5057b480815aSAlex Elder 	rbd_bus_del_dev(rbd_dev);
50588ad42cd0SAlex Elder 	rbd_dev_image_release(rbd_dev);
505979ab7558SAlex Elder 	module_put(THIS_MODULE);
5060602adf40SYehuda Sadeh done:
5061602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
5062aafb230eSAlex Elder 
5063602adf40SYehuda Sadeh 	return ret;
5064602adf40SYehuda Sadeh }
5065602adf40SYehuda Sadeh 
5066602adf40SYehuda Sadeh /*
5067602adf40SYehuda Sadeh  * create control files in sysfs
5068dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
5069602adf40SYehuda Sadeh  */
5070602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
5071602adf40SYehuda Sadeh {
5072dfc5606dSYehuda Sadeh 	int ret;
5073602adf40SYehuda Sadeh 
5074fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5075dfc5606dSYehuda Sadeh 	if (ret < 0)
5076dfc5606dSYehuda Sadeh 		return ret;
5077602adf40SYehuda Sadeh 
5078fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5079fed4c143SAlex Elder 	if (ret < 0)
5080fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5081602adf40SYehuda Sadeh 
5082602adf40SYehuda Sadeh 	return ret;
5083602adf40SYehuda Sadeh }
5084602adf40SYehuda Sadeh 
5085602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5086602adf40SYehuda Sadeh {
5087dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5088fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5089602adf40SYehuda Sadeh }
5090602adf40SYehuda Sadeh 
5091cc344fa1SAlex Elder static int __init rbd_init(void)
5092602adf40SYehuda Sadeh {
5093602adf40SYehuda Sadeh 	int rc;
5094602adf40SYehuda Sadeh 
50951e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
50961e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
50971e32d34cSAlex Elder 
50981e32d34cSAlex Elder 		return -EINVAL;
50991e32d34cSAlex Elder 	}
5100602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
5101602adf40SYehuda Sadeh 	if (rc)
5102602adf40SYehuda Sadeh 		return rc;
5103f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5104602adf40SYehuda Sadeh 	return 0;
5105602adf40SYehuda Sadeh }
5106602adf40SYehuda Sadeh 
5107cc344fa1SAlex Elder static void __exit rbd_exit(void)
5108602adf40SYehuda Sadeh {
5109602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
5110602adf40SYehuda Sadeh }
5111602adf40SYehuda Sadeh 
5112602adf40SYehuda Sadeh module_init(rbd_init);
5113602adf40SYehuda Sadeh module_exit(rbd_exit);
5114602adf40SYehuda Sadeh 
5115602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5116602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5117602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5118602adf40SYehuda Sadeh 
5119602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5120602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5121602adf40SYehuda Sadeh 
5122602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5123