xref: /openbmc/linux/drivers/block/rbd.c (revision 9849e986)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
835cbf6f12SAlex Elder #define RBD_FEATURES_SUPPORTED	(0)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
11159c2be1eSYehuda Sadeh 	u64 obj_version;
11259c2be1eSYehuda Sadeh };
11359c2be1eSYehuda Sadeh 
1140d7dbfceSAlex Elder /*
1150d7dbfceSAlex Elder  * An rbd image specification.
1160d7dbfceSAlex Elder  *
1170d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
118c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
119c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
120c66c6e0cSAlex Elder  *
121c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
122c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
123c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
124c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
127c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
128c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
129c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
130c66c6e0cSAlex Elder  * is shared between the parent and child).
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
133c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
134c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
137c66c6e0cSAlex Elder  * could be a null pointer).
1380d7dbfceSAlex Elder  */
1390d7dbfceSAlex Elder struct rbd_spec {
1400d7dbfceSAlex Elder 	u64		pool_id;
1410d7dbfceSAlex Elder 	char		*pool_name;
1420d7dbfceSAlex Elder 
1430d7dbfceSAlex Elder 	char		*image_id;
1440d7dbfceSAlex Elder 	char		*image_name;
1450d7dbfceSAlex Elder 
1460d7dbfceSAlex Elder 	u64		snap_id;
1470d7dbfceSAlex Elder 	char		*snap_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	struct kref	kref;
1500d7dbfceSAlex Elder };
1510d7dbfceSAlex Elder 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
154602adf40SYehuda Sadeh  */
155602adf40SYehuda Sadeh struct rbd_client {
156602adf40SYehuda Sadeh 	struct ceph_client	*client;
157602adf40SYehuda Sadeh 	struct kref		kref;
158602adf40SYehuda Sadeh 	struct list_head	node;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161bf0d5f50SAlex Elder struct rbd_img_request;
162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163bf0d5f50SAlex Elder 
164bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
165bf0d5f50SAlex Elder 
166bf0d5f50SAlex Elder struct rbd_obj_request;
167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168bf0d5f50SAlex Elder 
1699969ebc5SAlex Elder enum obj_request_type {
1709969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1719969ebc5SAlex Elder };
172bf0d5f50SAlex Elder 
173bf0d5f50SAlex Elder struct rbd_obj_request {
174bf0d5f50SAlex Elder 	const char		*object_name;
175bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
176bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
177bf0d5f50SAlex Elder 
178bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
1797da22d29SAlex Elder 	u64			img_offset;	/* image relative offset */
180bf0d5f50SAlex Elder 	struct list_head	links;		/* img_request->obj_requests */
181bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
182bf0d5f50SAlex Elder 
183bf0d5f50SAlex Elder 	enum obj_request_type	type;
184788e2df3SAlex Elder 	union {
185bf0d5f50SAlex Elder 		struct bio	*bio_list;
186788e2df3SAlex Elder 		struct {
187788e2df3SAlex Elder 			struct page	**pages;
188788e2df3SAlex Elder 			u32		page_count;
189788e2df3SAlex Elder 		};
190788e2df3SAlex Elder 	};
191bf0d5f50SAlex Elder 
192bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
193bf0d5f50SAlex Elder 
194bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
195bf0d5f50SAlex Elder 	u64			version;
1961b83bef2SSage Weil 	int			result;
197bf0d5f50SAlex Elder 	atomic_t		done;
198bf0d5f50SAlex Elder 
199bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
200788e2df3SAlex Elder 	struct completion	completion;
201bf0d5f50SAlex Elder 
202bf0d5f50SAlex Elder 	struct kref		kref;
203bf0d5f50SAlex Elder };
204bf0d5f50SAlex Elder 
2050c425248SAlex Elder enum img_req_flags {
2069849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2079849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
2080c425248SAlex Elder };
2090c425248SAlex Elder 
210bf0d5f50SAlex Elder struct rbd_img_request {
211bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
212bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
213bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2140c425248SAlex Elder 	unsigned long		flags;
215bf0d5f50SAlex Elder 	union {
216bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2179849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2189849e986SAlex Elder 	};
2199849e986SAlex Elder 	union {
2209849e986SAlex Elder 		struct request		*rq;		/* block request */
2219849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
222bf0d5f50SAlex Elder 	};
223bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
224bf0d5f50SAlex Elder 	u32			next_completion;
225bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
22655f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
227a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
228bf0d5f50SAlex Elder 
229bf0d5f50SAlex Elder 	u32			obj_request_count;
230bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
231bf0d5f50SAlex Elder 
232bf0d5f50SAlex Elder 	struct kref		kref;
233bf0d5f50SAlex Elder };
234bf0d5f50SAlex Elder 
235bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
236ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
237bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
238ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
239bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
240ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
241bf0d5f50SAlex Elder 
242dfc5606dSYehuda Sadeh struct rbd_snap {
243dfc5606dSYehuda Sadeh 	struct	device		dev;
244dfc5606dSYehuda Sadeh 	const char		*name;
2453591538fSJosh Durgin 	u64			size;
246dfc5606dSYehuda Sadeh 	struct list_head	node;
247dfc5606dSYehuda Sadeh 	u64			id;
24834b13184SAlex Elder 	u64			features;
249dfc5606dSYehuda Sadeh };
250dfc5606dSYehuda Sadeh 
251f84344f3SAlex Elder struct rbd_mapping {
25299c1f08fSAlex Elder 	u64                     size;
25334b13184SAlex Elder 	u64                     features;
254f84344f3SAlex Elder 	bool			read_only;
255f84344f3SAlex Elder };
256f84344f3SAlex Elder 
257602adf40SYehuda Sadeh /*
258602adf40SYehuda Sadeh  * a single device
259602adf40SYehuda Sadeh  */
260602adf40SYehuda Sadeh struct rbd_device {
261de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
262602adf40SYehuda Sadeh 
263602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
264602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
265602adf40SYehuda Sadeh 
266a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
267602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
268602adf40SYehuda Sadeh 
269602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
270602adf40SYehuda Sadeh 
271b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
272602adf40SYehuda Sadeh 
273602adf40SYehuda Sadeh 	struct rbd_image_header	header;
274b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
2750d7dbfceSAlex Elder 	struct rbd_spec		*spec;
276602adf40SYehuda Sadeh 
2770d7dbfceSAlex Elder 	char			*header_name;
278971f839aSAlex Elder 
2790903e875SAlex Elder 	struct ceph_file_layout	layout;
2800903e875SAlex Elder 
28159c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
282975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
28359c2be1eSYehuda Sadeh 
28486b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
28586b00e0dSAlex Elder 	u64			parent_overlap;
28686b00e0dSAlex Elder 
287c666601aSJosh Durgin 	/* protects updating the header */
288c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
289f84344f3SAlex Elder 
290f84344f3SAlex Elder 	struct rbd_mapping	mapping;
291602adf40SYehuda Sadeh 
292602adf40SYehuda Sadeh 	struct list_head	node;
293dfc5606dSYehuda Sadeh 
294dfc5606dSYehuda Sadeh 	/* list of snapshots */
295dfc5606dSYehuda Sadeh 	struct list_head	snaps;
296dfc5606dSYehuda Sadeh 
297dfc5606dSYehuda Sadeh 	/* sysfs related */
298dfc5606dSYehuda Sadeh 	struct device		dev;
299b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
300dfc5606dSYehuda Sadeh };
301dfc5606dSYehuda Sadeh 
302b82d167bSAlex Elder /*
303b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
304b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
305b82d167bSAlex Elder  *
306b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
307b82d167bSAlex Elder  * "open_count" field) requires atomic access.
308b82d167bSAlex Elder  */
3096d292906SAlex Elder enum rbd_dev_flags {
3106d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
311b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3126d292906SAlex Elder };
3136d292906SAlex Elder 
314602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
315e124a82fSAlex Elder 
316602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
317e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
318e124a82fSAlex Elder 
319602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
320432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
321602adf40SYehuda Sadeh 
322304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
323304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
324304f6808SAlex Elder 
325dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
32641f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
327dfc5606dSYehuda Sadeh 
328f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
329f0f8cef5SAlex Elder 		       size_t count);
330f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
331f0f8cef5SAlex Elder 			  size_t count);
332f0f8cef5SAlex Elder 
333f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
334f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
335f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
336f0f8cef5SAlex Elder 	__ATTR_NULL
337f0f8cef5SAlex Elder };
338f0f8cef5SAlex Elder 
339f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
340f0f8cef5SAlex Elder 	.name		= "rbd",
341f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
342f0f8cef5SAlex Elder };
343f0f8cef5SAlex Elder 
344f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
345f0f8cef5SAlex Elder {
346f0f8cef5SAlex Elder }
347f0f8cef5SAlex Elder 
348f0f8cef5SAlex Elder static struct device rbd_root_dev = {
349f0f8cef5SAlex Elder 	.init_name =    "rbd",
350f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
351f0f8cef5SAlex Elder };
352f0f8cef5SAlex Elder 
35306ecc6cbSAlex Elder static __printf(2, 3)
35406ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
35506ecc6cbSAlex Elder {
35606ecc6cbSAlex Elder 	struct va_format vaf;
35706ecc6cbSAlex Elder 	va_list args;
35806ecc6cbSAlex Elder 
35906ecc6cbSAlex Elder 	va_start(args, fmt);
36006ecc6cbSAlex Elder 	vaf.fmt = fmt;
36106ecc6cbSAlex Elder 	vaf.va = &args;
36206ecc6cbSAlex Elder 
36306ecc6cbSAlex Elder 	if (!rbd_dev)
36406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
36506ecc6cbSAlex Elder 	else if (rbd_dev->disk)
36606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
36706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
36806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
36906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
37006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
37106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
37206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
37306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
37406ecc6cbSAlex Elder 	else	/* punt */
37506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
37606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
37706ecc6cbSAlex Elder 	va_end(args);
37806ecc6cbSAlex Elder }
37906ecc6cbSAlex Elder 
380aafb230eSAlex Elder #ifdef RBD_DEBUG
381aafb230eSAlex Elder #define rbd_assert(expr)						\
382aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
383aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
384aafb230eSAlex Elder 						"at line %d:\n\n"	\
385aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
386aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
387aafb230eSAlex Elder 			BUG();						\
388aafb230eSAlex Elder 		}
389aafb230eSAlex Elder #else /* !RBD_DEBUG */
390aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
391aafb230eSAlex Elder #endif /* !RBD_DEBUG */
392dfc5606dSYehuda Sadeh 
393117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
394117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
39559c2be1eSYehuda Sadeh 
396602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
397602adf40SYehuda Sadeh {
398f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
399b82d167bSAlex Elder 	bool removing = false;
400602adf40SYehuda Sadeh 
401f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
402602adf40SYehuda Sadeh 		return -EROFS;
403602adf40SYehuda Sadeh 
404a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
405b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
406b82d167bSAlex Elder 		removing = true;
407b82d167bSAlex Elder 	else
408b82d167bSAlex Elder 		rbd_dev->open_count++;
409a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
410b82d167bSAlex Elder 	if (removing)
411b82d167bSAlex Elder 		return -ENOENT;
412b82d167bSAlex Elder 
41342382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
414c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
415f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
41642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
417340c7a2bSAlex Elder 
418602adf40SYehuda Sadeh 	return 0;
419602adf40SYehuda Sadeh }
420602adf40SYehuda Sadeh 
421dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
422dfc5606dSYehuda Sadeh {
423dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
424b82d167bSAlex Elder 	unsigned long open_count_before;
425b82d167bSAlex Elder 
426a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
427b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
428a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
429b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
430dfc5606dSYehuda Sadeh 
43142382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
432c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
43342382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
434dfc5606dSYehuda Sadeh 
435dfc5606dSYehuda Sadeh 	return 0;
436dfc5606dSYehuda Sadeh }
437dfc5606dSYehuda Sadeh 
438602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
439602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
440602adf40SYehuda Sadeh 	.open			= rbd_open,
441dfc5606dSYehuda Sadeh 	.release		= rbd_release,
442602adf40SYehuda Sadeh };
443602adf40SYehuda Sadeh 
444602adf40SYehuda Sadeh /*
445602adf40SYehuda Sadeh  * Initialize an rbd client instance.
44643ae4701SAlex Elder  * We own *ceph_opts.
447602adf40SYehuda Sadeh  */
448f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
449602adf40SYehuda Sadeh {
450602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
451602adf40SYehuda Sadeh 	int ret = -ENOMEM;
452602adf40SYehuda Sadeh 
45337206ee5SAlex Elder 	dout("%s:\n", __func__);
454602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
455602adf40SYehuda Sadeh 	if (!rbdc)
456602adf40SYehuda Sadeh 		goto out_opt;
457602adf40SYehuda Sadeh 
458602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
459602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
460602adf40SYehuda Sadeh 
461bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
462bc534d86SAlex Elder 
46343ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
464602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
465bc534d86SAlex Elder 		goto out_mutex;
46643ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
467602adf40SYehuda Sadeh 
468602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
469602adf40SYehuda Sadeh 	if (ret < 0)
470602adf40SYehuda Sadeh 		goto out_err;
471602adf40SYehuda Sadeh 
472432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
473602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
474432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
475602adf40SYehuda Sadeh 
476bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
47737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
478bc534d86SAlex Elder 
479602adf40SYehuda Sadeh 	return rbdc;
480602adf40SYehuda Sadeh 
481602adf40SYehuda Sadeh out_err:
482602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
483bc534d86SAlex Elder out_mutex:
484bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
485602adf40SYehuda Sadeh 	kfree(rbdc);
486602adf40SYehuda Sadeh out_opt:
48743ae4701SAlex Elder 	if (ceph_opts)
48843ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
48937206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
49037206ee5SAlex Elder 
49128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
492602adf40SYehuda Sadeh }
493602adf40SYehuda Sadeh 
494602adf40SYehuda Sadeh /*
4951f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
4961f7ba331SAlex Elder  * found, bump its reference count.
497602adf40SYehuda Sadeh  */
4981f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
499602adf40SYehuda Sadeh {
500602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5011f7ba331SAlex Elder 	bool found = false;
502602adf40SYehuda Sadeh 
50343ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
504602adf40SYehuda Sadeh 		return NULL;
505602adf40SYehuda Sadeh 
5061f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5071f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5081f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5091f7ba331SAlex Elder 			kref_get(&client_node->kref);
5101f7ba331SAlex Elder 			found = true;
5111f7ba331SAlex Elder 			break;
5121f7ba331SAlex Elder 		}
5131f7ba331SAlex Elder 	}
5141f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5151f7ba331SAlex Elder 
5161f7ba331SAlex Elder 	return found ? client_node : NULL;
517602adf40SYehuda Sadeh }
518602adf40SYehuda Sadeh 
519602adf40SYehuda Sadeh /*
52059c2be1eSYehuda Sadeh  * mount options
52159c2be1eSYehuda Sadeh  */
52259c2be1eSYehuda Sadeh enum {
52359c2be1eSYehuda Sadeh 	Opt_last_int,
52459c2be1eSYehuda Sadeh 	/* int args above */
52559c2be1eSYehuda Sadeh 	Opt_last_string,
52659c2be1eSYehuda Sadeh 	/* string args above */
527cc0538b6SAlex Elder 	Opt_read_only,
528cc0538b6SAlex Elder 	Opt_read_write,
529cc0538b6SAlex Elder 	/* Boolean args above */
530cc0538b6SAlex Elder 	Opt_last_bool,
53159c2be1eSYehuda Sadeh };
53259c2be1eSYehuda Sadeh 
53343ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
53459c2be1eSYehuda Sadeh 	/* int args above */
53559c2be1eSYehuda Sadeh 	/* string args above */
536be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
537cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
538cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
539cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
540cc0538b6SAlex Elder 	/* Boolean args above */
54159c2be1eSYehuda Sadeh 	{-1, NULL}
54259c2be1eSYehuda Sadeh };
54359c2be1eSYehuda Sadeh 
54498571b5aSAlex Elder struct rbd_options {
54598571b5aSAlex Elder 	bool	read_only;
54698571b5aSAlex Elder };
54798571b5aSAlex Elder 
54898571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
54998571b5aSAlex Elder 
55059c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
55159c2be1eSYehuda Sadeh {
55243ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
55359c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
55459c2be1eSYehuda Sadeh 	int token, intval, ret;
55559c2be1eSYehuda Sadeh 
55643ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
55759c2be1eSYehuda Sadeh 	if (token < 0)
55859c2be1eSYehuda Sadeh 		return -EINVAL;
55959c2be1eSYehuda Sadeh 
56059c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
56159c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
56259c2be1eSYehuda Sadeh 		if (ret < 0) {
56359c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
56459c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
56559c2be1eSYehuda Sadeh 			return ret;
56659c2be1eSYehuda Sadeh 		}
56759c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
56859c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
56959c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
57059c2be1eSYehuda Sadeh 		     argstr[0].from);
571cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
572cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
57359c2be1eSYehuda Sadeh 	} else {
57459c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
57559c2be1eSYehuda Sadeh 	}
57659c2be1eSYehuda Sadeh 
57759c2be1eSYehuda Sadeh 	switch (token) {
578cc0538b6SAlex Elder 	case Opt_read_only:
579cc0538b6SAlex Elder 		rbd_opts->read_only = true;
580cc0538b6SAlex Elder 		break;
581cc0538b6SAlex Elder 	case Opt_read_write:
582cc0538b6SAlex Elder 		rbd_opts->read_only = false;
583cc0538b6SAlex Elder 		break;
58459c2be1eSYehuda Sadeh 	default:
585aafb230eSAlex Elder 		rbd_assert(false);
586aafb230eSAlex Elder 		break;
58759c2be1eSYehuda Sadeh 	}
58859c2be1eSYehuda Sadeh 	return 0;
58959c2be1eSYehuda Sadeh }
59059c2be1eSYehuda Sadeh 
59159c2be1eSYehuda Sadeh /*
592602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
593602adf40SYehuda Sadeh  * not exist create it.
594602adf40SYehuda Sadeh  */
5959d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
596602adf40SYehuda Sadeh {
597f8c38929SAlex Elder 	struct rbd_client *rbdc;
59859c2be1eSYehuda Sadeh 
5991f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6009d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
60143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6029d3997fdSAlex Elder 	else
603f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
604d720bcb0SAlex Elder 
6059d3997fdSAlex Elder 	return rbdc;
606602adf40SYehuda Sadeh }
607602adf40SYehuda Sadeh 
608602adf40SYehuda Sadeh /*
609602adf40SYehuda Sadeh  * Destroy ceph client
610d23a4b3fSAlex Elder  *
611432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
612602adf40SYehuda Sadeh  */
613602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
614602adf40SYehuda Sadeh {
615602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
616602adf40SYehuda Sadeh 
61737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
618cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
619602adf40SYehuda Sadeh 	list_del(&rbdc->node);
620cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
621602adf40SYehuda Sadeh 
622602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
623602adf40SYehuda Sadeh 	kfree(rbdc);
624602adf40SYehuda Sadeh }
625602adf40SYehuda Sadeh 
626602adf40SYehuda Sadeh /*
627602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
628602adf40SYehuda Sadeh  * it.
629602adf40SYehuda Sadeh  */
6309d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
631602adf40SYehuda Sadeh {
632c53d5893SAlex Elder 	if (rbdc)
6339d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
634602adf40SYehuda Sadeh }
635602adf40SYehuda Sadeh 
636a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
637a30b71b9SAlex Elder {
638a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
639a30b71b9SAlex Elder }
640a30b71b9SAlex Elder 
6418e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6428e94af8eSAlex Elder {
643103a150fSAlex Elder 	size_t size;
644103a150fSAlex Elder 	u32 snap_count;
645103a150fSAlex Elder 
646103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
647103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
648103a150fSAlex Elder 		return false;
649103a150fSAlex Elder 
650db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
651db2388b6SAlex Elder 
652db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
653db2388b6SAlex Elder 		return false;
654db2388b6SAlex Elder 
655db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
656db2388b6SAlex Elder 
657db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
658db2388b6SAlex Elder 		return false;
659db2388b6SAlex Elder 
660103a150fSAlex Elder 	/*
661103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
662103a150fSAlex Elder 	 * that limits the number of snapshots.
663103a150fSAlex Elder 	 */
664103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
665103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
666103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
667103a150fSAlex Elder 		return false;
668103a150fSAlex Elder 
669103a150fSAlex Elder 	/*
670103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
671103a150fSAlex Elder 	 * header must also be representable in a size_t.
672103a150fSAlex Elder 	 */
673103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
674103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
675103a150fSAlex Elder 		return false;
676103a150fSAlex Elder 
677103a150fSAlex Elder 	return true;
6788e94af8eSAlex Elder }
6798e94af8eSAlex Elder 
680602adf40SYehuda Sadeh /*
681602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
682602adf40SYehuda Sadeh  * header.
683602adf40SYehuda Sadeh  */
684602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
6854156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
686602adf40SYehuda Sadeh {
687ccece235SAlex Elder 	u32 snap_count;
68858c17b0eSAlex Elder 	size_t len;
689d2bb24e5SAlex Elder 	size_t size;
690621901d6SAlex Elder 	u32 i;
691602adf40SYehuda Sadeh 
6926a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
6936a52325fSAlex Elder 
694103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
695103a150fSAlex Elder 
69658c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
69758c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6986a52325fSAlex Elder 	if (!header->object_prefix)
699602adf40SYehuda Sadeh 		return -ENOMEM;
70058c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
70158c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
70200f1f36fSAlex Elder 
703602adf40SYehuda Sadeh 	if (snap_count) {
704f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
705f785cc1dSAlex Elder 
706621901d6SAlex Elder 		/* Save a copy of the snapshot names */
707621901d6SAlex Elder 
708f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
709f785cc1dSAlex Elder 			return -EIO;
710f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
711602adf40SYehuda Sadeh 		if (!header->snap_names)
7126a52325fSAlex Elder 			goto out_err;
713f785cc1dSAlex Elder 		/*
714f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
715f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
716f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
717f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
718f785cc1dSAlex Elder 		 */
719f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
720f785cc1dSAlex Elder 			snap_names_len);
7216a52325fSAlex Elder 
722621901d6SAlex Elder 		/* Record each snapshot's size */
723621901d6SAlex Elder 
724d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
725d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
726602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7276a52325fSAlex Elder 			goto out_err;
728621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
729621901d6SAlex Elder 			header->snap_sizes[i] =
730621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
731602adf40SYehuda Sadeh 	} else {
732ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
733602adf40SYehuda Sadeh 		header->snap_names = NULL;
734602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
735602adf40SYehuda Sadeh 	}
736849b4260SAlex Elder 
73734b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
738602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
739602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
740602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7416a52325fSAlex Elder 
742621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
743621901d6SAlex Elder 
744f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7456a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7466a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7476a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7486a52325fSAlex Elder 	if (!header->snapc)
7496a52325fSAlex Elder 		goto out_err;
750602adf40SYehuda Sadeh 
751602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
752505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
753602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
754621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
755602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
756602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
757602adf40SYehuda Sadeh 
758602adf40SYehuda Sadeh 	return 0;
759602adf40SYehuda Sadeh 
7606a52325fSAlex Elder out_err:
761849b4260SAlex Elder 	kfree(header->snap_sizes);
762ccece235SAlex Elder 	header->snap_sizes = NULL;
763602adf40SYehuda Sadeh 	kfree(header->snap_names);
764ccece235SAlex Elder 	header->snap_names = NULL;
7656a52325fSAlex Elder 	kfree(header->object_prefix);
7666a52325fSAlex Elder 	header->object_prefix = NULL;
767ccece235SAlex Elder 
76800f1f36fSAlex Elder 	return -ENOMEM;
769602adf40SYehuda Sadeh }
770602adf40SYehuda Sadeh 
7719e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7729e15b77dSAlex Elder {
7739e15b77dSAlex Elder 	struct rbd_snap *snap;
7749e15b77dSAlex Elder 
7759e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7769e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7779e15b77dSAlex Elder 
7789e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7799e15b77dSAlex Elder 		if (snap_id == snap->id)
7809e15b77dSAlex Elder 			return snap->name;
7819e15b77dSAlex Elder 
7829e15b77dSAlex Elder 	return NULL;
7839e15b77dSAlex Elder }
7849e15b77dSAlex Elder 
7858836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
786602adf40SYehuda Sadeh {
787602adf40SYehuda Sadeh 
788e86924a8SAlex Elder 	struct rbd_snap *snap;
78900f1f36fSAlex Elder 
790e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
791e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
7920d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
793e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
79434b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
79500f1f36fSAlex Elder 
796e86924a8SAlex Elder 			return 0;
797602adf40SYehuda Sadeh 		}
79800f1f36fSAlex Elder 	}
799e86924a8SAlex Elder 
80000f1f36fSAlex Elder 	return -ENOENT;
80100f1f36fSAlex Elder }
802602adf40SYehuda Sadeh 
803819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
804602adf40SYehuda Sadeh {
80578dc447dSAlex Elder 	int ret;
806602adf40SYehuda Sadeh 
8070d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
808cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8090d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
81099c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
81134b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
812e86924a8SAlex Elder 		ret = 0;
813602adf40SYehuda Sadeh 	} else {
8140d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
815602adf40SYehuda Sadeh 		if (ret < 0)
816602adf40SYehuda Sadeh 			goto done;
817f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
818602adf40SYehuda Sadeh 	}
8196d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8206d292906SAlex Elder 
821602adf40SYehuda Sadeh done:
822602adf40SYehuda Sadeh 	return ret;
823602adf40SYehuda Sadeh }
824602adf40SYehuda Sadeh 
825602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
826602adf40SYehuda Sadeh {
827849b4260SAlex Elder 	kfree(header->object_prefix);
828d78fd7aeSAlex Elder 	header->object_prefix = NULL;
829602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
830d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
831849b4260SAlex Elder 	kfree(header->snap_names);
832d78fd7aeSAlex Elder 	header->snap_names = NULL;
833d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
834d78fd7aeSAlex Elder 	header->snapc = NULL;
835602adf40SYehuda Sadeh }
836602adf40SYehuda Sadeh 
83798571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
838602adf40SYehuda Sadeh {
83965ccfe21SAlex Elder 	char *name;
84065ccfe21SAlex Elder 	u64 segment;
84165ccfe21SAlex Elder 	int ret;
842602adf40SYehuda Sadeh 
8432fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
84465ccfe21SAlex Elder 	if (!name)
84565ccfe21SAlex Elder 		return NULL;
84665ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8472fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
84865ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8492fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
85065ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
85165ccfe21SAlex Elder 			segment, ret);
85265ccfe21SAlex Elder 		kfree(name);
85365ccfe21SAlex Elder 		name = NULL;
85465ccfe21SAlex Elder 	}
855602adf40SYehuda Sadeh 
85665ccfe21SAlex Elder 	return name;
85765ccfe21SAlex Elder }
858602adf40SYehuda Sadeh 
85965ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
86065ccfe21SAlex Elder {
86165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
862602adf40SYehuda Sadeh 
86365ccfe21SAlex Elder 	return offset & (segment_size - 1);
86465ccfe21SAlex Elder }
86565ccfe21SAlex Elder 
86665ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
86765ccfe21SAlex Elder 				u64 offset, u64 length)
86865ccfe21SAlex Elder {
86965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
87065ccfe21SAlex Elder 
87165ccfe21SAlex Elder 	offset &= segment_size - 1;
87265ccfe21SAlex Elder 
873aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
87465ccfe21SAlex Elder 	if (offset + length > segment_size)
87565ccfe21SAlex Elder 		length = segment_size - offset;
87665ccfe21SAlex Elder 
87765ccfe21SAlex Elder 	return length;
878602adf40SYehuda Sadeh }
879602adf40SYehuda Sadeh 
880602adf40SYehuda Sadeh /*
881029bcbd8SJosh Durgin  * returns the size of an object in the image
882029bcbd8SJosh Durgin  */
883029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
884029bcbd8SJosh Durgin {
885029bcbd8SJosh Durgin 	return 1 << header->obj_order;
886029bcbd8SJosh Durgin }
887029bcbd8SJosh Durgin 
888029bcbd8SJosh Durgin /*
889602adf40SYehuda Sadeh  * bio helpers
890602adf40SYehuda Sadeh  */
891602adf40SYehuda Sadeh 
892602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
893602adf40SYehuda Sadeh {
894602adf40SYehuda Sadeh 	struct bio *tmp;
895602adf40SYehuda Sadeh 
896602adf40SYehuda Sadeh 	while (chain) {
897602adf40SYehuda Sadeh 		tmp = chain;
898602adf40SYehuda Sadeh 		chain = chain->bi_next;
899602adf40SYehuda Sadeh 		bio_put(tmp);
900602adf40SYehuda Sadeh 	}
901602adf40SYehuda Sadeh }
902602adf40SYehuda Sadeh 
903602adf40SYehuda Sadeh /*
904602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
905602adf40SYehuda Sadeh  */
906602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
907602adf40SYehuda Sadeh {
908602adf40SYehuda Sadeh 	struct bio_vec *bv;
909602adf40SYehuda Sadeh 	unsigned long flags;
910602adf40SYehuda Sadeh 	void *buf;
911602adf40SYehuda Sadeh 	int i;
912602adf40SYehuda Sadeh 	int pos = 0;
913602adf40SYehuda Sadeh 
914602adf40SYehuda Sadeh 	while (chain) {
915602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
916602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
917602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
918602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
919602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
920602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
92185b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
922602adf40SYehuda Sadeh 			}
923602adf40SYehuda Sadeh 			pos += bv->bv_len;
924602adf40SYehuda Sadeh 		}
925602adf40SYehuda Sadeh 
926602adf40SYehuda Sadeh 		chain = chain->bi_next;
927602adf40SYehuda Sadeh 	}
928602adf40SYehuda Sadeh }
929602adf40SYehuda Sadeh 
930602adf40SYehuda Sadeh /*
931f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
932f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
933602adf40SYehuda Sadeh  */
934f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
935f7760dadSAlex Elder 					unsigned int offset,
936f7760dadSAlex Elder 					unsigned int len,
937f7760dadSAlex Elder 					gfp_t gfpmask)
938602adf40SYehuda Sadeh {
939f7760dadSAlex Elder 	struct bio_vec *bv;
940f7760dadSAlex Elder 	unsigned int resid;
941f7760dadSAlex Elder 	unsigned short idx;
942f7760dadSAlex Elder 	unsigned int voff;
943f7760dadSAlex Elder 	unsigned short end_idx;
944f7760dadSAlex Elder 	unsigned short vcnt;
945f7760dadSAlex Elder 	struct bio *bio;
946602adf40SYehuda Sadeh 
947f7760dadSAlex Elder 	/* Handle the easy case for the caller */
948f7760dadSAlex Elder 
949f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
950f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
951f7760dadSAlex Elder 
952f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
953f7760dadSAlex Elder 		return NULL;
954f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
955f7760dadSAlex Elder 		return NULL;
956f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
957f7760dadSAlex Elder 		return NULL;
958f7760dadSAlex Elder 
959f7760dadSAlex Elder 	/* Find first affected segment... */
960f7760dadSAlex Elder 
961f7760dadSAlex Elder 	resid = offset;
962f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
963f7760dadSAlex Elder 		if (resid < bv->bv_len)
964f7760dadSAlex Elder 			break;
965f7760dadSAlex Elder 		resid -= bv->bv_len;
966602adf40SYehuda Sadeh 	}
967f7760dadSAlex Elder 	voff = resid;
968602adf40SYehuda Sadeh 
969f7760dadSAlex Elder 	/* ...and the last affected segment */
970542582fcSAlex Elder 
971f7760dadSAlex Elder 	resid += len;
972f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
973f7760dadSAlex Elder 		if (resid <= bv->bv_len)
974f7760dadSAlex Elder 			break;
975f7760dadSAlex Elder 		resid -= bv->bv_len;
976f7760dadSAlex Elder 	}
977f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
978602adf40SYehuda Sadeh 
979f7760dadSAlex Elder 	/* Build the clone */
980f7760dadSAlex Elder 
981f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
982f7760dadSAlex Elder 	if (!bio)
983f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
984f7760dadSAlex Elder 
985f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
986f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
987f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
988f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
989602adf40SYehuda Sadeh 
990602adf40SYehuda Sadeh 	/*
991f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
992f7760dadSAlex Elder 	 * and last (or only) entries.
993602adf40SYehuda Sadeh 	 */
994f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
995f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
996f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
997f7760dadSAlex Elder 	if (vcnt > 1) {
998f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
999f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1000602adf40SYehuda Sadeh 	} else {
1001f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1002602adf40SYehuda Sadeh 	}
1003602adf40SYehuda Sadeh 
1004f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1005f7760dadSAlex Elder 	bio->bi_size = len;
1006f7760dadSAlex Elder 	bio->bi_idx = 0;
1007602adf40SYehuda Sadeh 
1008f7760dadSAlex Elder 	return bio;
1009602adf40SYehuda Sadeh }
1010602adf40SYehuda Sadeh 
1011f7760dadSAlex Elder /*
1012f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1013f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1014f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1015f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1016f7760dadSAlex Elder  *
1017f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1018f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1019f7760dadSAlex Elder  * the start of data to be cloned is located.
1020f7760dadSAlex Elder  *
1021f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1022f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1023f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1024f7760dadSAlex Elder  */
1025f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1026f7760dadSAlex Elder 					unsigned int *offset,
1027f7760dadSAlex Elder 					unsigned int len,
1028f7760dadSAlex Elder 					gfp_t gfpmask)
1029f7760dadSAlex Elder {
1030f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1031f7760dadSAlex Elder 	unsigned int off = *offset;
1032f7760dadSAlex Elder 	struct bio *chain = NULL;
1033f7760dadSAlex Elder 	struct bio **end;
1034602adf40SYehuda Sadeh 
1035f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1036602adf40SYehuda Sadeh 
1037f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1038f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1039602adf40SYehuda Sadeh 
1040f7760dadSAlex Elder 	end = &chain;
1041f7760dadSAlex Elder 	while (len) {
1042f7760dadSAlex Elder 		unsigned int bi_size;
1043f7760dadSAlex Elder 		struct bio *bio;
1044f7760dadSAlex Elder 
1045f5400b7aSAlex Elder 		if (!bi) {
1046f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1047f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1048f5400b7aSAlex Elder 		}
1049f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1050f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1051f7760dadSAlex Elder 		if (!bio)
1052f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1053f7760dadSAlex Elder 
1054f7760dadSAlex Elder 		*end = bio;
1055f7760dadSAlex Elder 		end = &bio->bi_next;
1056f7760dadSAlex Elder 
1057f7760dadSAlex Elder 		off += bi_size;
1058f7760dadSAlex Elder 		if (off == bi->bi_size) {
1059f7760dadSAlex Elder 			bi = bi->bi_next;
1060f7760dadSAlex Elder 			off = 0;
1061f7760dadSAlex Elder 		}
1062f7760dadSAlex Elder 		len -= bi_size;
1063f7760dadSAlex Elder 	}
1064f7760dadSAlex Elder 	*bio_src = bi;
1065f7760dadSAlex Elder 	*offset = off;
1066f7760dadSAlex Elder 
1067f7760dadSAlex Elder 	return chain;
1068f7760dadSAlex Elder out_err:
1069f7760dadSAlex Elder 	bio_chain_put(chain);
1070f7760dadSAlex Elder 
1071602adf40SYehuda Sadeh 	return NULL;
1072602adf40SYehuda Sadeh }
1073602adf40SYehuda Sadeh 
1074bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1075bf0d5f50SAlex Elder {
107637206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
107737206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1078bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1079bf0d5f50SAlex Elder }
1080bf0d5f50SAlex Elder 
1081bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1082bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1083bf0d5f50SAlex Elder {
1084bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
108537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
108637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1087bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1088bf0d5f50SAlex Elder }
1089bf0d5f50SAlex Elder 
1090bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1091bf0d5f50SAlex Elder {
109237206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
109337206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1094bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1095bf0d5f50SAlex Elder }
1096bf0d5f50SAlex Elder 
1097bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1098bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1099bf0d5f50SAlex Elder {
1100bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
110137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
110237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1103bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1104bf0d5f50SAlex Elder }
1105bf0d5f50SAlex Elder 
1106bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1107bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1108bf0d5f50SAlex Elder {
110925dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
111025dcf954SAlex Elder 
1111bf0d5f50SAlex Elder 	rbd_obj_request_get(obj_request);
1112bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
111325dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
1114bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
111525dcf954SAlex Elder 	img_request->obj_request_count++;
111625dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
111737206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
111837206ee5SAlex Elder 		obj_request->which);
1119bf0d5f50SAlex Elder }
1120bf0d5f50SAlex Elder 
1121bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1122bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1123bf0d5f50SAlex Elder {
1124bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
112525dcf954SAlex Elder 
112637206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
112737206ee5SAlex Elder 		obj_request->which);
1128bf0d5f50SAlex Elder 	list_del(&obj_request->links);
112925dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
113025dcf954SAlex Elder 	img_request->obj_request_count--;
113125dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
113225dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
1133bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1134bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
113525dcf954SAlex Elder 	obj_request->callback = NULL;
1136bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1137bf0d5f50SAlex Elder }
1138bf0d5f50SAlex Elder 
1139bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1140bf0d5f50SAlex Elder {
1141bf0d5f50SAlex Elder 	switch (type) {
11429969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1143bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1144788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1145bf0d5f50SAlex Elder 		return true;
1146bf0d5f50SAlex Elder 	default:
1147bf0d5f50SAlex Elder 		return false;
1148bf0d5f50SAlex Elder 	}
1149bf0d5f50SAlex Elder }
1150bf0d5f50SAlex Elder 
1151bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1152bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1153bf0d5f50SAlex Elder {
115437206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
115537206ee5SAlex Elder 
1156bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1157bf0d5f50SAlex Elder }
1158bf0d5f50SAlex Elder 
1159bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1160bf0d5f50SAlex Elder {
116155f27e09SAlex Elder 
116237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
116355f27e09SAlex Elder 
116455f27e09SAlex Elder 	/*
116555f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
116655f27e09SAlex Elder 	 * count for the image request.  We could instead use
116755f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
116855f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
116955f27e09SAlex Elder 	 */
117055f27e09SAlex Elder 	if (!img_request->result) {
117155f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
117255f27e09SAlex Elder 		u64 xferred = 0;
117355f27e09SAlex Elder 
117455f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
117555f27e09SAlex Elder 			xferred += obj_request->xferred;
117655f27e09SAlex Elder 		img_request->xferred = xferred;
117755f27e09SAlex Elder 	}
117855f27e09SAlex Elder 
1179bf0d5f50SAlex Elder 	if (img_request->callback)
1180bf0d5f50SAlex Elder 		img_request->callback(img_request);
1181bf0d5f50SAlex Elder 	else
1182bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1183bf0d5f50SAlex Elder }
1184bf0d5f50SAlex Elder 
1185788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1186788e2df3SAlex Elder 
1187788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1188788e2df3SAlex Elder {
118937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
119037206ee5SAlex Elder 
1191788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1192788e2df3SAlex Elder }
1193788e2df3SAlex Elder 
119407741308SAlex Elder static void obj_request_done_init(struct rbd_obj_request *obj_request)
119507741308SAlex Elder {
119607741308SAlex Elder 	atomic_set(&obj_request->done, 0);
119707741308SAlex Elder 	smp_wmb();
119807741308SAlex Elder }
119907741308SAlex Elder 
120007741308SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
120107741308SAlex Elder {
1202632b88caSAlex Elder 	int done;
1203632b88caSAlex Elder 
1204632b88caSAlex Elder 	done = atomic_inc_return(&obj_request->done);
1205632b88caSAlex Elder 	if (done > 1) {
1206632b88caSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
1207632b88caSAlex Elder 		struct rbd_device *rbd_dev;
1208632b88caSAlex Elder 
1209632b88caSAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
1210632b88caSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p was already done\n",
1211632b88caSAlex Elder 			obj_request);
1212632b88caSAlex Elder 	}
121307741308SAlex Elder }
121407741308SAlex Elder 
121507741308SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
121607741308SAlex Elder {
1217632b88caSAlex Elder 	smp_mb();
121807741308SAlex Elder 	return atomic_read(&obj_request->done) != 0;
121907741308SAlex Elder }
122007741308SAlex Elder 
12210c425248SAlex Elder /*
12220c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
12230c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
12240c425248SAlex Elder  * and currently never change thereafter.
12250c425248SAlex Elder  */
12260c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
12270c425248SAlex Elder {
12280c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
12290c425248SAlex Elder 	smp_mb();
12300c425248SAlex Elder }
12310c425248SAlex Elder 
12320c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
12330c425248SAlex Elder {
12340c425248SAlex Elder 	smp_mb();
12350c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
12360c425248SAlex Elder }
12370c425248SAlex Elder 
12389849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
12399849e986SAlex Elder {
12409849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
12419849e986SAlex Elder 	smp_mb();
12429849e986SAlex Elder }
12439849e986SAlex Elder 
12449849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
12459849e986SAlex Elder {
12469849e986SAlex Elder 	smp_mb();
12479849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
12489849e986SAlex Elder }
12499849e986SAlex Elder 
12506e2a4505SAlex Elder static void
12516e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
12526e2a4505SAlex Elder {
12536e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
12546e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
12556e2a4505SAlex Elder 		obj_request->xferred, obj_request->length);
12566e2a4505SAlex Elder 	/*
12576e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
12586e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
12596e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
12606e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
12616e2a4505SAlex Elder 	 * was satisfied.
12626e2a4505SAlex Elder 	 */
12636e2a4505SAlex Elder 	BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
12646e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
12656e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, 0);
12666e2a4505SAlex Elder 		obj_request->result = 0;
12676e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
12686e2a4505SAlex Elder 	} else if (obj_request->xferred < obj_request->length &&
12696e2a4505SAlex Elder 			!obj_request->result) {
12706e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
12716e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
12726e2a4505SAlex Elder 	}
12736e2a4505SAlex Elder 	obj_request_done_set(obj_request);
12746e2a4505SAlex Elder }
12756e2a4505SAlex Elder 
1276bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1277bf0d5f50SAlex Elder {
127837206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
127937206ee5SAlex Elder 		obj_request->callback);
1280bf0d5f50SAlex Elder 	if (obj_request->callback)
1281bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1282788e2df3SAlex Elder 	else
1283788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1284bf0d5f50SAlex Elder }
1285bf0d5f50SAlex Elder 
1286c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
128739bf2c5dSAlex Elder {
128839bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
128939bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
129039bf2c5dSAlex Elder }
129139bf2c5dSAlex Elder 
1292c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1293bf0d5f50SAlex Elder {
129437206ee5SAlex Elder 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
1295c47f9371SAlex Elder 		obj_request->result, obj_request->xferred, obj_request->length);
12966e2a4505SAlex Elder 	if (obj_request->img_request)
12976e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
12986e2a4505SAlex Elder 	else
129907741308SAlex Elder 		obj_request_done_set(obj_request);
1300bf0d5f50SAlex Elder }
1301bf0d5f50SAlex Elder 
1302c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1303bf0d5f50SAlex Elder {
13041b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
13051b83bef2SSage Weil 		obj_request->result, obj_request->length);
13061b83bef2SSage Weil 	/*
13071b83bef2SSage Weil 	 * There is no such thing as a successful short write.
13081b83bef2SSage Weil 	 * Our xferred value is the number of bytes transferred
13091b83bef2SSage Weil 	 * back.  Set it to our originally-requested length.
13101b83bef2SSage Weil 	 */
13111b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
131207741308SAlex Elder 	obj_request_done_set(obj_request);
1313bf0d5f50SAlex Elder }
1314bf0d5f50SAlex Elder 
1315fbfab539SAlex Elder /*
1316fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1317fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1318fbfab539SAlex Elder  */
1319c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1320fbfab539SAlex Elder {
132137206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1322fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1323fbfab539SAlex Elder }
1324fbfab539SAlex Elder 
1325bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1326bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1327bf0d5f50SAlex Elder {
1328bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1329bf0d5f50SAlex Elder 	u16 opcode;
1330bf0d5f50SAlex Elder 
133137206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1332bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
1333bf0d5f50SAlex Elder 	rbd_assert(!!obj_request->img_request ^
1334bf0d5f50SAlex Elder 				(obj_request->which == BAD_WHICH));
1335bf0d5f50SAlex Elder 
13361b83bef2SSage Weil 	if (osd_req->r_result < 0)
13371b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1338bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1339bf0d5f50SAlex Elder 
13401b83bef2SSage Weil 	WARN_ON(osd_req->r_num_ops != 1);	/* For now */
1341bf0d5f50SAlex Elder 
1342c47f9371SAlex Elder 	/*
1343c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1344c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1345c47f9371SAlex Elder 	 */
13461b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1347c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64) UINT_MAX);
134879528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1349bf0d5f50SAlex Elder 	switch (opcode) {
1350bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1351c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1352bf0d5f50SAlex Elder 		break;
1353bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1354c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1355bf0d5f50SAlex Elder 		break;
1356fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1357c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1358fbfab539SAlex Elder 		break;
135936be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1360b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
13619969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1362c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
13639969ebc5SAlex Elder 		break;
1364bf0d5f50SAlex Elder 	default:
1365bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1366bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1367bf0d5f50SAlex Elder 		break;
1368bf0d5f50SAlex Elder 	}
1369bf0d5f50SAlex Elder 
137007741308SAlex Elder 	if (obj_request_done_test(obj_request))
1371bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1372bf0d5f50SAlex Elder }
1373bf0d5f50SAlex Elder 
13742fa12320SAlex Elder static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
137579528734SAlex Elder 					bool write_request)
1376430c28c3SAlex Elder {
1377430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
13788c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
1379430c28c3SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1380430c28c3SAlex Elder 	u64 snap_id = CEPH_NOSNAP;
1381430c28c3SAlex Elder 	struct timespec *mtime = NULL;
1382430c28c3SAlex Elder 	struct timespec now;
1383430c28c3SAlex Elder 
13848c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1385430c28c3SAlex Elder 
1386430c28c3SAlex Elder 	if (write_request) {
1387430c28c3SAlex Elder 		now = CURRENT_TIME;
1388430c28c3SAlex Elder 		mtime = &now;
1389430c28c3SAlex Elder 		if (img_request)
1390430c28c3SAlex Elder 			snapc = img_request->snapc;
13912fa12320SAlex Elder 	} else if (img_request) {
1392430c28c3SAlex Elder 		snap_id = img_request->snap_id;
1393430c28c3SAlex Elder 	}
13948c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
139579528734SAlex Elder 			snapc, snap_id, mtime);
1396430c28c3SAlex Elder }
1397430c28c3SAlex Elder 
1398bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1399bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1400bf0d5f50SAlex Elder 					bool write_request,
1401430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1402bf0d5f50SAlex Elder {
1403bf0d5f50SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
1404bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1405bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1406bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1407bf0d5f50SAlex Elder 
1408bf0d5f50SAlex Elder 	if (img_request) {
14090c425248SAlex Elder 		rbd_assert(write_request ==
14100c425248SAlex Elder 				img_request_write_test(img_request));
14110c425248SAlex Elder 		if (write_request)
1412bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1413bf0d5f50SAlex Elder 	}
1414bf0d5f50SAlex Elder 
1415bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1416bf0d5f50SAlex Elder 
1417bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1418bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1419bf0d5f50SAlex Elder 	if (!osd_req)
1420bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1421bf0d5f50SAlex Elder 
1422430c28c3SAlex Elder 	if (write_request)
1423bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1424430c28c3SAlex Elder 	else
1425bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1426bf0d5f50SAlex Elder 
1427bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1428bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1429bf0d5f50SAlex Elder 
1430bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1431bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1432bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1433bf0d5f50SAlex Elder 
1434bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1435bf0d5f50SAlex Elder 
1436bf0d5f50SAlex Elder 	return osd_req;
1437bf0d5f50SAlex Elder }
1438bf0d5f50SAlex Elder 
1439bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1440bf0d5f50SAlex Elder {
1441bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1442bf0d5f50SAlex Elder }
1443bf0d5f50SAlex Elder 
1444bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1445bf0d5f50SAlex Elder 
1446bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1447bf0d5f50SAlex Elder 						u64 offset, u64 length,
1448bf0d5f50SAlex Elder 						enum obj_request_type type)
1449bf0d5f50SAlex Elder {
1450bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1451bf0d5f50SAlex Elder 	size_t size;
1452bf0d5f50SAlex Elder 	char *name;
1453bf0d5f50SAlex Elder 
1454bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1455bf0d5f50SAlex Elder 
1456bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1457bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1458bf0d5f50SAlex Elder 	if (!obj_request)
1459bf0d5f50SAlex Elder 		return NULL;
1460bf0d5f50SAlex Elder 
1461bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1462bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1463bf0d5f50SAlex Elder 	obj_request->offset = offset;
1464bf0d5f50SAlex Elder 	obj_request->length = length;
1465bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1466bf0d5f50SAlex Elder 	obj_request->type = type;
1467bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
146807741308SAlex Elder 	obj_request_done_init(obj_request);
1469788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1470bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1471bf0d5f50SAlex Elder 
147237206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
147337206ee5SAlex Elder 		offset, length, (int)type, obj_request);
147437206ee5SAlex Elder 
1475bf0d5f50SAlex Elder 	return obj_request;
1476bf0d5f50SAlex Elder }
1477bf0d5f50SAlex Elder 
1478bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1479bf0d5f50SAlex Elder {
1480bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1481bf0d5f50SAlex Elder 
1482bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1483bf0d5f50SAlex Elder 
148437206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
148537206ee5SAlex Elder 
1486bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1487bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1488bf0d5f50SAlex Elder 
1489bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1490bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1491bf0d5f50SAlex Elder 
1492bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1493bf0d5f50SAlex Elder 	switch (obj_request->type) {
14949969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
14959969ebc5SAlex Elder 		break;		/* Nothing to do */
1496bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1497bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1498bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1499bf0d5f50SAlex Elder 		break;
1500788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1501788e2df3SAlex Elder 		if (obj_request->pages)
1502788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1503788e2df3SAlex Elder 						obj_request->page_count);
1504788e2df3SAlex Elder 		break;
1505bf0d5f50SAlex Elder 	}
1506bf0d5f50SAlex Elder 
1507bf0d5f50SAlex Elder 	kfree(obj_request);
1508bf0d5f50SAlex Elder }
1509bf0d5f50SAlex Elder 
1510bf0d5f50SAlex Elder /*
1511bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1512bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1513bf0d5f50SAlex Elder  * (if there is one).
1514bf0d5f50SAlex Elder  */
1515cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1516cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1517bf0d5f50SAlex Elder 					u64 offset, u64 length,
15189849e986SAlex Elder 					bool write_request,
15199849e986SAlex Elder 					bool child_request)
1520bf0d5f50SAlex Elder {
1521bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1522bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1523bf0d5f50SAlex Elder 
1524bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1525bf0d5f50SAlex Elder 	if (!img_request)
1526bf0d5f50SAlex Elder 		return NULL;
1527bf0d5f50SAlex Elder 
1528bf0d5f50SAlex Elder 	if (write_request) {
1529bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1530bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1531bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1532bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1533bf0d5f50SAlex Elder 			kfree(img_request);
1534bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1535bf0d5f50SAlex Elder 		}
15360c425248SAlex Elder 
1537bf0d5f50SAlex Elder 	}
1538bf0d5f50SAlex Elder 
1539bf0d5f50SAlex Elder 	img_request->rq = NULL;
1540bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1541bf0d5f50SAlex Elder 	img_request->offset = offset;
1542bf0d5f50SAlex Elder 	img_request->length = length;
15430c425248SAlex Elder 	img_request->flags = 0;
15440c425248SAlex Elder 	if (write_request) {
15450c425248SAlex Elder 		img_request_write_set(img_request);
1546bf0d5f50SAlex Elder 		img_request->snapc = snapc;
15470c425248SAlex Elder 	} else {
1548bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
15490c425248SAlex Elder 	}
15509849e986SAlex Elder 	if (child_request)
15519849e986SAlex Elder 		img_request_child_set(img_request);
1552bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1553bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1554bf0d5f50SAlex Elder 	img_request->callback = NULL;
1555a5a337d4SAlex Elder 	img_request->result = 0;
1556bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1557bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1558bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1559bf0d5f50SAlex Elder 
1560bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1561bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1562bf0d5f50SAlex Elder 
156337206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
156437206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
156537206ee5SAlex Elder 		img_request);
156637206ee5SAlex Elder 
1567bf0d5f50SAlex Elder 	return img_request;
1568bf0d5f50SAlex Elder }
1569bf0d5f50SAlex Elder 
1570bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1571bf0d5f50SAlex Elder {
1572bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1573bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1574bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1575bf0d5f50SAlex Elder 
1576bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1577bf0d5f50SAlex Elder 
157837206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
157937206ee5SAlex Elder 
1580bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1581bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
158225dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1583bf0d5f50SAlex Elder 
15840c425248SAlex Elder 	if (img_request_write_test(img_request))
1585bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1586bf0d5f50SAlex Elder 
1587bf0d5f50SAlex Elder 	kfree(img_request);
1588bf0d5f50SAlex Elder }
1589bf0d5f50SAlex Elder 
15902169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
15912169238dSAlex Elder {
15922169238dSAlex Elder 	struct rbd_img_request *img_request;
15932169238dSAlex Elder 	u32 which = obj_request->which;
15942169238dSAlex Elder 	bool more = true;
15952169238dSAlex Elder 
15962169238dSAlex Elder 	img_request = obj_request->img_request;
15972169238dSAlex Elder 
15982169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
15992169238dSAlex Elder 	rbd_assert(img_request != NULL);
16009849e986SAlex Elder 	rbd_assert(!img_request_child_test(img_request))
16012169238dSAlex Elder 	rbd_assert(img_request->rq != NULL);
16029849e986SAlex Elder 
16032169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
16042169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
16052169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
16062169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
16072169238dSAlex Elder 
16082169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
16092169238dSAlex Elder 	if (which != img_request->next_completion)
16102169238dSAlex Elder 		goto out;
16112169238dSAlex Elder 
16122169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
16132169238dSAlex Elder 		unsigned int xferred;
16142169238dSAlex Elder 		int result;
16152169238dSAlex Elder 
16162169238dSAlex Elder 		rbd_assert(more);
16172169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
16182169238dSAlex Elder 
16192169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
16202169238dSAlex Elder 			break;
16212169238dSAlex Elder 
16222169238dSAlex Elder 		rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
16232169238dSAlex Elder 		xferred = (unsigned int)obj_request->xferred;
1624a5a337d4SAlex Elder 		result = obj_request->result;
1625a5a337d4SAlex Elder 		if (result) {
16267da22d29SAlex Elder 			struct rbd_device *rbd_dev = img_request->rbd_dev;
16277da22d29SAlex Elder 
16287da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
16290c425248SAlex Elder 				img_request_write_test(img_request) ? "write"
16300c425248SAlex Elder 								    : "read",
16317da22d29SAlex Elder 				obj_request->length, obj_request->img_offset,
16327da22d29SAlex Elder 				obj_request->offset);
16337da22d29SAlex Elder 			rbd_warn(rbd_dev, "  result %d xferred %x\n",
16342169238dSAlex Elder 				result, xferred);
1635a5a337d4SAlex Elder 			if (!img_request->result)
1636a5a337d4SAlex Elder 				img_request->result = result;
1637a5a337d4SAlex Elder 		}
16382169238dSAlex Elder 
16392169238dSAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
16402169238dSAlex Elder 		which++;
16412169238dSAlex Elder 	}
16422169238dSAlex Elder 
16432169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
16442169238dSAlex Elder 	img_request->next_completion = which;
16452169238dSAlex Elder out:
16462169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
16472169238dSAlex Elder 
16482169238dSAlex Elder 	if (!more)
16492169238dSAlex Elder 		rbd_img_request_complete(img_request);
16502169238dSAlex Elder }
16512169238dSAlex Elder 
1652bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1653bf0d5f50SAlex Elder 					struct bio *bio_list)
1654bf0d5f50SAlex Elder {
1655bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1656bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1657bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
16580c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1659bf0d5f50SAlex Elder 	unsigned int bio_offset;
16607da22d29SAlex Elder 	u64 img_offset;
1661bf0d5f50SAlex Elder 	u64 resid;
1662bf0d5f50SAlex Elder 	u16 opcode;
1663bf0d5f50SAlex Elder 
166437206ee5SAlex Elder 	dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
166537206ee5SAlex Elder 
1666430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1667bf0d5f50SAlex Elder 	bio_offset = 0;
16687da22d29SAlex Elder 	img_offset = img_request->offset;
16697da22d29SAlex Elder 	rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1670bf0d5f50SAlex Elder 	resid = img_request->length;
16714dda41d3SAlex Elder 	rbd_assert(resid > 0);
1672bf0d5f50SAlex Elder 	while (resid) {
16732fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1674bf0d5f50SAlex Elder 		const char *object_name;
1675bf0d5f50SAlex Elder 		unsigned int clone_size;
1676bf0d5f50SAlex Elder 		u64 offset;
1677bf0d5f50SAlex Elder 		u64 length;
1678bf0d5f50SAlex Elder 
16797da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1680bf0d5f50SAlex Elder 		if (!object_name)
1681bf0d5f50SAlex Elder 			goto out_unwind;
16827da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
16837da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1684bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1685bf0d5f50SAlex Elder 						offset, length,
1686bf0d5f50SAlex Elder 						OBJ_REQUEST_BIO);
1687bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1688bf0d5f50SAlex Elder 		if (!obj_request)
1689bf0d5f50SAlex Elder 			goto out_unwind;
1690bf0d5f50SAlex Elder 
1691bf0d5f50SAlex Elder 		rbd_assert(length <= (u64) UINT_MAX);
1692bf0d5f50SAlex Elder 		clone_size = (unsigned int) length;
1693bf0d5f50SAlex Elder 		obj_request->bio_list = bio_chain_clone_range(&bio_list,
1694bf0d5f50SAlex Elder 						&bio_offset, clone_size,
1695bf0d5f50SAlex Elder 						GFP_ATOMIC);
1696bf0d5f50SAlex Elder 		if (!obj_request->bio_list)
1697bf0d5f50SAlex Elder 			goto out_partial;
1698bf0d5f50SAlex Elder 
16992fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
17002fa12320SAlex Elder 						obj_request);
17012fa12320SAlex Elder 		if (!osd_req)
1702bf0d5f50SAlex Elder 			goto out_partial;
17032fa12320SAlex Elder 		obj_request->osd_req = osd_req;
17042169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1705430c28c3SAlex Elder 
17062fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
17072fa12320SAlex Elder 						0, 0);
1708a4ce40a9SAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
1709a4ce40a9SAlex Elder 				obj_request->bio_list, obj_request->length);
17102fa12320SAlex Elder 		rbd_osd_req_format(obj_request, write_request);
1711430c28c3SAlex Elder 
17127da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1713bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1714bf0d5f50SAlex Elder 
17157da22d29SAlex Elder 		img_offset += length;
1716bf0d5f50SAlex Elder 		resid -= length;
1717bf0d5f50SAlex Elder 	}
1718bf0d5f50SAlex Elder 
1719bf0d5f50SAlex Elder 	return 0;
1720bf0d5f50SAlex Elder 
1721bf0d5f50SAlex Elder out_partial:
1722bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1723bf0d5f50SAlex Elder out_unwind:
1724bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1725bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1726bf0d5f50SAlex Elder 
1727bf0d5f50SAlex Elder 	return -ENOMEM;
1728bf0d5f50SAlex Elder }
1729bf0d5f50SAlex Elder 
1730bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
1731bf0d5f50SAlex Elder {
1732bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1733bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1734bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
173546faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
1736bf0d5f50SAlex Elder 
173737206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
173846faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
1739bf0d5f50SAlex Elder 		int ret;
1740bf0d5f50SAlex Elder 
1741bf0d5f50SAlex Elder 		ret = rbd_obj_request_submit(osdc, obj_request);
1742bf0d5f50SAlex Elder 		if (ret)
1743bf0d5f50SAlex Elder 			return ret;
1744bf0d5f50SAlex Elder 		/*
1745bf0d5f50SAlex Elder 		 * The image request has its own reference to each
1746bf0d5f50SAlex Elder 		 * of its object requests, so we can safely drop the
1747bf0d5f50SAlex Elder 		 * initial one here.
1748bf0d5f50SAlex Elder 		 */
1749bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1750bf0d5f50SAlex Elder 	}
1751bf0d5f50SAlex Elder 
1752bf0d5f50SAlex Elder 	return 0;
1753bf0d5f50SAlex Elder }
1754bf0d5f50SAlex Elder 
1755cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1756b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
1757b8d70035SAlex Elder {
1758b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
17592169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1760b8d70035SAlex Elder 	int ret;
1761b8d70035SAlex Elder 
1762b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1763b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
1764b8d70035SAlex Elder 	if (!obj_request)
1765b8d70035SAlex Elder 		return -ENOMEM;
1766b8d70035SAlex Elder 
1767b8d70035SAlex Elder 	ret = -ENOMEM;
1768430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
1769b8d70035SAlex Elder 	if (!obj_request->osd_req)
1770b8d70035SAlex Elder 		goto out;
17712169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
1772b8d70035SAlex Elder 
1773c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1774c99d2d4aSAlex Elder 					notify_id, ver, 0);
17752fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
1776430c28c3SAlex Elder 
1777b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
1778b8d70035SAlex Elder out:
1779cf81b60eSAlex Elder 	if (ret)
1780b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
1781b8d70035SAlex Elder 
1782b8d70035SAlex Elder 	return ret;
1783b8d70035SAlex Elder }
1784b8d70035SAlex Elder 
1785b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1786b8d70035SAlex Elder {
1787b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1788b8d70035SAlex Elder 	u64 hver;
1789b8d70035SAlex Elder 	int rc;
1790b8d70035SAlex Elder 
1791b8d70035SAlex Elder 	if (!rbd_dev)
1792b8d70035SAlex Elder 		return;
1793b8d70035SAlex Elder 
179437206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1795b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1796b8d70035SAlex Elder 		(unsigned int) opcode);
1797b8d70035SAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
1798b8d70035SAlex Elder 	if (rc)
1799b8d70035SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
1800b8d70035SAlex Elder 			   " update snaps: %d\n", rc);
1801b8d70035SAlex Elder 
1802cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1803b8d70035SAlex Elder }
1804b8d70035SAlex Elder 
18059969ebc5SAlex Elder /*
18069969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
18079969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
18089969ebc5SAlex Elder  */
18099969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
18109969ebc5SAlex Elder {
18119969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
18129969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
18139969ebc5SAlex Elder 	int ret;
18149969ebc5SAlex Elder 
18159969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
18169969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
18179969ebc5SAlex Elder 
18189969ebc5SAlex Elder 	if (start) {
18193c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
18209969ebc5SAlex Elder 						&rbd_dev->watch_event);
18219969ebc5SAlex Elder 		if (ret < 0)
18229969ebc5SAlex Elder 			return ret;
18238eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
18249969ebc5SAlex Elder 	}
18259969ebc5SAlex Elder 
18269969ebc5SAlex Elder 	ret = -ENOMEM;
18279969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
18289969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
18299969ebc5SAlex Elder 	if (!obj_request)
18309969ebc5SAlex Elder 		goto out_cancel;
18319969ebc5SAlex Elder 
1832430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1833430c28c3SAlex Elder 	if (!obj_request->osd_req)
1834430c28c3SAlex Elder 		goto out_cancel;
1835430c28c3SAlex Elder 
18368eb87565SAlex Elder 	if (start)
1837975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
18388eb87565SAlex Elder 	else
18396977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
1840975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
18412169238dSAlex Elder 
18422169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
18432169238dSAlex Elder 				rbd_dev->watch_event->cookie,
18442169238dSAlex Elder 				rbd_dev->header.obj_version, start);
18452169238dSAlex Elder 	rbd_osd_req_format(obj_request, true);
18462169238dSAlex Elder 
18479969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
18489969ebc5SAlex Elder 	if (ret)
18499969ebc5SAlex Elder 		goto out_cancel;
18509969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
18519969ebc5SAlex Elder 	if (ret)
18529969ebc5SAlex Elder 		goto out_cancel;
18539969ebc5SAlex Elder 	ret = obj_request->result;
18549969ebc5SAlex Elder 	if (ret)
18559969ebc5SAlex Elder 		goto out_cancel;
18569969ebc5SAlex Elder 
18578eb87565SAlex Elder 	/*
18588eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
18598eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
18608eb87565SAlex Elder 	 * a pointer to the object request during that time (in
18618eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
18628eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
18638eb87565SAlex Elder 	 * unregistered it.
18648eb87565SAlex Elder 	 */
18658eb87565SAlex Elder 	if (start) {
18668eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
18678eb87565SAlex Elder 
18688eb87565SAlex Elder 		return 0;
18698eb87565SAlex Elder 	}
18708eb87565SAlex Elder 
18718eb87565SAlex Elder 	/* We have successfully torn down the watch request */
18728eb87565SAlex Elder 
18738eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
18748eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
18759969ebc5SAlex Elder out_cancel:
18769969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
18779969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
18789969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
18799969ebc5SAlex Elder 	if (obj_request)
18809969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
18819969ebc5SAlex Elder 
18829969ebc5SAlex Elder 	return ret;
18839969ebc5SAlex Elder }
18849969ebc5SAlex Elder 
188536be9a76SAlex Elder /*
188636be9a76SAlex Elder  * Synchronous osd object method call
188736be9a76SAlex Elder  */
188836be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
188936be9a76SAlex Elder 			     const char *object_name,
189036be9a76SAlex Elder 			     const char *class_name,
189136be9a76SAlex Elder 			     const char *method_name,
189236be9a76SAlex Elder 			     const char *outbound,
189336be9a76SAlex Elder 			     size_t outbound_size,
189436be9a76SAlex Elder 			     char *inbound,
189536be9a76SAlex Elder 			     size_t inbound_size,
189636be9a76SAlex Elder 			     u64 *version)
189736be9a76SAlex Elder {
18982169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
189936be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
190036be9a76SAlex Elder 	struct page **pages;
190136be9a76SAlex Elder 	u32 page_count;
190236be9a76SAlex Elder 	int ret;
190336be9a76SAlex Elder 
190436be9a76SAlex Elder 	/*
19056010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
19066010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
19076010a451SAlex Elder 	 * also supply outbound data--parameters for the object
19086010a451SAlex Elder 	 * method.  Currently if this is present it will be a
19096010a451SAlex Elder 	 * snapshot id.
191036be9a76SAlex Elder 	 */
191136be9a76SAlex Elder 	page_count = (u32) calc_pages_for(0, inbound_size);
191236be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
191336be9a76SAlex Elder 	if (IS_ERR(pages))
191436be9a76SAlex Elder 		return PTR_ERR(pages);
191536be9a76SAlex Elder 
191636be9a76SAlex Elder 	ret = -ENOMEM;
19176010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
191836be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
191936be9a76SAlex Elder 	if (!obj_request)
192036be9a76SAlex Elder 		goto out;
192136be9a76SAlex Elder 
192236be9a76SAlex Elder 	obj_request->pages = pages;
192336be9a76SAlex Elder 	obj_request->page_count = page_count;
192436be9a76SAlex Elder 
1925430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
192636be9a76SAlex Elder 	if (!obj_request->osd_req)
192736be9a76SAlex Elder 		goto out;
192836be9a76SAlex Elder 
1929c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
193004017e29SAlex Elder 					class_name, method_name);
193104017e29SAlex Elder 	if (outbound_size) {
193204017e29SAlex Elder 		struct ceph_pagelist *pagelist;
193304017e29SAlex Elder 
193404017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
193504017e29SAlex Elder 		if (!pagelist)
193604017e29SAlex Elder 			goto out;
193704017e29SAlex Elder 
193804017e29SAlex Elder 		ceph_pagelist_init(pagelist);
193904017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
194004017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
194104017e29SAlex Elder 						pagelist);
194204017e29SAlex Elder 	}
1943a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
1944a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
194544cd188dSAlex Elder 					0, false, false);
19462fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
1947430c28c3SAlex Elder 
194836be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
194936be9a76SAlex Elder 	if (ret)
195036be9a76SAlex Elder 		goto out;
195136be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
195236be9a76SAlex Elder 	if (ret)
195336be9a76SAlex Elder 		goto out;
195436be9a76SAlex Elder 
195536be9a76SAlex Elder 	ret = obj_request->result;
195636be9a76SAlex Elder 	if (ret < 0)
195736be9a76SAlex Elder 		goto out;
195823ed6e13SAlex Elder 	ret = 0;
1959903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
196036be9a76SAlex Elder 	if (version)
196136be9a76SAlex Elder 		*version = obj_request->version;
196236be9a76SAlex Elder out:
196336be9a76SAlex Elder 	if (obj_request)
196436be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
196536be9a76SAlex Elder 	else
196636be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
196736be9a76SAlex Elder 
196836be9a76SAlex Elder 	return ret;
196936be9a76SAlex Elder }
197036be9a76SAlex Elder 
1971bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
1972cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
1973bf0d5f50SAlex Elder {
1974bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
1975bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
1976bf0d5f50SAlex Elder 	struct request *rq;
1977bf0d5f50SAlex Elder 	int result;
1978bf0d5f50SAlex Elder 
1979bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
1980bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
1981bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
1982bf0d5f50SAlex Elder 		u64 offset;
1983bf0d5f50SAlex Elder 		u64 length;
1984bf0d5f50SAlex Elder 
1985bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
1986bf0d5f50SAlex Elder 
1987bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
19884dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
19894dda41d3SAlex Elder 				(int) rq->cmd_type);
19904dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
19914dda41d3SAlex Elder 			continue;
19924dda41d3SAlex Elder 		}
19934dda41d3SAlex Elder 
19944dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
19954dda41d3SAlex Elder 
19964dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
19974dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
19984dda41d3SAlex Elder 
19994dda41d3SAlex Elder 		if (!length) {
20004dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2001bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2002bf0d5f50SAlex Elder 			continue;
2003bf0d5f50SAlex Elder 		}
2004bf0d5f50SAlex Elder 
2005bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2006bf0d5f50SAlex Elder 
2007bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2008bf0d5f50SAlex Elder 
2009bf0d5f50SAlex Elder 		if (write_request) {
2010bf0d5f50SAlex Elder 			result = -EROFS;
2011bf0d5f50SAlex Elder 			if (read_only)
2012bf0d5f50SAlex Elder 				goto end_request;
2013bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2014bf0d5f50SAlex Elder 		}
2015bf0d5f50SAlex Elder 
20166d292906SAlex Elder 		/*
20176d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
20186d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
20196d292906SAlex Elder 		 * have disappeared by the time our request arrives
20206d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
20216d292906SAlex Elder 		 * we already know.
20226d292906SAlex Elder 		 */
20236d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2024bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2025bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2026bf0d5f50SAlex Elder 			result = -ENXIO;
2027bf0d5f50SAlex Elder 			goto end_request;
2028bf0d5f50SAlex Elder 		}
2029bf0d5f50SAlex Elder 
2030bf0d5f50SAlex Elder 		result = -EINVAL;
2031bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2032bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2033bf0d5f50SAlex Elder 
2034bf0d5f50SAlex Elder 		result = -ENOMEM;
2035bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
20369849e986SAlex Elder 							write_request, false);
2037bf0d5f50SAlex Elder 		if (!img_request)
2038bf0d5f50SAlex Elder 			goto end_request;
2039bf0d5f50SAlex Elder 
2040bf0d5f50SAlex Elder 		img_request->rq = rq;
2041bf0d5f50SAlex Elder 
2042bf0d5f50SAlex Elder 		result = rbd_img_request_fill_bio(img_request, rq->bio);
2043bf0d5f50SAlex Elder 		if (!result)
2044bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2045bf0d5f50SAlex Elder 		if (result)
2046bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2047bf0d5f50SAlex Elder end_request:
2048bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2049bf0d5f50SAlex Elder 		if (result < 0) {
20507da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
20517da22d29SAlex Elder 				write_request ? "write" : "read",
20527da22d29SAlex Elder 				length, offset, result);
20537da22d29SAlex Elder 
2054bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2055bf0d5f50SAlex Elder 		}
2056bf0d5f50SAlex Elder 	}
2057bf0d5f50SAlex Elder }
2058bf0d5f50SAlex Elder 
2059602adf40SYehuda Sadeh /*
2060602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2061602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2062f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2063602adf40SYehuda Sadeh  */
2064602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2065602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2066602adf40SYehuda Sadeh {
2067602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2068e5cfeed2SAlex Elder 	sector_t sector_offset;
2069e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2070e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2071e5cfeed2SAlex Elder 	int ret;
2072602adf40SYehuda Sadeh 
2073e5cfeed2SAlex Elder 	/*
2074e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2075e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2076e5cfeed2SAlex Elder 	 * device.
2077e5cfeed2SAlex Elder 	 */
2078e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2079e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2080e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2081593a9e7bSAlex Elder 
2082e5cfeed2SAlex Elder 	/*
2083e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2084e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2085e5cfeed2SAlex Elder 	 */
2086e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2087e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2088e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2089e5cfeed2SAlex Elder 	else
2090e5cfeed2SAlex Elder 		ret = 0;
2091e5cfeed2SAlex Elder 
2092e5cfeed2SAlex Elder 	/*
2093e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2094e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2095e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2096e5cfeed2SAlex Elder 	 * added to an empty bio."
2097e5cfeed2SAlex Elder 	 */
2098e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2099e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2100e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2101e5cfeed2SAlex Elder 
2102e5cfeed2SAlex Elder 	return ret;
2103602adf40SYehuda Sadeh }
2104602adf40SYehuda Sadeh 
2105602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2106602adf40SYehuda Sadeh {
2107602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2108602adf40SYehuda Sadeh 
2109602adf40SYehuda Sadeh 	if (!disk)
2110602adf40SYehuda Sadeh 		return;
2111602adf40SYehuda Sadeh 
2112602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2113602adf40SYehuda Sadeh 		del_gendisk(disk);
2114602adf40SYehuda Sadeh 	if (disk->queue)
2115602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2116602adf40SYehuda Sadeh 	put_disk(disk);
2117602adf40SYehuda Sadeh }
2118602adf40SYehuda Sadeh 
2119788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2120788e2df3SAlex Elder 				const char *object_name,
2121788e2df3SAlex Elder 				u64 offset, u64 length,
2122788e2df3SAlex Elder 				char *buf, u64 *version)
2123788e2df3SAlex Elder 
2124788e2df3SAlex Elder {
21252169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2126788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2127788e2df3SAlex Elder 	struct page **pages = NULL;
2128788e2df3SAlex Elder 	u32 page_count;
21291ceae7efSAlex Elder 	size_t size;
2130788e2df3SAlex Elder 	int ret;
2131788e2df3SAlex Elder 
2132788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2133788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2134788e2df3SAlex Elder 	if (IS_ERR(pages))
2135788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2136788e2df3SAlex Elder 
2137788e2df3SAlex Elder 	ret = -ENOMEM;
2138788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2139788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2140788e2df3SAlex Elder 	if (!obj_request)
2141788e2df3SAlex Elder 		goto out;
2142788e2df3SAlex Elder 
2143788e2df3SAlex Elder 	obj_request->pages = pages;
2144788e2df3SAlex Elder 	obj_request->page_count = page_count;
2145788e2df3SAlex Elder 
2146430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2147788e2df3SAlex Elder 	if (!obj_request->osd_req)
2148788e2df3SAlex Elder 		goto out;
2149788e2df3SAlex Elder 
2150c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2151c99d2d4aSAlex Elder 					offset, length, 0, 0);
2152a4ce40a9SAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
2153a4ce40a9SAlex Elder 					obj_request->pages,
215444cd188dSAlex Elder 					obj_request->length,
215544cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
215644cd188dSAlex Elder 					false, false);
21572fa12320SAlex Elder 	rbd_osd_req_format(obj_request, false);
2158430c28c3SAlex Elder 
2159788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2160788e2df3SAlex Elder 	if (ret)
2161788e2df3SAlex Elder 		goto out;
2162788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2163788e2df3SAlex Elder 	if (ret)
2164788e2df3SAlex Elder 		goto out;
2165788e2df3SAlex Elder 
2166788e2df3SAlex Elder 	ret = obj_request->result;
2167788e2df3SAlex Elder 	if (ret < 0)
2168788e2df3SAlex Elder 		goto out;
21691ceae7efSAlex Elder 
21701ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
21711ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2172903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
217323ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
217423ed6e13SAlex Elder 	ret = (int) size;
2175788e2df3SAlex Elder 	if (version)
2176788e2df3SAlex Elder 		*version = obj_request->version;
2177788e2df3SAlex Elder out:
2178788e2df3SAlex Elder 	if (obj_request)
2179788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2180788e2df3SAlex Elder 	else
2181788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2182788e2df3SAlex Elder 
2183788e2df3SAlex Elder 	return ret;
2184788e2df3SAlex Elder }
2185788e2df3SAlex Elder 
2186602adf40SYehuda Sadeh /*
21874156d998SAlex Elder  * Read the complete header for the given rbd device.
21884156d998SAlex Elder  *
21894156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
21904156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
21914156d998SAlex Elder  * of a variable that will be filled in with the version of the
21924156d998SAlex Elder  * header object at the time it was read.
21934156d998SAlex Elder  *
21944156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
21954156d998SAlex Elder  */
21964156d998SAlex Elder static struct rbd_image_header_ondisk *
21974156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
21984156d998SAlex Elder {
21994156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
22004156d998SAlex Elder 	u32 snap_count = 0;
22014156d998SAlex Elder 	u64 names_size = 0;
22024156d998SAlex Elder 	u32 want_count;
22034156d998SAlex Elder 	int ret;
22044156d998SAlex Elder 
22054156d998SAlex Elder 	/*
22064156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
22074156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
22084156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
22094156d998SAlex Elder 	 * the number of snapshots could change by the time we read
22104156d998SAlex Elder 	 * it in, in which case we re-read it.
22114156d998SAlex Elder 	 */
22124156d998SAlex Elder 	do {
22134156d998SAlex Elder 		size_t size;
22144156d998SAlex Elder 
22154156d998SAlex Elder 		kfree(ondisk);
22164156d998SAlex Elder 
22174156d998SAlex Elder 		size = sizeof (*ondisk);
22184156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
22194156d998SAlex Elder 		size += names_size;
22204156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
22214156d998SAlex Elder 		if (!ondisk)
22224156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
22234156d998SAlex Elder 
2224788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
22254156d998SAlex Elder 				       0, size,
22264156d998SAlex Elder 				       (char *) ondisk, version);
22274156d998SAlex Elder 		if (ret < 0)
22284156d998SAlex Elder 			goto out_err;
22294156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
22304156d998SAlex Elder 			ret = -ENXIO;
223106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
223206ecc6cbSAlex Elder 				size, ret);
22334156d998SAlex Elder 			goto out_err;
22344156d998SAlex Elder 		}
22354156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
22364156d998SAlex Elder 			ret = -ENXIO;
223706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
22384156d998SAlex Elder 			goto out_err;
22394156d998SAlex Elder 		}
22404156d998SAlex Elder 
22414156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
22424156d998SAlex Elder 		want_count = snap_count;
22434156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
22444156d998SAlex Elder 	} while (snap_count != want_count);
22454156d998SAlex Elder 
22464156d998SAlex Elder 	return ondisk;
22474156d998SAlex Elder 
22484156d998SAlex Elder out_err:
22494156d998SAlex Elder 	kfree(ondisk);
22504156d998SAlex Elder 
22514156d998SAlex Elder 	return ERR_PTR(ret);
22524156d998SAlex Elder }
22534156d998SAlex Elder 
22544156d998SAlex Elder /*
2255602adf40SYehuda Sadeh  * reload the ondisk the header
2256602adf40SYehuda Sadeh  */
2257602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2258602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2259602adf40SYehuda Sadeh {
22604156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
22614156d998SAlex Elder 	u64 ver = 0;
22624156d998SAlex Elder 	int ret;
2263602adf40SYehuda Sadeh 
22644156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
22654156d998SAlex Elder 	if (IS_ERR(ondisk))
22664156d998SAlex Elder 		return PTR_ERR(ondisk);
22674156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
22684156d998SAlex Elder 	if (ret >= 0)
226959c2be1eSYehuda Sadeh 		header->obj_version = ver;
22704156d998SAlex Elder 	kfree(ondisk);
2271602adf40SYehuda Sadeh 
22724156d998SAlex Elder 	return ret;
2273602adf40SYehuda Sadeh }
2274602adf40SYehuda Sadeh 
227541f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2276dfc5606dSYehuda Sadeh {
2277dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2278a0593290SAlex Elder 	struct rbd_snap *next;
2279dfc5606dSYehuda Sadeh 
2280a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
228141f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
2282dfc5606dSYehuda Sadeh }
2283dfc5606dSYehuda Sadeh 
22849478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
22859478554aSAlex Elder {
22869478554aSAlex Elder 	sector_t size;
22879478554aSAlex Elder 
22880d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
22899478554aSAlex Elder 		return;
22909478554aSAlex Elder 
22919478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
22929478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
22939478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
22949478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
22959478554aSAlex Elder }
22969478554aSAlex Elder 
2297602adf40SYehuda Sadeh /*
2298602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
2299602adf40SYehuda Sadeh  */
2300117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2301602adf40SYehuda Sadeh {
2302602adf40SYehuda Sadeh 	int ret;
2303602adf40SYehuda Sadeh 	struct rbd_image_header h;
2304602adf40SYehuda Sadeh 
2305602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
2306602adf40SYehuda Sadeh 	if (ret < 0)
2307602adf40SYehuda Sadeh 		return ret;
2308602adf40SYehuda Sadeh 
2309a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
2310a51aa0c0SJosh Durgin 
23119478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
23129478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
23139478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
23149db4b3e3SSage Weil 
2315849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
2316602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
2317849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
2318d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
2319d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
2320602adf40SYehuda Sadeh 
2321b813623aSAlex Elder 	if (hver)
2322b813623aSAlex Elder 		*hver = h.obj_version;
2323a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
232493a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
2325602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
2326602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
2327602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
2328849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
2329849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2330849b4260SAlex Elder 	kfree(h.object_prefix);
2331849b4260SAlex Elder 
2332304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2333304f6808SAlex Elder 	if (!ret)
2334304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
2335dfc5606dSYehuda Sadeh 
2336c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
2337602adf40SYehuda Sadeh 
2338dfc5606dSYehuda Sadeh 	return ret;
2339602adf40SYehuda Sadeh }
2340602adf40SYehuda Sadeh 
2341117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
23421fe5e993SAlex Elder {
23431fe5e993SAlex Elder 	int ret;
23441fe5e993SAlex Elder 
2345117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
23461fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2347117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
2348117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
2349117973fbSAlex Elder 	else
2350117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
23511fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
23521fe5e993SAlex Elder 
23531fe5e993SAlex Elder 	return ret;
23541fe5e993SAlex Elder }
23551fe5e993SAlex Elder 
2356602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
2357602adf40SYehuda Sadeh {
2358602adf40SYehuda Sadeh 	struct gendisk *disk;
2359602adf40SYehuda Sadeh 	struct request_queue *q;
2360593a9e7bSAlex Elder 	u64 segment_size;
2361602adf40SYehuda Sadeh 
2362602adf40SYehuda Sadeh 	/* create gendisk info */
2363602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2364602adf40SYehuda Sadeh 	if (!disk)
23651fcdb8aaSAlex Elder 		return -ENOMEM;
2366602adf40SYehuda Sadeh 
2367f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2368de71a297SAlex Elder 		 rbd_dev->dev_id);
2369602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
2370602adf40SYehuda Sadeh 	disk->first_minor = 0;
2371602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
2372602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
2373602adf40SYehuda Sadeh 
2374bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2375602adf40SYehuda Sadeh 	if (!q)
2376602adf40SYehuda Sadeh 		goto out_disk;
2377029bcbd8SJosh Durgin 
2378593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
2379593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
2380593a9e7bSAlex Elder 
2381029bcbd8SJosh Durgin 	/* set io sizes to object size */
2382593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
2383593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2384593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
2385593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
2386593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
2387029bcbd8SJosh Durgin 
2388602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
2389602adf40SYehuda Sadeh 	disk->queue = q;
2390602adf40SYehuda Sadeh 
2391602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
2392602adf40SYehuda Sadeh 
2393602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
2394602adf40SYehuda Sadeh 
239512f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
239612f02944SAlex Elder 
2397602adf40SYehuda Sadeh 	return 0;
2398602adf40SYehuda Sadeh out_disk:
2399602adf40SYehuda Sadeh 	put_disk(disk);
24001fcdb8aaSAlex Elder 
24011fcdb8aaSAlex Elder 	return -ENOMEM;
2402602adf40SYehuda Sadeh }
2403602adf40SYehuda Sadeh 
2404dfc5606dSYehuda Sadeh /*
2405dfc5606dSYehuda Sadeh   sysfs
2406dfc5606dSYehuda Sadeh */
2407602adf40SYehuda Sadeh 
2408593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2409593a9e7bSAlex Elder {
2410593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
2411593a9e7bSAlex Elder }
2412593a9e7bSAlex Elder 
2413dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
2414dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2415602adf40SYehuda Sadeh {
2416593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2417a51aa0c0SJosh Durgin 	sector_t size;
2418dfc5606dSYehuda Sadeh 
2419a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2420a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2421a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2422a51aa0c0SJosh Durgin 
2423a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2424602adf40SYehuda Sadeh }
2425602adf40SYehuda Sadeh 
242634b13184SAlex Elder /*
242734b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
242834b13184SAlex Elder  * necessarily the base image.
242934b13184SAlex Elder  */
243034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
243134b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
243234b13184SAlex Elder {
243334b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
243434b13184SAlex Elder 
243534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
243634b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
243734b13184SAlex Elder }
243834b13184SAlex Elder 
2439dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2440dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2441602adf40SYehuda Sadeh {
2442593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2443dfc5606dSYehuda Sadeh 
2444dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2445dfc5606dSYehuda Sadeh }
2446dfc5606dSYehuda Sadeh 
2447dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2448dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2449dfc5606dSYehuda Sadeh {
2450593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2451dfc5606dSYehuda Sadeh 
24521dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
24531dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2454dfc5606dSYehuda Sadeh }
2455dfc5606dSYehuda Sadeh 
2456dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2457dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2458dfc5606dSYehuda Sadeh {
2459593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2460dfc5606dSYehuda Sadeh 
24610d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2462dfc5606dSYehuda Sadeh }
2463dfc5606dSYehuda Sadeh 
24649bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
24659bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
24669bb2f334SAlex Elder {
24679bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
24689bb2f334SAlex Elder 
24690d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
24700d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
24719bb2f334SAlex Elder }
24729bb2f334SAlex Elder 
2473dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2474dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2475dfc5606dSYehuda Sadeh {
2476593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2477dfc5606dSYehuda Sadeh 
2478a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
24790d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2480a92ffdf8SAlex Elder 
2481a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2482dfc5606dSYehuda Sadeh }
2483dfc5606dSYehuda Sadeh 
2484589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2485589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2486589d30e0SAlex Elder {
2487589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2488589d30e0SAlex Elder 
24890d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2490589d30e0SAlex Elder }
2491589d30e0SAlex Elder 
249234b13184SAlex Elder /*
249334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
249434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
249534b13184SAlex Elder  */
2496dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2497dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2498dfc5606dSYehuda Sadeh 			     char *buf)
2499dfc5606dSYehuda Sadeh {
2500593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2501dfc5606dSYehuda Sadeh 
25020d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2503dfc5606dSYehuda Sadeh }
2504dfc5606dSYehuda Sadeh 
250586b00e0dSAlex Elder /*
250686b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
250786b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
250886b00e0dSAlex Elder  * "(no parent image)".
250986b00e0dSAlex Elder  */
251086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
251186b00e0dSAlex Elder 			     struct device_attribute *attr,
251286b00e0dSAlex Elder 			     char *buf)
251386b00e0dSAlex Elder {
251486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
251586b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
251686b00e0dSAlex Elder 	int count;
251786b00e0dSAlex Elder 	char *bufp = buf;
251886b00e0dSAlex Elder 
251986b00e0dSAlex Elder 	if (!spec)
252086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
252186b00e0dSAlex Elder 
252286b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
252386b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
252486b00e0dSAlex Elder 	if (count < 0)
252586b00e0dSAlex Elder 		return count;
252686b00e0dSAlex Elder 	bufp += count;
252786b00e0dSAlex Elder 
252886b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
252986b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
253086b00e0dSAlex Elder 	if (count < 0)
253186b00e0dSAlex Elder 		return count;
253286b00e0dSAlex Elder 	bufp += count;
253386b00e0dSAlex Elder 
253486b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
253586b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
253686b00e0dSAlex Elder 	if (count < 0)
253786b00e0dSAlex Elder 		return count;
253886b00e0dSAlex Elder 	bufp += count;
253986b00e0dSAlex Elder 
254086b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
254186b00e0dSAlex Elder 	if (count < 0)
254286b00e0dSAlex Elder 		return count;
254386b00e0dSAlex Elder 	bufp += count;
254486b00e0dSAlex Elder 
254586b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
254686b00e0dSAlex Elder }
254786b00e0dSAlex Elder 
2548dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2549dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2550dfc5606dSYehuda Sadeh 				 const char *buf,
2551dfc5606dSYehuda Sadeh 				 size_t size)
2552dfc5606dSYehuda Sadeh {
2553593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2554b813623aSAlex Elder 	int ret;
2555602adf40SYehuda Sadeh 
2556117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2557b813623aSAlex Elder 
2558b813623aSAlex Elder 	return ret < 0 ? ret : size;
2559dfc5606dSYehuda Sadeh }
2560602adf40SYehuda Sadeh 
2561dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
256234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2563dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2564dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2565dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
25669bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2567dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2568589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2569dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2570dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
257186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2572dfc5606dSYehuda Sadeh 
2573dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2574dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
257534b13184SAlex Elder 	&dev_attr_features.attr,
2576dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2577dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2578dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
25799bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2580dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2581589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2582dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
258386b00e0dSAlex Elder 	&dev_attr_parent.attr,
2584dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2585dfc5606dSYehuda Sadeh 	NULL
2586dfc5606dSYehuda Sadeh };
2587dfc5606dSYehuda Sadeh 
2588dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2589dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2590dfc5606dSYehuda Sadeh };
2591dfc5606dSYehuda Sadeh 
2592dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2593dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2594dfc5606dSYehuda Sadeh 	NULL
2595dfc5606dSYehuda Sadeh };
2596dfc5606dSYehuda Sadeh 
2597dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2598dfc5606dSYehuda Sadeh {
2599dfc5606dSYehuda Sadeh }
2600dfc5606dSYehuda Sadeh 
2601dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2602dfc5606dSYehuda Sadeh 	.name		= "rbd",
2603dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2604dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2605dfc5606dSYehuda Sadeh };
2606dfc5606dSYehuda Sadeh 
2607dfc5606dSYehuda Sadeh 
2608dfc5606dSYehuda Sadeh /*
2609dfc5606dSYehuda Sadeh   sysfs - snapshots
2610dfc5606dSYehuda Sadeh */
2611dfc5606dSYehuda Sadeh 
2612dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2613dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2614dfc5606dSYehuda Sadeh 				  char *buf)
2615dfc5606dSYehuda Sadeh {
2616dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2617dfc5606dSYehuda Sadeh 
26183591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2619dfc5606dSYehuda Sadeh }
2620dfc5606dSYehuda Sadeh 
2621dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2622dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2623dfc5606dSYehuda Sadeh 				char *buf)
2624dfc5606dSYehuda Sadeh {
2625dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2626dfc5606dSYehuda Sadeh 
2627593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2628dfc5606dSYehuda Sadeh }
2629dfc5606dSYehuda Sadeh 
263034b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
263134b13184SAlex Elder 				struct device_attribute *attr,
263234b13184SAlex Elder 				char *buf)
263334b13184SAlex Elder {
263434b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
263534b13184SAlex Elder 
263634b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
263734b13184SAlex Elder 			(unsigned long long) snap->features);
263834b13184SAlex Elder }
263934b13184SAlex Elder 
2640dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2641dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
264234b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2643dfc5606dSYehuda Sadeh 
2644dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2645dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2646dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
264734b13184SAlex Elder 	&dev_attr_snap_features.attr,
2648dfc5606dSYehuda Sadeh 	NULL,
2649dfc5606dSYehuda Sadeh };
2650dfc5606dSYehuda Sadeh 
2651dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2652dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2653dfc5606dSYehuda Sadeh };
2654dfc5606dSYehuda Sadeh 
2655dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2656dfc5606dSYehuda Sadeh {
2657dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2658dfc5606dSYehuda Sadeh 	kfree(snap->name);
2659dfc5606dSYehuda Sadeh 	kfree(snap);
2660dfc5606dSYehuda Sadeh }
2661dfc5606dSYehuda Sadeh 
2662dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2663dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2664dfc5606dSYehuda Sadeh 	NULL
2665dfc5606dSYehuda Sadeh };
2666dfc5606dSYehuda Sadeh 
2667dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2668dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2669dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2670dfc5606dSYehuda Sadeh };
2671dfc5606dSYehuda Sadeh 
26728b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
26738b8fb99cSAlex Elder {
26748b8fb99cSAlex Elder 	kref_get(&spec->kref);
26758b8fb99cSAlex Elder 
26768b8fb99cSAlex Elder 	return spec;
26778b8fb99cSAlex Elder }
26788b8fb99cSAlex Elder 
26798b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
26808b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
26818b8fb99cSAlex Elder {
26828b8fb99cSAlex Elder 	if (spec)
26838b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
26848b8fb99cSAlex Elder }
26858b8fb99cSAlex Elder 
26868b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
26878b8fb99cSAlex Elder {
26888b8fb99cSAlex Elder 	struct rbd_spec *spec;
26898b8fb99cSAlex Elder 
26908b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
26918b8fb99cSAlex Elder 	if (!spec)
26928b8fb99cSAlex Elder 		return NULL;
26938b8fb99cSAlex Elder 	kref_init(&spec->kref);
26948b8fb99cSAlex Elder 
26958b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
26968b8fb99cSAlex Elder 
26978b8fb99cSAlex Elder 	return spec;
26988b8fb99cSAlex Elder }
26998b8fb99cSAlex Elder 
27008b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
27018b8fb99cSAlex Elder {
27028b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
27038b8fb99cSAlex Elder 
27048b8fb99cSAlex Elder 	kfree(spec->pool_name);
27058b8fb99cSAlex Elder 	kfree(spec->image_id);
27068b8fb99cSAlex Elder 	kfree(spec->image_name);
27078b8fb99cSAlex Elder 	kfree(spec->snap_name);
27088b8fb99cSAlex Elder 	kfree(spec);
27098b8fb99cSAlex Elder }
27108b8fb99cSAlex Elder 
2711cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2712c53d5893SAlex Elder 				struct rbd_spec *spec)
2713c53d5893SAlex Elder {
2714c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2715c53d5893SAlex Elder 
2716c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2717c53d5893SAlex Elder 	if (!rbd_dev)
2718c53d5893SAlex Elder 		return NULL;
2719c53d5893SAlex Elder 
2720c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
27216d292906SAlex Elder 	rbd_dev->flags = 0;
2722c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2723c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2724c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2725c53d5893SAlex Elder 
2726c53d5893SAlex Elder 	rbd_dev->spec = spec;
2727c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2728c53d5893SAlex Elder 
27290903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
27300903e875SAlex Elder 
27310903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
27320903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
27330903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
27340903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
27350903e875SAlex Elder 
2736c53d5893SAlex Elder 	return rbd_dev;
2737c53d5893SAlex Elder }
2738c53d5893SAlex Elder 
2739c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2740c53d5893SAlex Elder {
274186b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2742c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2743c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2744c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2745c53d5893SAlex Elder 	kfree(rbd_dev);
2746c53d5893SAlex Elder }
2747c53d5893SAlex Elder 
2748304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2749304f6808SAlex Elder {
2750304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2751304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2752304f6808SAlex Elder 
2753304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2754304f6808SAlex Elder 
2755304f6808SAlex Elder 	return ret;
2756304f6808SAlex Elder }
2757304f6808SAlex Elder 
275841f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2759dfc5606dSYehuda Sadeh {
2760dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2761304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2762dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2763dfc5606dSYehuda Sadeh }
2764dfc5606dSYehuda Sadeh 
276514e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2766dfc5606dSYehuda Sadeh 				  struct device *parent)
2767dfc5606dSYehuda Sadeh {
2768dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2769dfc5606dSYehuda Sadeh 	int ret;
2770dfc5606dSYehuda Sadeh 
2771dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2772dfc5606dSYehuda Sadeh 	dev->parent = parent;
2773dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2774d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2775304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2776304f6808SAlex Elder 
2777dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2778dfc5606dSYehuda Sadeh 
2779dfc5606dSYehuda Sadeh 	return ret;
2780dfc5606dSYehuda Sadeh }
2781dfc5606dSYehuda Sadeh 
27824e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2783c8d18425SAlex Elder 						const char *snap_name,
278434b13184SAlex Elder 						u64 snap_id, u64 snap_size,
278534b13184SAlex Elder 						u64 snap_features)
2786dfc5606dSYehuda Sadeh {
27874e891e0aSAlex Elder 	struct rbd_snap *snap;
2788dfc5606dSYehuda Sadeh 	int ret;
27894e891e0aSAlex Elder 
27904e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2791dfc5606dSYehuda Sadeh 	if (!snap)
27924e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
27934e891e0aSAlex Elder 
27944e891e0aSAlex Elder 	ret = -ENOMEM;
2795c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
27964e891e0aSAlex Elder 	if (!snap->name)
27974e891e0aSAlex Elder 		goto err;
27984e891e0aSAlex Elder 
2799c8d18425SAlex Elder 	snap->id = snap_id;
2800c8d18425SAlex Elder 	snap->size = snap_size;
280134b13184SAlex Elder 	snap->features = snap_features;
28024e891e0aSAlex Elder 
28034e891e0aSAlex Elder 	return snap;
28044e891e0aSAlex Elder 
2805dfc5606dSYehuda Sadeh err:
2806dfc5606dSYehuda Sadeh 	kfree(snap->name);
2807dfc5606dSYehuda Sadeh 	kfree(snap);
28084e891e0aSAlex Elder 
28094e891e0aSAlex Elder 	return ERR_PTR(ret);
2810dfc5606dSYehuda Sadeh }
2811dfc5606dSYehuda Sadeh 
2812cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2813cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2814cd892126SAlex Elder {
2815cd892126SAlex Elder 	char *snap_name;
2816cd892126SAlex Elder 
2817cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2818cd892126SAlex Elder 
2819cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2820cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2821cd892126SAlex Elder 
2822cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2823cd892126SAlex Elder 
2824cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2825cd892126SAlex Elder 	while (which--)
2826cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2827cd892126SAlex Elder 
2828cd892126SAlex Elder 	return snap_name;
2829cd892126SAlex Elder }
2830cd892126SAlex Elder 
2831dfc5606dSYehuda Sadeh /*
28329d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
28339d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
28349d475de5SAlex Elder  * image.
28359d475de5SAlex Elder  */
28369d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
28379d475de5SAlex Elder 				u8 *order, u64 *snap_size)
28389d475de5SAlex Elder {
28399d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
28409d475de5SAlex Elder 	int ret;
28419d475de5SAlex Elder 	struct {
28429d475de5SAlex Elder 		u8 order;
28439d475de5SAlex Elder 		__le64 size;
28449d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
28459d475de5SAlex Elder 
284636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
28479d475de5SAlex Elder 				"rbd", "get_size",
28489d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
284907b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
285036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
28519d475de5SAlex Elder 	if (ret < 0)
28529d475de5SAlex Elder 		return ret;
28539d475de5SAlex Elder 
28549d475de5SAlex Elder 	*order = size_buf.order;
28559d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
28569d475de5SAlex Elder 
28579d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
28589d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
28599d475de5SAlex Elder 		(unsigned long long) *snap_size);
28609d475de5SAlex Elder 
28619d475de5SAlex Elder 	return 0;
28629d475de5SAlex Elder }
28639d475de5SAlex Elder 
28649d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
28659d475de5SAlex Elder {
28669d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
28679d475de5SAlex Elder 					&rbd_dev->header.obj_order,
28689d475de5SAlex Elder 					&rbd_dev->header.image_size);
28699d475de5SAlex Elder }
28709d475de5SAlex Elder 
28711e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
28721e130199SAlex Elder {
28731e130199SAlex Elder 	void *reply_buf;
28741e130199SAlex Elder 	int ret;
28751e130199SAlex Elder 	void *p;
28761e130199SAlex Elder 
28771e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
28781e130199SAlex Elder 	if (!reply_buf)
28791e130199SAlex Elder 		return -ENOMEM;
28801e130199SAlex Elder 
288136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
28821e130199SAlex Elder 				"rbd", "get_object_prefix",
28831e130199SAlex Elder 				NULL, 0,
288407b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
288536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
28861e130199SAlex Elder 	if (ret < 0)
28871e130199SAlex Elder 		goto out;
28881e130199SAlex Elder 
28891e130199SAlex Elder 	p = reply_buf;
28901e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
28911e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
28921e130199SAlex Elder 						NULL, GFP_NOIO);
28931e130199SAlex Elder 
28941e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
28951e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
28961e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
28971e130199SAlex Elder 	} else {
28981e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
28991e130199SAlex Elder 	}
29001e130199SAlex Elder 
29011e130199SAlex Elder out:
29021e130199SAlex Elder 	kfree(reply_buf);
29031e130199SAlex Elder 
29041e130199SAlex Elder 	return ret;
29051e130199SAlex Elder }
29061e130199SAlex Elder 
2907b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2908b1b5402aSAlex Elder 		u64 *snap_features)
2909b1b5402aSAlex Elder {
2910b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2911b1b5402aSAlex Elder 	struct {
2912b1b5402aSAlex Elder 		__le64 features;
2913b1b5402aSAlex Elder 		__le64 incompat;
2914b1b5402aSAlex Elder 	} features_buf = { 0 };
2915d889140cSAlex Elder 	u64 incompat;
2916b1b5402aSAlex Elder 	int ret;
2917b1b5402aSAlex Elder 
291836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2919b1b5402aSAlex Elder 				"rbd", "get_features",
2920b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2921b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
292207b2391fSAlex Elder 				NULL);
292336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2924b1b5402aSAlex Elder 	if (ret < 0)
2925b1b5402aSAlex Elder 		return ret;
2926d889140cSAlex Elder 
2927d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
29285cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
2929b8f5c6edSAlex Elder 		return -ENXIO;
2930d889140cSAlex Elder 
2931b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2932b1b5402aSAlex Elder 
2933b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2934b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2935b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2936b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2937b1b5402aSAlex Elder 
2938b1b5402aSAlex Elder 	return 0;
2939b1b5402aSAlex Elder }
2940b1b5402aSAlex Elder 
2941b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2942b1b5402aSAlex Elder {
2943b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2944b1b5402aSAlex Elder 						&rbd_dev->header.features);
2945b1b5402aSAlex Elder }
2946b1b5402aSAlex Elder 
294786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
294886b00e0dSAlex Elder {
294986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
295086b00e0dSAlex Elder 	size_t size;
295186b00e0dSAlex Elder 	void *reply_buf = NULL;
295286b00e0dSAlex Elder 	__le64 snapid;
295386b00e0dSAlex Elder 	void *p;
295486b00e0dSAlex Elder 	void *end;
295586b00e0dSAlex Elder 	char *image_id;
295686b00e0dSAlex Elder 	u64 overlap;
295786b00e0dSAlex Elder 	int ret;
295886b00e0dSAlex Elder 
295986b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
296086b00e0dSAlex Elder 	if (!parent_spec)
296186b00e0dSAlex Elder 		return -ENOMEM;
296286b00e0dSAlex Elder 
296386b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
296486b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
296586b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
296686b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
296786b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
296886b00e0dSAlex Elder 	if (!reply_buf) {
296986b00e0dSAlex Elder 		ret = -ENOMEM;
297086b00e0dSAlex Elder 		goto out_err;
297186b00e0dSAlex Elder 	}
297286b00e0dSAlex Elder 
297386b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
297436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
297586b00e0dSAlex Elder 				"rbd", "get_parent",
297686b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
297707b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
297836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
297986b00e0dSAlex Elder 	if (ret < 0)
298086b00e0dSAlex Elder 		goto out_err;
298186b00e0dSAlex Elder 
298286b00e0dSAlex Elder 	ret = -ERANGE;
298386b00e0dSAlex Elder 	p = reply_buf;
298486b00e0dSAlex Elder 	end = (char *) reply_buf + size;
298586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
298686b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
298786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
298886b00e0dSAlex Elder 
29890903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
29900903e875SAlex Elder 
29910903e875SAlex Elder 	ret = -EIO;
29920903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
29930903e875SAlex Elder 		goto out;
29940903e875SAlex Elder 
2995979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
299686b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
299786b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
299886b00e0dSAlex Elder 		goto out_err;
299986b00e0dSAlex Elder 	}
300086b00e0dSAlex Elder 	parent_spec->image_id = image_id;
300186b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
300286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
300386b00e0dSAlex Elder 
300486b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
300586b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
300686b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
300786b00e0dSAlex Elder out:
300886b00e0dSAlex Elder 	ret = 0;
300986b00e0dSAlex Elder out_err:
301086b00e0dSAlex Elder 	kfree(reply_buf);
301186b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
301286b00e0dSAlex Elder 
301386b00e0dSAlex Elder 	return ret;
301486b00e0dSAlex Elder }
301586b00e0dSAlex Elder 
30169e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
30179e15b77dSAlex Elder {
30189e15b77dSAlex Elder 	size_t image_id_size;
30199e15b77dSAlex Elder 	char *image_id;
30209e15b77dSAlex Elder 	void *p;
30219e15b77dSAlex Elder 	void *end;
30229e15b77dSAlex Elder 	size_t size;
30239e15b77dSAlex Elder 	void *reply_buf = NULL;
30249e15b77dSAlex Elder 	size_t len = 0;
30259e15b77dSAlex Elder 	char *image_name = NULL;
30269e15b77dSAlex Elder 	int ret;
30279e15b77dSAlex Elder 
30289e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
30299e15b77dSAlex Elder 
303069e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
303169e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
30329e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
30339e15b77dSAlex Elder 	if (!image_id)
30349e15b77dSAlex Elder 		return NULL;
30359e15b77dSAlex Elder 
30369e15b77dSAlex Elder 	p = image_id;
30379e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
303869e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
30399e15b77dSAlex Elder 
30409e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
30419e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
30429e15b77dSAlex Elder 	if (!reply_buf)
30439e15b77dSAlex Elder 		goto out;
30449e15b77dSAlex Elder 
304536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
30469e15b77dSAlex Elder 				"rbd", "dir_get_name",
30479e15b77dSAlex Elder 				image_id, image_id_size,
304807b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
30499e15b77dSAlex Elder 	if (ret < 0)
30509e15b77dSAlex Elder 		goto out;
30519e15b77dSAlex Elder 	p = reply_buf;
30529e15b77dSAlex Elder 	end = (char *) reply_buf + size;
30539e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
30549e15b77dSAlex Elder 	if (IS_ERR(image_name))
30559e15b77dSAlex Elder 		image_name = NULL;
30569e15b77dSAlex Elder 	else
30579e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
30589e15b77dSAlex Elder out:
30599e15b77dSAlex Elder 	kfree(reply_buf);
30609e15b77dSAlex Elder 	kfree(image_id);
30619e15b77dSAlex Elder 
30629e15b77dSAlex Elder 	return image_name;
30639e15b77dSAlex Elder }
30649e15b77dSAlex Elder 
30659e15b77dSAlex Elder /*
30669e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
30679e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
30689e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
30699e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
30709e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
30719e15b77dSAlex Elder  * until then.
30729e15b77dSAlex Elder  */
30739e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
30749e15b77dSAlex Elder {
30759e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
30769e15b77dSAlex Elder 	const char *name;
30779e15b77dSAlex Elder 	void *reply_buf = NULL;
30789e15b77dSAlex Elder 	int ret;
30799e15b77dSAlex Elder 
30809e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
30819e15b77dSAlex Elder 		return 0;	/* Already have the names */
30829e15b77dSAlex Elder 
30839e15b77dSAlex Elder 	/* Look up the pool name */
30849e15b77dSAlex Elder 
30859e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
30869e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3087935dc89fSAlex Elder 	if (!name) {
3088935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3089935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3090935dc89fSAlex Elder 		return -EIO;
3091935dc89fSAlex Elder 	}
30929e15b77dSAlex Elder 
30939e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
30949e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
30959e15b77dSAlex Elder 		return -ENOMEM;
30969e15b77dSAlex Elder 
30979e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
30989e15b77dSAlex Elder 
30999e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
310069e7a02fSAlex Elder 	if (name)
31019e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
310269e7a02fSAlex Elder 	else
310306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
31049e15b77dSAlex Elder 
31059e15b77dSAlex Elder 	/* Look up the snapshot name. */
31069e15b77dSAlex Elder 
31079e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
31089e15b77dSAlex Elder 	if (!name) {
3109935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3110935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
31119e15b77dSAlex Elder 		ret = -EIO;
31129e15b77dSAlex Elder 		goto out_err;
31139e15b77dSAlex Elder 	}
31149e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
31159e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
31169e15b77dSAlex Elder 		goto out_err;
31179e15b77dSAlex Elder 
31189e15b77dSAlex Elder 	return 0;
31199e15b77dSAlex Elder out_err:
31209e15b77dSAlex Elder 	kfree(reply_buf);
31219e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
31229e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
31239e15b77dSAlex Elder 
31249e15b77dSAlex Elder 	return ret;
31259e15b77dSAlex Elder }
31269e15b77dSAlex Elder 
31276e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
312835d489f9SAlex Elder {
312935d489f9SAlex Elder 	size_t size;
313035d489f9SAlex Elder 	int ret;
313135d489f9SAlex Elder 	void *reply_buf;
313235d489f9SAlex Elder 	void *p;
313335d489f9SAlex Elder 	void *end;
313435d489f9SAlex Elder 	u64 seq;
313535d489f9SAlex Elder 	u32 snap_count;
313635d489f9SAlex Elder 	struct ceph_snap_context *snapc;
313735d489f9SAlex Elder 	u32 i;
313835d489f9SAlex Elder 
313935d489f9SAlex Elder 	/*
314035d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
314135d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
314235d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
314335d489f9SAlex Elder 	 * prepared to receive.
314435d489f9SAlex Elder 	 */
314535d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
314635d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
314735d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
314835d489f9SAlex Elder 	if (!reply_buf)
314935d489f9SAlex Elder 		return -ENOMEM;
315035d489f9SAlex Elder 
315136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
315235d489f9SAlex Elder 				"rbd", "get_snapcontext",
315335d489f9SAlex Elder 				NULL, 0,
315407b2391fSAlex Elder 				reply_buf, size, ver);
315536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
315635d489f9SAlex Elder 	if (ret < 0)
315735d489f9SAlex Elder 		goto out;
315835d489f9SAlex Elder 
315935d489f9SAlex Elder 	ret = -ERANGE;
316035d489f9SAlex Elder 	p = reply_buf;
316135d489f9SAlex Elder 	end = (char *) reply_buf + size;
316235d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
316335d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
316435d489f9SAlex Elder 
316535d489f9SAlex Elder 	/*
316635d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
316735d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
316835d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
316935d489f9SAlex Elder 	 * allocate is representable in a size_t.
317035d489f9SAlex Elder 	 */
317135d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
317235d489f9SAlex Elder 				 / sizeof (u64)) {
317335d489f9SAlex Elder 		ret = -EINVAL;
317435d489f9SAlex Elder 		goto out;
317535d489f9SAlex Elder 	}
317635d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
317735d489f9SAlex Elder 		goto out;
317835d489f9SAlex Elder 
317935d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
318035d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
318135d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
318235d489f9SAlex Elder 	if (!snapc) {
318335d489f9SAlex Elder 		ret = -ENOMEM;
318435d489f9SAlex Elder 		goto out;
318535d489f9SAlex Elder 	}
318635d489f9SAlex Elder 
318735d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
318835d489f9SAlex Elder 	snapc->seq = seq;
318935d489f9SAlex Elder 	snapc->num_snaps = snap_count;
319035d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
319135d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
319235d489f9SAlex Elder 
319335d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
319435d489f9SAlex Elder 
319535d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
319635d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
319735d489f9SAlex Elder 
319835d489f9SAlex Elder out:
319935d489f9SAlex Elder 	kfree(reply_buf);
320035d489f9SAlex Elder 
320135d489f9SAlex Elder 	return 0;
320235d489f9SAlex Elder }
320335d489f9SAlex Elder 
3204b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3205b8b1e2dbSAlex Elder {
3206b8b1e2dbSAlex Elder 	size_t size;
3207b8b1e2dbSAlex Elder 	void *reply_buf;
3208b8b1e2dbSAlex Elder 	__le64 snap_id;
3209b8b1e2dbSAlex Elder 	int ret;
3210b8b1e2dbSAlex Elder 	void *p;
3211b8b1e2dbSAlex Elder 	void *end;
3212b8b1e2dbSAlex Elder 	char *snap_name;
3213b8b1e2dbSAlex Elder 
3214b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3215b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3216b8b1e2dbSAlex Elder 	if (!reply_buf)
3217b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3218b8b1e2dbSAlex Elder 
3219b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
322036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3221b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
3222b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
322307b2391fSAlex Elder 				reply_buf, size, NULL);
322436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3225b8b1e2dbSAlex Elder 	if (ret < 0)
3226b8b1e2dbSAlex Elder 		goto out;
3227b8b1e2dbSAlex Elder 
3228b8b1e2dbSAlex Elder 	p = reply_buf;
3229b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
3230e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3231b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3232b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3233b8b1e2dbSAlex Elder 		goto out;
3234b8b1e2dbSAlex Elder 	} else {
3235b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3236b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
3237b8b1e2dbSAlex Elder 	}
3238b8b1e2dbSAlex Elder 	kfree(reply_buf);
3239b8b1e2dbSAlex Elder 
3240b8b1e2dbSAlex Elder 	return snap_name;
3241b8b1e2dbSAlex Elder out:
3242b8b1e2dbSAlex Elder 	kfree(reply_buf);
3243b8b1e2dbSAlex Elder 
3244b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3245b8b1e2dbSAlex Elder }
3246b8b1e2dbSAlex Elder 
3247b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3248b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3249b8b1e2dbSAlex Elder {
3250e0b49868SAlex Elder 	u64 snap_id;
3251b8b1e2dbSAlex Elder 	u8 order;
3252b8b1e2dbSAlex Elder 	int ret;
3253b8b1e2dbSAlex Elder 
3254b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3255b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3256b8b1e2dbSAlex Elder 	if (ret)
3257b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3258b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3259b8b1e2dbSAlex Elder 	if (ret)
3260b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3261b8b1e2dbSAlex Elder 
3262b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
3263b8b1e2dbSAlex Elder }
3264b8b1e2dbSAlex Elder 
3265b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3266b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3267b8b1e2dbSAlex Elder {
3268b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3269b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3270b8b1e2dbSAlex Elder 					snap_size, snap_features);
3271b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3272b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3273b8b1e2dbSAlex Elder 					snap_size, snap_features);
3274b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3275b8b1e2dbSAlex Elder }
3276b8b1e2dbSAlex Elder 
3277117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3278117973fbSAlex Elder {
3279117973fbSAlex Elder 	int ret;
3280117973fbSAlex Elder 	__u8 obj_order;
3281117973fbSAlex Elder 
3282117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3283117973fbSAlex Elder 
3284117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3285117973fbSAlex Elder 
3286117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3287117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3288117973fbSAlex Elder 	if (ret)
3289117973fbSAlex Elder 		goto out;
3290117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3291117973fbSAlex Elder 		ret = -EIO;
3292117973fbSAlex Elder 		goto out;
3293117973fbSAlex Elder 	}
3294117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3295117973fbSAlex Elder 
3296117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3297117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
3298117973fbSAlex Elder 	if (ret)
3299117973fbSAlex Elder 		goto out;
3300117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3301117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
3302117973fbSAlex Elder 	if (ret)
3303117973fbSAlex Elder 		goto out;
3304117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
3305117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
3306117973fbSAlex Elder out:
3307117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
3308117973fbSAlex Elder 
3309117973fbSAlex Elder 	return ret;
3310117973fbSAlex Elder }
3311117973fbSAlex Elder 
33129d475de5SAlex Elder /*
331335938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
331435938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
331535938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
331635938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
331735938150SAlex Elder  * And verify there are no changes to snapshots we already know
331835938150SAlex Elder  * about.
331935938150SAlex Elder  *
332035938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
332135938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
332235938150SAlex Elder  * are also maintained in that order.)
3323dfc5606dSYehuda Sadeh  */
3324304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3325dfc5606dSYehuda Sadeh {
332635938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
332735938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
332835938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
332935938150SAlex Elder 	struct list_head *links = head->next;
333035938150SAlex Elder 	u32 index = 0;
3331dfc5606dSYehuda Sadeh 
33329fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
333335938150SAlex Elder 	while (index < snap_count || links != head) {
333435938150SAlex Elder 		u64 snap_id;
333535938150SAlex Elder 		struct rbd_snap *snap;
3336cd892126SAlex Elder 		char *snap_name;
3337cd892126SAlex Elder 		u64 snap_size = 0;
3338cd892126SAlex Elder 		u64 snap_features = 0;
3339dfc5606dSYehuda Sadeh 
334035938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
334135938150SAlex Elder 					     : CEPH_NOSNAP;
334235938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
334335938150SAlex Elder 				     : NULL;
3344aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3345dfc5606dSYehuda Sadeh 
334635938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
334735938150SAlex Elder 			struct list_head *next = links->next;
3348dfc5606dSYehuda Sadeh 
33496d292906SAlex Elder 			/*
33506d292906SAlex Elder 			 * A previously-existing snapshot is not in
33516d292906SAlex Elder 			 * the new snap context.
33526d292906SAlex Elder 			 *
33536d292906SAlex Elder 			 * If the now missing snapshot is the one the
33546d292906SAlex Elder 			 * image is mapped to, clear its exists flag
33556d292906SAlex Elder 			 * so we can avoid sending any more requests
33566d292906SAlex Elder 			 * to it.
33576d292906SAlex Elder 			 */
33580d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
33596d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
336041f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
33619fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
33620d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
33630d7dbfceSAlex Elder 							"mapped " : "",
33649fcbb800SAlex Elder 				(unsigned long long) snap->id);
3365dfc5606dSYehuda Sadeh 
336635938150SAlex Elder 			/* Done with this list entry; advance */
336735938150SAlex Elder 
336835938150SAlex Elder 			links = next;
336935938150SAlex Elder 			continue;
3370dfc5606dSYehuda Sadeh 		}
337135938150SAlex Elder 
3372b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
3373cd892126SAlex Elder 					&snap_size, &snap_features);
3374cd892126SAlex Elder 		if (IS_ERR(snap_name))
3375cd892126SAlex Elder 			return PTR_ERR(snap_name);
3376cd892126SAlex Elder 
33779fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
33789fcbb800SAlex Elder 			(unsigned long long) snap_id);
337935938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
338035938150SAlex Elder 			struct rbd_snap *new_snap;
338135938150SAlex Elder 
338235938150SAlex Elder 			/* We haven't seen this snapshot before */
338335938150SAlex Elder 
3384c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3385cd892126SAlex Elder 					snap_id, snap_size, snap_features);
33869fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
33879fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
33889fcbb800SAlex Elder 
33899fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
33909fcbb800SAlex Elder 
33919fcbb800SAlex Elder 				return err;
33929fcbb800SAlex Elder 			}
339335938150SAlex Elder 
339435938150SAlex Elder 			/* New goes before existing, or at end of list */
339535938150SAlex Elder 
33969fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
339735938150SAlex Elder 			if (snap)
339835938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
339935938150SAlex Elder 			else
3400523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
340135938150SAlex Elder 		} else {
340235938150SAlex Elder 			/* Already have this one */
340335938150SAlex Elder 
34049fcbb800SAlex Elder 			dout("  already present\n");
34059fcbb800SAlex Elder 
3406cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
3407aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
3408cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
340935938150SAlex Elder 
341035938150SAlex Elder 			/* Done with this list entry; advance */
341135938150SAlex Elder 
341235938150SAlex Elder 			links = links->next;
3413dfc5606dSYehuda Sadeh 		}
341435938150SAlex Elder 
341535938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
341635938150SAlex Elder 
341735938150SAlex Elder 		index++;
3418dfc5606dSYehuda Sadeh 	}
34199fcbb800SAlex Elder 	dout("%s: done\n", __func__);
3420dfc5606dSYehuda Sadeh 
3421dfc5606dSYehuda Sadeh 	return 0;
3422dfc5606dSYehuda Sadeh }
3423dfc5606dSYehuda Sadeh 
3424304f6808SAlex Elder /*
3425304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
3426304f6808SAlex Elder  * have not already been registered.
3427304f6808SAlex Elder  */
3428304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3429304f6808SAlex Elder {
3430304f6808SAlex Elder 	struct rbd_snap *snap;
3431304f6808SAlex Elder 	int ret = 0;
3432304f6808SAlex Elder 
343337206ee5SAlex Elder 	dout("%s:\n", __func__);
343486ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
343586ff77bbSAlex Elder 		return -EIO;
3436304f6808SAlex Elder 
3437304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3438304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3439304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3440304f6808SAlex Elder 			if (ret < 0)
3441304f6808SAlex Elder 				break;
3442304f6808SAlex Elder 		}
3443304f6808SAlex Elder 	}
3444304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3445304f6808SAlex Elder 
3446304f6808SAlex Elder 	return ret;
3447304f6808SAlex Elder }
3448304f6808SAlex Elder 
3449dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3450dfc5606dSYehuda Sadeh {
3451dfc5606dSYehuda Sadeh 	struct device *dev;
3452cd789ab9SAlex Elder 	int ret;
3453dfc5606dSYehuda Sadeh 
3454dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3455dfc5606dSYehuda Sadeh 
3456cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3457dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3458dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3459dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3460dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3461de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3462dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3463dfc5606dSYehuda Sadeh 
3464dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3465cd789ab9SAlex Elder 
3466dfc5606dSYehuda Sadeh 	return ret;
3467602adf40SYehuda Sadeh }
3468602adf40SYehuda Sadeh 
3469dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3470dfc5606dSYehuda Sadeh {
3471dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3472dfc5606dSYehuda Sadeh }
3473dfc5606dSYehuda Sadeh 
3474e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
34751ddbe94eSAlex Elder 
34761ddbe94eSAlex Elder /*
3477499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3478499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
34791ddbe94eSAlex Elder  */
3480e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3481b7f23c36SAlex Elder {
3482e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3483499afd5bSAlex Elder 
3484499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3485499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3486499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3487e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3488e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3489b7f23c36SAlex Elder }
3490b7f23c36SAlex Elder 
34911ddbe94eSAlex Elder /*
3492499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3493499afd5bSAlex Elder  * identifier is no longer in use.
34941ddbe94eSAlex Elder  */
3495e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
34961ddbe94eSAlex Elder {
3497d184f6bfSAlex Elder 	struct list_head *tmp;
3498de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3499d184f6bfSAlex Elder 	int max_id;
3500d184f6bfSAlex Elder 
3501aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3502499afd5bSAlex Elder 
3503e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3504e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3505499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3506499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3507d184f6bfSAlex Elder 
3508d184f6bfSAlex Elder 	/*
3509d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3510d184f6bfSAlex Elder 	 * is nothing special we need to do.
3511d184f6bfSAlex Elder 	 */
3512e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3513d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3514d184f6bfSAlex Elder 		return;
3515d184f6bfSAlex Elder 	}
3516d184f6bfSAlex Elder 
3517d184f6bfSAlex Elder 	/*
3518d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3519d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3520d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3521d184f6bfSAlex Elder 	 */
3522d184f6bfSAlex Elder 	max_id = 0;
3523d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3524d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3525d184f6bfSAlex Elder 
3526d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3527b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3528b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3529d184f6bfSAlex Elder 	}
3530499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
35311ddbe94eSAlex Elder 
35321ddbe94eSAlex Elder 	/*
3533e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3534d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3535d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3536d184f6bfSAlex Elder 	 * case.
35371ddbe94eSAlex Elder 	 */
3538e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3539e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3540b7f23c36SAlex Elder }
3541b7f23c36SAlex Elder 
3542a725f65eSAlex Elder /*
3543e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3544e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3545593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3546593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3547e28fff26SAlex Elder  */
3548e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3549e28fff26SAlex Elder {
3550e28fff26SAlex Elder         /*
3551e28fff26SAlex Elder         * These are the characters that produce nonzero for
3552e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3553e28fff26SAlex Elder         */
3554e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3555e28fff26SAlex Elder 
3556e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3557e28fff26SAlex Elder 
3558e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3559e28fff26SAlex Elder }
3560e28fff26SAlex Elder 
3561e28fff26SAlex Elder /*
3562e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3563e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3564593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3565593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3566e28fff26SAlex Elder  *
3567e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3568e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3569e28fff26SAlex Elder  * token_size if the token would not fit.
3570e28fff26SAlex Elder  *
3571593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3572e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3573e28fff26SAlex Elder  * too small to hold it.
3574e28fff26SAlex Elder  */
3575e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3576e28fff26SAlex Elder 				char *token,
3577e28fff26SAlex Elder 				size_t token_size)
3578e28fff26SAlex Elder {
3579e28fff26SAlex Elder         size_t len;
3580e28fff26SAlex Elder 
3581e28fff26SAlex Elder 	len = next_token(buf);
3582e28fff26SAlex Elder 	if (len < token_size) {
3583e28fff26SAlex Elder 		memcpy(token, *buf, len);
3584e28fff26SAlex Elder 		*(token + len) = '\0';
3585e28fff26SAlex Elder 	}
3586e28fff26SAlex Elder 	*buf += len;
3587e28fff26SAlex Elder 
3588e28fff26SAlex Elder         return len;
3589e28fff26SAlex Elder }
3590e28fff26SAlex Elder 
3591e28fff26SAlex Elder /*
3592ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3593ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3594ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3595ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3596ea3352f4SAlex Elder  *
3597ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3598ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3599ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3600ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3601ea3352f4SAlex Elder  *
3602ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3603ea3352f4SAlex Elder  * the end of the found token.
3604ea3352f4SAlex Elder  *
3605ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3606ea3352f4SAlex Elder  */
3607ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3608ea3352f4SAlex Elder {
3609ea3352f4SAlex Elder 	char *dup;
3610ea3352f4SAlex Elder 	size_t len;
3611ea3352f4SAlex Elder 
3612ea3352f4SAlex Elder 	len = next_token(buf);
36134caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3614ea3352f4SAlex Elder 	if (!dup)
3615ea3352f4SAlex Elder 		return NULL;
3616ea3352f4SAlex Elder 	*(dup + len) = '\0';
3617ea3352f4SAlex Elder 	*buf += len;
3618ea3352f4SAlex Elder 
3619ea3352f4SAlex Elder 	if (lenp)
3620ea3352f4SAlex Elder 		*lenp = len;
3621ea3352f4SAlex Elder 
3622ea3352f4SAlex Elder 	return dup;
3623ea3352f4SAlex Elder }
3624ea3352f4SAlex Elder 
3625ea3352f4SAlex Elder /*
3626859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3627859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3628859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3629859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3630d22f76e7SAlex Elder  *
3631859c31dfSAlex Elder  * The information extracted from these options is recorded in
3632859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3633859c31dfSAlex Elder  * structures:
3634859c31dfSAlex Elder  *  ceph_opts
3635859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3636859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3637859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3638859c31dfSAlex Elder  *  rbd_opts
3639859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3640859c31dfSAlex Elder  *	this function; caller must release with kfree().
3641859c31dfSAlex Elder  *  spec
3642859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3643859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3644859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3645859c31dfSAlex Elder  *
3646859c31dfSAlex Elder  * The options passed take this form:
3647859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3648859c31dfSAlex Elder  * where:
3649859c31dfSAlex Elder  *  <mon_addrs>
3650859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3651859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3652859c31dfSAlex Elder  *      by a port number (separated by a colon).
3653859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3654859c31dfSAlex Elder  *  <options>
3655859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3656859c31dfSAlex Elder  *  <pool_name>
3657859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3658859c31dfSAlex Elder  *  <image_name>
3659859c31dfSAlex Elder  *      The name of the image in that pool to map.
3660859c31dfSAlex Elder  *  <snap_id>
3661859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3662859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3663859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3664859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3665a725f65eSAlex Elder  */
3666859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3667dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3668859c31dfSAlex Elder 				struct rbd_options **opts,
3669859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3670a725f65eSAlex Elder {
3671e28fff26SAlex Elder 	size_t len;
3672859c31dfSAlex Elder 	char *options;
36730ddebc0cSAlex Elder 	const char *mon_addrs;
36740ddebc0cSAlex Elder 	size_t mon_addrs_size;
3675859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
36764e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3677859c31dfSAlex Elder 	struct ceph_options *copts;
3678dc79b113SAlex Elder 	int ret;
3679e28fff26SAlex Elder 
3680e28fff26SAlex Elder 	/* The first four tokens are required */
3681e28fff26SAlex Elder 
36827ef3214aSAlex Elder 	len = next_token(&buf);
36834fb5d671SAlex Elder 	if (!len) {
36844fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
36854fb5d671SAlex Elder 		return -EINVAL;
36864fb5d671SAlex Elder 	}
36870ddebc0cSAlex Elder 	mon_addrs = buf;
3688f28e565aSAlex Elder 	mon_addrs_size = len + 1;
36897ef3214aSAlex Elder 	buf += len;
3690a725f65eSAlex Elder 
3691dc79b113SAlex Elder 	ret = -EINVAL;
3692f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3693f28e565aSAlex Elder 	if (!options)
3694dc79b113SAlex Elder 		return -ENOMEM;
36954fb5d671SAlex Elder 	if (!*options) {
36964fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
36974fb5d671SAlex Elder 		goto out_err;
36984fb5d671SAlex Elder 	}
3699a725f65eSAlex Elder 
3700859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3701859c31dfSAlex Elder 	if (!spec)
3702f28e565aSAlex Elder 		goto out_mem;
3703859c31dfSAlex Elder 
3704859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3705859c31dfSAlex Elder 	if (!spec->pool_name)
3706859c31dfSAlex Elder 		goto out_mem;
37074fb5d671SAlex Elder 	if (!*spec->pool_name) {
37084fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
37094fb5d671SAlex Elder 		goto out_err;
37104fb5d671SAlex Elder 	}
3711e28fff26SAlex Elder 
371269e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3713859c31dfSAlex Elder 	if (!spec->image_name)
3714f28e565aSAlex Elder 		goto out_mem;
37154fb5d671SAlex Elder 	if (!*spec->image_name) {
37164fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
37174fb5d671SAlex Elder 		goto out_err;
37184fb5d671SAlex Elder 	}
3719e28fff26SAlex Elder 
3720f28e565aSAlex Elder 	/*
3721f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3722f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3723f28e565aSAlex Elder 	 */
37243feeb894SAlex Elder 	len = next_token(&buf);
3725820a5f3eSAlex Elder 	if (!len) {
37263feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
37273feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3728f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3729dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3730f28e565aSAlex Elder 		goto out_err;
3731849b4260SAlex Elder 	}
37324caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3733859c31dfSAlex Elder 	if (!spec->snap_name)
3734f28e565aSAlex Elder 		goto out_mem;
3735859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3736e5c35534SAlex Elder 
37370ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3738e28fff26SAlex Elder 
37394e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
37404e9afebaSAlex Elder 	if (!rbd_opts)
37414e9afebaSAlex Elder 		goto out_mem;
37424e9afebaSAlex Elder 
37434e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3744d22f76e7SAlex Elder 
3745859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
37460ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
37474e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3748859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3749859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3750dc79b113SAlex Elder 		goto out_err;
3751dc79b113SAlex Elder 	}
3752859c31dfSAlex Elder 	kfree(options);
3753859c31dfSAlex Elder 
3754859c31dfSAlex Elder 	*ceph_opts = copts;
37554e9afebaSAlex Elder 	*opts = rbd_opts;
3756859c31dfSAlex Elder 	*rbd_spec = spec;
37570ddebc0cSAlex Elder 
3758dc79b113SAlex Elder 	return 0;
3759f28e565aSAlex Elder out_mem:
3760dc79b113SAlex Elder 	ret = -ENOMEM;
3761d22f76e7SAlex Elder out_err:
3762859c31dfSAlex Elder 	kfree(rbd_opts);
3763859c31dfSAlex Elder 	rbd_spec_put(spec);
3764f28e565aSAlex Elder 	kfree(options);
3765d22f76e7SAlex Elder 
3766dc79b113SAlex Elder 	return ret;
3767a725f65eSAlex Elder }
3768a725f65eSAlex Elder 
3769589d30e0SAlex Elder /*
3770589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3771589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3772589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3773589d30e0SAlex Elder  *
3774589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3775589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3776589d30e0SAlex Elder  * with the supplied name.
3777589d30e0SAlex Elder  *
3778589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3779589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3780589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3781589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3782589d30e0SAlex Elder  */
3783589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3784589d30e0SAlex Elder {
3785589d30e0SAlex Elder 	int ret;
3786589d30e0SAlex Elder 	size_t size;
3787589d30e0SAlex Elder 	char *object_name;
3788589d30e0SAlex Elder 	void *response;
3789589d30e0SAlex Elder 	void *p;
3790589d30e0SAlex Elder 
3791589d30e0SAlex Elder 	/*
37922c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
37932c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
37942c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
37952c0d0a10SAlex Elder 	 */
37962c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
37972c0d0a10SAlex Elder 		return 0;
37982c0d0a10SAlex Elder 
37992c0d0a10SAlex Elder 	/*
3800589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3801589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3802589d30e0SAlex Elder 	 */
380369e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3804589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3805589d30e0SAlex Elder 	if (!object_name)
3806589d30e0SAlex Elder 		return -ENOMEM;
38070d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3808589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3809589d30e0SAlex Elder 
3810589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3811589d30e0SAlex Elder 
3812589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3813589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3814589d30e0SAlex Elder 	if (!response) {
3815589d30e0SAlex Elder 		ret = -ENOMEM;
3816589d30e0SAlex Elder 		goto out;
3817589d30e0SAlex Elder 	}
3818589d30e0SAlex Elder 
381936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
3820589d30e0SAlex Elder 				"rbd", "get_id",
3821589d30e0SAlex Elder 				NULL, 0,
382207b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
382336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3824589d30e0SAlex Elder 	if (ret < 0)
3825589d30e0SAlex Elder 		goto out;
3826589d30e0SAlex Elder 
3827589d30e0SAlex Elder 	p = response;
38280d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3829589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3830979ed480SAlex Elder 						NULL, GFP_NOIO);
38310d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
38320d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
38330d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3834589d30e0SAlex Elder 	} else {
38350d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3836589d30e0SAlex Elder 	}
3837589d30e0SAlex Elder out:
3838589d30e0SAlex Elder 	kfree(response);
3839589d30e0SAlex Elder 	kfree(object_name);
3840589d30e0SAlex Elder 
3841589d30e0SAlex Elder 	return ret;
3842589d30e0SAlex Elder }
3843589d30e0SAlex Elder 
3844a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3845a30b71b9SAlex Elder {
3846a30b71b9SAlex Elder 	int ret;
3847a30b71b9SAlex Elder 	size_t size;
3848a30b71b9SAlex Elder 
3849a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3850a30b71b9SAlex Elder 
38510d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
38520d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3853a30b71b9SAlex Elder 		return -ENOMEM;
3854a30b71b9SAlex Elder 
3855a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3856a30b71b9SAlex Elder 
385769e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3858a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3859a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3860a30b71b9SAlex Elder 		ret = -ENOMEM;
3861a30b71b9SAlex Elder 		goto out_err;
3862a30b71b9SAlex Elder 	}
38630d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
38640d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3865a30b71b9SAlex Elder 
3866a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3867a30b71b9SAlex Elder 
3868a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3869a30b71b9SAlex Elder 	if (ret < 0)
3870a30b71b9SAlex Elder 		goto out_err;
387186b00e0dSAlex Elder 
387286b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
387386b00e0dSAlex Elder 
387486b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
387586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
387686b00e0dSAlex Elder 
3877a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3878a30b71b9SAlex Elder 
3879a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3880a30b71b9SAlex Elder 		rbd_dev->header_name);
3881a30b71b9SAlex Elder 
3882a30b71b9SAlex Elder 	return 0;
3883a30b71b9SAlex Elder 
3884a30b71b9SAlex Elder out_err:
3885a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3886a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
38870d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
38880d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3889a30b71b9SAlex Elder 
3890a30b71b9SAlex Elder 	return ret;
3891a30b71b9SAlex Elder }
3892a30b71b9SAlex Elder 
3893a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3894a30b71b9SAlex Elder {
3895a30b71b9SAlex Elder 	size_t size;
38969d475de5SAlex Elder 	int ret;
38976e14b1a6SAlex Elder 	u64 ver = 0;
3898a30b71b9SAlex Elder 
3899a30b71b9SAlex Elder 	/*
3900a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3901a30b71b9SAlex Elder 	 * object name for this rbd image.
3902a30b71b9SAlex Elder 	 */
3903979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3904a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3905a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3906a30b71b9SAlex Elder 		return -ENOMEM;
3907a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
39080d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
39099d475de5SAlex Elder 
39109d475de5SAlex Elder 	/* Get the size and object order for the image */
39119d475de5SAlex Elder 
39129d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
39139d475de5SAlex Elder 	if (ret < 0)
39149d475de5SAlex Elder 		goto out_err;
39151e130199SAlex Elder 
39161e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
39171e130199SAlex Elder 
39181e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
39191e130199SAlex Elder 	if (ret < 0)
39201e130199SAlex Elder 		goto out_err;
3921b1b5402aSAlex Elder 
3922d889140cSAlex Elder 	/* Get the and check features for the image */
3923b1b5402aSAlex Elder 
3924b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3925b1b5402aSAlex Elder 	if (ret < 0)
3926b1b5402aSAlex Elder 		goto out_err;
392735d489f9SAlex Elder 
392886b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
392986b00e0dSAlex Elder 
393086b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
393186b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
393286b00e0dSAlex Elder 		if (ret < 0)
393386b00e0dSAlex Elder 			goto out_err;
393486b00e0dSAlex Elder 	}
393586b00e0dSAlex Elder 
39366e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
393735d489f9SAlex Elder 
39386e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
39396e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
39406e14b1a6SAlex Elder 
39416e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
39426e14b1a6SAlex Elder 
39436e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
394435d489f9SAlex Elder 	if (ret)
394535d489f9SAlex Elder 		goto out_err;
39466e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
39476e14b1a6SAlex Elder 
3948a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3949a30b71b9SAlex Elder 
3950a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3951a30b71b9SAlex Elder 		rbd_dev->header_name);
3952a30b71b9SAlex Elder 
395335152979SAlex Elder 	return 0;
39549d475de5SAlex Elder out_err:
395586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
395686b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
395786b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
39589d475de5SAlex Elder 	kfree(rbd_dev->header_name);
39599d475de5SAlex Elder 	rbd_dev->header_name = NULL;
39601e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
39611e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
39629d475de5SAlex Elder 
39639d475de5SAlex Elder 	return ret;
3964a30b71b9SAlex Elder }
3965a30b71b9SAlex Elder 
396683a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
396783a06263SAlex Elder {
396883a06263SAlex Elder 	int ret;
396983a06263SAlex Elder 
397083a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
397183a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
397283a06263SAlex Elder 	if (ret)
397383a06263SAlex Elder 		return ret;
397483a06263SAlex Elder 
39759e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
39769e15b77dSAlex Elder 	if (ret)
39779e15b77dSAlex Elder 		goto err_out_snaps;
39789e15b77dSAlex Elder 
397983a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
398083a06263SAlex Elder 	if (ret)
398183a06263SAlex Elder 		goto err_out_snaps;
398283a06263SAlex Elder 
398383a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
398483a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
398583a06263SAlex Elder 
398683a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
398783a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
398883a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
398983a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
399083a06263SAlex Elder 
399183a06263SAlex Elder 	/* Get our block major device number. */
399283a06263SAlex Elder 
399383a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
399483a06263SAlex Elder 	if (ret < 0)
399583a06263SAlex Elder 		goto err_out_id;
399683a06263SAlex Elder 	rbd_dev->major = ret;
399783a06263SAlex Elder 
399883a06263SAlex Elder 	/* Set up the blkdev mapping. */
399983a06263SAlex Elder 
400083a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
400183a06263SAlex Elder 	if (ret)
400283a06263SAlex Elder 		goto err_out_blkdev;
400383a06263SAlex Elder 
400483a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
400583a06263SAlex Elder 	if (ret)
400683a06263SAlex Elder 		goto err_out_disk;
400783a06263SAlex Elder 
400883a06263SAlex Elder 	/*
400983a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
401083a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
401183a06263SAlex Elder 	 */
401283a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
401383a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
401483a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
401583a06263SAlex Elder 	if (ret)
401683a06263SAlex Elder 		goto err_out_bus;
401783a06263SAlex Elder 
40189969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
401983a06263SAlex Elder 	if (ret)
402083a06263SAlex Elder 		goto err_out_bus;
402183a06263SAlex Elder 
402283a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
402383a06263SAlex Elder 
402483a06263SAlex Elder 	add_disk(rbd_dev->disk);
402583a06263SAlex Elder 
402683a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
402783a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
402883a06263SAlex Elder 
402983a06263SAlex Elder 	return ret;
403083a06263SAlex Elder err_out_bus:
403183a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
403283a06263SAlex Elder 
403383a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
403483a06263SAlex Elder 
403583a06263SAlex Elder 	return ret;
403683a06263SAlex Elder err_out_disk:
403783a06263SAlex Elder 	rbd_free_disk(rbd_dev);
403883a06263SAlex Elder err_out_blkdev:
403983a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
404083a06263SAlex Elder err_out_id:
404183a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
404283a06263SAlex Elder err_out_snaps:
404383a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
404483a06263SAlex Elder 
404583a06263SAlex Elder 	return ret;
404683a06263SAlex Elder }
404783a06263SAlex Elder 
4048a30b71b9SAlex Elder /*
4049a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4050a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4051a30b71b9SAlex Elder  * id.
4052a30b71b9SAlex Elder  */
4053a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4054a30b71b9SAlex Elder {
4055a30b71b9SAlex Elder 	int ret;
4056a30b71b9SAlex Elder 
4057a30b71b9SAlex Elder 	/*
4058a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4059a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4060a30b71b9SAlex Elder 	 * it's a format 1 image.
4061a30b71b9SAlex Elder 	 */
4062a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4063a30b71b9SAlex Elder 	if (ret)
4064a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4065a30b71b9SAlex Elder 	else
4066a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
406783a06263SAlex Elder 	if (ret) {
4068a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4069a30b71b9SAlex Elder 
4070a30b71b9SAlex Elder 		return ret;
4071a30b71b9SAlex Elder 	}
4072a30b71b9SAlex Elder 
407383a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
407483a06263SAlex Elder 	if (ret)
407583a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
407683a06263SAlex Elder 
407783a06263SAlex Elder 	return ret;
407883a06263SAlex Elder }
407983a06263SAlex Elder 
408059c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
408159c2be1eSYehuda Sadeh 		       const char *buf,
408259c2be1eSYehuda Sadeh 		       size_t count)
4083602adf40SYehuda Sadeh {
4084cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4085dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
40864e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4087859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
40889d3997fdSAlex Elder 	struct rbd_client *rbdc;
408927cc2594SAlex Elder 	struct ceph_osd_client *osdc;
409027cc2594SAlex Elder 	int rc = -ENOMEM;
4091602adf40SYehuda Sadeh 
4092602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4093602adf40SYehuda Sadeh 		return -ENODEV;
4094602adf40SYehuda Sadeh 
4095a725f65eSAlex Elder 	/* parse add command */
4096859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4097dc79b113SAlex Elder 	if (rc < 0)
4098bd4ba655SAlex Elder 		goto err_out_module;
4099a725f65eSAlex Elder 
41009d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
41019d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
41029d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
41030ddebc0cSAlex Elder 		goto err_out_args;
41049d3997fdSAlex Elder 	}
4105c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4106602adf40SYehuda Sadeh 
4107602adf40SYehuda Sadeh 	/* pick the pool */
41089d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4109859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4110602adf40SYehuda Sadeh 	if (rc < 0)
4111602adf40SYehuda Sadeh 		goto err_out_client;
4112859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4113859c31dfSAlex Elder 
41140903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
41150903e875SAlex Elder 
41160903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
41170903e875SAlex Elder 		rc = -EIO;
41180903e875SAlex Elder 		goto err_out_client;
41190903e875SAlex Elder 	}
41200903e875SAlex Elder 
4121c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4122bd4ba655SAlex Elder 	if (!rbd_dev)
4123bd4ba655SAlex Elder 		goto err_out_client;
4124c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4125c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4126602adf40SYehuda Sadeh 
4127bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4128c53d5893SAlex Elder 	kfree(rbd_opts);
4129c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4130bd4ba655SAlex Elder 
4131a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4132a30b71b9SAlex Elder 	if (rc < 0)
4133c53d5893SAlex Elder 		goto err_out_rbd_dev;
413405fd6f6fSAlex Elder 
4135602adf40SYehuda Sadeh 	return count;
4136c53d5893SAlex Elder err_out_rbd_dev:
4137c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4138bd4ba655SAlex Elder err_out_client:
41399d3997fdSAlex Elder 	rbd_put_client(rbdc);
41400ddebc0cSAlex Elder err_out_args:
414178cea76eSAlex Elder 	if (ceph_opts)
414278cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
41434e9afebaSAlex Elder 	kfree(rbd_opts);
4144859c31dfSAlex Elder 	rbd_spec_put(spec);
4145bd4ba655SAlex Elder err_out_module:
4146bd4ba655SAlex Elder 	module_put(THIS_MODULE);
414727cc2594SAlex Elder 
4148602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
414927cc2594SAlex Elder 
415027cc2594SAlex Elder 	return (ssize_t) rc;
4151602adf40SYehuda Sadeh }
4152602adf40SYehuda Sadeh 
4153de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4154602adf40SYehuda Sadeh {
4155602adf40SYehuda Sadeh 	struct list_head *tmp;
4156602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4157602adf40SYehuda Sadeh 
4158e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4159602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4160602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4161de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4162e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4163602adf40SYehuda Sadeh 			return rbd_dev;
4164602adf40SYehuda Sadeh 		}
4165e124a82fSAlex Elder 	}
4166e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4167602adf40SYehuda Sadeh 	return NULL;
4168602adf40SYehuda Sadeh }
4169602adf40SYehuda Sadeh 
4170dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4171602adf40SYehuda Sadeh {
4172593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4173602adf40SYehuda Sadeh 
417459c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
41759969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4176602adf40SYehuda Sadeh 
4177602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4178602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4179602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
418032eec68dSAlex Elder 
41812ac4e75dSAlex Elder 	/* release allocated disk header fields */
41822ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
41832ac4e75dSAlex Elder 
418432eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4185e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4186c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4187c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4188602adf40SYehuda Sadeh 
4189602adf40SYehuda Sadeh 	/* release module ref */
4190602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4191602adf40SYehuda Sadeh }
4192602adf40SYehuda Sadeh 
4193dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4194602adf40SYehuda Sadeh 			  const char *buf,
4195602adf40SYehuda Sadeh 			  size_t count)
4196602adf40SYehuda Sadeh {
4197602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4198602adf40SYehuda Sadeh 	int target_id, rc;
4199602adf40SYehuda Sadeh 	unsigned long ul;
4200602adf40SYehuda Sadeh 	int ret = count;
4201602adf40SYehuda Sadeh 
4202602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4203602adf40SYehuda Sadeh 	if (rc)
4204602adf40SYehuda Sadeh 		return rc;
4205602adf40SYehuda Sadeh 
4206602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4207602adf40SYehuda Sadeh 	target_id = (int) ul;
4208602adf40SYehuda Sadeh 	if (target_id != ul)
4209602adf40SYehuda Sadeh 		return -EINVAL;
4210602adf40SYehuda Sadeh 
4211602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4212602adf40SYehuda Sadeh 
4213602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4214602adf40SYehuda Sadeh 	if (!rbd_dev) {
4215602adf40SYehuda Sadeh 		ret = -ENOENT;
4216602adf40SYehuda Sadeh 		goto done;
4217602adf40SYehuda Sadeh 	}
4218602adf40SYehuda Sadeh 
4219a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4220b82d167bSAlex Elder 	if (rbd_dev->open_count)
422142382b70SAlex Elder 		ret = -EBUSY;
4222b82d167bSAlex Elder 	else
4223b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4224a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4225b82d167bSAlex Elder 	if (ret < 0)
422642382b70SAlex Elder 		goto done;
422742382b70SAlex Elder 
422841f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4229dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
4230602adf40SYehuda Sadeh 
4231602adf40SYehuda Sadeh done:
4232602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4233aafb230eSAlex Elder 
4234602adf40SYehuda Sadeh 	return ret;
4235602adf40SYehuda Sadeh }
4236602adf40SYehuda Sadeh 
4237602adf40SYehuda Sadeh /*
4238602adf40SYehuda Sadeh  * create control files in sysfs
4239dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4240602adf40SYehuda Sadeh  */
4241602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4242602adf40SYehuda Sadeh {
4243dfc5606dSYehuda Sadeh 	int ret;
4244602adf40SYehuda Sadeh 
4245fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4246dfc5606dSYehuda Sadeh 	if (ret < 0)
4247dfc5606dSYehuda Sadeh 		return ret;
4248602adf40SYehuda Sadeh 
4249fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4250fed4c143SAlex Elder 	if (ret < 0)
4251fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4252602adf40SYehuda Sadeh 
4253602adf40SYehuda Sadeh 	return ret;
4254602adf40SYehuda Sadeh }
4255602adf40SYehuda Sadeh 
4256602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4257602adf40SYehuda Sadeh {
4258dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4259fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4260602adf40SYehuda Sadeh }
4261602adf40SYehuda Sadeh 
4262cc344fa1SAlex Elder static int __init rbd_init(void)
4263602adf40SYehuda Sadeh {
4264602adf40SYehuda Sadeh 	int rc;
4265602adf40SYehuda Sadeh 
42661e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
42671e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
42681e32d34cSAlex Elder 
42691e32d34cSAlex Elder 		return -EINVAL;
42701e32d34cSAlex Elder 	}
4271602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
4272602adf40SYehuda Sadeh 	if (rc)
4273602adf40SYehuda Sadeh 		return rc;
4274f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4275602adf40SYehuda Sadeh 	return 0;
4276602adf40SYehuda Sadeh }
4277602adf40SYehuda Sadeh 
4278cc344fa1SAlex Elder static void __exit rbd_exit(void)
4279602adf40SYehuda Sadeh {
4280602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
4281602adf40SYehuda Sadeh }
4282602adf40SYehuda Sadeh 
4283602adf40SYehuda Sadeh module_init(rbd_init);
4284602adf40SYehuda Sadeh module_exit(rbd_exit);
4285602adf40SYehuda Sadeh 
4286602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4287602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4288602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
4289602adf40SYehuda Sadeh 
4290602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
4291602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4292602adf40SYehuda Sadeh 
4293602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
4294