xref: /openbmc/linux/drivers/block/rbd.c (revision 23ed6e13)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
552647ba38SAlex Elder /* It might be useful to have these defined elsewhere */
56df111be6SAlex Elder 
572647ba38SAlex Elder #define	U8_MAX	((u8)	(~0U))
582647ba38SAlex Elder #define	U16_MAX	((u16)	(~0U))
590ec8ce87SAlex Elder #define	U32_MAX	((u32)	(~0U))
60df111be6SAlex Elder #define	U64_MAX	((u64)	(~0ULL))
61df111be6SAlex Elder 
62f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
63f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
64602adf40SYehuda Sadeh 
65602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
66602adf40SYehuda Sadeh 
67d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
68d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
69d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70d4b125e9SAlex Elder 
7135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
72602adf40SYehuda Sadeh 
73602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
74602adf40SYehuda Sadeh 
759e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
769e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
77589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
789e15b77dSAlex Elder 
791e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
80589d30e0SAlex Elder 
81d889140cSAlex Elder /* Feature bits */
82d889140cSAlex Elder 
83d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
84d889140cSAlex Elder 
85d889140cSAlex Elder /* Features supported by this (client software) implementation. */
86d889140cSAlex Elder 
87d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
88d889140cSAlex Elder 
8981a89793SAlex Elder /*
9081a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9181a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9281a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9381a89793SAlex Elder  * enough to hold all possible device names.
9481a89793SAlex Elder  */
95602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9681a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
97602adf40SYehuda Sadeh 
98602adf40SYehuda Sadeh /*
99602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
100602adf40SYehuda Sadeh  */
101602adf40SYehuda Sadeh struct rbd_image_header {
102f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
103849b4260SAlex Elder 	char *object_prefix;
10434b13184SAlex Elder 	u64 features;
105602adf40SYehuda Sadeh 	__u8 obj_order;
106602adf40SYehuda Sadeh 	__u8 crypt_type;
107602adf40SYehuda Sadeh 	__u8 comp_type;
108602adf40SYehuda Sadeh 
109f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
110f84344f3SAlex Elder 	u64 image_size;
111f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
112602adf40SYehuda Sadeh 	char *snap_names;
113602adf40SYehuda Sadeh 	u64 *snap_sizes;
11459c2be1eSYehuda Sadeh 
11559c2be1eSYehuda Sadeh 	u64 obj_version;
11659c2be1eSYehuda Sadeh };
11759c2be1eSYehuda Sadeh 
1180d7dbfceSAlex Elder /*
1190d7dbfceSAlex Elder  * An rbd image specification.
1200d7dbfceSAlex Elder  *
1210d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
123c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
124c66c6e0cSAlex Elder  *
125c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
126c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
127c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
128c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
129c66c6e0cSAlex Elder  *
130c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
131c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
132c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
133c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
134c66c6e0cSAlex Elder  * is shared between the parent and child).
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
137c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
138c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
139c66c6e0cSAlex Elder  *
140c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
141c66c6e0cSAlex Elder  * could be a null pointer).
1420d7dbfceSAlex Elder  */
1430d7dbfceSAlex Elder struct rbd_spec {
1440d7dbfceSAlex Elder 	u64		pool_id;
1450d7dbfceSAlex Elder 	char		*pool_name;
1460d7dbfceSAlex Elder 
1470d7dbfceSAlex Elder 	char		*image_id;
1480d7dbfceSAlex Elder 	char		*image_name;
1490d7dbfceSAlex Elder 
1500d7dbfceSAlex Elder 	u64		snap_id;
1510d7dbfceSAlex Elder 	char		*snap_name;
1520d7dbfceSAlex Elder 
1530d7dbfceSAlex Elder 	struct kref	kref;
1540d7dbfceSAlex Elder };
1550d7dbfceSAlex Elder 
156602adf40SYehuda Sadeh /*
157f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
158602adf40SYehuda Sadeh  */
159602adf40SYehuda Sadeh struct rbd_client {
160602adf40SYehuda Sadeh 	struct ceph_client	*client;
161602adf40SYehuda Sadeh 	struct kref		kref;
162602adf40SYehuda Sadeh 	struct list_head	node;
163602adf40SYehuda Sadeh };
164602adf40SYehuda Sadeh 
165bf0d5f50SAlex Elder struct rbd_img_request;
166bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167bf0d5f50SAlex Elder 
168bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
169bf0d5f50SAlex Elder 
170bf0d5f50SAlex Elder struct rbd_obj_request;
171bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172bf0d5f50SAlex Elder 
1739969ebc5SAlex Elder enum obj_request_type {
1749969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1759969ebc5SAlex Elder };
176bf0d5f50SAlex Elder 
177bf0d5f50SAlex Elder struct rbd_obj_request {
178bf0d5f50SAlex Elder 	const char		*object_name;
179bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
180bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
181bf0d5f50SAlex Elder 
182bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
183bf0d5f50SAlex Elder 	struct list_head	links;		/* img_request->obj_requests */
184bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
185bf0d5f50SAlex Elder 
186bf0d5f50SAlex Elder 	enum obj_request_type	type;
187788e2df3SAlex Elder 	union {
188bf0d5f50SAlex Elder 		struct bio	*bio_list;
189788e2df3SAlex Elder 		struct {
190788e2df3SAlex Elder 			struct page	**pages;
191788e2df3SAlex Elder 			u32		page_count;
192788e2df3SAlex Elder 		};
193788e2df3SAlex Elder 	};
194bf0d5f50SAlex Elder 
195bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
196bf0d5f50SAlex Elder 
197bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
198bf0d5f50SAlex Elder 	u64			version;
199bf0d5f50SAlex Elder 	s32			result;
200bf0d5f50SAlex Elder 	atomic_t		done;
201bf0d5f50SAlex Elder 
202bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
203788e2df3SAlex Elder 	struct completion	completion;
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder 	struct kref		kref;
206bf0d5f50SAlex Elder };
207bf0d5f50SAlex Elder 
208bf0d5f50SAlex Elder struct rbd_img_request {
209bf0d5f50SAlex Elder 	struct request		*rq;
210bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
211bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
212bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
213bf0d5f50SAlex Elder 	bool			write_request;	/* false for read */
214bf0d5f50SAlex Elder 	union {
215bf0d5f50SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
216bf0d5f50SAlex Elder 		u64		snap_id;		/* for reads */
217bf0d5f50SAlex Elder 	};
218bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
219bf0d5f50SAlex Elder 	u32			next_completion;
220bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
221bf0d5f50SAlex Elder 
222bf0d5f50SAlex Elder 	u32			obj_request_count;
223bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
224bf0d5f50SAlex Elder 
225bf0d5f50SAlex Elder 	struct kref		kref;
226bf0d5f50SAlex Elder };
227bf0d5f50SAlex Elder 
228bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
229ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
230bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
231ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
232bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
233ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
234bf0d5f50SAlex Elder 
235dfc5606dSYehuda Sadeh struct rbd_snap {
236dfc5606dSYehuda Sadeh 	struct	device		dev;
237dfc5606dSYehuda Sadeh 	const char		*name;
2383591538fSJosh Durgin 	u64			size;
239dfc5606dSYehuda Sadeh 	struct list_head	node;
240dfc5606dSYehuda Sadeh 	u64			id;
24134b13184SAlex Elder 	u64			features;
242dfc5606dSYehuda Sadeh };
243dfc5606dSYehuda Sadeh 
244f84344f3SAlex Elder struct rbd_mapping {
24599c1f08fSAlex Elder 	u64                     size;
24634b13184SAlex Elder 	u64                     features;
247f84344f3SAlex Elder 	bool			read_only;
248f84344f3SAlex Elder };
249f84344f3SAlex Elder 
250602adf40SYehuda Sadeh /*
251602adf40SYehuda Sadeh  * a single device
252602adf40SYehuda Sadeh  */
253602adf40SYehuda Sadeh struct rbd_device {
254de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
255602adf40SYehuda Sadeh 
256602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
257602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
258602adf40SYehuda Sadeh 
259a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
260602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
261602adf40SYehuda Sadeh 
262602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263602adf40SYehuda Sadeh 
264b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
265602adf40SYehuda Sadeh 
266602adf40SYehuda Sadeh 	struct rbd_image_header	header;
267b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
2680d7dbfceSAlex Elder 	struct rbd_spec		*spec;
269602adf40SYehuda Sadeh 
2700d7dbfceSAlex Elder 	char			*header_name;
271971f839aSAlex Elder 
2720903e875SAlex Elder 	struct ceph_file_layout	layout;
2730903e875SAlex Elder 
27459c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
275975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
27659c2be1eSYehuda Sadeh 
27786b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
27886b00e0dSAlex Elder 	u64			parent_overlap;
27986b00e0dSAlex Elder 
280c666601aSJosh Durgin 	/* protects updating the header */
281c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
282f84344f3SAlex Elder 
283f84344f3SAlex Elder 	struct rbd_mapping	mapping;
284602adf40SYehuda Sadeh 
285602adf40SYehuda Sadeh 	struct list_head	node;
286dfc5606dSYehuda Sadeh 
287dfc5606dSYehuda Sadeh 	/* list of snapshots */
288dfc5606dSYehuda Sadeh 	struct list_head	snaps;
289dfc5606dSYehuda Sadeh 
290dfc5606dSYehuda Sadeh 	/* sysfs related */
291dfc5606dSYehuda Sadeh 	struct device		dev;
292b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
293dfc5606dSYehuda Sadeh };
294dfc5606dSYehuda Sadeh 
295b82d167bSAlex Elder /*
296b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
297b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
298b82d167bSAlex Elder  *
299b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
300b82d167bSAlex Elder  * "open_count" field) requires atomic access.
301b82d167bSAlex Elder  */
3026d292906SAlex Elder enum rbd_dev_flags {
3036d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
304b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3056d292906SAlex Elder };
3066d292906SAlex Elder 
307602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
308e124a82fSAlex Elder 
309602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
310e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
311e124a82fSAlex Elder 
312602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
313432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
314602adf40SYehuda Sadeh 
315304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
316304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
317304f6808SAlex Elder 
318dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
31941f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
320dfc5606dSYehuda Sadeh 
321f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
322f0f8cef5SAlex Elder 		       size_t count);
323f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
324f0f8cef5SAlex Elder 			  size_t count);
325f0f8cef5SAlex Elder 
326f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
327f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
328f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
329f0f8cef5SAlex Elder 	__ATTR_NULL
330f0f8cef5SAlex Elder };
331f0f8cef5SAlex Elder 
332f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
333f0f8cef5SAlex Elder 	.name		= "rbd",
334f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
335f0f8cef5SAlex Elder };
336f0f8cef5SAlex Elder 
337f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
338f0f8cef5SAlex Elder {
339f0f8cef5SAlex Elder }
340f0f8cef5SAlex Elder 
341f0f8cef5SAlex Elder static struct device rbd_root_dev = {
342f0f8cef5SAlex Elder 	.init_name =    "rbd",
343f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
344f0f8cef5SAlex Elder };
345f0f8cef5SAlex Elder 
34606ecc6cbSAlex Elder static __printf(2, 3)
34706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
34806ecc6cbSAlex Elder {
34906ecc6cbSAlex Elder 	struct va_format vaf;
35006ecc6cbSAlex Elder 	va_list args;
35106ecc6cbSAlex Elder 
35206ecc6cbSAlex Elder 	va_start(args, fmt);
35306ecc6cbSAlex Elder 	vaf.fmt = fmt;
35406ecc6cbSAlex Elder 	vaf.va = &args;
35506ecc6cbSAlex Elder 
35606ecc6cbSAlex Elder 	if (!rbd_dev)
35706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
35806ecc6cbSAlex Elder 	else if (rbd_dev->disk)
35906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
36006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
36106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
36206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
36306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
36406ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
36506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
36606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
36706ecc6cbSAlex Elder 	else	/* punt */
36806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
36906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
37006ecc6cbSAlex Elder 	va_end(args);
37106ecc6cbSAlex Elder }
37206ecc6cbSAlex Elder 
373aafb230eSAlex Elder #ifdef RBD_DEBUG
374aafb230eSAlex Elder #define rbd_assert(expr)						\
375aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
376aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
377aafb230eSAlex Elder 						"at line %d:\n\n"	\
378aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
379aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
380aafb230eSAlex Elder 			BUG();						\
381aafb230eSAlex Elder 		}
382aafb230eSAlex Elder #else /* !RBD_DEBUG */
383aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
384aafb230eSAlex Elder #endif /* !RBD_DEBUG */
385dfc5606dSYehuda Sadeh 
386117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
387117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
38859c2be1eSYehuda Sadeh 
389602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
390602adf40SYehuda Sadeh {
391f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
392b82d167bSAlex Elder 	bool removing = false;
393602adf40SYehuda Sadeh 
394f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
395602adf40SYehuda Sadeh 		return -EROFS;
396602adf40SYehuda Sadeh 
397a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
398b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399b82d167bSAlex Elder 		removing = true;
400b82d167bSAlex Elder 	else
401b82d167bSAlex Elder 		rbd_dev->open_count++;
402a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
403b82d167bSAlex Elder 	if (removing)
404b82d167bSAlex Elder 		return -ENOENT;
405b82d167bSAlex Elder 
40642382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
407c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
408f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
40942382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
410340c7a2bSAlex Elder 
411602adf40SYehuda Sadeh 	return 0;
412602adf40SYehuda Sadeh }
413602adf40SYehuda Sadeh 
414dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
415dfc5606dSYehuda Sadeh {
416dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
417b82d167bSAlex Elder 	unsigned long open_count_before;
418b82d167bSAlex Elder 
419a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
420b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
421a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
422b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
423dfc5606dSYehuda Sadeh 
42442382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
425c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
42642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
427dfc5606dSYehuda Sadeh 
428dfc5606dSYehuda Sadeh 	return 0;
429dfc5606dSYehuda Sadeh }
430dfc5606dSYehuda Sadeh 
431602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
432602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
433602adf40SYehuda Sadeh 	.open			= rbd_open,
434dfc5606dSYehuda Sadeh 	.release		= rbd_release,
435602adf40SYehuda Sadeh };
436602adf40SYehuda Sadeh 
437602adf40SYehuda Sadeh /*
438602adf40SYehuda Sadeh  * Initialize an rbd client instance.
43943ae4701SAlex Elder  * We own *ceph_opts.
440602adf40SYehuda Sadeh  */
441f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
442602adf40SYehuda Sadeh {
443602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
444602adf40SYehuda Sadeh 	int ret = -ENOMEM;
445602adf40SYehuda Sadeh 
446602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
447602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
448602adf40SYehuda Sadeh 	if (!rbdc)
449602adf40SYehuda Sadeh 		goto out_opt;
450602adf40SYehuda Sadeh 
451602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
452602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
453602adf40SYehuda Sadeh 
454bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455bc534d86SAlex Elder 
45643ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
457602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
458bc534d86SAlex Elder 		goto out_mutex;
45943ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
460602adf40SYehuda Sadeh 
461602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
462602adf40SYehuda Sadeh 	if (ret < 0)
463602adf40SYehuda Sadeh 		goto out_err;
464602adf40SYehuda Sadeh 
465432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
466602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
467432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
468602adf40SYehuda Sadeh 
469bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
470bc534d86SAlex Elder 
471602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
472602adf40SYehuda Sadeh 	return rbdc;
473602adf40SYehuda Sadeh 
474602adf40SYehuda Sadeh out_err:
475602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
476bc534d86SAlex Elder out_mutex:
477bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
478602adf40SYehuda Sadeh 	kfree(rbdc);
479602adf40SYehuda Sadeh out_opt:
48043ae4701SAlex Elder 	if (ceph_opts)
48143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
48228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
483602adf40SYehuda Sadeh }
484602adf40SYehuda Sadeh 
485602adf40SYehuda Sadeh /*
4861f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
4871f7ba331SAlex Elder  * found, bump its reference count.
488602adf40SYehuda Sadeh  */
4891f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
490602adf40SYehuda Sadeh {
491602adf40SYehuda Sadeh 	struct rbd_client *client_node;
4921f7ba331SAlex Elder 	bool found = false;
493602adf40SYehuda Sadeh 
49443ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
495602adf40SYehuda Sadeh 		return NULL;
496602adf40SYehuda Sadeh 
4971f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
4981f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
4991f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5001f7ba331SAlex Elder 			kref_get(&client_node->kref);
5011f7ba331SAlex Elder 			found = true;
5021f7ba331SAlex Elder 			break;
5031f7ba331SAlex Elder 		}
5041f7ba331SAlex Elder 	}
5051f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5061f7ba331SAlex Elder 
5071f7ba331SAlex Elder 	return found ? client_node : NULL;
508602adf40SYehuda Sadeh }
509602adf40SYehuda Sadeh 
510602adf40SYehuda Sadeh /*
51159c2be1eSYehuda Sadeh  * mount options
51259c2be1eSYehuda Sadeh  */
51359c2be1eSYehuda Sadeh enum {
51459c2be1eSYehuda Sadeh 	Opt_last_int,
51559c2be1eSYehuda Sadeh 	/* int args above */
51659c2be1eSYehuda Sadeh 	Opt_last_string,
51759c2be1eSYehuda Sadeh 	/* string args above */
518cc0538b6SAlex Elder 	Opt_read_only,
519cc0538b6SAlex Elder 	Opt_read_write,
520cc0538b6SAlex Elder 	/* Boolean args above */
521cc0538b6SAlex Elder 	Opt_last_bool,
52259c2be1eSYehuda Sadeh };
52359c2be1eSYehuda Sadeh 
52443ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
52559c2be1eSYehuda Sadeh 	/* int args above */
52659c2be1eSYehuda Sadeh 	/* string args above */
527be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
528cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
529cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
530cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
531cc0538b6SAlex Elder 	/* Boolean args above */
53259c2be1eSYehuda Sadeh 	{-1, NULL}
53359c2be1eSYehuda Sadeh };
53459c2be1eSYehuda Sadeh 
53598571b5aSAlex Elder struct rbd_options {
53698571b5aSAlex Elder 	bool	read_only;
53798571b5aSAlex Elder };
53898571b5aSAlex Elder 
53998571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
54098571b5aSAlex Elder 
54159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
54259c2be1eSYehuda Sadeh {
54343ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
54459c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
54559c2be1eSYehuda Sadeh 	int token, intval, ret;
54659c2be1eSYehuda Sadeh 
54743ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
54859c2be1eSYehuda Sadeh 	if (token < 0)
54959c2be1eSYehuda Sadeh 		return -EINVAL;
55059c2be1eSYehuda Sadeh 
55159c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
55259c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
55359c2be1eSYehuda Sadeh 		if (ret < 0) {
55459c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
55559c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
55659c2be1eSYehuda Sadeh 			return ret;
55759c2be1eSYehuda Sadeh 		}
55859c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
55959c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
56059c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
56159c2be1eSYehuda Sadeh 		     argstr[0].from);
562cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
563cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
56459c2be1eSYehuda Sadeh 	} else {
56559c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
56659c2be1eSYehuda Sadeh 	}
56759c2be1eSYehuda Sadeh 
56859c2be1eSYehuda Sadeh 	switch (token) {
569cc0538b6SAlex Elder 	case Opt_read_only:
570cc0538b6SAlex Elder 		rbd_opts->read_only = true;
571cc0538b6SAlex Elder 		break;
572cc0538b6SAlex Elder 	case Opt_read_write:
573cc0538b6SAlex Elder 		rbd_opts->read_only = false;
574cc0538b6SAlex Elder 		break;
57559c2be1eSYehuda Sadeh 	default:
576aafb230eSAlex Elder 		rbd_assert(false);
577aafb230eSAlex Elder 		break;
57859c2be1eSYehuda Sadeh 	}
57959c2be1eSYehuda Sadeh 	return 0;
58059c2be1eSYehuda Sadeh }
58159c2be1eSYehuda Sadeh 
58259c2be1eSYehuda Sadeh /*
583602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
584602adf40SYehuda Sadeh  * not exist create it.
585602adf40SYehuda Sadeh  */
5869d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
587602adf40SYehuda Sadeh {
588f8c38929SAlex Elder 	struct rbd_client *rbdc;
58959c2be1eSYehuda Sadeh 
5901f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
5919d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
59243ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
5939d3997fdSAlex Elder 	else
594f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
595d720bcb0SAlex Elder 
5969d3997fdSAlex Elder 	return rbdc;
597602adf40SYehuda Sadeh }
598602adf40SYehuda Sadeh 
599602adf40SYehuda Sadeh /*
600602adf40SYehuda Sadeh  * Destroy ceph client
601d23a4b3fSAlex Elder  *
602432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
603602adf40SYehuda Sadeh  */
604602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
605602adf40SYehuda Sadeh {
606602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
607602adf40SYehuda Sadeh 
608602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
609cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
610602adf40SYehuda Sadeh 	list_del(&rbdc->node);
611cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
612602adf40SYehuda Sadeh 
613602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
614602adf40SYehuda Sadeh 	kfree(rbdc);
615602adf40SYehuda Sadeh }
616602adf40SYehuda Sadeh 
617602adf40SYehuda Sadeh /*
618602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
619602adf40SYehuda Sadeh  * it.
620602adf40SYehuda Sadeh  */
6219d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
622602adf40SYehuda Sadeh {
623c53d5893SAlex Elder 	if (rbdc)
6249d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
625602adf40SYehuda Sadeh }
626602adf40SYehuda Sadeh 
627a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
628a30b71b9SAlex Elder {
629a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
630a30b71b9SAlex Elder }
631a30b71b9SAlex Elder 
6328e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6338e94af8eSAlex Elder {
634103a150fSAlex Elder 	size_t size;
635103a150fSAlex Elder 	u32 snap_count;
636103a150fSAlex Elder 
637103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
638103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
639103a150fSAlex Elder 		return false;
640103a150fSAlex Elder 
641db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
642db2388b6SAlex Elder 
643db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
644db2388b6SAlex Elder 		return false;
645db2388b6SAlex Elder 
646db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
647db2388b6SAlex Elder 
648db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
649db2388b6SAlex Elder 		return false;
650db2388b6SAlex Elder 
651103a150fSAlex Elder 	/*
652103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
653103a150fSAlex Elder 	 * that limits the number of snapshots.
654103a150fSAlex Elder 	 */
655103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
656103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
657103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
658103a150fSAlex Elder 		return false;
659103a150fSAlex Elder 
660103a150fSAlex Elder 	/*
661103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
662103a150fSAlex Elder 	 * header must also be representable in a size_t.
663103a150fSAlex Elder 	 */
664103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
665103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
666103a150fSAlex Elder 		return false;
667103a150fSAlex Elder 
668103a150fSAlex Elder 	return true;
6698e94af8eSAlex Elder }
6708e94af8eSAlex Elder 
671602adf40SYehuda Sadeh /*
672602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
673602adf40SYehuda Sadeh  * header.
674602adf40SYehuda Sadeh  */
675602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
6764156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
677602adf40SYehuda Sadeh {
678ccece235SAlex Elder 	u32 snap_count;
67958c17b0eSAlex Elder 	size_t len;
680d2bb24e5SAlex Elder 	size_t size;
681621901d6SAlex Elder 	u32 i;
682602adf40SYehuda Sadeh 
6836a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
6846a52325fSAlex Elder 
685103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
686103a150fSAlex Elder 
68758c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
68858c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6896a52325fSAlex Elder 	if (!header->object_prefix)
690602adf40SYehuda Sadeh 		return -ENOMEM;
69158c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
69258c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
69300f1f36fSAlex Elder 
694602adf40SYehuda Sadeh 	if (snap_count) {
695f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
696f785cc1dSAlex Elder 
697621901d6SAlex Elder 		/* Save a copy of the snapshot names */
698621901d6SAlex Elder 
699f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
700f785cc1dSAlex Elder 			return -EIO;
701f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
702602adf40SYehuda Sadeh 		if (!header->snap_names)
7036a52325fSAlex Elder 			goto out_err;
704f785cc1dSAlex Elder 		/*
705f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
706f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
707f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
708f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
709f785cc1dSAlex Elder 		 */
710f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
711f785cc1dSAlex Elder 			snap_names_len);
7126a52325fSAlex Elder 
713621901d6SAlex Elder 		/* Record each snapshot's size */
714621901d6SAlex Elder 
715d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
716d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
717602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7186a52325fSAlex Elder 			goto out_err;
719621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
720621901d6SAlex Elder 			header->snap_sizes[i] =
721621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
722602adf40SYehuda Sadeh 	} else {
723ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
724602adf40SYehuda Sadeh 		header->snap_names = NULL;
725602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
726602adf40SYehuda Sadeh 	}
727849b4260SAlex Elder 
72834b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
729602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
730602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
731602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7326a52325fSAlex Elder 
733621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
734621901d6SAlex Elder 
735f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7366a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7376a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7386a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7396a52325fSAlex Elder 	if (!header->snapc)
7406a52325fSAlex Elder 		goto out_err;
741602adf40SYehuda Sadeh 
742602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
743505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
744602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
745621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
746602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
747602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
748602adf40SYehuda Sadeh 
749602adf40SYehuda Sadeh 	return 0;
750602adf40SYehuda Sadeh 
7516a52325fSAlex Elder out_err:
752849b4260SAlex Elder 	kfree(header->snap_sizes);
753ccece235SAlex Elder 	header->snap_sizes = NULL;
754602adf40SYehuda Sadeh 	kfree(header->snap_names);
755ccece235SAlex Elder 	header->snap_names = NULL;
7566a52325fSAlex Elder 	kfree(header->object_prefix);
7576a52325fSAlex Elder 	header->object_prefix = NULL;
758ccece235SAlex Elder 
75900f1f36fSAlex Elder 	return -ENOMEM;
760602adf40SYehuda Sadeh }
761602adf40SYehuda Sadeh 
7629e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7639e15b77dSAlex Elder {
7649e15b77dSAlex Elder 	struct rbd_snap *snap;
7659e15b77dSAlex Elder 
7669e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7679e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7689e15b77dSAlex Elder 
7699e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7709e15b77dSAlex Elder 		if (snap_id == snap->id)
7719e15b77dSAlex Elder 			return snap->name;
7729e15b77dSAlex Elder 
7739e15b77dSAlex Elder 	return NULL;
7749e15b77dSAlex Elder }
7759e15b77dSAlex Elder 
7768836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
777602adf40SYehuda Sadeh {
778602adf40SYehuda Sadeh 
779e86924a8SAlex Elder 	struct rbd_snap *snap;
78000f1f36fSAlex Elder 
781e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
782e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
7830d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
784e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
78534b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
78600f1f36fSAlex Elder 
787e86924a8SAlex Elder 			return 0;
788602adf40SYehuda Sadeh 		}
78900f1f36fSAlex Elder 	}
790e86924a8SAlex Elder 
79100f1f36fSAlex Elder 	return -ENOENT;
79200f1f36fSAlex Elder }
793602adf40SYehuda Sadeh 
794819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
795602adf40SYehuda Sadeh {
79678dc447dSAlex Elder 	int ret;
797602adf40SYehuda Sadeh 
7980d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
799cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8000d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
80199c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
80234b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
803e86924a8SAlex Elder 		ret = 0;
804602adf40SYehuda Sadeh 	} else {
8050d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
806602adf40SYehuda Sadeh 		if (ret < 0)
807602adf40SYehuda Sadeh 			goto done;
808f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
809602adf40SYehuda Sadeh 	}
8106d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8116d292906SAlex Elder 
812602adf40SYehuda Sadeh done:
813602adf40SYehuda Sadeh 	return ret;
814602adf40SYehuda Sadeh }
815602adf40SYehuda Sadeh 
816602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
817602adf40SYehuda Sadeh {
818849b4260SAlex Elder 	kfree(header->object_prefix);
819d78fd7aeSAlex Elder 	header->object_prefix = NULL;
820602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
821d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
822849b4260SAlex Elder 	kfree(header->snap_names);
823d78fd7aeSAlex Elder 	header->snap_names = NULL;
824d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
825d78fd7aeSAlex Elder 	header->snapc = NULL;
826602adf40SYehuda Sadeh }
827602adf40SYehuda Sadeh 
82898571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
829602adf40SYehuda Sadeh {
83065ccfe21SAlex Elder 	char *name;
83165ccfe21SAlex Elder 	u64 segment;
83265ccfe21SAlex Elder 	int ret;
833602adf40SYehuda Sadeh 
8342fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
83565ccfe21SAlex Elder 	if (!name)
83665ccfe21SAlex Elder 		return NULL;
83765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8382fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
83965ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8402fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
84165ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
84265ccfe21SAlex Elder 			segment, ret);
84365ccfe21SAlex Elder 		kfree(name);
84465ccfe21SAlex Elder 		name = NULL;
84565ccfe21SAlex Elder 	}
846602adf40SYehuda Sadeh 
84765ccfe21SAlex Elder 	return name;
84865ccfe21SAlex Elder }
849602adf40SYehuda Sadeh 
85065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
85165ccfe21SAlex Elder {
85265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
853602adf40SYehuda Sadeh 
85465ccfe21SAlex Elder 	return offset & (segment_size - 1);
85565ccfe21SAlex Elder }
85665ccfe21SAlex Elder 
85765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
85865ccfe21SAlex Elder 				u64 offset, u64 length)
85965ccfe21SAlex Elder {
86065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
86165ccfe21SAlex Elder 
86265ccfe21SAlex Elder 	offset &= segment_size - 1;
86365ccfe21SAlex Elder 
864aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
86565ccfe21SAlex Elder 	if (offset + length > segment_size)
86665ccfe21SAlex Elder 		length = segment_size - offset;
86765ccfe21SAlex Elder 
86865ccfe21SAlex Elder 	return length;
869602adf40SYehuda Sadeh }
870602adf40SYehuda Sadeh 
871602adf40SYehuda Sadeh /*
872029bcbd8SJosh Durgin  * returns the size of an object in the image
873029bcbd8SJosh Durgin  */
874029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
875029bcbd8SJosh Durgin {
876029bcbd8SJosh Durgin 	return 1 << header->obj_order;
877029bcbd8SJosh Durgin }
878029bcbd8SJosh Durgin 
879029bcbd8SJosh Durgin /*
880602adf40SYehuda Sadeh  * bio helpers
881602adf40SYehuda Sadeh  */
882602adf40SYehuda Sadeh 
883602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
884602adf40SYehuda Sadeh {
885602adf40SYehuda Sadeh 	struct bio *tmp;
886602adf40SYehuda Sadeh 
887602adf40SYehuda Sadeh 	while (chain) {
888602adf40SYehuda Sadeh 		tmp = chain;
889602adf40SYehuda Sadeh 		chain = chain->bi_next;
890602adf40SYehuda Sadeh 		bio_put(tmp);
891602adf40SYehuda Sadeh 	}
892602adf40SYehuda Sadeh }
893602adf40SYehuda Sadeh 
894602adf40SYehuda Sadeh /*
895602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
896602adf40SYehuda Sadeh  */
897602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
898602adf40SYehuda Sadeh {
899602adf40SYehuda Sadeh 	struct bio_vec *bv;
900602adf40SYehuda Sadeh 	unsigned long flags;
901602adf40SYehuda Sadeh 	void *buf;
902602adf40SYehuda Sadeh 	int i;
903602adf40SYehuda Sadeh 	int pos = 0;
904602adf40SYehuda Sadeh 
905602adf40SYehuda Sadeh 	while (chain) {
906602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
907602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
908602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
909602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
910602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
911602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
91285b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
913602adf40SYehuda Sadeh 			}
914602adf40SYehuda Sadeh 			pos += bv->bv_len;
915602adf40SYehuda Sadeh 		}
916602adf40SYehuda Sadeh 
917602adf40SYehuda Sadeh 		chain = chain->bi_next;
918602adf40SYehuda Sadeh 	}
919602adf40SYehuda Sadeh }
920602adf40SYehuda Sadeh 
921602adf40SYehuda Sadeh /*
922f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
923f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
924602adf40SYehuda Sadeh  */
925f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
926f7760dadSAlex Elder 					unsigned int offset,
927f7760dadSAlex Elder 					unsigned int len,
928f7760dadSAlex Elder 					gfp_t gfpmask)
929602adf40SYehuda Sadeh {
930f7760dadSAlex Elder 	struct bio_vec *bv;
931f7760dadSAlex Elder 	unsigned int resid;
932f7760dadSAlex Elder 	unsigned short idx;
933f7760dadSAlex Elder 	unsigned int voff;
934f7760dadSAlex Elder 	unsigned short end_idx;
935f7760dadSAlex Elder 	unsigned short vcnt;
936f7760dadSAlex Elder 	struct bio *bio;
937602adf40SYehuda Sadeh 
938f7760dadSAlex Elder 	/* Handle the easy case for the caller */
939f7760dadSAlex Elder 
940f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
941f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
942f7760dadSAlex Elder 
943f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
944f7760dadSAlex Elder 		return NULL;
945f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
946f7760dadSAlex Elder 		return NULL;
947f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
948f7760dadSAlex Elder 		return NULL;
949f7760dadSAlex Elder 
950f7760dadSAlex Elder 	/* Find first affected segment... */
951f7760dadSAlex Elder 
952f7760dadSAlex Elder 	resid = offset;
953f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
954f7760dadSAlex Elder 		if (resid < bv->bv_len)
955f7760dadSAlex Elder 			break;
956f7760dadSAlex Elder 		resid -= bv->bv_len;
957602adf40SYehuda Sadeh 	}
958f7760dadSAlex Elder 	voff = resid;
959602adf40SYehuda Sadeh 
960f7760dadSAlex Elder 	/* ...and the last affected segment */
961542582fcSAlex Elder 
962f7760dadSAlex Elder 	resid += len;
963f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
964f7760dadSAlex Elder 		if (resid <= bv->bv_len)
965f7760dadSAlex Elder 			break;
966f7760dadSAlex Elder 		resid -= bv->bv_len;
967f7760dadSAlex Elder 	}
968f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
969602adf40SYehuda Sadeh 
970f7760dadSAlex Elder 	/* Build the clone */
971f7760dadSAlex Elder 
972f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
973f7760dadSAlex Elder 	if (!bio)
974f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
975f7760dadSAlex Elder 
976f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
977f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
978f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
979f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
980602adf40SYehuda Sadeh 
981602adf40SYehuda Sadeh 	/*
982f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
983f7760dadSAlex Elder 	 * and last (or only) entries.
984602adf40SYehuda Sadeh 	 */
985f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
986f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
987f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
988f7760dadSAlex Elder 	if (vcnt > 1) {
989f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
990f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
991602adf40SYehuda Sadeh 	} else {
992f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
993602adf40SYehuda Sadeh 	}
994602adf40SYehuda Sadeh 
995f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
996f7760dadSAlex Elder 	bio->bi_size = len;
997f7760dadSAlex Elder 	bio->bi_idx = 0;
998602adf40SYehuda Sadeh 
999f7760dadSAlex Elder 	return bio;
1000602adf40SYehuda Sadeh }
1001602adf40SYehuda Sadeh 
1002f7760dadSAlex Elder /*
1003f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1004f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1005f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1006f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1007f7760dadSAlex Elder  *
1008f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1009f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1010f7760dadSAlex Elder  * the start of data to be cloned is located.
1011f7760dadSAlex Elder  *
1012f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1013f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1014f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1015f7760dadSAlex Elder  */
1016f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1017f7760dadSAlex Elder 					unsigned int *offset,
1018f7760dadSAlex Elder 					unsigned int len,
1019f7760dadSAlex Elder 					gfp_t gfpmask)
1020f7760dadSAlex Elder {
1021f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1022f7760dadSAlex Elder 	unsigned int off = *offset;
1023f7760dadSAlex Elder 	struct bio *chain = NULL;
1024f7760dadSAlex Elder 	struct bio **end;
1025602adf40SYehuda Sadeh 
1026f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1027602adf40SYehuda Sadeh 
1028f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1029f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1030602adf40SYehuda Sadeh 
1031f7760dadSAlex Elder 	end = &chain;
1032f7760dadSAlex Elder 	while (len) {
1033f7760dadSAlex Elder 		unsigned int bi_size;
1034f7760dadSAlex Elder 		struct bio *bio;
1035f7760dadSAlex Elder 
1036f5400b7aSAlex Elder 		if (!bi) {
1037f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1038f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1039f5400b7aSAlex Elder 		}
1040f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1041f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1042f7760dadSAlex Elder 		if (!bio)
1043f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1044f7760dadSAlex Elder 
1045f7760dadSAlex Elder 		*end = bio;
1046f7760dadSAlex Elder 		end = &bio->bi_next;
1047f7760dadSAlex Elder 
1048f7760dadSAlex Elder 		off += bi_size;
1049f7760dadSAlex Elder 		if (off == bi->bi_size) {
1050f7760dadSAlex Elder 			bi = bi->bi_next;
1051f7760dadSAlex Elder 			off = 0;
1052f7760dadSAlex Elder 		}
1053f7760dadSAlex Elder 		len -= bi_size;
1054f7760dadSAlex Elder 	}
1055f7760dadSAlex Elder 	*bio_src = bi;
1056f7760dadSAlex Elder 	*offset = off;
1057f7760dadSAlex Elder 
1058f7760dadSAlex Elder 	return chain;
1059f7760dadSAlex Elder out_err:
1060f7760dadSAlex Elder 	bio_chain_put(chain);
1061f7760dadSAlex Elder 
1062602adf40SYehuda Sadeh 	return NULL;
1063602adf40SYehuda Sadeh }
1064602adf40SYehuda Sadeh 
1065bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1066bf0d5f50SAlex Elder {
1067bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1068bf0d5f50SAlex Elder }
1069bf0d5f50SAlex Elder 
1070bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1071bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1072bf0d5f50SAlex Elder {
1073bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
1074bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1075bf0d5f50SAlex Elder }
1076bf0d5f50SAlex Elder 
1077bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1078bf0d5f50SAlex Elder {
1079bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1080bf0d5f50SAlex Elder }
1081bf0d5f50SAlex Elder 
1082bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1083bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1084bf0d5f50SAlex Elder {
1085bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
1086bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1087bf0d5f50SAlex Elder }
1088bf0d5f50SAlex Elder 
1089bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1090bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1091bf0d5f50SAlex Elder {
109225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
109325dcf954SAlex Elder 
1094bf0d5f50SAlex Elder 	rbd_obj_request_get(obj_request);
1095bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
109625dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
1097bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
109825dcf954SAlex Elder 	img_request->obj_request_count++;
109925dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
1100bf0d5f50SAlex Elder }
1101bf0d5f50SAlex Elder 
1102bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1103bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1104bf0d5f50SAlex Elder {
1105bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
110625dcf954SAlex Elder 
1107bf0d5f50SAlex Elder 	list_del(&obj_request->links);
110825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
110925dcf954SAlex Elder 	img_request->obj_request_count--;
111025dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
111125dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
1112bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1113bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
111425dcf954SAlex Elder 	obj_request->callback = NULL;
1115bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1116bf0d5f50SAlex Elder }
1117bf0d5f50SAlex Elder 
1118bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1119bf0d5f50SAlex Elder {
1120bf0d5f50SAlex Elder 	switch (type) {
11219969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1122bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1123788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1124bf0d5f50SAlex Elder 		return true;
1125bf0d5f50SAlex Elder 	default:
1126bf0d5f50SAlex Elder 		return false;
1127bf0d5f50SAlex Elder 	}
1128bf0d5f50SAlex Elder }
1129bf0d5f50SAlex Elder 
11308d23bf29SAlex Elder struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
11318d23bf29SAlex Elder {
11328d23bf29SAlex Elder 	struct ceph_osd_req_op *op;
11338d23bf29SAlex Elder 	va_list args;
11342647ba38SAlex Elder 	size_t size;
11358d23bf29SAlex Elder 
11368d23bf29SAlex Elder 	op = kzalloc(sizeof (*op), GFP_NOIO);
11378d23bf29SAlex Elder 	if (!op)
11388d23bf29SAlex Elder 		return NULL;
11398d23bf29SAlex Elder 	op->op = opcode;
11408d23bf29SAlex Elder 	va_start(args, opcode);
11418d23bf29SAlex Elder 	switch (opcode) {
11428d23bf29SAlex Elder 	case CEPH_OSD_OP_READ:
11438d23bf29SAlex Elder 	case CEPH_OSD_OP_WRITE:
11448d23bf29SAlex Elder 		/* rbd_osd_req_op_create(READ, offset, length) */
11458d23bf29SAlex Elder 		/* rbd_osd_req_op_create(WRITE, offset, length) */
11468d23bf29SAlex Elder 		op->extent.offset = va_arg(args, u64);
11478d23bf29SAlex Elder 		op->extent.length = va_arg(args, u64);
11488d23bf29SAlex Elder 		if (opcode == CEPH_OSD_OP_WRITE)
11498d23bf29SAlex Elder 			op->payload_len = op->extent.length;
11508d23bf29SAlex Elder 		break;
1151fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1152fbfab539SAlex Elder 		break;
11532647ba38SAlex Elder 	case CEPH_OSD_OP_CALL:
11542647ba38SAlex Elder 		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
11552647ba38SAlex Elder 		op->cls.class_name = va_arg(args, char *);
11562647ba38SAlex Elder 		size = strlen(op->cls.class_name);
11572647ba38SAlex Elder 		rbd_assert(size <= (size_t) U8_MAX);
11582647ba38SAlex Elder 		op->cls.class_len = size;
11592647ba38SAlex Elder 		op->payload_len = size;
11602647ba38SAlex Elder 
11612647ba38SAlex Elder 		op->cls.method_name = va_arg(args, char *);
11622647ba38SAlex Elder 		size = strlen(op->cls.method_name);
11632647ba38SAlex Elder 		rbd_assert(size <= (size_t) U8_MAX);
11642647ba38SAlex Elder 		op->cls.method_len = size;
11652647ba38SAlex Elder 		op->payload_len += size;
11662647ba38SAlex Elder 
11672647ba38SAlex Elder 		op->cls.argc = 0;
11682647ba38SAlex Elder 		op->cls.indata = va_arg(args, void *);
11692647ba38SAlex Elder 		size = va_arg(args, size_t);
11702647ba38SAlex Elder 		rbd_assert(size <= (size_t) U32_MAX);
11712647ba38SAlex Elder 		op->cls.indata_len = (u32) size;
11722647ba38SAlex Elder 		op->payload_len += size;
11732647ba38SAlex Elder 		break;
11745efea49aSAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
11755efea49aSAlex Elder 	case CEPH_OSD_OP_WATCH:
11765efea49aSAlex Elder 		/* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
11775efea49aSAlex Elder 		/* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
11785efea49aSAlex Elder 		op->watch.cookie = va_arg(args, u64);
11795efea49aSAlex Elder 		op->watch.ver = va_arg(args, u64);
11805efea49aSAlex Elder 		op->watch.ver = cpu_to_le64(op->watch.ver);
11815efea49aSAlex Elder 		if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
11825efea49aSAlex Elder 			op->watch.flag = (u8) 1;
11835efea49aSAlex Elder 		break;
11848d23bf29SAlex Elder 	default:
11858d23bf29SAlex Elder 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
11868d23bf29SAlex Elder 		kfree(op);
11878d23bf29SAlex Elder 		op = NULL;
11888d23bf29SAlex Elder 		break;
11898d23bf29SAlex Elder 	}
11908d23bf29SAlex Elder 	va_end(args);
11918d23bf29SAlex Elder 
11928d23bf29SAlex Elder 	return op;
11938d23bf29SAlex Elder }
11948d23bf29SAlex Elder 
11958d23bf29SAlex Elder static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
11968d23bf29SAlex Elder {
11978d23bf29SAlex Elder 	kfree(op);
11988d23bf29SAlex Elder }
11998d23bf29SAlex Elder 
1200bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1201bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1202bf0d5f50SAlex Elder {
1203bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1204bf0d5f50SAlex Elder }
1205bf0d5f50SAlex Elder 
1206bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1207bf0d5f50SAlex Elder {
1208bf0d5f50SAlex Elder 	if (img_request->callback)
1209bf0d5f50SAlex Elder 		img_request->callback(img_request);
1210bf0d5f50SAlex Elder 	else
1211bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1212bf0d5f50SAlex Elder }
1213bf0d5f50SAlex Elder 
1214788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1215788e2df3SAlex Elder 
1216788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1217788e2df3SAlex Elder {
1218788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1219788e2df3SAlex Elder }
1220788e2df3SAlex Elder 
122107741308SAlex Elder static void obj_request_done_init(struct rbd_obj_request *obj_request)
122207741308SAlex Elder {
122307741308SAlex Elder 	atomic_set(&obj_request->done, 0);
122407741308SAlex Elder 	smp_wmb();
122507741308SAlex Elder }
122607741308SAlex Elder 
122707741308SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
122807741308SAlex Elder {
122907741308SAlex Elder 	atomic_set(&obj_request->done, 1);
123007741308SAlex Elder 	smp_wmb();
123107741308SAlex Elder }
123207741308SAlex Elder 
123307741308SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
123407741308SAlex Elder {
123507741308SAlex Elder 	smp_rmb();
123607741308SAlex Elder 	return atomic_read(&obj_request->done) != 0;
123707741308SAlex Elder }
123807741308SAlex Elder 
12399969ebc5SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
12409969ebc5SAlex Elder 				struct ceph_osd_op *op)
12419969ebc5SAlex Elder {
124207741308SAlex Elder 	obj_request_done_set(obj_request);
12439969ebc5SAlex Elder }
12449969ebc5SAlex Elder 
1245bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1246bf0d5f50SAlex Elder {
1247bf0d5f50SAlex Elder 	if (obj_request->callback)
1248bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1249788e2df3SAlex Elder 	else
1250788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1251bf0d5f50SAlex Elder }
1252bf0d5f50SAlex Elder 
1253bf0d5f50SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1254bf0d5f50SAlex Elder 				struct ceph_osd_op *op)
1255bf0d5f50SAlex Elder {
1256bf0d5f50SAlex Elder 	u64 xferred;
1257bf0d5f50SAlex Elder 
1258bf0d5f50SAlex Elder 	/*
1259bf0d5f50SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1260bf0d5f50SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1261bf0d5f50SAlex Elder 	 */
1262bf0d5f50SAlex Elder 	xferred = le64_to_cpu(op->extent.length);
1263bf0d5f50SAlex Elder 	rbd_assert(xferred < (u64) UINT_MAX);
1264bf0d5f50SAlex Elder 	if (obj_request->result == (s32) -ENOENT) {
1265bf0d5f50SAlex Elder 		zero_bio_chain(obj_request->bio_list, 0);
1266bf0d5f50SAlex Elder 		obj_request->result = 0;
1267bf0d5f50SAlex Elder 	} else if (xferred < obj_request->length && !obj_request->result) {
1268bf0d5f50SAlex Elder 		zero_bio_chain(obj_request->bio_list, xferred);
1269bf0d5f50SAlex Elder 		xferred = obj_request->length;
1270bf0d5f50SAlex Elder 	}
1271bf0d5f50SAlex Elder 	obj_request->xferred = xferred;
127207741308SAlex Elder 	obj_request_done_set(obj_request);
1273bf0d5f50SAlex Elder }
1274bf0d5f50SAlex Elder 
1275bf0d5f50SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1276bf0d5f50SAlex Elder 				struct ceph_osd_op *op)
1277bf0d5f50SAlex Elder {
1278bf0d5f50SAlex Elder 	obj_request->xferred = le64_to_cpu(op->extent.length);
127907741308SAlex Elder 	obj_request_done_set(obj_request);
1280bf0d5f50SAlex Elder }
1281bf0d5f50SAlex Elder 
1282fbfab539SAlex Elder /*
1283fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1284fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1285fbfab539SAlex Elder  */
1286fbfab539SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request,
1287fbfab539SAlex Elder 				struct ceph_osd_op *op)
1288fbfab539SAlex Elder {
1289fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1290fbfab539SAlex Elder }
1291fbfab539SAlex Elder 
1292bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1293bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1294bf0d5f50SAlex Elder {
1295bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1296bf0d5f50SAlex Elder 	struct ceph_osd_reply_head *reply_head;
1297bf0d5f50SAlex Elder 	struct ceph_osd_op *op;
1298bf0d5f50SAlex Elder 	u32 num_ops;
1299bf0d5f50SAlex Elder 	u16 opcode;
1300bf0d5f50SAlex Elder 
1301bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
1302bf0d5f50SAlex Elder 	rbd_assert(!!obj_request->img_request ^
1303bf0d5f50SAlex Elder 				(obj_request->which == BAD_WHICH));
1304bf0d5f50SAlex Elder 
1305bf0d5f50SAlex Elder 	obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1306bf0d5f50SAlex Elder 	reply_head = msg->front.iov_base;
1307bf0d5f50SAlex Elder 	obj_request->result = (s32) le32_to_cpu(reply_head->result);
1308bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1309bf0d5f50SAlex Elder 
1310bf0d5f50SAlex Elder 	num_ops = le32_to_cpu(reply_head->num_ops);
1311bf0d5f50SAlex Elder 	WARN_ON(num_ops != 1);	/* For now */
1312bf0d5f50SAlex Elder 
1313bf0d5f50SAlex Elder 	op = &reply_head->ops[0];
1314bf0d5f50SAlex Elder 	opcode = le16_to_cpu(op->op);
1315bf0d5f50SAlex Elder 	switch (opcode) {
1316bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1317bf0d5f50SAlex Elder 		rbd_osd_read_callback(obj_request, op);
1318bf0d5f50SAlex Elder 		break;
1319bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1320bf0d5f50SAlex Elder 		rbd_osd_write_callback(obj_request, op);
1321bf0d5f50SAlex Elder 		break;
1322fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1323fbfab539SAlex Elder 		rbd_osd_stat_callback(obj_request, op);
1324fbfab539SAlex Elder 		break;
132536be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1326b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
13279969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
13289969ebc5SAlex Elder 		rbd_osd_trivial_callback(obj_request, op);
13299969ebc5SAlex Elder 		break;
1330bf0d5f50SAlex Elder 	default:
1331bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1332bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1333bf0d5f50SAlex Elder 		break;
1334bf0d5f50SAlex Elder 	}
1335bf0d5f50SAlex Elder 
133607741308SAlex Elder 	if (obj_request_done_test(obj_request))
1337bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1338bf0d5f50SAlex Elder }
1339bf0d5f50SAlex Elder 
1340bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1341bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1342bf0d5f50SAlex Elder 					bool write_request,
1343bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request,
1344bf0d5f50SAlex Elder 					struct ceph_osd_req_op *op)
1345bf0d5f50SAlex Elder {
1346bf0d5f50SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
1347bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1348bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1349bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1350bf0d5f50SAlex Elder 	struct timespec now;
1351bf0d5f50SAlex Elder 	struct timespec *mtime;
1352bf0d5f50SAlex Elder 	u64 snap_id = CEPH_NOSNAP;
1353bf0d5f50SAlex Elder 	u64 offset = obj_request->offset;
1354bf0d5f50SAlex Elder 	u64 length = obj_request->length;
1355bf0d5f50SAlex Elder 
1356bf0d5f50SAlex Elder 	if (img_request) {
1357bf0d5f50SAlex Elder 		rbd_assert(img_request->write_request == write_request);
1358bf0d5f50SAlex Elder 		if (img_request->write_request)
1359bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1360bf0d5f50SAlex Elder 		else
1361bf0d5f50SAlex Elder 			snap_id = img_request->snap_id;
1362bf0d5f50SAlex Elder 	}
1363bf0d5f50SAlex Elder 
1364bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1365bf0d5f50SAlex Elder 
1366bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1367bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1368bf0d5f50SAlex Elder 	if (!osd_req)
1369bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1370bf0d5f50SAlex Elder 
1371bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1372bf0d5f50SAlex Elder 	switch (obj_request->type) {
13739969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
13749969ebc5SAlex Elder 		break;		/* Nothing to do */
1375bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1376bf0d5f50SAlex Elder 		rbd_assert(obj_request->bio_list != NULL);
1377bf0d5f50SAlex Elder 		osd_req->r_bio = obj_request->bio_list;
1378bf0d5f50SAlex Elder 		break;
1379788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1380788e2df3SAlex Elder 		osd_req->r_pages = obj_request->pages;
1381788e2df3SAlex Elder 		osd_req->r_num_pages = obj_request->page_count;
1382788e2df3SAlex Elder 		osd_req->r_page_alignment = offset & ~PAGE_MASK;
1383788e2df3SAlex Elder 		break;
1384bf0d5f50SAlex Elder 	}
1385bf0d5f50SAlex Elder 
1386bf0d5f50SAlex Elder 	if (write_request) {
1387bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1388bf0d5f50SAlex Elder 		now = CURRENT_TIME;
1389bf0d5f50SAlex Elder 		mtime = &now;
1390bf0d5f50SAlex Elder 	} else {
1391bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1392bf0d5f50SAlex Elder 		mtime = NULL;	/* not needed for reads */
1393bf0d5f50SAlex Elder 		offset = 0;	/* These are not used... */
1394bf0d5f50SAlex Elder 		length = 0;	/* ...for osd read requests */
1395bf0d5f50SAlex Elder 	}
1396bf0d5f50SAlex Elder 
1397bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1398bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1399bf0d5f50SAlex Elder 
1400bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1401bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1402bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1403bf0d5f50SAlex Elder 
1404bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1405bf0d5f50SAlex Elder 
1406bf0d5f50SAlex Elder 	/* osd_req will get its own reference to snapc (if non-null) */
1407bf0d5f50SAlex Elder 
1408bf0d5f50SAlex Elder 	ceph_osdc_build_request(osd_req, offset, length, 1, op,
1409bf0d5f50SAlex Elder 				snapc, snap_id, mtime);
1410bf0d5f50SAlex Elder 
1411bf0d5f50SAlex Elder 	return osd_req;
1412bf0d5f50SAlex Elder }
1413bf0d5f50SAlex Elder 
1414bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1415bf0d5f50SAlex Elder {
1416bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1417bf0d5f50SAlex Elder }
1418bf0d5f50SAlex Elder 
1419bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1420bf0d5f50SAlex Elder 
1421bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1422bf0d5f50SAlex Elder 						u64 offset, u64 length,
1423bf0d5f50SAlex Elder 						enum obj_request_type type)
1424bf0d5f50SAlex Elder {
1425bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1426bf0d5f50SAlex Elder 	size_t size;
1427bf0d5f50SAlex Elder 	char *name;
1428bf0d5f50SAlex Elder 
1429bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1430bf0d5f50SAlex Elder 
1431bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1432bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1433bf0d5f50SAlex Elder 	if (!obj_request)
1434bf0d5f50SAlex Elder 		return NULL;
1435bf0d5f50SAlex Elder 
1436bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1437bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1438bf0d5f50SAlex Elder 	obj_request->offset = offset;
1439bf0d5f50SAlex Elder 	obj_request->length = length;
1440bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1441bf0d5f50SAlex Elder 	obj_request->type = type;
1442bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
144307741308SAlex Elder 	obj_request_done_init(obj_request);
1444788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1445bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1446bf0d5f50SAlex Elder 
1447bf0d5f50SAlex Elder 	return obj_request;
1448bf0d5f50SAlex Elder }
1449bf0d5f50SAlex Elder 
1450bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1451bf0d5f50SAlex Elder {
1452bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1453bf0d5f50SAlex Elder 
1454bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1455bf0d5f50SAlex Elder 
1456bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1457bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1458bf0d5f50SAlex Elder 
1459bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1460bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1461bf0d5f50SAlex Elder 
1462bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1463bf0d5f50SAlex Elder 	switch (obj_request->type) {
14649969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
14659969ebc5SAlex Elder 		break;		/* Nothing to do */
1466bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1467bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1468bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1469bf0d5f50SAlex Elder 		break;
1470788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1471788e2df3SAlex Elder 		if (obj_request->pages)
1472788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1473788e2df3SAlex Elder 						obj_request->page_count);
1474788e2df3SAlex Elder 		break;
1475bf0d5f50SAlex Elder 	}
1476bf0d5f50SAlex Elder 
1477bf0d5f50SAlex Elder 	kfree(obj_request);
1478bf0d5f50SAlex Elder }
1479bf0d5f50SAlex Elder 
1480bf0d5f50SAlex Elder /*
1481bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1482bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1483bf0d5f50SAlex Elder  * (if there is one).
1484bf0d5f50SAlex Elder  */
1485bf0d5f50SAlex Elder struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1486bf0d5f50SAlex Elder 					u64 offset, u64 length,
1487bf0d5f50SAlex Elder 					bool write_request)
1488bf0d5f50SAlex Elder {
1489bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1490bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1491bf0d5f50SAlex Elder 
1492bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1493bf0d5f50SAlex Elder 	if (!img_request)
1494bf0d5f50SAlex Elder 		return NULL;
1495bf0d5f50SAlex Elder 
1496bf0d5f50SAlex Elder 	if (write_request) {
1497bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1498bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1499bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1500bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1501bf0d5f50SAlex Elder 			kfree(img_request);
1502bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1503bf0d5f50SAlex Elder 		}
1504bf0d5f50SAlex Elder 	}
1505bf0d5f50SAlex Elder 
1506bf0d5f50SAlex Elder 	img_request->rq = NULL;
1507bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1508bf0d5f50SAlex Elder 	img_request->offset = offset;
1509bf0d5f50SAlex Elder 	img_request->length = length;
1510bf0d5f50SAlex Elder 	img_request->write_request = write_request;
1511bf0d5f50SAlex Elder 	if (write_request)
1512bf0d5f50SAlex Elder 		img_request->snapc = snapc;
1513bf0d5f50SAlex Elder 	else
1514bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
1515bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1516bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1517bf0d5f50SAlex Elder 	img_request->callback = NULL;
1518bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1519bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1520bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1521bf0d5f50SAlex Elder 
1522bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1523bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1524bf0d5f50SAlex Elder 
1525bf0d5f50SAlex Elder 	return img_request;
1526bf0d5f50SAlex Elder }
1527bf0d5f50SAlex Elder 
1528bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1529bf0d5f50SAlex Elder {
1530bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1531bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1532bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1533bf0d5f50SAlex Elder 
1534bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1535bf0d5f50SAlex Elder 
1536bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1537bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
153825dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1539bf0d5f50SAlex Elder 
1540bf0d5f50SAlex Elder 	if (img_request->write_request)
1541bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1542bf0d5f50SAlex Elder 
1543bf0d5f50SAlex Elder 	kfree(img_request);
1544bf0d5f50SAlex Elder }
1545bf0d5f50SAlex Elder 
1546bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1547bf0d5f50SAlex Elder 					struct bio *bio_list)
1548bf0d5f50SAlex Elder {
1549bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1550bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1551bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1552bf0d5f50SAlex Elder 	unsigned int bio_offset;
1553bf0d5f50SAlex Elder 	u64 image_offset;
1554bf0d5f50SAlex Elder 	u64 resid;
1555bf0d5f50SAlex Elder 	u16 opcode;
1556bf0d5f50SAlex Elder 
1557bf0d5f50SAlex Elder 	opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1558bf0d5f50SAlex Elder 					      : CEPH_OSD_OP_READ;
1559bf0d5f50SAlex Elder 	bio_offset = 0;
1560bf0d5f50SAlex Elder 	image_offset = img_request->offset;
1561bf0d5f50SAlex Elder 	rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1562bf0d5f50SAlex Elder 	resid = img_request->length;
1563bf0d5f50SAlex Elder 	while (resid) {
1564bf0d5f50SAlex Elder 		const char *object_name;
1565bf0d5f50SAlex Elder 		unsigned int clone_size;
1566bf0d5f50SAlex Elder 		struct ceph_osd_req_op *op;
1567bf0d5f50SAlex Elder 		u64 offset;
1568bf0d5f50SAlex Elder 		u64 length;
1569bf0d5f50SAlex Elder 
1570bf0d5f50SAlex Elder 		object_name = rbd_segment_name(rbd_dev, image_offset);
1571bf0d5f50SAlex Elder 		if (!object_name)
1572bf0d5f50SAlex Elder 			goto out_unwind;
1573bf0d5f50SAlex Elder 		offset = rbd_segment_offset(rbd_dev, image_offset);
1574bf0d5f50SAlex Elder 		length = rbd_segment_length(rbd_dev, image_offset, resid);
1575bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1576bf0d5f50SAlex Elder 						offset, length,
1577bf0d5f50SAlex Elder 						OBJ_REQUEST_BIO);
1578bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1579bf0d5f50SAlex Elder 		if (!obj_request)
1580bf0d5f50SAlex Elder 			goto out_unwind;
1581bf0d5f50SAlex Elder 
1582bf0d5f50SAlex Elder 		rbd_assert(length <= (u64) UINT_MAX);
1583bf0d5f50SAlex Elder 		clone_size = (unsigned int) length;
1584bf0d5f50SAlex Elder 		obj_request->bio_list = bio_chain_clone_range(&bio_list,
1585bf0d5f50SAlex Elder 						&bio_offset, clone_size,
1586bf0d5f50SAlex Elder 						GFP_ATOMIC);
1587bf0d5f50SAlex Elder 		if (!obj_request->bio_list)
1588bf0d5f50SAlex Elder 			goto out_partial;
1589bf0d5f50SAlex Elder 
1590bf0d5f50SAlex Elder 		/*
1591bf0d5f50SAlex Elder 		 * Build up the op to use in building the osd
1592bf0d5f50SAlex Elder 		 * request.  Note that the contents of the op are
1593bf0d5f50SAlex Elder 		 * copied by rbd_osd_req_create().
1594bf0d5f50SAlex Elder 		 */
1595bf0d5f50SAlex Elder 		op = rbd_osd_req_op_create(opcode, offset, length);
1596bf0d5f50SAlex Elder 		if (!op)
1597bf0d5f50SAlex Elder 			goto out_partial;
1598bf0d5f50SAlex Elder 		obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1599bf0d5f50SAlex Elder 						img_request->write_request,
1600bf0d5f50SAlex Elder 						obj_request, op);
1601bf0d5f50SAlex Elder 		rbd_osd_req_op_destroy(op);
1602bf0d5f50SAlex Elder 		if (!obj_request->osd_req)
1603bf0d5f50SAlex Elder 			goto out_partial;
1604bf0d5f50SAlex Elder 		/* status and version are initially zero-filled */
1605bf0d5f50SAlex Elder 
1606bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1607bf0d5f50SAlex Elder 
1608bf0d5f50SAlex Elder 		image_offset += length;
1609bf0d5f50SAlex Elder 		resid -= length;
1610bf0d5f50SAlex Elder 	}
1611bf0d5f50SAlex Elder 
1612bf0d5f50SAlex Elder 	return 0;
1613bf0d5f50SAlex Elder 
1614bf0d5f50SAlex Elder out_partial:
1615bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1616bf0d5f50SAlex Elder out_unwind:
1617bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1618bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1619bf0d5f50SAlex Elder 
1620bf0d5f50SAlex Elder 	return -ENOMEM;
1621bf0d5f50SAlex Elder }
1622bf0d5f50SAlex Elder 
1623bf0d5f50SAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1624bf0d5f50SAlex Elder {
1625bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1626bf0d5f50SAlex Elder 	u32 which = obj_request->which;
1627bf0d5f50SAlex Elder 	bool more = true;
1628bf0d5f50SAlex Elder 
1629bf0d5f50SAlex Elder 	img_request = obj_request->img_request;
1630bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
1631bf0d5f50SAlex Elder 	rbd_assert(img_request->rq != NULL);
1632bf0d5f50SAlex Elder 	rbd_assert(which != BAD_WHICH);
1633bf0d5f50SAlex Elder 	rbd_assert(which < img_request->obj_request_count);
1634bf0d5f50SAlex Elder 	rbd_assert(which >= img_request->next_completion);
1635bf0d5f50SAlex Elder 
1636bf0d5f50SAlex Elder 	spin_lock_irq(&img_request->completion_lock);
1637bf0d5f50SAlex Elder 	if (which != img_request->next_completion)
1638bf0d5f50SAlex Elder 		goto out;
1639bf0d5f50SAlex Elder 
1640bf0d5f50SAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
1641bf0d5f50SAlex Elder 		unsigned int xferred;
1642bf0d5f50SAlex Elder 		int result;
1643bf0d5f50SAlex Elder 
1644bf0d5f50SAlex Elder 		rbd_assert(more);
1645bf0d5f50SAlex Elder 		rbd_assert(which < img_request->obj_request_count);
1646bf0d5f50SAlex Elder 
164707741308SAlex Elder 		if (!obj_request_done_test(obj_request))
1648bf0d5f50SAlex Elder 			break;
1649bf0d5f50SAlex Elder 
1650bf0d5f50SAlex Elder 		rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1651bf0d5f50SAlex Elder 		xferred = (unsigned int) obj_request->xferred;
1652bf0d5f50SAlex Elder 		result = (int) obj_request->result;
1653bf0d5f50SAlex Elder 		if (result)
1654bf0d5f50SAlex Elder 			rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1655bf0d5f50SAlex Elder 				img_request->write_request ? "write" : "read",
1656bf0d5f50SAlex Elder 				result, xferred);
1657bf0d5f50SAlex Elder 
1658bf0d5f50SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
1659bf0d5f50SAlex Elder 		which++;
1660bf0d5f50SAlex Elder 	}
1661bf0d5f50SAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
1662bf0d5f50SAlex Elder 	img_request->next_completion = which;
1663bf0d5f50SAlex Elder out:
1664bf0d5f50SAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
1665bf0d5f50SAlex Elder 
1666bf0d5f50SAlex Elder 	if (!more)
1667bf0d5f50SAlex Elder 		rbd_img_request_complete(img_request);
1668bf0d5f50SAlex Elder }
1669bf0d5f50SAlex Elder 
1670bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
1671bf0d5f50SAlex Elder {
1672bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1673bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1674bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1675bf0d5f50SAlex Elder 
1676bf0d5f50SAlex Elder 	for_each_obj_request(img_request, obj_request) {
1677bf0d5f50SAlex Elder 		int ret;
1678bf0d5f50SAlex Elder 
1679bf0d5f50SAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1680bf0d5f50SAlex Elder 		ret = rbd_obj_request_submit(osdc, obj_request);
1681bf0d5f50SAlex Elder 		if (ret)
1682bf0d5f50SAlex Elder 			return ret;
1683bf0d5f50SAlex Elder 		/*
1684bf0d5f50SAlex Elder 		 * The image request has its own reference to each
1685bf0d5f50SAlex Elder 		 * of its object requests, so we can safely drop the
1686bf0d5f50SAlex Elder 		 * initial one here.
1687bf0d5f50SAlex Elder 		 */
1688bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1689bf0d5f50SAlex Elder 	}
1690bf0d5f50SAlex Elder 
1691bf0d5f50SAlex Elder 	return 0;
1692bf0d5f50SAlex Elder }
1693bf0d5f50SAlex Elder 
1694cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1695b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
1696b8d70035SAlex Elder {
1697b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
1698b8d70035SAlex Elder 	struct ceph_osd_req_op *op;
1699b8d70035SAlex Elder 	struct ceph_osd_client *osdc;
1700b8d70035SAlex Elder 	int ret;
1701b8d70035SAlex Elder 
1702b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1703b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
1704b8d70035SAlex Elder 	if (!obj_request)
1705b8d70035SAlex Elder 		return -ENOMEM;
1706b8d70035SAlex Elder 
1707b8d70035SAlex Elder 	ret = -ENOMEM;
1708b8d70035SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1709b8d70035SAlex Elder 	if (!op)
1710b8d70035SAlex Elder 		goto out;
1711b8d70035SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1712b8d70035SAlex Elder 						obj_request, op);
1713b8d70035SAlex Elder 	rbd_osd_req_op_destroy(op);
1714b8d70035SAlex Elder 	if (!obj_request->osd_req)
1715b8d70035SAlex Elder 		goto out;
1716b8d70035SAlex Elder 
1717b8d70035SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1718cf81b60eSAlex Elder 	obj_request->callback = rbd_obj_request_put;
1719b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
1720b8d70035SAlex Elder out:
1721cf81b60eSAlex Elder 	if (ret)
1722b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
1723b8d70035SAlex Elder 
1724b8d70035SAlex Elder 	return ret;
1725b8d70035SAlex Elder }
1726b8d70035SAlex Elder 
1727b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1728b8d70035SAlex Elder {
1729b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1730b8d70035SAlex Elder 	u64 hver;
1731b8d70035SAlex Elder 	int rc;
1732b8d70035SAlex Elder 
1733b8d70035SAlex Elder 	if (!rbd_dev)
1734b8d70035SAlex Elder 		return;
1735b8d70035SAlex Elder 
1736b8d70035SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1737b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1738b8d70035SAlex Elder 		(unsigned int) opcode);
1739b8d70035SAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
1740b8d70035SAlex Elder 	if (rc)
1741b8d70035SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
1742b8d70035SAlex Elder 			   " update snaps: %d\n", rc);
1743b8d70035SAlex Elder 
1744cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1745b8d70035SAlex Elder }
1746b8d70035SAlex Elder 
17479969ebc5SAlex Elder /*
17489969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
17499969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
17509969ebc5SAlex Elder  */
17519969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
17529969ebc5SAlex Elder {
17539969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
17549969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
17559969ebc5SAlex Elder 	struct ceph_osd_req_op *op;
17569969ebc5SAlex Elder 	int ret;
17579969ebc5SAlex Elder 
17589969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
17599969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
17609969ebc5SAlex Elder 
17619969ebc5SAlex Elder 	if (start) {
17623c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
17639969ebc5SAlex Elder 						&rbd_dev->watch_event);
17649969ebc5SAlex Elder 		if (ret < 0)
17659969ebc5SAlex Elder 			return ret;
17668eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
17679969ebc5SAlex Elder 	}
17689969ebc5SAlex Elder 
17699969ebc5SAlex Elder 	ret = -ENOMEM;
17709969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
17719969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
17729969ebc5SAlex Elder 	if (!obj_request)
17739969ebc5SAlex Elder 		goto out_cancel;
17749969ebc5SAlex Elder 
17759969ebc5SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
17769969ebc5SAlex Elder 				rbd_dev->watch_event->cookie,
17779969ebc5SAlex Elder 				rbd_dev->header.obj_version, start);
17789969ebc5SAlex Elder 	if (!op)
17799969ebc5SAlex Elder 		goto out_cancel;
17809969ebc5SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
17819969ebc5SAlex Elder 							obj_request, op);
17829969ebc5SAlex Elder 	rbd_osd_req_op_destroy(op);
17839969ebc5SAlex Elder 	if (!obj_request->osd_req)
17849969ebc5SAlex Elder 		goto out_cancel;
17859969ebc5SAlex Elder 
17868eb87565SAlex Elder 	if (start)
1787975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
17888eb87565SAlex Elder 	else
17896977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
1790975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
17919969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
17929969ebc5SAlex Elder 	if (ret)
17939969ebc5SAlex Elder 		goto out_cancel;
17949969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
17959969ebc5SAlex Elder 	if (ret)
17969969ebc5SAlex Elder 		goto out_cancel;
17979969ebc5SAlex Elder 	ret = obj_request->result;
17989969ebc5SAlex Elder 	if (ret)
17999969ebc5SAlex Elder 		goto out_cancel;
18009969ebc5SAlex Elder 
18018eb87565SAlex Elder 	/*
18028eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
18038eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
18048eb87565SAlex Elder 	 * a pointer to the object request during that time (in
18058eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
18068eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
18078eb87565SAlex Elder 	 * unregistered it.
18088eb87565SAlex Elder 	 */
18098eb87565SAlex Elder 	if (start) {
18108eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
18118eb87565SAlex Elder 
18128eb87565SAlex Elder 		return 0;
18138eb87565SAlex Elder 	}
18148eb87565SAlex Elder 
18158eb87565SAlex Elder 	/* We have successfully torn down the watch request */
18168eb87565SAlex Elder 
18178eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
18188eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
18199969ebc5SAlex Elder out_cancel:
18209969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
18219969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
18229969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
18239969ebc5SAlex Elder 	if (obj_request)
18249969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
18259969ebc5SAlex Elder 
18269969ebc5SAlex Elder 	return ret;
18279969ebc5SAlex Elder }
18289969ebc5SAlex Elder 
182936be9a76SAlex Elder /*
183036be9a76SAlex Elder  * Synchronous osd object method call
183136be9a76SAlex Elder  */
183236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
183336be9a76SAlex Elder 			     const char *object_name,
183436be9a76SAlex Elder 			     const char *class_name,
183536be9a76SAlex Elder 			     const char *method_name,
183636be9a76SAlex Elder 			     const char *outbound,
183736be9a76SAlex Elder 			     size_t outbound_size,
183836be9a76SAlex Elder 			     char *inbound,
183936be9a76SAlex Elder 			     size_t inbound_size,
184036be9a76SAlex Elder 			     u64 *version)
184136be9a76SAlex Elder {
184236be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
184336be9a76SAlex Elder 	struct ceph_osd_client *osdc;
184436be9a76SAlex Elder 	struct ceph_osd_req_op *op;
184536be9a76SAlex Elder 	struct page **pages;
184636be9a76SAlex Elder 	u32 page_count;
184736be9a76SAlex Elder 	int ret;
184836be9a76SAlex Elder 
184936be9a76SAlex Elder 	/*
185036be9a76SAlex Elder 	 * Method calls are ultimately read operations but they
185136be9a76SAlex Elder 	 * don't involve object data (so no offset or length).
185236be9a76SAlex Elder 	 * The result should placed into the inbound buffer
185336be9a76SAlex Elder 	 * provided.  They also supply outbound data--parameters for
185436be9a76SAlex Elder 	 * the object method.  Currently if this is present it will
185536be9a76SAlex Elder 	 * be a snapshot id.
185636be9a76SAlex Elder 	 */
185736be9a76SAlex Elder 	page_count = (u32) calc_pages_for(0, inbound_size);
185836be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
185936be9a76SAlex Elder 	if (IS_ERR(pages))
186036be9a76SAlex Elder 		return PTR_ERR(pages);
186136be9a76SAlex Elder 
186236be9a76SAlex Elder 	ret = -ENOMEM;
186336be9a76SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, 0,
186436be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
186536be9a76SAlex Elder 	if (!obj_request)
186636be9a76SAlex Elder 		goto out;
186736be9a76SAlex Elder 
186836be9a76SAlex Elder 	obj_request->pages = pages;
186936be9a76SAlex Elder 	obj_request->page_count = page_count;
187036be9a76SAlex Elder 
187136be9a76SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
187236be9a76SAlex Elder 					method_name, outbound, outbound_size);
187336be9a76SAlex Elder 	if (!op)
187436be9a76SAlex Elder 		goto out;
187536be9a76SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
187636be9a76SAlex Elder 						obj_request, op);
187736be9a76SAlex Elder 	rbd_osd_req_op_destroy(op);
187836be9a76SAlex Elder 	if (!obj_request->osd_req)
187936be9a76SAlex Elder 		goto out;
188036be9a76SAlex Elder 
188136be9a76SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
188236be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
188336be9a76SAlex Elder 	if (ret)
188436be9a76SAlex Elder 		goto out;
188536be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
188636be9a76SAlex Elder 	if (ret)
188736be9a76SAlex Elder 		goto out;
188836be9a76SAlex Elder 
188936be9a76SAlex Elder 	ret = obj_request->result;
189036be9a76SAlex Elder 	if (ret < 0)
189136be9a76SAlex Elder 		goto out;
189223ed6e13SAlex Elder 	ret = 0;
189323ed6e13SAlex Elder 	(void) ceph_copy_from_page_vector(pages, inbound, 0,
189436be9a76SAlex Elder 					obj_request->xferred);
189536be9a76SAlex Elder 	if (version)
189636be9a76SAlex Elder 		*version = obj_request->version;
189736be9a76SAlex Elder out:
189836be9a76SAlex Elder 	if (obj_request)
189936be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
190036be9a76SAlex Elder 	else
190136be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
190236be9a76SAlex Elder 
190336be9a76SAlex Elder 	return ret;
190436be9a76SAlex Elder }
190536be9a76SAlex Elder 
1906bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
1907bf0d5f50SAlex Elder {
1908bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
1909bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
1910bf0d5f50SAlex Elder 	struct request *rq;
1911bf0d5f50SAlex Elder 	int result;
1912bf0d5f50SAlex Elder 
1913bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
1914bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
1915bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
1916bf0d5f50SAlex Elder 		u64 offset;
1917bf0d5f50SAlex Elder 		u64 length;
1918bf0d5f50SAlex Elder 
1919bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
1920bf0d5f50SAlex Elder 
1921bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
1922bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
1923bf0d5f50SAlex Elder 			continue;
1924bf0d5f50SAlex Elder 		}
1925bf0d5f50SAlex Elder 
1926bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
1927bf0d5f50SAlex Elder 
1928bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
1929bf0d5f50SAlex Elder 
1930bf0d5f50SAlex Elder 		if (write_request) {
1931bf0d5f50SAlex Elder 			result = -EROFS;
1932bf0d5f50SAlex Elder 			if (read_only)
1933bf0d5f50SAlex Elder 				goto end_request;
1934bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1935bf0d5f50SAlex Elder 		}
1936bf0d5f50SAlex Elder 
19376d292906SAlex Elder 		/*
19386d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
19396d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
19406d292906SAlex Elder 		 * have disappeared by the time our request arrives
19416d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
19426d292906SAlex Elder 		 * we already know.
19436d292906SAlex Elder 		 */
19446d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
1945bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
1946bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1947bf0d5f50SAlex Elder 			result = -ENXIO;
1948bf0d5f50SAlex Elder 			goto end_request;
1949bf0d5f50SAlex Elder 		}
1950bf0d5f50SAlex Elder 
1951bf0d5f50SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1952bf0d5f50SAlex Elder 		length = (u64) blk_rq_bytes(rq);
1953bf0d5f50SAlex Elder 
1954bf0d5f50SAlex Elder 		result = -EINVAL;
1955bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
1956bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
1957bf0d5f50SAlex Elder 
1958bf0d5f50SAlex Elder 		result = -ENOMEM;
1959bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
1960bf0d5f50SAlex Elder 							write_request);
1961bf0d5f50SAlex Elder 		if (!img_request)
1962bf0d5f50SAlex Elder 			goto end_request;
1963bf0d5f50SAlex Elder 
1964bf0d5f50SAlex Elder 		img_request->rq = rq;
1965bf0d5f50SAlex Elder 
1966bf0d5f50SAlex Elder 		result = rbd_img_request_fill_bio(img_request, rq->bio);
1967bf0d5f50SAlex Elder 		if (!result)
1968bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
1969bf0d5f50SAlex Elder 		if (result)
1970bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
1971bf0d5f50SAlex Elder end_request:
1972bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
1973bf0d5f50SAlex Elder 		if (result < 0) {
1974bf0d5f50SAlex Elder 			rbd_warn(rbd_dev, "obj_request %s result %d\n",
1975bf0d5f50SAlex Elder 				write_request ? "write" : "read", result);
1976bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
1977bf0d5f50SAlex Elder 		}
1978bf0d5f50SAlex Elder 	}
1979bf0d5f50SAlex Elder }
1980bf0d5f50SAlex Elder 
1981602adf40SYehuda Sadeh /*
1982602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1983602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1984f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1985602adf40SYehuda Sadeh  */
1986602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1987602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1988602adf40SYehuda Sadeh {
1989602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1990e5cfeed2SAlex Elder 	sector_t sector_offset;
1991e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1992e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1993e5cfeed2SAlex Elder 	int ret;
1994602adf40SYehuda Sadeh 
1995e5cfeed2SAlex Elder 	/*
1996e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1997e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1998e5cfeed2SAlex Elder 	 * device.
1999e5cfeed2SAlex Elder 	 */
2000e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2001e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2002e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2003593a9e7bSAlex Elder 
2004e5cfeed2SAlex Elder 	/*
2005e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2006e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2007e5cfeed2SAlex Elder 	 */
2008e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2009e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2010e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2011e5cfeed2SAlex Elder 	else
2012e5cfeed2SAlex Elder 		ret = 0;
2013e5cfeed2SAlex Elder 
2014e5cfeed2SAlex Elder 	/*
2015e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2016e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2017e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2018e5cfeed2SAlex Elder 	 * added to an empty bio."
2019e5cfeed2SAlex Elder 	 */
2020e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2021e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2022e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2023e5cfeed2SAlex Elder 
2024e5cfeed2SAlex Elder 	return ret;
2025602adf40SYehuda Sadeh }
2026602adf40SYehuda Sadeh 
2027602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2028602adf40SYehuda Sadeh {
2029602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2030602adf40SYehuda Sadeh 
2031602adf40SYehuda Sadeh 	if (!disk)
2032602adf40SYehuda Sadeh 		return;
2033602adf40SYehuda Sadeh 
2034602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2035602adf40SYehuda Sadeh 		del_gendisk(disk);
2036602adf40SYehuda Sadeh 	if (disk->queue)
2037602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2038602adf40SYehuda Sadeh 	put_disk(disk);
2039602adf40SYehuda Sadeh }
2040602adf40SYehuda Sadeh 
2041788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2042788e2df3SAlex Elder 				const char *object_name,
2043788e2df3SAlex Elder 				u64 offset, u64 length,
2044788e2df3SAlex Elder 				char *buf, u64 *version)
2045788e2df3SAlex Elder 
2046788e2df3SAlex Elder {
2047788e2df3SAlex Elder 	struct ceph_osd_req_op *op;
2048788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2049788e2df3SAlex Elder 	struct ceph_osd_client *osdc;
2050788e2df3SAlex Elder 	struct page **pages = NULL;
2051788e2df3SAlex Elder 	u32 page_count;
20521ceae7efSAlex Elder 	size_t size;
2053788e2df3SAlex Elder 	int ret;
2054788e2df3SAlex Elder 
2055788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2056788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2057788e2df3SAlex Elder 	if (IS_ERR(pages))
2058788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2059788e2df3SAlex Elder 
2060788e2df3SAlex Elder 	ret = -ENOMEM;
2061788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2062788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2063788e2df3SAlex Elder 	if (!obj_request)
2064788e2df3SAlex Elder 		goto out;
2065788e2df3SAlex Elder 
2066788e2df3SAlex Elder 	obj_request->pages = pages;
2067788e2df3SAlex Elder 	obj_request->page_count = page_count;
2068788e2df3SAlex Elder 
2069788e2df3SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2070788e2df3SAlex Elder 	if (!op)
2071788e2df3SAlex Elder 		goto out;
2072788e2df3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2073788e2df3SAlex Elder 						obj_request, op);
2074788e2df3SAlex Elder 	rbd_osd_req_op_destroy(op);
2075788e2df3SAlex Elder 	if (!obj_request->osd_req)
2076788e2df3SAlex Elder 		goto out;
2077788e2df3SAlex Elder 
2078788e2df3SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2079788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2080788e2df3SAlex Elder 	if (ret)
2081788e2df3SAlex Elder 		goto out;
2082788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2083788e2df3SAlex Elder 	if (ret)
2084788e2df3SAlex Elder 		goto out;
2085788e2df3SAlex Elder 
2086788e2df3SAlex Elder 	ret = obj_request->result;
2087788e2df3SAlex Elder 	if (ret < 0)
2088788e2df3SAlex Elder 		goto out;
20891ceae7efSAlex Elder 
20901ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
20911ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
209223ed6e13SAlex Elder 	(void) ceph_copy_from_page_vector(pages, buf, 0, size);
209323ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
209423ed6e13SAlex Elder 	ret = (int) size;
2095788e2df3SAlex Elder 	if (version)
2096788e2df3SAlex Elder 		*version = obj_request->version;
2097788e2df3SAlex Elder out:
2098788e2df3SAlex Elder 	if (obj_request)
2099788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2100788e2df3SAlex Elder 	else
2101788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2102788e2df3SAlex Elder 
2103788e2df3SAlex Elder 	return ret;
2104788e2df3SAlex Elder }
2105788e2df3SAlex Elder 
2106602adf40SYehuda Sadeh /*
21074156d998SAlex Elder  * Read the complete header for the given rbd device.
21084156d998SAlex Elder  *
21094156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
21104156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
21114156d998SAlex Elder  * of a variable that will be filled in with the version of the
21124156d998SAlex Elder  * header object at the time it was read.
21134156d998SAlex Elder  *
21144156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
21154156d998SAlex Elder  */
21164156d998SAlex Elder static struct rbd_image_header_ondisk *
21174156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
21184156d998SAlex Elder {
21194156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
21204156d998SAlex Elder 	u32 snap_count = 0;
21214156d998SAlex Elder 	u64 names_size = 0;
21224156d998SAlex Elder 	u32 want_count;
21234156d998SAlex Elder 	int ret;
21244156d998SAlex Elder 
21254156d998SAlex Elder 	/*
21264156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
21274156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
21284156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
21294156d998SAlex Elder 	 * the number of snapshots could change by the time we read
21304156d998SAlex Elder 	 * it in, in which case we re-read it.
21314156d998SAlex Elder 	 */
21324156d998SAlex Elder 	do {
21334156d998SAlex Elder 		size_t size;
21344156d998SAlex Elder 
21354156d998SAlex Elder 		kfree(ondisk);
21364156d998SAlex Elder 
21374156d998SAlex Elder 		size = sizeof (*ondisk);
21384156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
21394156d998SAlex Elder 		size += names_size;
21404156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
21414156d998SAlex Elder 		if (!ondisk)
21424156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
21434156d998SAlex Elder 
2144788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
21454156d998SAlex Elder 				       0, size,
21464156d998SAlex Elder 				       (char *) ondisk, version);
21474156d998SAlex Elder 		if (ret < 0)
21484156d998SAlex Elder 			goto out_err;
21494156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
21504156d998SAlex Elder 			ret = -ENXIO;
215106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
215206ecc6cbSAlex Elder 				size, ret);
21534156d998SAlex Elder 			goto out_err;
21544156d998SAlex Elder 		}
21554156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
21564156d998SAlex Elder 			ret = -ENXIO;
215706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
21584156d998SAlex Elder 			goto out_err;
21594156d998SAlex Elder 		}
21604156d998SAlex Elder 
21614156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
21624156d998SAlex Elder 		want_count = snap_count;
21634156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
21644156d998SAlex Elder 	} while (snap_count != want_count);
21654156d998SAlex Elder 
21664156d998SAlex Elder 	return ondisk;
21674156d998SAlex Elder 
21684156d998SAlex Elder out_err:
21694156d998SAlex Elder 	kfree(ondisk);
21704156d998SAlex Elder 
21714156d998SAlex Elder 	return ERR_PTR(ret);
21724156d998SAlex Elder }
21734156d998SAlex Elder 
21744156d998SAlex Elder /*
2175602adf40SYehuda Sadeh  * reload the ondisk the header
2176602adf40SYehuda Sadeh  */
2177602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2178602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2179602adf40SYehuda Sadeh {
21804156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
21814156d998SAlex Elder 	u64 ver = 0;
21824156d998SAlex Elder 	int ret;
2183602adf40SYehuda Sadeh 
21844156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
21854156d998SAlex Elder 	if (IS_ERR(ondisk))
21864156d998SAlex Elder 		return PTR_ERR(ondisk);
21874156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
21884156d998SAlex Elder 	if (ret >= 0)
218959c2be1eSYehuda Sadeh 		header->obj_version = ver;
21904156d998SAlex Elder 	kfree(ondisk);
2191602adf40SYehuda Sadeh 
21924156d998SAlex Elder 	return ret;
2193602adf40SYehuda Sadeh }
2194602adf40SYehuda Sadeh 
219541f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2196dfc5606dSYehuda Sadeh {
2197dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2198a0593290SAlex Elder 	struct rbd_snap *next;
2199dfc5606dSYehuda Sadeh 
2200a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
220141f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
2202dfc5606dSYehuda Sadeh }
2203dfc5606dSYehuda Sadeh 
22049478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
22059478554aSAlex Elder {
22069478554aSAlex Elder 	sector_t size;
22079478554aSAlex Elder 
22080d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
22099478554aSAlex Elder 		return;
22109478554aSAlex Elder 
22119478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
22129478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
22139478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
22149478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
22159478554aSAlex Elder }
22169478554aSAlex Elder 
2217602adf40SYehuda Sadeh /*
2218602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
2219602adf40SYehuda Sadeh  */
2220117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2221602adf40SYehuda Sadeh {
2222602adf40SYehuda Sadeh 	int ret;
2223602adf40SYehuda Sadeh 	struct rbd_image_header h;
2224602adf40SYehuda Sadeh 
2225602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
2226602adf40SYehuda Sadeh 	if (ret < 0)
2227602adf40SYehuda Sadeh 		return ret;
2228602adf40SYehuda Sadeh 
2229a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
2230a51aa0c0SJosh Durgin 
22319478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
22329478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
22339478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
22349db4b3e3SSage Weil 
2235849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
2236602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
2237849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
2238d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
2239d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
2240602adf40SYehuda Sadeh 
2241b813623aSAlex Elder 	if (hver)
2242b813623aSAlex Elder 		*hver = h.obj_version;
2243a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
224493a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
2245602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
2246602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
2247602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
2248849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
2249849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2250849b4260SAlex Elder 	kfree(h.object_prefix);
2251849b4260SAlex Elder 
2252304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2253304f6808SAlex Elder 	if (!ret)
2254304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
2255dfc5606dSYehuda Sadeh 
2256c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
2257602adf40SYehuda Sadeh 
2258dfc5606dSYehuda Sadeh 	return ret;
2259602adf40SYehuda Sadeh }
2260602adf40SYehuda Sadeh 
2261117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
22621fe5e993SAlex Elder {
22631fe5e993SAlex Elder 	int ret;
22641fe5e993SAlex Elder 
2265117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
22661fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2267117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
2268117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
2269117973fbSAlex Elder 	else
2270117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
22711fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
22721fe5e993SAlex Elder 
22731fe5e993SAlex Elder 	return ret;
22741fe5e993SAlex Elder }
22751fe5e993SAlex Elder 
2276602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
2277602adf40SYehuda Sadeh {
2278602adf40SYehuda Sadeh 	struct gendisk *disk;
2279602adf40SYehuda Sadeh 	struct request_queue *q;
2280593a9e7bSAlex Elder 	u64 segment_size;
2281602adf40SYehuda Sadeh 
2282602adf40SYehuda Sadeh 	/* create gendisk info */
2283602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2284602adf40SYehuda Sadeh 	if (!disk)
22851fcdb8aaSAlex Elder 		return -ENOMEM;
2286602adf40SYehuda Sadeh 
2287f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2288de71a297SAlex Elder 		 rbd_dev->dev_id);
2289602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
2290602adf40SYehuda Sadeh 	disk->first_minor = 0;
2291602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
2292602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
2293602adf40SYehuda Sadeh 
2294bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2295602adf40SYehuda Sadeh 	if (!q)
2296602adf40SYehuda Sadeh 		goto out_disk;
2297029bcbd8SJosh Durgin 
2298593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
2299593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
2300593a9e7bSAlex Elder 
2301029bcbd8SJosh Durgin 	/* set io sizes to object size */
2302593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
2303593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2304593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
2305593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
2306593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
2307029bcbd8SJosh Durgin 
2308602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
2309602adf40SYehuda Sadeh 	disk->queue = q;
2310602adf40SYehuda Sadeh 
2311602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
2312602adf40SYehuda Sadeh 
2313602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
2314602adf40SYehuda Sadeh 
231512f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
231612f02944SAlex Elder 
2317602adf40SYehuda Sadeh 	return 0;
2318602adf40SYehuda Sadeh out_disk:
2319602adf40SYehuda Sadeh 	put_disk(disk);
23201fcdb8aaSAlex Elder 
23211fcdb8aaSAlex Elder 	return -ENOMEM;
2322602adf40SYehuda Sadeh }
2323602adf40SYehuda Sadeh 
2324dfc5606dSYehuda Sadeh /*
2325dfc5606dSYehuda Sadeh   sysfs
2326dfc5606dSYehuda Sadeh */
2327602adf40SYehuda Sadeh 
2328593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2329593a9e7bSAlex Elder {
2330593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
2331593a9e7bSAlex Elder }
2332593a9e7bSAlex Elder 
2333dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
2334dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2335602adf40SYehuda Sadeh {
2336593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2337a51aa0c0SJosh Durgin 	sector_t size;
2338dfc5606dSYehuda Sadeh 
2339a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2340a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2341a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2342a51aa0c0SJosh Durgin 
2343a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2344602adf40SYehuda Sadeh }
2345602adf40SYehuda Sadeh 
234634b13184SAlex Elder /*
234734b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
234834b13184SAlex Elder  * necessarily the base image.
234934b13184SAlex Elder  */
235034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
235134b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
235234b13184SAlex Elder {
235334b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
235434b13184SAlex Elder 
235534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
235634b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
235734b13184SAlex Elder }
235834b13184SAlex Elder 
2359dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2360dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2361602adf40SYehuda Sadeh {
2362593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2363dfc5606dSYehuda Sadeh 
2364dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2365dfc5606dSYehuda Sadeh }
2366dfc5606dSYehuda Sadeh 
2367dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2368dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2369dfc5606dSYehuda Sadeh {
2370593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2371dfc5606dSYehuda Sadeh 
23721dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
23731dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2374dfc5606dSYehuda Sadeh }
2375dfc5606dSYehuda Sadeh 
2376dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2377dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2378dfc5606dSYehuda Sadeh {
2379593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2380dfc5606dSYehuda Sadeh 
23810d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2382dfc5606dSYehuda Sadeh }
2383dfc5606dSYehuda Sadeh 
23849bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
23859bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
23869bb2f334SAlex Elder {
23879bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
23889bb2f334SAlex Elder 
23890d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
23900d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
23919bb2f334SAlex Elder }
23929bb2f334SAlex Elder 
2393dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2394dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2395dfc5606dSYehuda Sadeh {
2396593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2397dfc5606dSYehuda Sadeh 
2398a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
23990d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2400a92ffdf8SAlex Elder 
2401a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2402dfc5606dSYehuda Sadeh }
2403dfc5606dSYehuda Sadeh 
2404589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2405589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2406589d30e0SAlex Elder {
2407589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2408589d30e0SAlex Elder 
24090d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2410589d30e0SAlex Elder }
2411589d30e0SAlex Elder 
241234b13184SAlex Elder /*
241334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
241434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
241534b13184SAlex Elder  */
2416dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2417dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2418dfc5606dSYehuda Sadeh 			     char *buf)
2419dfc5606dSYehuda Sadeh {
2420593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2421dfc5606dSYehuda Sadeh 
24220d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2423dfc5606dSYehuda Sadeh }
2424dfc5606dSYehuda Sadeh 
242586b00e0dSAlex Elder /*
242686b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
242786b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
242886b00e0dSAlex Elder  * "(no parent image)".
242986b00e0dSAlex Elder  */
243086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
243186b00e0dSAlex Elder 			     struct device_attribute *attr,
243286b00e0dSAlex Elder 			     char *buf)
243386b00e0dSAlex Elder {
243486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
243586b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
243686b00e0dSAlex Elder 	int count;
243786b00e0dSAlex Elder 	char *bufp = buf;
243886b00e0dSAlex Elder 
243986b00e0dSAlex Elder 	if (!spec)
244086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
244186b00e0dSAlex Elder 
244286b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
244386b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
244486b00e0dSAlex Elder 	if (count < 0)
244586b00e0dSAlex Elder 		return count;
244686b00e0dSAlex Elder 	bufp += count;
244786b00e0dSAlex Elder 
244886b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
244986b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
245086b00e0dSAlex Elder 	if (count < 0)
245186b00e0dSAlex Elder 		return count;
245286b00e0dSAlex Elder 	bufp += count;
245386b00e0dSAlex Elder 
245486b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
245586b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
245686b00e0dSAlex Elder 	if (count < 0)
245786b00e0dSAlex Elder 		return count;
245886b00e0dSAlex Elder 	bufp += count;
245986b00e0dSAlex Elder 
246086b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
246186b00e0dSAlex Elder 	if (count < 0)
246286b00e0dSAlex Elder 		return count;
246386b00e0dSAlex Elder 	bufp += count;
246486b00e0dSAlex Elder 
246586b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
246686b00e0dSAlex Elder }
246786b00e0dSAlex Elder 
2468dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2469dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2470dfc5606dSYehuda Sadeh 				 const char *buf,
2471dfc5606dSYehuda Sadeh 				 size_t size)
2472dfc5606dSYehuda Sadeh {
2473593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2474b813623aSAlex Elder 	int ret;
2475602adf40SYehuda Sadeh 
2476117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2477b813623aSAlex Elder 
2478b813623aSAlex Elder 	return ret < 0 ? ret : size;
2479dfc5606dSYehuda Sadeh }
2480602adf40SYehuda Sadeh 
2481dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
248234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2483dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2484dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2485dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
24869bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2487dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2488589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2489dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2490dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
249186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2492dfc5606dSYehuda Sadeh 
2493dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2494dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
249534b13184SAlex Elder 	&dev_attr_features.attr,
2496dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2497dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2498dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
24999bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2500dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2501589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2502dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
250386b00e0dSAlex Elder 	&dev_attr_parent.attr,
2504dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2505dfc5606dSYehuda Sadeh 	NULL
2506dfc5606dSYehuda Sadeh };
2507dfc5606dSYehuda Sadeh 
2508dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2509dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2510dfc5606dSYehuda Sadeh };
2511dfc5606dSYehuda Sadeh 
2512dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2513dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2514dfc5606dSYehuda Sadeh 	NULL
2515dfc5606dSYehuda Sadeh };
2516dfc5606dSYehuda Sadeh 
2517dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2518dfc5606dSYehuda Sadeh {
2519dfc5606dSYehuda Sadeh }
2520dfc5606dSYehuda Sadeh 
2521dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2522dfc5606dSYehuda Sadeh 	.name		= "rbd",
2523dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2524dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2525dfc5606dSYehuda Sadeh };
2526dfc5606dSYehuda Sadeh 
2527dfc5606dSYehuda Sadeh 
2528dfc5606dSYehuda Sadeh /*
2529dfc5606dSYehuda Sadeh   sysfs - snapshots
2530dfc5606dSYehuda Sadeh */
2531dfc5606dSYehuda Sadeh 
2532dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2533dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2534dfc5606dSYehuda Sadeh 				  char *buf)
2535dfc5606dSYehuda Sadeh {
2536dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2537dfc5606dSYehuda Sadeh 
25383591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2539dfc5606dSYehuda Sadeh }
2540dfc5606dSYehuda Sadeh 
2541dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2542dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2543dfc5606dSYehuda Sadeh 				char *buf)
2544dfc5606dSYehuda Sadeh {
2545dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2546dfc5606dSYehuda Sadeh 
2547593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2548dfc5606dSYehuda Sadeh }
2549dfc5606dSYehuda Sadeh 
255034b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
255134b13184SAlex Elder 				struct device_attribute *attr,
255234b13184SAlex Elder 				char *buf)
255334b13184SAlex Elder {
255434b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
255534b13184SAlex Elder 
255634b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
255734b13184SAlex Elder 			(unsigned long long) snap->features);
255834b13184SAlex Elder }
255934b13184SAlex Elder 
2560dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2561dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
256234b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2563dfc5606dSYehuda Sadeh 
2564dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2565dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2566dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
256734b13184SAlex Elder 	&dev_attr_snap_features.attr,
2568dfc5606dSYehuda Sadeh 	NULL,
2569dfc5606dSYehuda Sadeh };
2570dfc5606dSYehuda Sadeh 
2571dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2572dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2573dfc5606dSYehuda Sadeh };
2574dfc5606dSYehuda Sadeh 
2575dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2576dfc5606dSYehuda Sadeh {
2577dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2578dfc5606dSYehuda Sadeh 	kfree(snap->name);
2579dfc5606dSYehuda Sadeh 	kfree(snap);
2580dfc5606dSYehuda Sadeh }
2581dfc5606dSYehuda Sadeh 
2582dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2583dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2584dfc5606dSYehuda Sadeh 	NULL
2585dfc5606dSYehuda Sadeh };
2586dfc5606dSYehuda Sadeh 
2587dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2588dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2589dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2590dfc5606dSYehuda Sadeh };
2591dfc5606dSYehuda Sadeh 
25928b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
25938b8fb99cSAlex Elder {
25948b8fb99cSAlex Elder 	kref_get(&spec->kref);
25958b8fb99cSAlex Elder 
25968b8fb99cSAlex Elder 	return spec;
25978b8fb99cSAlex Elder }
25988b8fb99cSAlex Elder 
25998b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
26008b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
26018b8fb99cSAlex Elder {
26028b8fb99cSAlex Elder 	if (spec)
26038b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
26048b8fb99cSAlex Elder }
26058b8fb99cSAlex Elder 
26068b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
26078b8fb99cSAlex Elder {
26088b8fb99cSAlex Elder 	struct rbd_spec *spec;
26098b8fb99cSAlex Elder 
26108b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
26118b8fb99cSAlex Elder 	if (!spec)
26128b8fb99cSAlex Elder 		return NULL;
26138b8fb99cSAlex Elder 	kref_init(&spec->kref);
26148b8fb99cSAlex Elder 
26158b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
26168b8fb99cSAlex Elder 
26178b8fb99cSAlex Elder 	return spec;
26188b8fb99cSAlex Elder }
26198b8fb99cSAlex Elder 
26208b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
26218b8fb99cSAlex Elder {
26228b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
26238b8fb99cSAlex Elder 
26248b8fb99cSAlex Elder 	kfree(spec->pool_name);
26258b8fb99cSAlex Elder 	kfree(spec->image_id);
26268b8fb99cSAlex Elder 	kfree(spec->image_name);
26278b8fb99cSAlex Elder 	kfree(spec->snap_name);
26288b8fb99cSAlex Elder 	kfree(spec);
26298b8fb99cSAlex Elder }
26308b8fb99cSAlex Elder 
2631c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2632c53d5893SAlex Elder 				struct rbd_spec *spec)
2633c53d5893SAlex Elder {
2634c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2635c53d5893SAlex Elder 
2636c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2637c53d5893SAlex Elder 	if (!rbd_dev)
2638c53d5893SAlex Elder 		return NULL;
2639c53d5893SAlex Elder 
2640c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
26416d292906SAlex Elder 	rbd_dev->flags = 0;
2642c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2643c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2644c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2645c53d5893SAlex Elder 
2646c53d5893SAlex Elder 	rbd_dev->spec = spec;
2647c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2648c53d5893SAlex Elder 
26490903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
26500903e875SAlex Elder 
26510903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
26520903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
26530903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
26540903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
26550903e875SAlex Elder 
2656c53d5893SAlex Elder 	return rbd_dev;
2657c53d5893SAlex Elder }
2658c53d5893SAlex Elder 
2659c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2660c53d5893SAlex Elder {
266186b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2662c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2663c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2664c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2665c53d5893SAlex Elder 	kfree(rbd_dev);
2666c53d5893SAlex Elder }
2667c53d5893SAlex Elder 
2668304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2669304f6808SAlex Elder {
2670304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2671304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2672304f6808SAlex Elder 
2673304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2674304f6808SAlex Elder 
2675304f6808SAlex Elder 	return ret;
2676304f6808SAlex Elder }
2677304f6808SAlex Elder 
267841f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2679dfc5606dSYehuda Sadeh {
2680dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2681304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2682dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2683dfc5606dSYehuda Sadeh }
2684dfc5606dSYehuda Sadeh 
268514e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2686dfc5606dSYehuda Sadeh 				  struct device *parent)
2687dfc5606dSYehuda Sadeh {
2688dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2689dfc5606dSYehuda Sadeh 	int ret;
2690dfc5606dSYehuda Sadeh 
2691dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2692dfc5606dSYehuda Sadeh 	dev->parent = parent;
2693dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2694d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2695304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2696304f6808SAlex Elder 
2697dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2698dfc5606dSYehuda Sadeh 
2699dfc5606dSYehuda Sadeh 	return ret;
2700dfc5606dSYehuda Sadeh }
2701dfc5606dSYehuda Sadeh 
27024e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2703c8d18425SAlex Elder 						const char *snap_name,
270434b13184SAlex Elder 						u64 snap_id, u64 snap_size,
270534b13184SAlex Elder 						u64 snap_features)
2706dfc5606dSYehuda Sadeh {
27074e891e0aSAlex Elder 	struct rbd_snap *snap;
2708dfc5606dSYehuda Sadeh 	int ret;
27094e891e0aSAlex Elder 
27104e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2711dfc5606dSYehuda Sadeh 	if (!snap)
27124e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
27134e891e0aSAlex Elder 
27144e891e0aSAlex Elder 	ret = -ENOMEM;
2715c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
27164e891e0aSAlex Elder 	if (!snap->name)
27174e891e0aSAlex Elder 		goto err;
27184e891e0aSAlex Elder 
2719c8d18425SAlex Elder 	snap->id = snap_id;
2720c8d18425SAlex Elder 	snap->size = snap_size;
272134b13184SAlex Elder 	snap->features = snap_features;
27224e891e0aSAlex Elder 
27234e891e0aSAlex Elder 	return snap;
27244e891e0aSAlex Elder 
2725dfc5606dSYehuda Sadeh err:
2726dfc5606dSYehuda Sadeh 	kfree(snap->name);
2727dfc5606dSYehuda Sadeh 	kfree(snap);
27284e891e0aSAlex Elder 
27294e891e0aSAlex Elder 	return ERR_PTR(ret);
2730dfc5606dSYehuda Sadeh }
2731dfc5606dSYehuda Sadeh 
2732cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2733cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2734cd892126SAlex Elder {
2735cd892126SAlex Elder 	char *snap_name;
2736cd892126SAlex Elder 
2737cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2738cd892126SAlex Elder 
2739cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2740cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2741cd892126SAlex Elder 
2742cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2743cd892126SAlex Elder 
2744cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2745cd892126SAlex Elder 	while (which--)
2746cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2747cd892126SAlex Elder 
2748cd892126SAlex Elder 	return snap_name;
2749cd892126SAlex Elder }
2750cd892126SAlex Elder 
2751dfc5606dSYehuda Sadeh /*
27529d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
27539d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
27549d475de5SAlex Elder  * image.
27559d475de5SAlex Elder  */
27569d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
27579d475de5SAlex Elder 				u8 *order, u64 *snap_size)
27589d475de5SAlex Elder {
27599d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
27609d475de5SAlex Elder 	int ret;
27619d475de5SAlex Elder 	struct {
27629d475de5SAlex Elder 		u8 order;
27639d475de5SAlex Elder 		__le64 size;
27649d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
27659d475de5SAlex Elder 
276636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
27679d475de5SAlex Elder 				"rbd", "get_size",
27689d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
276907b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
277036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
27719d475de5SAlex Elder 	if (ret < 0)
27729d475de5SAlex Elder 		return ret;
27739d475de5SAlex Elder 
27749d475de5SAlex Elder 	*order = size_buf.order;
27759d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
27769d475de5SAlex Elder 
27779d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
27789d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
27799d475de5SAlex Elder 		(unsigned long long) *snap_size);
27809d475de5SAlex Elder 
27819d475de5SAlex Elder 	return 0;
27829d475de5SAlex Elder }
27839d475de5SAlex Elder 
27849d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
27859d475de5SAlex Elder {
27869d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
27879d475de5SAlex Elder 					&rbd_dev->header.obj_order,
27889d475de5SAlex Elder 					&rbd_dev->header.image_size);
27899d475de5SAlex Elder }
27909d475de5SAlex Elder 
27911e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
27921e130199SAlex Elder {
27931e130199SAlex Elder 	void *reply_buf;
27941e130199SAlex Elder 	int ret;
27951e130199SAlex Elder 	void *p;
27961e130199SAlex Elder 
27971e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
27981e130199SAlex Elder 	if (!reply_buf)
27991e130199SAlex Elder 		return -ENOMEM;
28001e130199SAlex Elder 
280136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
28021e130199SAlex Elder 				"rbd", "get_object_prefix",
28031e130199SAlex Elder 				NULL, 0,
280407b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
280536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
28061e130199SAlex Elder 	if (ret < 0)
28071e130199SAlex Elder 		goto out;
28081e130199SAlex Elder 
28091e130199SAlex Elder 	p = reply_buf;
28101e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
28111e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
28121e130199SAlex Elder 						NULL, GFP_NOIO);
28131e130199SAlex Elder 
28141e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
28151e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
28161e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
28171e130199SAlex Elder 	} else {
28181e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
28191e130199SAlex Elder 	}
28201e130199SAlex Elder 
28211e130199SAlex Elder out:
28221e130199SAlex Elder 	kfree(reply_buf);
28231e130199SAlex Elder 
28241e130199SAlex Elder 	return ret;
28251e130199SAlex Elder }
28261e130199SAlex Elder 
2827b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2828b1b5402aSAlex Elder 		u64 *snap_features)
2829b1b5402aSAlex Elder {
2830b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2831b1b5402aSAlex Elder 	struct {
2832b1b5402aSAlex Elder 		__le64 features;
2833b1b5402aSAlex Elder 		__le64 incompat;
2834b1b5402aSAlex Elder 	} features_buf = { 0 };
2835d889140cSAlex Elder 	u64 incompat;
2836b1b5402aSAlex Elder 	int ret;
2837b1b5402aSAlex Elder 
283836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2839b1b5402aSAlex Elder 				"rbd", "get_features",
2840b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2841b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
284207b2391fSAlex Elder 				NULL);
284336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2844b1b5402aSAlex Elder 	if (ret < 0)
2845b1b5402aSAlex Elder 		return ret;
2846d889140cSAlex Elder 
2847d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2848d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2849b8f5c6edSAlex Elder 		return -ENXIO;
2850d889140cSAlex Elder 
2851b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2852b1b5402aSAlex Elder 
2853b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2854b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2855b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2856b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2857b1b5402aSAlex Elder 
2858b1b5402aSAlex Elder 	return 0;
2859b1b5402aSAlex Elder }
2860b1b5402aSAlex Elder 
2861b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2862b1b5402aSAlex Elder {
2863b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2864b1b5402aSAlex Elder 						&rbd_dev->header.features);
2865b1b5402aSAlex Elder }
2866b1b5402aSAlex Elder 
286786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
286886b00e0dSAlex Elder {
286986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
287086b00e0dSAlex Elder 	size_t size;
287186b00e0dSAlex Elder 	void *reply_buf = NULL;
287286b00e0dSAlex Elder 	__le64 snapid;
287386b00e0dSAlex Elder 	void *p;
287486b00e0dSAlex Elder 	void *end;
287586b00e0dSAlex Elder 	char *image_id;
287686b00e0dSAlex Elder 	u64 overlap;
287786b00e0dSAlex Elder 	int ret;
287886b00e0dSAlex Elder 
287986b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
288086b00e0dSAlex Elder 	if (!parent_spec)
288186b00e0dSAlex Elder 		return -ENOMEM;
288286b00e0dSAlex Elder 
288386b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
288486b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
288586b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
288686b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
288786b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
288886b00e0dSAlex Elder 	if (!reply_buf) {
288986b00e0dSAlex Elder 		ret = -ENOMEM;
289086b00e0dSAlex Elder 		goto out_err;
289186b00e0dSAlex Elder 	}
289286b00e0dSAlex Elder 
289386b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
289436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
289586b00e0dSAlex Elder 				"rbd", "get_parent",
289686b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
289707b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
289836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
289986b00e0dSAlex Elder 	if (ret < 0)
290086b00e0dSAlex Elder 		goto out_err;
290186b00e0dSAlex Elder 
290286b00e0dSAlex Elder 	ret = -ERANGE;
290386b00e0dSAlex Elder 	p = reply_buf;
290486b00e0dSAlex Elder 	end = (char *) reply_buf + size;
290586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
290686b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
290786b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
290886b00e0dSAlex Elder 
29090903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
29100903e875SAlex Elder 
29110903e875SAlex Elder 	ret = -EIO;
29120903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
29130903e875SAlex Elder 		goto out;
29140903e875SAlex Elder 
2915979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
291686b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
291786b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
291886b00e0dSAlex Elder 		goto out_err;
291986b00e0dSAlex Elder 	}
292086b00e0dSAlex Elder 	parent_spec->image_id = image_id;
292186b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
292286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
292386b00e0dSAlex Elder 
292486b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
292586b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
292686b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
292786b00e0dSAlex Elder out:
292886b00e0dSAlex Elder 	ret = 0;
292986b00e0dSAlex Elder out_err:
293086b00e0dSAlex Elder 	kfree(reply_buf);
293186b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
293286b00e0dSAlex Elder 
293386b00e0dSAlex Elder 	return ret;
293486b00e0dSAlex Elder }
293586b00e0dSAlex Elder 
29369e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
29379e15b77dSAlex Elder {
29389e15b77dSAlex Elder 	size_t image_id_size;
29399e15b77dSAlex Elder 	char *image_id;
29409e15b77dSAlex Elder 	void *p;
29419e15b77dSAlex Elder 	void *end;
29429e15b77dSAlex Elder 	size_t size;
29439e15b77dSAlex Elder 	void *reply_buf = NULL;
29449e15b77dSAlex Elder 	size_t len = 0;
29459e15b77dSAlex Elder 	char *image_name = NULL;
29469e15b77dSAlex Elder 	int ret;
29479e15b77dSAlex Elder 
29489e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
29499e15b77dSAlex Elder 
295069e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
295169e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
29529e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
29539e15b77dSAlex Elder 	if (!image_id)
29549e15b77dSAlex Elder 		return NULL;
29559e15b77dSAlex Elder 
29569e15b77dSAlex Elder 	p = image_id;
29579e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
295869e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
29599e15b77dSAlex Elder 
29609e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
29619e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
29629e15b77dSAlex Elder 	if (!reply_buf)
29639e15b77dSAlex Elder 		goto out;
29649e15b77dSAlex Elder 
296536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
29669e15b77dSAlex Elder 				"rbd", "dir_get_name",
29679e15b77dSAlex Elder 				image_id, image_id_size,
296807b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
29699e15b77dSAlex Elder 	if (ret < 0)
29709e15b77dSAlex Elder 		goto out;
29719e15b77dSAlex Elder 	p = reply_buf;
29729e15b77dSAlex Elder 	end = (char *) reply_buf + size;
29739e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
29749e15b77dSAlex Elder 	if (IS_ERR(image_name))
29759e15b77dSAlex Elder 		image_name = NULL;
29769e15b77dSAlex Elder 	else
29779e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
29789e15b77dSAlex Elder out:
29799e15b77dSAlex Elder 	kfree(reply_buf);
29809e15b77dSAlex Elder 	kfree(image_id);
29819e15b77dSAlex Elder 
29829e15b77dSAlex Elder 	return image_name;
29839e15b77dSAlex Elder }
29849e15b77dSAlex Elder 
29859e15b77dSAlex Elder /*
29869e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
29879e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
29889e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
29899e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
29909e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
29919e15b77dSAlex Elder  * until then.
29929e15b77dSAlex Elder  */
29939e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
29949e15b77dSAlex Elder {
29959e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
29969e15b77dSAlex Elder 	const char *name;
29979e15b77dSAlex Elder 	void *reply_buf = NULL;
29989e15b77dSAlex Elder 	int ret;
29999e15b77dSAlex Elder 
30009e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
30019e15b77dSAlex Elder 		return 0;	/* Already have the names */
30029e15b77dSAlex Elder 
30039e15b77dSAlex Elder 	/* Look up the pool name */
30049e15b77dSAlex Elder 
30059e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
30069e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3007935dc89fSAlex Elder 	if (!name) {
3008935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3009935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3010935dc89fSAlex Elder 		return -EIO;
3011935dc89fSAlex Elder 	}
30129e15b77dSAlex Elder 
30139e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
30149e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
30159e15b77dSAlex Elder 		return -ENOMEM;
30169e15b77dSAlex Elder 
30179e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
30189e15b77dSAlex Elder 
30199e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
302069e7a02fSAlex Elder 	if (name)
30219e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
302269e7a02fSAlex Elder 	else
302306ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
30249e15b77dSAlex Elder 
30259e15b77dSAlex Elder 	/* Look up the snapshot name. */
30269e15b77dSAlex Elder 
30279e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
30289e15b77dSAlex Elder 	if (!name) {
3029935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3030935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
30319e15b77dSAlex Elder 		ret = -EIO;
30329e15b77dSAlex Elder 		goto out_err;
30339e15b77dSAlex Elder 	}
30349e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
30359e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
30369e15b77dSAlex Elder 		goto out_err;
30379e15b77dSAlex Elder 
30389e15b77dSAlex Elder 	return 0;
30399e15b77dSAlex Elder out_err:
30409e15b77dSAlex Elder 	kfree(reply_buf);
30419e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
30429e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
30439e15b77dSAlex Elder 
30449e15b77dSAlex Elder 	return ret;
30459e15b77dSAlex Elder }
30469e15b77dSAlex Elder 
30476e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
304835d489f9SAlex Elder {
304935d489f9SAlex Elder 	size_t size;
305035d489f9SAlex Elder 	int ret;
305135d489f9SAlex Elder 	void *reply_buf;
305235d489f9SAlex Elder 	void *p;
305335d489f9SAlex Elder 	void *end;
305435d489f9SAlex Elder 	u64 seq;
305535d489f9SAlex Elder 	u32 snap_count;
305635d489f9SAlex Elder 	struct ceph_snap_context *snapc;
305735d489f9SAlex Elder 	u32 i;
305835d489f9SAlex Elder 
305935d489f9SAlex Elder 	/*
306035d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
306135d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
306235d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
306335d489f9SAlex Elder 	 * prepared to receive.
306435d489f9SAlex Elder 	 */
306535d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
306635d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
306735d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
306835d489f9SAlex Elder 	if (!reply_buf)
306935d489f9SAlex Elder 		return -ENOMEM;
307035d489f9SAlex Elder 
307136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
307235d489f9SAlex Elder 				"rbd", "get_snapcontext",
307335d489f9SAlex Elder 				NULL, 0,
307407b2391fSAlex Elder 				reply_buf, size, ver);
307536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
307635d489f9SAlex Elder 	if (ret < 0)
307735d489f9SAlex Elder 		goto out;
307835d489f9SAlex Elder 
307935d489f9SAlex Elder 	ret = -ERANGE;
308035d489f9SAlex Elder 	p = reply_buf;
308135d489f9SAlex Elder 	end = (char *) reply_buf + size;
308235d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
308335d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
308435d489f9SAlex Elder 
308535d489f9SAlex Elder 	/*
308635d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
308735d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
308835d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
308935d489f9SAlex Elder 	 * allocate is representable in a size_t.
309035d489f9SAlex Elder 	 */
309135d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
309235d489f9SAlex Elder 				 / sizeof (u64)) {
309335d489f9SAlex Elder 		ret = -EINVAL;
309435d489f9SAlex Elder 		goto out;
309535d489f9SAlex Elder 	}
309635d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
309735d489f9SAlex Elder 		goto out;
309835d489f9SAlex Elder 
309935d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
310035d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
310135d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
310235d489f9SAlex Elder 	if (!snapc) {
310335d489f9SAlex Elder 		ret = -ENOMEM;
310435d489f9SAlex Elder 		goto out;
310535d489f9SAlex Elder 	}
310635d489f9SAlex Elder 
310735d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
310835d489f9SAlex Elder 	snapc->seq = seq;
310935d489f9SAlex Elder 	snapc->num_snaps = snap_count;
311035d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
311135d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
311235d489f9SAlex Elder 
311335d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
311435d489f9SAlex Elder 
311535d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
311635d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
311735d489f9SAlex Elder 
311835d489f9SAlex Elder out:
311935d489f9SAlex Elder 	kfree(reply_buf);
312035d489f9SAlex Elder 
312135d489f9SAlex Elder 	return 0;
312235d489f9SAlex Elder }
312335d489f9SAlex Elder 
3124b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3125b8b1e2dbSAlex Elder {
3126b8b1e2dbSAlex Elder 	size_t size;
3127b8b1e2dbSAlex Elder 	void *reply_buf;
3128b8b1e2dbSAlex Elder 	__le64 snap_id;
3129b8b1e2dbSAlex Elder 	int ret;
3130b8b1e2dbSAlex Elder 	void *p;
3131b8b1e2dbSAlex Elder 	void *end;
3132b8b1e2dbSAlex Elder 	char *snap_name;
3133b8b1e2dbSAlex Elder 
3134b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3135b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3136b8b1e2dbSAlex Elder 	if (!reply_buf)
3137b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3138b8b1e2dbSAlex Elder 
3139b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
314036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3141b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
3142b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
314307b2391fSAlex Elder 				reply_buf, size, NULL);
314436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3145b8b1e2dbSAlex Elder 	if (ret < 0)
3146b8b1e2dbSAlex Elder 		goto out;
3147b8b1e2dbSAlex Elder 
3148b8b1e2dbSAlex Elder 	p = reply_buf;
3149b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
3150e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3151b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3152b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3153b8b1e2dbSAlex Elder 		goto out;
3154b8b1e2dbSAlex Elder 	} else {
3155b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3156b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
3157b8b1e2dbSAlex Elder 	}
3158b8b1e2dbSAlex Elder 	kfree(reply_buf);
3159b8b1e2dbSAlex Elder 
3160b8b1e2dbSAlex Elder 	return snap_name;
3161b8b1e2dbSAlex Elder out:
3162b8b1e2dbSAlex Elder 	kfree(reply_buf);
3163b8b1e2dbSAlex Elder 
3164b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3165b8b1e2dbSAlex Elder }
3166b8b1e2dbSAlex Elder 
3167b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3168b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3169b8b1e2dbSAlex Elder {
3170e0b49868SAlex Elder 	u64 snap_id;
3171b8b1e2dbSAlex Elder 	u8 order;
3172b8b1e2dbSAlex Elder 	int ret;
3173b8b1e2dbSAlex Elder 
3174b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3175b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3176b8b1e2dbSAlex Elder 	if (ret)
3177b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3178b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3179b8b1e2dbSAlex Elder 	if (ret)
3180b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3181b8b1e2dbSAlex Elder 
3182b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
3183b8b1e2dbSAlex Elder }
3184b8b1e2dbSAlex Elder 
3185b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3186b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3187b8b1e2dbSAlex Elder {
3188b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3189b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3190b8b1e2dbSAlex Elder 					snap_size, snap_features);
3191b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3192b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3193b8b1e2dbSAlex Elder 					snap_size, snap_features);
3194b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3195b8b1e2dbSAlex Elder }
3196b8b1e2dbSAlex Elder 
3197117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3198117973fbSAlex Elder {
3199117973fbSAlex Elder 	int ret;
3200117973fbSAlex Elder 	__u8 obj_order;
3201117973fbSAlex Elder 
3202117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3203117973fbSAlex Elder 
3204117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3205117973fbSAlex Elder 
3206117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3207117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3208117973fbSAlex Elder 	if (ret)
3209117973fbSAlex Elder 		goto out;
3210117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3211117973fbSAlex Elder 		ret = -EIO;
3212117973fbSAlex Elder 		goto out;
3213117973fbSAlex Elder 	}
3214117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3215117973fbSAlex Elder 
3216117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3217117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
3218117973fbSAlex Elder 	if (ret)
3219117973fbSAlex Elder 		goto out;
3220117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3221117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
3222117973fbSAlex Elder 	if (ret)
3223117973fbSAlex Elder 		goto out;
3224117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
3225117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
3226117973fbSAlex Elder out:
3227117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
3228117973fbSAlex Elder 
3229117973fbSAlex Elder 	return ret;
3230117973fbSAlex Elder }
3231117973fbSAlex Elder 
32329d475de5SAlex Elder /*
323335938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
323435938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
323535938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
323635938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
323735938150SAlex Elder  * And verify there are no changes to snapshots we already know
323835938150SAlex Elder  * about.
323935938150SAlex Elder  *
324035938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
324135938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
324235938150SAlex Elder  * are also maintained in that order.)
3243dfc5606dSYehuda Sadeh  */
3244304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3245dfc5606dSYehuda Sadeh {
324635938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
324735938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
324835938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
324935938150SAlex Elder 	struct list_head *links = head->next;
325035938150SAlex Elder 	u32 index = 0;
3251dfc5606dSYehuda Sadeh 
32529fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
325335938150SAlex Elder 	while (index < snap_count || links != head) {
325435938150SAlex Elder 		u64 snap_id;
325535938150SAlex Elder 		struct rbd_snap *snap;
3256cd892126SAlex Elder 		char *snap_name;
3257cd892126SAlex Elder 		u64 snap_size = 0;
3258cd892126SAlex Elder 		u64 snap_features = 0;
3259dfc5606dSYehuda Sadeh 
326035938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
326135938150SAlex Elder 					     : CEPH_NOSNAP;
326235938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
326335938150SAlex Elder 				     : NULL;
3264aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3265dfc5606dSYehuda Sadeh 
326635938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
326735938150SAlex Elder 			struct list_head *next = links->next;
3268dfc5606dSYehuda Sadeh 
32696d292906SAlex Elder 			/*
32706d292906SAlex Elder 			 * A previously-existing snapshot is not in
32716d292906SAlex Elder 			 * the new snap context.
32726d292906SAlex Elder 			 *
32736d292906SAlex Elder 			 * If the now missing snapshot is the one the
32746d292906SAlex Elder 			 * image is mapped to, clear its exists flag
32756d292906SAlex Elder 			 * so we can avoid sending any more requests
32766d292906SAlex Elder 			 * to it.
32776d292906SAlex Elder 			 */
32780d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
32796d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
328041f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
32819fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
32820d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
32830d7dbfceSAlex Elder 							"mapped " : "",
32849fcbb800SAlex Elder 				(unsigned long long) snap->id);
3285dfc5606dSYehuda Sadeh 
328635938150SAlex Elder 			/* Done with this list entry; advance */
328735938150SAlex Elder 
328835938150SAlex Elder 			links = next;
328935938150SAlex Elder 			continue;
3290dfc5606dSYehuda Sadeh 		}
329135938150SAlex Elder 
3292b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
3293cd892126SAlex Elder 					&snap_size, &snap_features);
3294cd892126SAlex Elder 		if (IS_ERR(snap_name))
3295cd892126SAlex Elder 			return PTR_ERR(snap_name);
3296cd892126SAlex Elder 
32979fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
32989fcbb800SAlex Elder 			(unsigned long long) snap_id);
329935938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
330035938150SAlex Elder 			struct rbd_snap *new_snap;
330135938150SAlex Elder 
330235938150SAlex Elder 			/* We haven't seen this snapshot before */
330335938150SAlex Elder 
3304c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3305cd892126SAlex Elder 					snap_id, snap_size, snap_features);
33069fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
33079fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
33089fcbb800SAlex Elder 
33099fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
33109fcbb800SAlex Elder 
33119fcbb800SAlex Elder 				return err;
33129fcbb800SAlex Elder 			}
331335938150SAlex Elder 
331435938150SAlex Elder 			/* New goes before existing, or at end of list */
331535938150SAlex Elder 
33169fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
331735938150SAlex Elder 			if (snap)
331835938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
331935938150SAlex Elder 			else
3320523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
332135938150SAlex Elder 		} else {
332235938150SAlex Elder 			/* Already have this one */
332335938150SAlex Elder 
33249fcbb800SAlex Elder 			dout("  already present\n");
33259fcbb800SAlex Elder 
3326cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
3327aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
3328cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
332935938150SAlex Elder 
333035938150SAlex Elder 			/* Done with this list entry; advance */
333135938150SAlex Elder 
333235938150SAlex Elder 			links = links->next;
3333dfc5606dSYehuda Sadeh 		}
333435938150SAlex Elder 
333535938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
333635938150SAlex Elder 
333735938150SAlex Elder 		index++;
3338dfc5606dSYehuda Sadeh 	}
33399fcbb800SAlex Elder 	dout("%s: done\n", __func__);
3340dfc5606dSYehuda Sadeh 
3341dfc5606dSYehuda Sadeh 	return 0;
3342dfc5606dSYehuda Sadeh }
3343dfc5606dSYehuda Sadeh 
3344304f6808SAlex Elder /*
3345304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
3346304f6808SAlex Elder  * have not already been registered.
3347304f6808SAlex Elder  */
3348304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3349304f6808SAlex Elder {
3350304f6808SAlex Elder 	struct rbd_snap *snap;
3351304f6808SAlex Elder 	int ret = 0;
3352304f6808SAlex Elder 
3353304f6808SAlex Elder 	dout("%s called\n", __func__);
335486ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
335586ff77bbSAlex Elder 		return -EIO;
3356304f6808SAlex Elder 
3357304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3358304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3359304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3360304f6808SAlex Elder 			if (ret < 0)
3361304f6808SAlex Elder 				break;
3362304f6808SAlex Elder 		}
3363304f6808SAlex Elder 	}
3364304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3365304f6808SAlex Elder 
3366304f6808SAlex Elder 	return ret;
3367304f6808SAlex Elder }
3368304f6808SAlex Elder 
3369dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3370dfc5606dSYehuda Sadeh {
3371dfc5606dSYehuda Sadeh 	struct device *dev;
3372cd789ab9SAlex Elder 	int ret;
3373dfc5606dSYehuda Sadeh 
3374dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3375dfc5606dSYehuda Sadeh 
3376cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3377dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3378dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3379dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3380dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3381de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3382dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3383dfc5606dSYehuda Sadeh 
3384dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3385cd789ab9SAlex Elder 
3386dfc5606dSYehuda Sadeh 	return ret;
3387602adf40SYehuda Sadeh }
3388602adf40SYehuda Sadeh 
3389dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3390dfc5606dSYehuda Sadeh {
3391dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3392dfc5606dSYehuda Sadeh }
3393dfc5606dSYehuda Sadeh 
3394e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
33951ddbe94eSAlex Elder 
33961ddbe94eSAlex Elder /*
3397499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3398499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
33991ddbe94eSAlex Elder  */
3400e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3401b7f23c36SAlex Elder {
3402e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3403499afd5bSAlex Elder 
3404499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3405499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3406499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3407e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3408e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3409b7f23c36SAlex Elder }
3410b7f23c36SAlex Elder 
34111ddbe94eSAlex Elder /*
3412499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3413499afd5bSAlex Elder  * identifier is no longer in use.
34141ddbe94eSAlex Elder  */
3415e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
34161ddbe94eSAlex Elder {
3417d184f6bfSAlex Elder 	struct list_head *tmp;
3418de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3419d184f6bfSAlex Elder 	int max_id;
3420d184f6bfSAlex Elder 
3421aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3422499afd5bSAlex Elder 
3423e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3424e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3425499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3426499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3427d184f6bfSAlex Elder 
3428d184f6bfSAlex Elder 	/*
3429d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3430d184f6bfSAlex Elder 	 * is nothing special we need to do.
3431d184f6bfSAlex Elder 	 */
3432e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3433d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3434d184f6bfSAlex Elder 		return;
3435d184f6bfSAlex Elder 	}
3436d184f6bfSAlex Elder 
3437d184f6bfSAlex Elder 	/*
3438d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3439d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3440d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3441d184f6bfSAlex Elder 	 */
3442d184f6bfSAlex Elder 	max_id = 0;
3443d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3444d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3445d184f6bfSAlex Elder 
3446d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3447b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3448b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3449d184f6bfSAlex Elder 	}
3450499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
34511ddbe94eSAlex Elder 
34521ddbe94eSAlex Elder 	/*
3453e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3454d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3455d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3456d184f6bfSAlex Elder 	 * case.
34571ddbe94eSAlex Elder 	 */
3458e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3459e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3460b7f23c36SAlex Elder }
3461b7f23c36SAlex Elder 
3462a725f65eSAlex Elder /*
3463e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3464e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3465593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3466593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3467e28fff26SAlex Elder  */
3468e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3469e28fff26SAlex Elder {
3470e28fff26SAlex Elder         /*
3471e28fff26SAlex Elder         * These are the characters that produce nonzero for
3472e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3473e28fff26SAlex Elder         */
3474e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3475e28fff26SAlex Elder 
3476e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3477e28fff26SAlex Elder 
3478e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3479e28fff26SAlex Elder }
3480e28fff26SAlex Elder 
3481e28fff26SAlex Elder /*
3482e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3483e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3484593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3485593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3486e28fff26SAlex Elder  *
3487e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3488e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3489e28fff26SAlex Elder  * token_size if the token would not fit.
3490e28fff26SAlex Elder  *
3491593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3492e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3493e28fff26SAlex Elder  * too small to hold it.
3494e28fff26SAlex Elder  */
3495e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3496e28fff26SAlex Elder 				char *token,
3497e28fff26SAlex Elder 				size_t token_size)
3498e28fff26SAlex Elder {
3499e28fff26SAlex Elder         size_t len;
3500e28fff26SAlex Elder 
3501e28fff26SAlex Elder 	len = next_token(buf);
3502e28fff26SAlex Elder 	if (len < token_size) {
3503e28fff26SAlex Elder 		memcpy(token, *buf, len);
3504e28fff26SAlex Elder 		*(token + len) = '\0';
3505e28fff26SAlex Elder 	}
3506e28fff26SAlex Elder 	*buf += len;
3507e28fff26SAlex Elder 
3508e28fff26SAlex Elder         return len;
3509e28fff26SAlex Elder }
3510e28fff26SAlex Elder 
3511e28fff26SAlex Elder /*
3512ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3513ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3514ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3515ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3516ea3352f4SAlex Elder  *
3517ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3518ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3519ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3520ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3521ea3352f4SAlex Elder  *
3522ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3523ea3352f4SAlex Elder  * the end of the found token.
3524ea3352f4SAlex Elder  *
3525ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3526ea3352f4SAlex Elder  */
3527ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3528ea3352f4SAlex Elder {
3529ea3352f4SAlex Elder 	char *dup;
3530ea3352f4SAlex Elder 	size_t len;
3531ea3352f4SAlex Elder 
3532ea3352f4SAlex Elder 	len = next_token(buf);
35334caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3534ea3352f4SAlex Elder 	if (!dup)
3535ea3352f4SAlex Elder 		return NULL;
3536ea3352f4SAlex Elder 	*(dup + len) = '\0';
3537ea3352f4SAlex Elder 	*buf += len;
3538ea3352f4SAlex Elder 
3539ea3352f4SAlex Elder 	if (lenp)
3540ea3352f4SAlex Elder 		*lenp = len;
3541ea3352f4SAlex Elder 
3542ea3352f4SAlex Elder 	return dup;
3543ea3352f4SAlex Elder }
3544ea3352f4SAlex Elder 
3545ea3352f4SAlex Elder /*
3546859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3547859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3548859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3549859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3550d22f76e7SAlex Elder  *
3551859c31dfSAlex Elder  * The information extracted from these options is recorded in
3552859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3553859c31dfSAlex Elder  * structures:
3554859c31dfSAlex Elder  *  ceph_opts
3555859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3556859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3557859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3558859c31dfSAlex Elder  *  rbd_opts
3559859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3560859c31dfSAlex Elder  *	this function; caller must release with kfree().
3561859c31dfSAlex Elder  *  spec
3562859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3563859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3564859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3565859c31dfSAlex Elder  *
3566859c31dfSAlex Elder  * The options passed take this form:
3567859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3568859c31dfSAlex Elder  * where:
3569859c31dfSAlex Elder  *  <mon_addrs>
3570859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3571859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3572859c31dfSAlex Elder  *      by a port number (separated by a colon).
3573859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3574859c31dfSAlex Elder  *  <options>
3575859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3576859c31dfSAlex Elder  *  <pool_name>
3577859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3578859c31dfSAlex Elder  *  <image_name>
3579859c31dfSAlex Elder  *      The name of the image in that pool to map.
3580859c31dfSAlex Elder  *  <snap_id>
3581859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3582859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3583859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3584859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3585a725f65eSAlex Elder  */
3586859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3587dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3588859c31dfSAlex Elder 				struct rbd_options **opts,
3589859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3590a725f65eSAlex Elder {
3591e28fff26SAlex Elder 	size_t len;
3592859c31dfSAlex Elder 	char *options;
35930ddebc0cSAlex Elder 	const char *mon_addrs;
35940ddebc0cSAlex Elder 	size_t mon_addrs_size;
3595859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
35964e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3597859c31dfSAlex Elder 	struct ceph_options *copts;
3598dc79b113SAlex Elder 	int ret;
3599e28fff26SAlex Elder 
3600e28fff26SAlex Elder 	/* The first four tokens are required */
3601e28fff26SAlex Elder 
36027ef3214aSAlex Elder 	len = next_token(&buf);
36034fb5d671SAlex Elder 	if (!len) {
36044fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
36054fb5d671SAlex Elder 		return -EINVAL;
36064fb5d671SAlex Elder 	}
36070ddebc0cSAlex Elder 	mon_addrs = buf;
3608f28e565aSAlex Elder 	mon_addrs_size = len + 1;
36097ef3214aSAlex Elder 	buf += len;
3610a725f65eSAlex Elder 
3611dc79b113SAlex Elder 	ret = -EINVAL;
3612f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3613f28e565aSAlex Elder 	if (!options)
3614dc79b113SAlex Elder 		return -ENOMEM;
36154fb5d671SAlex Elder 	if (!*options) {
36164fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
36174fb5d671SAlex Elder 		goto out_err;
36184fb5d671SAlex Elder 	}
3619a725f65eSAlex Elder 
3620859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3621859c31dfSAlex Elder 	if (!spec)
3622f28e565aSAlex Elder 		goto out_mem;
3623859c31dfSAlex Elder 
3624859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3625859c31dfSAlex Elder 	if (!spec->pool_name)
3626859c31dfSAlex Elder 		goto out_mem;
36274fb5d671SAlex Elder 	if (!*spec->pool_name) {
36284fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
36294fb5d671SAlex Elder 		goto out_err;
36304fb5d671SAlex Elder 	}
3631e28fff26SAlex Elder 
363269e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3633859c31dfSAlex Elder 	if (!spec->image_name)
3634f28e565aSAlex Elder 		goto out_mem;
36354fb5d671SAlex Elder 	if (!*spec->image_name) {
36364fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
36374fb5d671SAlex Elder 		goto out_err;
36384fb5d671SAlex Elder 	}
3639e28fff26SAlex Elder 
3640f28e565aSAlex Elder 	/*
3641f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3642f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3643f28e565aSAlex Elder 	 */
36443feeb894SAlex Elder 	len = next_token(&buf);
3645820a5f3eSAlex Elder 	if (!len) {
36463feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
36473feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3648f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3649dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3650f28e565aSAlex Elder 		goto out_err;
3651849b4260SAlex Elder 	}
36524caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3653859c31dfSAlex Elder 	if (!spec->snap_name)
3654f28e565aSAlex Elder 		goto out_mem;
3655859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3656e5c35534SAlex Elder 
36570ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3658e28fff26SAlex Elder 
36594e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
36604e9afebaSAlex Elder 	if (!rbd_opts)
36614e9afebaSAlex Elder 		goto out_mem;
36624e9afebaSAlex Elder 
36634e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3664d22f76e7SAlex Elder 
3665859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
36660ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
36674e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3668859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3669859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3670dc79b113SAlex Elder 		goto out_err;
3671dc79b113SAlex Elder 	}
3672859c31dfSAlex Elder 	kfree(options);
3673859c31dfSAlex Elder 
3674859c31dfSAlex Elder 	*ceph_opts = copts;
36754e9afebaSAlex Elder 	*opts = rbd_opts;
3676859c31dfSAlex Elder 	*rbd_spec = spec;
36770ddebc0cSAlex Elder 
3678dc79b113SAlex Elder 	return 0;
3679f28e565aSAlex Elder out_mem:
3680dc79b113SAlex Elder 	ret = -ENOMEM;
3681d22f76e7SAlex Elder out_err:
3682859c31dfSAlex Elder 	kfree(rbd_opts);
3683859c31dfSAlex Elder 	rbd_spec_put(spec);
3684f28e565aSAlex Elder 	kfree(options);
3685d22f76e7SAlex Elder 
3686dc79b113SAlex Elder 	return ret;
3687a725f65eSAlex Elder }
3688a725f65eSAlex Elder 
3689589d30e0SAlex Elder /*
3690589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3691589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3692589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3693589d30e0SAlex Elder  *
3694589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3695589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3696589d30e0SAlex Elder  * with the supplied name.
3697589d30e0SAlex Elder  *
3698589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3699589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3700589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3701589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3702589d30e0SAlex Elder  */
3703589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3704589d30e0SAlex Elder {
3705589d30e0SAlex Elder 	int ret;
3706589d30e0SAlex Elder 	size_t size;
3707589d30e0SAlex Elder 	char *object_name;
3708589d30e0SAlex Elder 	void *response;
3709589d30e0SAlex Elder 	void *p;
3710589d30e0SAlex Elder 
3711589d30e0SAlex Elder 	/*
37122c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
37132c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
37142c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
37152c0d0a10SAlex Elder 	 */
37162c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
37172c0d0a10SAlex Elder 		return 0;
37182c0d0a10SAlex Elder 
37192c0d0a10SAlex Elder 	/*
3720589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3721589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3722589d30e0SAlex Elder 	 */
372369e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3724589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3725589d30e0SAlex Elder 	if (!object_name)
3726589d30e0SAlex Elder 		return -ENOMEM;
37270d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3728589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3729589d30e0SAlex Elder 
3730589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3731589d30e0SAlex Elder 
3732589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3733589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3734589d30e0SAlex Elder 	if (!response) {
3735589d30e0SAlex Elder 		ret = -ENOMEM;
3736589d30e0SAlex Elder 		goto out;
3737589d30e0SAlex Elder 	}
3738589d30e0SAlex Elder 
373936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
3740589d30e0SAlex Elder 				"rbd", "get_id",
3741589d30e0SAlex Elder 				NULL, 0,
374207b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
374336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3744589d30e0SAlex Elder 	if (ret < 0)
3745589d30e0SAlex Elder 		goto out;
3746589d30e0SAlex Elder 
3747589d30e0SAlex Elder 	p = response;
37480d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3749589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3750979ed480SAlex Elder 						NULL, GFP_NOIO);
37510d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
37520d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
37530d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3754589d30e0SAlex Elder 	} else {
37550d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3756589d30e0SAlex Elder 	}
3757589d30e0SAlex Elder out:
3758589d30e0SAlex Elder 	kfree(response);
3759589d30e0SAlex Elder 	kfree(object_name);
3760589d30e0SAlex Elder 
3761589d30e0SAlex Elder 	return ret;
3762589d30e0SAlex Elder }
3763589d30e0SAlex Elder 
3764a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3765a30b71b9SAlex Elder {
3766a30b71b9SAlex Elder 	int ret;
3767a30b71b9SAlex Elder 	size_t size;
3768a30b71b9SAlex Elder 
3769a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3770a30b71b9SAlex Elder 
37710d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
37720d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3773a30b71b9SAlex Elder 		return -ENOMEM;
3774a30b71b9SAlex Elder 
3775a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3776a30b71b9SAlex Elder 
377769e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3778a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3779a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3780a30b71b9SAlex Elder 		ret = -ENOMEM;
3781a30b71b9SAlex Elder 		goto out_err;
3782a30b71b9SAlex Elder 	}
37830d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
37840d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3785a30b71b9SAlex Elder 
3786a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3787a30b71b9SAlex Elder 
3788a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3789a30b71b9SAlex Elder 	if (ret < 0)
3790a30b71b9SAlex Elder 		goto out_err;
379186b00e0dSAlex Elder 
379286b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
379386b00e0dSAlex Elder 
379486b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
379586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
379686b00e0dSAlex Elder 
3797a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3798a30b71b9SAlex Elder 
3799a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3800a30b71b9SAlex Elder 		rbd_dev->header_name);
3801a30b71b9SAlex Elder 
3802a30b71b9SAlex Elder 	return 0;
3803a30b71b9SAlex Elder 
3804a30b71b9SAlex Elder out_err:
3805a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3806a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
38070d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
38080d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3809a30b71b9SAlex Elder 
3810a30b71b9SAlex Elder 	return ret;
3811a30b71b9SAlex Elder }
3812a30b71b9SAlex Elder 
3813a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3814a30b71b9SAlex Elder {
3815a30b71b9SAlex Elder 	size_t size;
38169d475de5SAlex Elder 	int ret;
38176e14b1a6SAlex Elder 	u64 ver = 0;
3818a30b71b9SAlex Elder 
3819a30b71b9SAlex Elder 	/*
3820a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3821a30b71b9SAlex Elder 	 * object name for this rbd image.
3822a30b71b9SAlex Elder 	 */
3823979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3824a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3825a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3826a30b71b9SAlex Elder 		return -ENOMEM;
3827a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
38280d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
38299d475de5SAlex Elder 
38309d475de5SAlex Elder 	/* Get the size and object order for the image */
38319d475de5SAlex Elder 
38329d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
38339d475de5SAlex Elder 	if (ret < 0)
38349d475de5SAlex Elder 		goto out_err;
38351e130199SAlex Elder 
38361e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
38371e130199SAlex Elder 
38381e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
38391e130199SAlex Elder 	if (ret < 0)
38401e130199SAlex Elder 		goto out_err;
3841b1b5402aSAlex Elder 
3842d889140cSAlex Elder 	/* Get the and check features for the image */
3843b1b5402aSAlex Elder 
3844b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3845b1b5402aSAlex Elder 	if (ret < 0)
3846b1b5402aSAlex Elder 		goto out_err;
384735d489f9SAlex Elder 
384886b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
384986b00e0dSAlex Elder 
385086b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
385186b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
385286b00e0dSAlex Elder 		if (ret < 0)
385386b00e0dSAlex Elder 			goto out_err;
385486b00e0dSAlex Elder 	}
385586b00e0dSAlex Elder 
38566e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
385735d489f9SAlex Elder 
38586e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
38596e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
38606e14b1a6SAlex Elder 
38616e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
38626e14b1a6SAlex Elder 
38636e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
386435d489f9SAlex Elder 	if (ret)
386535d489f9SAlex Elder 		goto out_err;
38666e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
38676e14b1a6SAlex Elder 
3868a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3869a30b71b9SAlex Elder 
3870a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3871a30b71b9SAlex Elder 		rbd_dev->header_name);
3872a30b71b9SAlex Elder 
387335152979SAlex Elder 	return 0;
38749d475de5SAlex Elder out_err:
387586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
387686b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
387786b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
38789d475de5SAlex Elder 	kfree(rbd_dev->header_name);
38799d475de5SAlex Elder 	rbd_dev->header_name = NULL;
38801e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
38811e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
38829d475de5SAlex Elder 
38839d475de5SAlex Elder 	return ret;
3884a30b71b9SAlex Elder }
3885a30b71b9SAlex Elder 
388683a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
388783a06263SAlex Elder {
388883a06263SAlex Elder 	int ret;
388983a06263SAlex Elder 
389083a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
389183a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
389283a06263SAlex Elder 	if (ret)
389383a06263SAlex Elder 		return ret;
389483a06263SAlex Elder 
38959e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
38969e15b77dSAlex Elder 	if (ret)
38979e15b77dSAlex Elder 		goto err_out_snaps;
38989e15b77dSAlex Elder 
389983a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
390083a06263SAlex Elder 	if (ret)
390183a06263SAlex Elder 		goto err_out_snaps;
390283a06263SAlex Elder 
390383a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
390483a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
390583a06263SAlex Elder 
390683a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
390783a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
390883a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
390983a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
391083a06263SAlex Elder 
391183a06263SAlex Elder 	/* Get our block major device number. */
391283a06263SAlex Elder 
391383a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
391483a06263SAlex Elder 	if (ret < 0)
391583a06263SAlex Elder 		goto err_out_id;
391683a06263SAlex Elder 	rbd_dev->major = ret;
391783a06263SAlex Elder 
391883a06263SAlex Elder 	/* Set up the blkdev mapping. */
391983a06263SAlex Elder 
392083a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
392183a06263SAlex Elder 	if (ret)
392283a06263SAlex Elder 		goto err_out_blkdev;
392383a06263SAlex Elder 
392483a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
392583a06263SAlex Elder 	if (ret)
392683a06263SAlex Elder 		goto err_out_disk;
392783a06263SAlex Elder 
392883a06263SAlex Elder 	/*
392983a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
393083a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
393183a06263SAlex Elder 	 */
393283a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
393383a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
393483a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
393583a06263SAlex Elder 	if (ret)
393683a06263SAlex Elder 		goto err_out_bus;
393783a06263SAlex Elder 
39389969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
393983a06263SAlex Elder 	if (ret)
394083a06263SAlex Elder 		goto err_out_bus;
394183a06263SAlex Elder 
394283a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
394383a06263SAlex Elder 
394483a06263SAlex Elder 	add_disk(rbd_dev->disk);
394583a06263SAlex Elder 
394683a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
394783a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
394883a06263SAlex Elder 
394983a06263SAlex Elder 	return ret;
395083a06263SAlex Elder err_out_bus:
395183a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
395283a06263SAlex Elder 
395383a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
395483a06263SAlex Elder 
395583a06263SAlex Elder 	return ret;
395683a06263SAlex Elder err_out_disk:
395783a06263SAlex Elder 	rbd_free_disk(rbd_dev);
395883a06263SAlex Elder err_out_blkdev:
395983a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
396083a06263SAlex Elder err_out_id:
396183a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
396283a06263SAlex Elder err_out_snaps:
396383a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
396483a06263SAlex Elder 
396583a06263SAlex Elder 	return ret;
396683a06263SAlex Elder }
396783a06263SAlex Elder 
3968a30b71b9SAlex Elder /*
3969a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3970a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3971a30b71b9SAlex Elder  * id.
3972a30b71b9SAlex Elder  */
3973a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3974a30b71b9SAlex Elder {
3975a30b71b9SAlex Elder 	int ret;
3976a30b71b9SAlex Elder 
3977a30b71b9SAlex Elder 	/*
3978a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3979a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3980a30b71b9SAlex Elder 	 * it's a format 1 image.
3981a30b71b9SAlex Elder 	 */
3982a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3983a30b71b9SAlex Elder 	if (ret)
3984a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3985a30b71b9SAlex Elder 	else
3986a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
398783a06263SAlex Elder 	if (ret) {
3988a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3989a30b71b9SAlex Elder 
3990a30b71b9SAlex Elder 		return ret;
3991a30b71b9SAlex Elder 	}
3992a30b71b9SAlex Elder 
399383a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
399483a06263SAlex Elder 	if (ret)
399583a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
399683a06263SAlex Elder 
399783a06263SAlex Elder 	return ret;
399883a06263SAlex Elder }
399983a06263SAlex Elder 
400059c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
400159c2be1eSYehuda Sadeh 		       const char *buf,
400259c2be1eSYehuda Sadeh 		       size_t count)
4003602adf40SYehuda Sadeh {
4004cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4005dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
40064e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4007859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
40089d3997fdSAlex Elder 	struct rbd_client *rbdc;
400927cc2594SAlex Elder 	struct ceph_osd_client *osdc;
401027cc2594SAlex Elder 	int rc = -ENOMEM;
4011602adf40SYehuda Sadeh 
4012602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4013602adf40SYehuda Sadeh 		return -ENODEV;
4014602adf40SYehuda Sadeh 
4015a725f65eSAlex Elder 	/* parse add command */
4016859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4017dc79b113SAlex Elder 	if (rc < 0)
4018bd4ba655SAlex Elder 		goto err_out_module;
4019a725f65eSAlex Elder 
40209d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
40219d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
40229d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
40230ddebc0cSAlex Elder 		goto err_out_args;
40249d3997fdSAlex Elder 	}
4025c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4026602adf40SYehuda Sadeh 
4027602adf40SYehuda Sadeh 	/* pick the pool */
40289d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4029859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4030602adf40SYehuda Sadeh 	if (rc < 0)
4031602adf40SYehuda Sadeh 		goto err_out_client;
4032859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4033859c31dfSAlex Elder 
40340903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
40350903e875SAlex Elder 
40360903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
40370903e875SAlex Elder 		rc = -EIO;
40380903e875SAlex Elder 		goto err_out_client;
40390903e875SAlex Elder 	}
40400903e875SAlex Elder 
4041c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4042bd4ba655SAlex Elder 	if (!rbd_dev)
4043bd4ba655SAlex Elder 		goto err_out_client;
4044c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4045c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4046602adf40SYehuda Sadeh 
4047bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4048c53d5893SAlex Elder 	kfree(rbd_opts);
4049c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4050bd4ba655SAlex Elder 
4051a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4052a30b71b9SAlex Elder 	if (rc < 0)
4053c53d5893SAlex Elder 		goto err_out_rbd_dev;
405405fd6f6fSAlex Elder 
4055602adf40SYehuda Sadeh 	return count;
4056c53d5893SAlex Elder err_out_rbd_dev:
4057c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4058bd4ba655SAlex Elder err_out_client:
40599d3997fdSAlex Elder 	rbd_put_client(rbdc);
40600ddebc0cSAlex Elder err_out_args:
406178cea76eSAlex Elder 	if (ceph_opts)
406278cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
40634e9afebaSAlex Elder 	kfree(rbd_opts);
4064859c31dfSAlex Elder 	rbd_spec_put(spec);
4065bd4ba655SAlex Elder err_out_module:
4066bd4ba655SAlex Elder 	module_put(THIS_MODULE);
406727cc2594SAlex Elder 
4068602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
406927cc2594SAlex Elder 
407027cc2594SAlex Elder 	return (ssize_t) rc;
4071602adf40SYehuda Sadeh }
4072602adf40SYehuda Sadeh 
4073de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4074602adf40SYehuda Sadeh {
4075602adf40SYehuda Sadeh 	struct list_head *tmp;
4076602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4077602adf40SYehuda Sadeh 
4078e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4079602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4080602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4081de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4082e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4083602adf40SYehuda Sadeh 			return rbd_dev;
4084602adf40SYehuda Sadeh 		}
4085e124a82fSAlex Elder 	}
4086e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4087602adf40SYehuda Sadeh 	return NULL;
4088602adf40SYehuda Sadeh }
4089602adf40SYehuda Sadeh 
4090dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4091602adf40SYehuda Sadeh {
4092593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4093602adf40SYehuda Sadeh 
409459c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
40959969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4096602adf40SYehuda Sadeh 
4097602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4098602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4099602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
410032eec68dSAlex Elder 
41012ac4e75dSAlex Elder 	/* release allocated disk header fields */
41022ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
41032ac4e75dSAlex Elder 
410432eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4105e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4106c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4107c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4108602adf40SYehuda Sadeh 
4109602adf40SYehuda Sadeh 	/* release module ref */
4110602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4111602adf40SYehuda Sadeh }
4112602adf40SYehuda Sadeh 
4113dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4114602adf40SYehuda Sadeh 			  const char *buf,
4115602adf40SYehuda Sadeh 			  size_t count)
4116602adf40SYehuda Sadeh {
4117602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4118602adf40SYehuda Sadeh 	int target_id, rc;
4119602adf40SYehuda Sadeh 	unsigned long ul;
4120602adf40SYehuda Sadeh 	int ret = count;
4121602adf40SYehuda Sadeh 
4122602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4123602adf40SYehuda Sadeh 	if (rc)
4124602adf40SYehuda Sadeh 		return rc;
4125602adf40SYehuda Sadeh 
4126602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4127602adf40SYehuda Sadeh 	target_id = (int) ul;
4128602adf40SYehuda Sadeh 	if (target_id != ul)
4129602adf40SYehuda Sadeh 		return -EINVAL;
4130602adf40SYehuda Sadeh 
4131602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4132602adf40SYehuda Sadeh 
4133602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4134602adf40SYehuda Sadeh 	if (!rbd_dev) {
4135602adf40SYehuda Sadeh 		ret = -ENOENT;
4136602adf40SYehuda Sadeh 		goto done;
4137602adf40SYehuda Sadeh 	}
4138602adf40SYehuda Sadeh 
4139a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4140b82d167bSAlex Elder 	if (rbd_dev->open_count)
414142382b70SAlex Elder 		ret = -EBUSY;
4142b82d167bSAlex Elder 	else
4143b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4144a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4145b82d167bSAlex Elder 	if (ret < 0)
414642382b70SAlex Elder 		goto done;
414742382b70SAlex Elder 
414841f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4149dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
4150602adf40SYehuda Sadeh 
4151602adf40SYehuda Sadeh done:
4152602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4153aafb230eSAlex Elder 
4154602adf40SYehuda Sadeh 	return ret;
4155602adf40SYehuda Sadeh }
4156602adf40SYehuda Sadeh 
4157602adf40SYehuda Sadeh /*
4158602adf40SYehuda Sadeh  * create control files in sysfs
4159dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4160602adf40SYehuda Sadeh  */
4161602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4162602adf40SYehuda Sadeh {
4163dfc5606dSYehuda Sadeh 	int ret;
4164602adf40SYehuda Sadeh 
4165fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4166dfc5606dSYehuda Sadeh 	if (ret < 0)
4167dfc5606dSYehuda Sadeh 		return ret;
4168602adf40SYehuda Sadeh 
4169fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4170fed4c143SAlex Elder 	if (ret < 0)
4171fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4172602adf40SYehuda Sadeh 
4173602adf40SYehuda Sadeh 	return ret;
4174602adf40SYehuda Sadeh }
4175602adf40SYehuda Sadeh 
4176602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4177602adf40SYehuda Sadeh {
4178dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4179fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4180602adf40SYehuda Sadeh }
4181602adf40SYehuda Sadeh 
4182602adf40SYehuda Sadeh int __init rbd_init(void)
4183602adf40SYehuda Sadeh {
4184602adf40SYehuda Sadeh 	int rc;
4185602adf40SYehuda Sadeh 
41861e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
41871e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
41881e32d34cSAlex Elder 
41891e32d34cSAlex Elder 		return -EINVAL;
41901e32d34cSAlex Elder 	}
4191602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
4192602adf40SYehuda Sadeh 	if (rc)
4193602adf40SYehuda Sadeh 		return rc;
4194f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4195602adf40SYehuda Sadeh 	return 0;
4196602adf40SYehuda Sadeh }
4197602adf40SYehuda Sadeh 
4198602adf40SYehuda Sadeh void __exit rbd_exit(void)
4199602adf40SYehuda Sadeh {
4200602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
4201602adf40SYehuda Sadeh }
4202602adf40SYehuda Sadeh 
4203602adf40SYehuda Sadeh module_init(rbd_init);
4204602adf40SYehuda Sadeh module_exit(rbd_exit);
4205602adf40SYehuda Sadeh 
4206602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4207602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4208602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
4209602adf40SYehuda Sadeh 
4210602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
4211602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4212602adf40SYehuda Sadeh 
4213602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
4214