xref: /openbmc/linux/drivers/block/rbd.c (revision cc344fa1)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
552647ba38SAlex Elder /* It might be useful to have these defined elsewhere */
56df111be6SAlex Elder 
572647ba38SAlex Elder #define	U8_MAX	((u8)	(~0U))
582647ba38SAlex Elder #define	U16_MAX	((u16)	(~0U))
590ec8ce87SAlex Elder #define	U32_MAX	((u32)	(~0U))
60df111be6SAlex Elder #define	U64_MAX	((u64)	(~0ULL))
61df111be6SAlex Elder 
62f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
63f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
64602adf40SYehuda Sadeh 
65602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
66602adf40SYehuda Sadeh 
67d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
68d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
69d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70d4b125e9SAlex Elder 
7135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
72602adf40SYehuda Sadeh 
73602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
74602adf40SYehuda Sadeh 
759e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
769e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
77589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
789e15b77dSAlex Elder 
791e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
80589d30e0SAlex Elder 
81d889140cSAlex Elder /* Feature bits */
82d889140cSAlex Elder 
83d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
84d889140cSAlex Elder 
85d889140cSAlex Elder /* Features supported by this (client software) implementation. */
86d889140cSAlex Elder 
87d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
88d889140cSAlex Elder 
8981a89793SAlex Elder /*
9081a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9181a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9281a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9381a89793SAlex Elder  * enough to hold all possible device names.
9481a89793SAlex Elder  */
95602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9681a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
97602adf40SYehuda Sadeh 
98602adf40SYehuda Sadeh /*
99602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
100602adf40SYehuda Sadeh  */
101602adf40SYehuda Sadeh struct rbd_image_header {
102f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
103849b4260SAlex Elder 	char *object_prefix;
10434b13184SAlex Elder 	u64 features;
105602adf40SYehuda Sadeh 	__u8 obj_order;
106602adf40SYehuda Sadeh 	__u8 crypt_type;
107602adf40SYehuda Sadeh 	__u8 comp_type;
108602adf40SYehuda Sadeh 
109f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
110f84344f3SAlex Elder 	u64 image_size;
111f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
112602adf40SYehuda Sadeh 	char *snap_names;
113602adf40SYehuda Sadeh 	u64 *snap_sizes;
11459c2be1eSYehuda Sadeh 
11559c2be1eSYehuda Sadeh 	u64 obj_version;
11659c2be1eSYehuda Sadeh };
11759c2be1eSYehuda Sadeh 
1180d7dbfceSAlex Elder /*
1190d7dbfceSAlex Elder  * An rbd image specification.
1200d7dbfceSAlex Elder  *
1210d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
123c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
124c66c6e0cSAlex Elder  *
125c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
126c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
127c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
128c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
129c66c6e0cSAlex Elder  *
130c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
131c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
132c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
133c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
134c66c6e0cSAlex Elder  * is shared between the parent and child).
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
137c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
138c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
139c66c6e0cSAlex Elder  *
140c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
141c66c6e0cSAlex Elder  * could be a null pointer).
1420d7dbfceSAlex Elder  */
1430d7dbfceSAlex Elder struct rbd_spec {
1440d7dbfceSAlex Elder 	u64		pool_id;
1450d7dbfceSAlex Elder 	char		*pool_name;
1460d7dbfceSAlex Elder 
1470d7dbfceSAlex Elder 	char		*image_id;
1480d7dbfceSAlex Elder 	char		*image_name;
1490d7dbfceSAlex Elder 
1500d7dbfceSAlex Elder 	u64		snap_id;
1510d7dbfceSAlex Elder 	char		*snap_name;
1520d7dbfceSAlex Elder 
1530d7dbfceSAlex Elder 	struct kref	kref;
1540d7dbfceSAlex Elder };
1550d7dbfceSAlex Elder 
156602adf40SYehuda Sadeh /*
157f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
158602adf40SYehuda Sadeh  */
159602adf40SYehuda Sadeh struct rbd_client {
160602adf40SYehuda Sadeh 	struct ceph_client	*client;
161602adf40SYehuda Sadeh 	struct kref		kref;
162602adf40SYehuda Sadeh 	struct list_head	node;
163602adf40SYehuda Sadeh };
164602adf40SYehuda Sadeh 
165bf0d5f50SAlex Elder struct rbd_img_request;
166bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167bf0d5f50SAlex Elder 
168bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
169bf0d5f50SAlex Elder 
170bf0d5f50SAlex Elder struct rbd_obj_request;
171bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172bf0d5f50SAlex Elder 
1739969ebc5SAlex Elder enum obj_request_type {
1749969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1759969ebc5SAlex Elder };
176bf0d5f50SAlex Elder 
177bf0d5f50SAlex Elder struct rbd_obj_request {
178bf0d5f50SAlex Elder 	const char		*object_name;
179bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
180bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
181bf0d5f50SAlex Elder 
182bf0d5f50SAlex Elder 	struct rbd_img_request	*img_request;
183bf0d5f50SAlex Elder 	struct list_head	links;		/* img_request->obj_requests */
184bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
185bf0d5f50SAlex Elder 
186bf0d5f50SAlex Elder 	enum obj_request_type	type;
187788e2df3SAlex Elder 	union {
188bf0d5f50SAlex Elder 		struct bio	*bio_list;
189788e2df3SAlex Elder 		struct {
190788e2df3SAlex Elder 			struct page	**pages;
191788e2df3SAlex Elder 			u32		page_count;
192788e2df3SAlex Elder 		};
193788e2df3SAlex Elder 	};
194bf0d5f50SAlex Elder 
195bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
196bf0d5f50SAlex Elder 
197bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
198bf0d5f50SAlex Elder 	u64			version;
199bf0d5f50SAlex Elder 	s32			result;
200bf0d5f50SAlex Elder 	atomic_t		done;
201bf0d5f50SAlex Elder 
202bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
203788e2df3SAlex Elder 	struct completion	completion;
204bf0d5f50SAlex Elder 
205bf0d5f50SAlex Elder 	struct kref		kref;
206bf0d5f50SAlex Elder };
207bf0d5f50SAlex Elder 
208bf0d5f50SAlex Elder struct rbd_img_request {
209bf0d5f50SAlex Elder 	struct request		*rq;
210bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
211bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
212bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
213bf0d5f50SAlex Elder 	bool			write_request;	/* false for read */
214bf0d5f50SAlex Elder 	union {
215bf0d5f50SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
216bf0d5f50SAlex Elder 		u64		snap_id;		/* for reads */
217bf0d5f50SAlex Elder 	};
218bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
219bf0d5f50SAlex Elder 	u32			next_completion;
220bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
221bf0d5f50SAlex Elder 
222bf0d5f50SAlex Elder 	u32			obj_request_count;
223bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
224bf0d5f50SAlex Elder 
225bf0d5f50SAlex Elder 	struct kref		kref;
226bf0d5f50SAlex Elder };
227bf0d5f50SAlex Elder 
228bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
229ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
230bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
231ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
232bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
233ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
234bf0d5f50SAlex Elder 
235dfc5606dSYehuda Sadeh struct rbd_snap {
236dfc5606dSYehuda Sadeh 	struct	device		dev;
237dfc5606dSYehuda Sadeh 	const char		*name;
2383591538fSJosh Durgin 	u64			size;
239dfc5606dSYehuda Sadeh 	struct list_head	node;
240dfc5606dSYehuda Sadeh 	u64			id;
24134b13184SAlex Elder 	u64			features;
242dfc5606dSYehuda Sadeh };
243dfc5606dSYehuda Sadeh 
244f84344f3SAlex Elder struct rbd_mapping {
24599c1f08fSAlex Elder 	u64                     size;
24634b13184SAlex Elder 	u64                     features;
247f84344f3SAlex Elder 	bool			read_only;
248f84344f3SAlex Elder };
249f84344f3SAlex Elder 
250602adf40SYehuda Sadeh /*
251602adf40SYehuda Sadeh  * a single device
252602adf40SYehuda Sadeh  */
253602adf40SYehuda Sadeh struct rbd_device {
254de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
255602adf40SYehuda Sadeh 
256602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
257602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
258602adf40SYehuda Sadeh 
259a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
260602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
261602adf40SYehuda Sadeh 
262602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263602adf40SYehuda Sadeh 
264b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
265602adf40SYehuda Sadeh 
266602adf40SYehuda Sadeh 	struct rbd_image_header	header;
267b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
2680d7dbfceSAlex Elder 	struct rbd_spec		*spec;
269602adf40SYehuda Sadeh 
2700d7dbfceSAlex Elder 	char			*header_name;
271971f839aSAlex Elder 
2720903e875SAlex Elder 	struct ceph_file_layout	layout;
2730903e875SAlex Elder 
27459c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
275975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
27659c2be1eSYehuda Sadeh 
27786b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
27886b00e0dSAlex Elder 	u64			parent_overlap;
27986b00e0dSAlex Elder 
280c666601aSJosh Durgin 	/* protects updating the header */
281c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
282f84344f3SAlex Elder 
283f84344f3SAlex Elder 	struct rbd_mapping	mapping;
284602adf40SYehuda Sadeh 
285602adf40SYehuda Sadeh 	struct list_head	node;
286dfc5606dSYehuda Sadeh 
287dfc5606dSYehuda Sadeh 	/* list of snapshots */
288dfc5606dSYehuda Sadeh 	struct list_head	snaps;
289dfc5606dSYehuda Sadeh 
290dfc5606dSYehuda Sadeh 	/* sysfs related */
291dfc5606dSYehuda Sadeh 	struct device		dev;
292b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
293dfc5606dSYehuda Sadeh };
294dfc5606dSYehuda Sadeh 
295b82d167bSAlex Elder /*
296b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
297b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
298b82d167bSAlex Elder  *
299b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
300b82d167bSAlex Elder  * "open_count" field) requires atomic access.
301b82d167bSAlex Elder  */
3026d292906SAlex Elder enum rbd_dev_flags {
3036d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
304b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3056d292906SAlex Elder };
3066d292906SAlex Elder 
307602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
308e124a82fSAlex Elder 
309602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
310e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
311e124a82fSAlex Elder 
312602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
313432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
314602adf40SYehuda Sadeh 
315304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
316304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
317304f6808SAlex Elder 
318dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
31941f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
320dfc5606dSYehuda Sadeh 
321f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
322f0f8cef5SAlex Elder 		       size_t count);
323f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
324f0f8cef5SAlex Elder 			  size_t count);
325f0f8cef5SAlex Elder 
326f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
327f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
328f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
329f0f8cef5SAlex Elder 	__ATTR_NULL
330f0f8cef5SAlex Elder };
331f0f8cef5SAlex Elder 
332f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
333f0f8cef5SAlex Elder 	.name		= "rbd",
334f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
335f0f8cef5SAlex Elder };
336f0f8cef5SAlex Elder 
337f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
338f0f8cef5SAlex Elder {
339f0f8cef5SAlex Elder }
340f0f8cef5SAlex Elder 
341f0f8cef5SAlex Elder static struct device rbd_root_dev = {
342f0f8cef5SAlex Elder 	.init_name =    "rbd",
343f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
344f0f8cef5SAlex Elder };
345f0f8cef5SAlex Elder 
34606ecc6cbSAlex Elder static __printf(2, 3)
34706ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
34806ecc6cbSAlex Elder {
34906ecc6cbSAlex Elder 	struct va_format vaf;
35006ecc6cbSAlex Elder 	va_list args;
35106ecc6cbSAlex Elder 
35206ecc6cbSAlex Elder 	va_start(args, fmt);
35306ecc6cbSAlex Elder 	vaf.fmt = fmt;
35406ecc6cbSAlex Elder 	vaf.va = &args;
35506ecc6cbSAlex Elder 
35606ecc6cbSAlex Elder 	if (!rbd_dev)
35706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
35806ecc6cbSAlex Elder 	else if (rbd_dev->disk)
35906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
36006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
36106ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
36206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
36306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
36406ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
36506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
36606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
36706ecc6cbSAlex Elder 	else	/* punt */
36806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
36906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
37006ecc6cbSAlex Elder 	va_end(args);
37106ecc6cbSAlex Elder }
37206ecc6cbSAlex Elder 
373aafb230eSAlex Elder #ifdef RBD_DEBUG
374aafb230eSAlex Elder #define rbd_assert(expr)						\
375aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
376aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
377aafb230eSAlex Elder 						"at line %d:\n\n"	\
378aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
379aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
380aafb230eSAlex Elder 			BUG();						\
381aafb230eSAlex Elder 		}
382aafb230eSAlex Elder #else /* !RBD_DEBUG */
383aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
384aafb230eSAlex Elder #endif /* !RBD_DEBUG */
385dfc5606dSYehuda Sadeh 
386117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
387117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
38859c2be1eSYehuda Sadeh 
389602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
390602adf40SYehuda Sadeh {
391f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
392b82d167bSAlex Elder 	bool removing = false;
393602adf40SYehuda Sadeh 
394f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
395602adf40SYehuda Sadeh 		return -EROFS;
396602adf40SYehuda Sadeh 
397a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
398b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399b82d167bSAlex Elder 		removing = true;
400b82d167bSAlex Elder 	else
401b82d167bSAlex Elder 		rbd_dev->open_count++;
402a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
403b82d167bSAlex Elder 	if (removing)
404b82d167bSAlex Elder 		return -ENOENT;
405b82d167bSAlex Elder 
40642382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
407c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
408f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
40942382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
410340c7a2bSAlex Elder 
411602adf40SYehuda Sadeh 	return 0;
412602adf40SYehuda Sadeh }
413602adf40SYehuda Sadeh 
414dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
415dfc5606dSYehuda Sadeh {
416dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
417b82d167bSAlex Elder 	unsigned long open_count_before;
418b82d167bSAlex Elder 
419a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
420b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
421a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
422b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
423dfc5606dSYehuda Sadeh 
42442382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
425c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
42642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
427dfc5606dSYehuda Sadeh 
428dfc5606dSYehuda Sadeh 	return 0;
429dfc5606dSYehuda Sadeh }
430dfc5606dSYehuda Sadeh 
431602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
432602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
433602adf40SYehuda Sadeh 	.open			= rbd_open,
434dfc5606dSYehuda Sadeh 	.release		= rbd_release,
435602adf40SYehuda Sadeh };
436602adf40SYehuda Sadeh 
437602adf40SYehuda Sadeh /*
438602adf40SYehuda Sadeh  * Initialize an rbd client instance.
43943ae4701SAlex Elder  * We own *ceph_opts.
440602adf40SYehuda Sadeh  */
441f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
442602adf40SYehuda Sadeh {
443602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
444602adf40SYehuda Sadeh 	int ret = -ENOMEM;
445602adf40SYehuda Sadeh 
44637206ee5SAlex Elder 	dout("%s:\n", __func__);
447602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
448602adf40SYehuda Sadeh 	if (!rbdc)
449602adf40SYehuda Sadeh 		goto out_opt;
450602adf40SYehuda Sadeh 
451602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
452602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
453602adf40SYehuda Sadeh 
454bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455bc534d86SAlex Elder 
45643ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
457602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
458bc534d86SAlex Elder 		goto out_mutex;
45943ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
460602adf40SYehuda Sadeh 
461602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
462602adf40SYehuda Sadeh 	if (ret < 0)
463602adf40SYehuda Sadeh 		goto out_err;
464602adf40SYehuda Sadeh 
465432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
466602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
467432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
468602adf40SYehuda Sadeh 
469bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
47037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
471bc534d86SAlex Elder 
472602adf40SYehuda Sadeh 	return rbdc;
473602adf40SYehuda Sadeh 
474602adf40SYehuda Sadeh out_err:
475602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
476bc534d86SAlex Elder out_mutex:
477bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
478602adf40SYehuda Sadeh 	kfree(rbdc);
479602adf40SYehuda Sadeh out_opt:
48043ae4701SAlex Elder 	if (ceph_opts)
48143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
48237206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
48337206ee5SAlex Elder 
48428f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
485602adf40SYehuda Sadeh }
486602adf40SYehuda Sadeh 
487602adf40SYehuda Sadeh /*
4881f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
4891f7ba331SAlex Elder  * found, bump its reference count.
490602adf40SYehuda Sadeh  */
4911f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
492602adf40SYehuda Sadeh {
493602adf40SYehuda Sadeh 	struct rbd_client *client_node;
4941f7ba331SAlex Elder 	bool found = false;
495602adf40SYehuda Sadeh 
49643ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
497602adf40SYehuda Sadeh 		return NULL;
498602adf40SYehuda Sadeh 
4991f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5001f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5011f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5021f7ba331SAlex Elder 			kref_get(&client_node->kref);
5031f7ba331SAlex Elder 			found = true;
5041f7ba331SAlex Elder 			break;
5051f7ba331SAlex Elder 		}
5061f7ba331SAlex Elder 	}
5071f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5081f7ba331SAlex Elder 
5091f7ba331SAlex Elder 	return found ? client_node : NULL;
510602adf40SYehuda Sadeh }
511602adf40SYehuda Sadeh 
512602adf40SYehuda Sadeh /*
51359c2be1eSYehuda Sadeh  * mount options
51459c2be1eSYehuda Sadeh  */
51559c2be1eSYehuda Sadeh enum {
51659c2be1eSYehuda Sadeh 	Opt_last_int,
51759c2be1eSYehuda Sadeh 	/* int args above */
51859c2be1eSYehuda Sadeh 	Opt_last_string,
51959c2be1eSYehuda Sadeh 	/* string args above */
520cc0538b6SAlex Elder 	Opt_read_only,
521cc0538b6SAlex Elder 	Opt_read_write,
522cc0538b6SAlex Elder 	/* Boolean args above */
523cc0538b6SAlex Elder 	Opt_last_bool,
52459c2be1eSYehuda Sadeh };
52559c2be1eSYehuda Sadeh 
52643ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
52759c2be1eSYehuda Sadeh 	/* int args above */
52859c2be1eSYehuda Sadeh 	/* string args above */
529be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
530cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
531cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
532cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
533cc0538b6SAlex Elder 	/* Boolean args above */
53459c2be1eSYehuda Sadeh 	{-1, NULL}
53559c2be1eSYehuda Sadeh };
53659c2be1eSYehuda Sadeh 
53798571b5aSAlex Elder struct rbd_options {
53898571b5aSAlex Elder 	bool	read_only;
53998571b5aSAlex Elder };
54098571b5aSAlex Elder 
54198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
54298571b5aSAlex Elder 
54359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
54459c2be1eSYehuda Sadeh {
54543ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
54659c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
54759c2be1eSYehuda Sadeh 	int token, intval, ret;
54859c2be1eSYehuda Sadeh 
54943ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
55059c2be1eSYehuda Sadeh 	if (token < 0)
55159c2be1eSYehuda Sadeh 		return -EINVAL;
55259c2be1eSYehuda Sadeh 
55359c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
55459c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
55559c2be1eSYehuda Sadeh 		if (ret < 0) {
55659c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
55759c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
55859c2be1eSYehuda Sadeh 			return ret;
55959c2be1eSYehuda Sadeh 		}
56059c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
56159c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
56259c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
56359c2be1eSYehuda Sadeh 		     argstr[0].from);
564cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
565cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
56659c2be1eSYehuda Sadeh 	} else {
56759c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
56859c2be1eSYehuda Sadeh 	}
56959c2be1eSYehuda Sadeh 
57059c2be1eSYehuda Sadeh 	switch (token) {
571cc0538b6SAlex Elder 	case Opt_read_only:
572cc0538b6SAlex Elder 		rbd_opts->read_only = true;
573cc0538b6SAlex Elder 		break;
574cc0538b6SAlex Elder 	case Opt_read_write:
575cc0538b6SAlex Elder 		rbd_opts->read_only = false;
576cc0538b6SAlex Elder 		break;
57759c2be1eSYehuda Sadeh 	default:
578aafb230eSAlex Elder 		rbd_assert(false);
579aafb230eSAlex Elder 		break;
58059c2be1eSYehuda Sadeh 	}
58159c2be1eSYehuda Sadeh 	return 0;
58259c2be1eSYehuda Sadeh }
58359c2be1eSYehuda Sadeh 
58459c2be1eSYehuda Sadeh /*
585602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
586602adf40SYehuda Sadeh  * not exist create it.
587602adf40SYehuda Sadeh  */
5889d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
589602adf40SYehuda Sadeh {
590f8c38929SAlex Elder 	struct rbd_client *rbdc;
59159c2be1eSYehuda Sadeh 
5921f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
5939d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
59443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
5959d3997fdSAlex Elder 	else
596f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
597d720bcb0SAlex Elder 
5989d3997fdSAlex Elder 	return rbdc;
599602adf40SYehuda Sadeh }
600602adf40SYehuda Sadeh 
601602adf40SYehuda Sadeh /*
602602adf40SYehuda Sadeh  * Destroy ceph client
603d23a4b3fSAlex Elder  *
604432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
605602adf40SYehuda Sadeh  */
606602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
607602adf40SYehuda Sadeh {
608602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
609602adf40SYehuda Sadeh 
61037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
611cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
612602adf40SYehuda Sadeh 	list_del(&rbdc->node);
613cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
614602adf40SYehuda Sadeh 
615602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
616602adf40SYehuda Sadeh 	kfree(rbdc);
617602adf40SYehuda Sadeh }
618602adf40SYehuda Sadeh 
619602adf40SYehuda Sadeh /*
620602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
621602adf40SYehuda Sadeh  * it.
622602adf40SYehuda Sadeh  */
6239d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
624602adf40SYehuda Sadeh {
625c53d5893SAlex Elder 	if (rbdc)
6269d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
627602adf40SYehuda Sadeh }
628602adf40SYehuda Sadeh 
629a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
630a30b71b9SAlex Elder {
631a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
632a30b71b9SAlex Elder }
633a30b71b9SAlex Elder 
6348e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6358e94af8eSAlex Elder {
636103a150fSAlex Elder 	size_t size;
637103a150fSAlex Elder 	u32 snap_count;
638103a150fSAlex Elder 
639103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
640103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
641103a150fSAlex Elder 		return false;
642103a150fSAlex Elder 
643db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
644db2388b6SAlex Elder 
645db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
646db2388b6SAlex Elder 		return false;
647db2388b6SAlex Elder 
648db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
649db2388b6SAlex Elder 
650db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
651db2388b6SAlex Elder 		return false;
652db2388b6SAlex Elder 
653103a150fSAlex Elder 	/*
654103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
655103a150fSAlex Elder 	 * that limits the number of snapshots.
656103a150fSAlex Elder 	 */
657103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
658103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
659103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
660103a150fSAlex Elder 		return false;
661103a150fSAlex Elder 
662103a150fSAlex Elder 	/*
663103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
664103a150fSAlex Elder 	 * header must also be representable in a size_t.
665103a150fSAlex Elder 	 */
666103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
667103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
668103a150fSAlex Elder 		return false;
669103a150fSAlex Elder 
670103a150fSAlex Elder 	return true;
6718e94af8eSAlex Elder }
6728e94af8eSAlex Elder 
673602adf40SYehuda Sadeh /*
674602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
675602adf40SYehuda Sadeh  * header.
676602adf40SYehuda Sadeh  */
677602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
6784156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
679602adf40SYehuda Sadeh {
680ccece235SAlex Elder 	u32 snap_count;
68158c17b0eSAlex Elder 	size_t len;
682d2bb24e5SAlex Elder 	size_t size;
683621901d6SAlex Elder 	u32 i;
684602adf40SYehuda Sadeh 
6856a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
6866a52325fSAlex Elder 
687103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
688103a150fSAlex Elder 
68958c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
69058c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6916a52325fSAlex Elder 	if (!header->object_prefix)
692602adf40SYehuda Sadeh 		return -ENOMEM;
69358c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
69458c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
69500f1f36fSAlex Elder 
696602adf40SYehuda Sadeh 	if (snap_count) {
697f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
698f785cc1dSAlex Elder 
699621901d6SAlex Elder 		/* Save a copy of the snapshot names */
700621901d6SAlex Elder 
701f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
702f785cc1dSAlex Elder 			return -EIO;
703f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
704602adf40SYehuda Sadeh 		if (!header->snap_names)
7056a52325fSAlex Elder 			goto out_err;
706f785cc1dSAlex Elder 		/*
707f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
708f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
709f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
710f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
711f785cc1dSAlex Elder 		 */
712f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
713f785cc1dSAlex Elder 			snap_names_len);
7146a52325fSAlex Elder 
715621901d6SAlex Elder 		/* Record each snapshot's size */
716621901d6SAlex Elder 
717d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
718d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
719602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7206a52325fSAlex Elder 			goto out_err;
721621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
722621901d6SAlex Elder 			header->snap_sizes[i] =
723621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
724602adf40SYehuda Sadeh 	} else {
725ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
726602adf40SYehuda Sadeh 		header->snap_names = NULL;
727602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
728602adf40SYehuda Sadeh 	}
729849b4260SAlex Elder 
73034b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
731602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
732602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
733602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7346a52325fSAlex Elder 
735621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
736621901d6SAlex Elder 
737f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7386a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7396a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7406a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7416a52325fSAlex Elder 	if (!header->snapc)
7426a52325fSAlex Elder 		goto out_err;
743602adf40SYehuda Sadeh 
744602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
745505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
746602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
747621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
748602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
749602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
750602adf40SYehuda Sadeh 
751602adf40SYehuda Sadeh 	return 0;
752602adf40SYehuda Sadeh 
7536a52325fSAlex Elder out_err:
754849b4260SAlex Elder 	kfree(header->snap_sizes);
755ccece235SAlex Elder 	header->snap_sizes = NULL;
756602adf40SYehuda Sadeh 	kfree(header->snap_names);
757ccece235SAlex Elder 	header->snap_names = NULL;
7586a52325fSAlex Elder 	kfree(header->object_prefix);
7596a52325fSAlex Elder 	header->object_prefix = NULL;
760ccece235SAlex Elder 
76100f1f36fSAlex Elder 	return -ENOMEM;
762602adf40SYehuda Sadeh }
763602adf40SYehuda Sadeh 
7649e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7659e15b77dSAlex Elder {
7669e15b77dSAlex Elder 	struct rbd_snap *snap;
7679e15b77dSAlex Elder 
7689e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7699e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7709e15b77dSAlex Elder 
7719e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7729e15b77dSAlex Elder 		if (snap_id == snap->id)
7739e15b77dSAlex Elder 			return snap->name;
7749e15b77dSAlex Elder 
7759e15b77dSAlex Elder 	return NULL;
7769e15b77dSAlex Elder }
7779e15b77dSAlex Elder 
7788836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
779602adf40SYehuda Sadeh {
780602adf40SYehuda Sadeh 
781e86924a8SAlex Elder 	struct rbd_snap *snap;
78200f1f36fSAlex Elder 
783e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
784e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
7850d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
786e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
78734b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
78800f1f36fSAlex Elder 
789e86924a8SAlex Elder 			return 0;
790602adf40SYehuda Sadeh 		}
79100f1f36fSAlex Elder 	}
792e86924a8SAlex Elder 
79300f1f36fSAlex Elder 	return -ENOENT;
79400f1f36fSAlex Elder }
795602adf40SYehuda Sadeh 
796819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
797602adf40SYehuda Sadeh {
79878dc447dSAlex Elder 	int ret;
799602adf40SYehuda Sadeh 
8000d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
801cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8020d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
80399c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
80434b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
805e86924a8SAlex Elder 		ret = 0;
806602adf40SYehuda Sadeh 	} else {
8070d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
808602adf40SYehuda Sadeh 		if (ret < 0)
809602adf40SYehuda Sadeh 			goto done;
810f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
811602adf40SYehuda Sadeh 	}
8126d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8136d292906SAlex Elder 
814602adf40SYehuda Sadeh done:
815602adf40SYehuda Sadeh 	return ret;
816602adf40SYehuda Sadeh }
817602adf40SYehuda Sadeh 
818602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
819602adf40SYehuda Sadeh {
820849b4260SAlex Elder 	kfree(header->object_prefix);
821d78fd7aeSAlex Elder 	header->object_prefix = NULL;
822602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
823d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
824849b4260SAlex Elder 	kfree(header->snap_names);
825d78fd7aeSAlex Elder 	header->snap_names = NULL;
826d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
827d78fd7aeSAlex Elder 	header->snapc = NULL;
828602adf40SYehuda Sadeh }
829602adf40SYehuda Sadeh 
83098571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
831602adf40SYehuda Sadeh {
83265ccfe21SAlex Elder 	char *name;
83365ccfe21SAlex Elder 	u64 segment;
83465ccfe21SAlex Elder 	int ret;
835602adf40SYehuda Sadeh 
8362fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
83765ccfe21SAlex Elder 	if (!name)
83865ccfe21SAlex Elder 		return NULL;
83965ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8402fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
84165ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8422fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
84365ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
84465ccfe21SAlex Elder 			segment, ret);
84565ccfe21SAlex Elder 		kfree(name);
84665ccfe21SAlex Elder 		name = NULL;
84765ccfe21SAlex Elder 	}
848602adf40SYehuda Sadeh 
84965ccfe21SAlex Elder 	return name;
85065ccfe21SAlex Elder }
851602adf40SYehuda Sadeh 
85265ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
85365ccfe21SAlex Elder {
85465ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
855602adf40SYehuda Sadeh 
85665ccfe21SAlex Elder 	return offset & (segment_size - 1);
85765ccfe21SAlex Elder }
85865ccfe21SAlex Elder 
85965ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
86065ccfe21SAlex Elder 				u64 offset, u64 length)
86165ccfe21SAlex Elder {
86265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
86365ccfe21SAlex Elder 
86465ccfe21SAlex Elder 	offset &= segment_size - 1;
86565ccfe21SAlex Elder 
866aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
86765ccfe21SAlex Elder 	if (offset + length > segment_size)
86865ccfe21SAlex Elder 		length = segment_size - offset;
86965ccfe21SAlex Elder 
87065ccfe21SAlex Elder 	return length;
871602adf40SYehuda Sadeh }
872602adf40SYehuda Sadeh 
873602adf40SYehuda Sadeh /*
874029bcbd8SJosh Durgin  * returns the size of an object in the image
875029bcbd8SJosh Durgin  */
876029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
877029bcbd8SJosh Durgin {
878029bcbd8SJosh Durgin 	return 1 << header->obj_order;
879029bcbd8SJosh Durgin }
880029bcbd8SJosh Durgin 
881029bcbd8SJosh Durgin /*
882602adf40SYehuda Sadeh  * bio helpers
883602adf40SYehuda Sadeh  */
884602adf40SYehuda Sadeh 
885602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
886602adf40SYehuda Sadeh {
887602adf40SYehuda Sadeh 	struct bio *tmp;
888602adf40SYehuda Sadeh 
889602adf40SYehuda Sadeh 	while (chain) {
890602adf40SYehuda Sadeh 		tmp = chain;
891602adf40SYehuda Sadeh 		chain = chain->bi_next;
892602adf40SYehuda Sadeh 		bio_put(tmp);
893602adf40SYehuda Sadeh 	}
894602adf40SYehuda Sadeh }
895602adf40SYehuda Sadeh 
896602adf40SYehuda Sadeh /*
897602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
898602adf40SYehuda Sadeh  */
899602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
900602adf40SYehuda Sadeh {
901602adf40SYehuda Sadeh 	struct bio_vec *bv;
902602adf40SYehuda Sadeh 	unsigned long flags;
903602adf40SYehuda Sadeh 	void *buf;
904602adf40SYehuda Sadeh 	int i;
905602adf40SYehuda Sadeh 	int pos = 0;
906602adf40SYehuda Sadeh 
907602adf40SYehuda Sadeh 	while (chain) {
908602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
909602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
910602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
911602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
912602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
913602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
91485b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
915602adf40SYehuda Sadeh 			}
916602adf40SYehuda Sadeh 			pos += bv->bv_len;
917602adf40SYehuda Sadeh 		}
918602adf40SYehuda Sadeh 
919602adf40SYehuda Sadeh 		chain = chain->bi_next;
920602adf40SYehuda Sadeh 	}
921602adf40SYehuda Sadeh }
922602adf40SYehuda Sadeh 
923602adf40SYehuda Sadeh /*
924f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
925f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
926602adf40SYehuda Sadeh  */
927f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
928f7760dadSAlex Elder 					unsigned int offset,
929f7760dadSAlex Elder 					unsigned int len,
930f7760dadSAlex Elder 					gfp_t gfpmask)
931602adf40SYehuda Sadeh {
932f7760dadSAlex Elder 	struct bio_vec *bv;
933f7760dadSAlex Elder 	unsigned int resid;
934f7760dadSAlex Elder 	unsigned short idx;
935f7760dadSAlex Elder 	unsigned int voff;
936f7760dadSAlex Elder 	unsigned short end_idx;
937f7760dadSAlex Elder 	unsigned short vcnt;
938f7760dadSAlex Elder 	struct bio *bio;
939602adf40SYehuda Sadeh 
940f7760dadSAlex Elder 	/* Handle the easy case for the caller */
941f7760dadSAlex Elder 
942f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
943f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
944f7760dadSAlex Elder 
945f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
946f7760dadSAlex Elder 		return NULL;
947f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
948f7760dadSAlex Elder 		return NULL;
949f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
950f7760dadSAlex Elder 		return NULL;
951f7760dadSAlex Elder 
952f7760dadSAlex Elder 	/* Find first affected segment... */
953f7760dadSAlex Elder 
954f7760dadSAlex Elder 	resid = offset;
955f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
956f7760dadSAlex Elder 		if (resid < bv->bv_len)
957f7760dadSAlex Elder 			break;
958f7760dadSAlex Elder 		resid -= bv->bv_len;
959602adf40SYehuda Sadeh 	}
960f7760dadSAlex Elder 	voff = resid;
961602adf40SYehuda Sadeh 
962f7760dadSAlex Elder 	/* ...and the last affected segment */
963542582fcSAlex Elder 
964f7760dadSAlex Elder 	resid += len;
965f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
966f7760dadSAlex Elder 		if (resid <= bv->bv_len)
967f7760dadSAlex Elder 			break;
968f7760dadSAlex Elder 		resid -= bv->bv_len;
969f7760dadSAlex Elder 	}
970f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
971602adf40SYehuda Sadeh 
972f7760dadSAlex Elder 	/* Build the clone */
973f7760dadSAlex Elder 
974f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
975f7760dadSAlex Elder 	if (!bio)
976f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
977f7760dadSAlex Elder 
978f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
979f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
980f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
981f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
982602adf40SYehuda Sadeh 
983602adf40SYehuda Sadeh 	/*
984f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
985f7760dadSAlex Elder 	 * and last (or only) entries.
986602adf40SYehuda Sadeh 	 */
987f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
988f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
989f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
990f7760dadSAlex Elder 	if (vcnt > 1) {
991f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
992f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
993602adf40SYehuda Sadeh 	} else {
994f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
995602adf40SYehuda Sadeh 	}
996602adf40SYehuda Sadeh 
997f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
998f7760dadSAlex Elder 	bio->bi_size = len;
999f7760dadSAlex Elder 	bio->bi_idx = 0;
1000602adf40SYehuda Sadeh 
1001f7760dadSAlex Elder 	return bio;
1002602adf40SYehuda Sadeh }
1003602adf40SYehuda Sadeh 
1004f7760dadSAlex Elder /*
1005f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1006f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1007f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1008f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1009f7760dadSAlex Elder  *
1010f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1011f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1012f7760dadSAlex Elder  * the start of data to be cloned is located.
1013f7760dadSAlex Elder  *
1014f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1015f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1016f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1017f7760dadSAlex Elder  */
1018f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1019f7760dadSAlex Elder 					unsigned int *offset,
1020f7760dadSAlex Elder 					unsigned int len,
1021f7760dadSAlex Elder 					gfp_t gfpmask)
1022f7760dadSAlex Elder {
1023f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1024f7760dadSAlex Elder 	unsigned int off = *offset;
1025f7760dadSAlex Elder 	struct bio *chain = NULL;
1026f7760dadSAlex Elder 	struct bio **end;
1027602adf40SYehuda Sadeh 
1028f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1029602adf40SYehuda Sadeh 
1030f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1031f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1032602adf40SYehuda Sadeh 
1033f7760dadSAlex Elder 	end = &chain;
1034f7760dadSAlex Elder 	while (len) {
1035f7760dadSAlex Elder 		unsigned int bi_size;
1036f7760dadSAlex Elder 		struct bio *bio;
1037f7760dadSAlex Elder 
1038f5400b7aSAlex Elder 		if (!bi) {
1039f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1040f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1041f5400b7aSAlex Elder 		}
1042f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1043f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1044f7760dadSAlex Elder 		if (!bio)
1045f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1046f7760dadSAlex Elder 
1047f7760dadSAlex Elder 		*end = bio;
1048f7760dadSAlex Elder 		end = &bio->bi_next;
1049f7760dadSAlex Elder 
1050f7760dadSAlex Elder 		off += bi_size;
1051f7760dadSAlex Elder 		if (off == bi->bi_size) {
1052f7760dadSAlex Elder 			bi = bi->bi_next;
1053f7760dadSAlex Elder 			off = 0;
1054f7760dadSAlex Elder 		}
1055f7760dadSAlex Elder 		len -= bi_size;
1056f7760dadSAlex Elder 	}
1057f7760dadSAlex Elder 	*bio_src = bi;
1058f7760dadSAlex Elder 	*offset = off;
1059f7760dadSAlex Elder 
1060f7760dadSAlex Elder 	return chain;
1061f7760dadSAlex Elder out_err:
1062f7760dadSAlex Elder 	bio_chain_put(chain);
1063f7760dadSAlex Elder 
1064602adf40SYehuda Sadeh 	return NULL;
1065602adf40SYehuda Sadeh }
1066602adf40SYehuda Sadeh 
1067bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068bf0d5f50SAlex Elder {
106937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
107037206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1071bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1072bf0d5f50SAlex Elder }
1073bf0d5f50SAlex Elder 
1074bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1075bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076bf0d5f50SAlex Elder {
1077bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
107837206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
107937206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1080bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081bf0d5f50SAlex Elder }
1082bf0d5f50SAlex Elder 
1083bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1084bf0d5f50SAlex Elder {
108537206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
108637206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1087bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1088bf0d5f50SAlex Elder }
1089bf0d5f50SAlex Elder 
1090bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1091bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1092bf0d5f50SAlex Elder {
1093bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
109437206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
109537206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1096bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1097bf0d5f50SAlex Elder }
1098bf0d5f50SAlex Elder 
1099bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1101bf0d5f50SAlex Elder {
110225dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
110325dcf954SAlex Elder 
1104bf0d5f50SAlex Elder 	rbd_obj_request_get(obj_request);
1105bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
110625dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
1107bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
110825dcf954SAlex Elder 	img_request->obj_request_count++;
110925dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
111037206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
111137206ee5SAlex Elder 		obj_request->which);
1112bf0d5f50SAlex Elder }
1113bf0d5f50SAlex Elder 
1114bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1115bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1116bf0d5f50SAlex Elder {
1117bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
111825dcf954SAlex Elder 
111937206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
112037206ee5SAlex Elder 		obj_request->which);
1121bf0d5f50SAlex Elder 	list_del(&obj_request->links);
112225dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
112325dcf954SAlex Elder 	img_request->obj_request_count--;
112425dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
112525dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
1126bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1127bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
112825dcf954SAlex Elder 	obj_request->callback = NULL;
1129bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1130bf0d5f50SAlex Elder }
1131bf0d5f50SAlex Elder 
1132bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1133bf0d5f50SAlex Elder {
1134bf0d5f50SAlex Elder 	switch (type) {
11359969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1136bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1137788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1138bf0d5f50SAlex Elder 		return true;
1139bf0d5f50SAlex Elder 	default:
1140bf0d5f50SAlex Elder 		return false;
1141bf0d5f50SAlex Elder 	}
1142bf0d5f50SAlex Elder }
1143bf0d5f50SAlex Elder 
1144cc344fa1SAlex Elder static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
11458d23bf29SAlex Elder {
11468d23bf29SAlex Elder 	struct ceph_osd_req_op *op;
11478d23bf29SAlex Elder 	va_list args;
11482647ba38SAlex Elder 	size_t size;
11498d23bf29SAlex Elder 
11508d23bf29SAlex Elder 	op = kzalloc(sizeof (*op), GFP_NOIO);
11518d23bf29SAlex Elder 	if (!op)
11528d23bf29SAlex Elder 		return NULL;
11538d23bf29SAlex Elder 	op->op = opcode;
11548d23bf29SAlex Elder 	va_start(args, opcode);
11558d23bf29SAlex Elder 	switch (opcode) {
11568d23bf29SAlex Elder 	case CEPH_OSD_OP_READ:
11578d23bf29SAlex Elder 	case CEPH_OSD_OP_WRITE:
11588d23bf29SAlex Elder 		/* rbd_osd_req_op_create(READ, offset, length) */
11598d23bf29SAlex Elder 		/* rbd_osd_req_op_create(WRITE, offset, length) */
11608d23bf29SAlex Elder 		op->extent.offset = va_arg(args, u64);
11618d23bf29SAlex Elder 		op->extent.length = va_arg(args, u64);
11628d23bf29SAlex Elder 		if (opcode == CEPH_OSD_OP_WRITE)
11638d23bf29SAlex Elder 			op->payload_len = op->extent.length;
11648d23bf29SAlex Elder 		break;
1165fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1166fbfab539SAlex Elder 		break;
11672647ba38SAlex Elder 	case CEPH_OSD_OP_CALL:
11682647ba38SAlex Elder 		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
11692647ba38SAlex Elder 		op->cls.class_name = va_arg(args, char *);
11702647ba38SAlex Elder 		size = strlen(op->cls.class_name);
11712647ba38SAlex Elder 		rbd_assert(size <= (size_t) U8_MAX);
11722647ba38SAlex Elder 		op->cls.class_len = size;
11732647ba38SAlex Elder 		op->payload_len = size;
11742647ba38SAlex Elder 
11752647ba38SAlex Elder 		op->cls.method_name = va_arg(args, char *);
11762647ba38SAlex Elder 		size = strlen(op->cls.method_name);
11772647ba38SAlex Elder 		rbd_assert(size <= (size_t) U8_MAX);
11782647ba38SAlex Elder 		op->cls.method_len = size;
11792647ba38SAlex Elder 		op->payload_len += size;
11802647ba38SAlex Elder 
11812647ba38SAlex Elder 		op->cls.argc = 0;
11822647ba38SAlex Elder 		op->cls.indata = va_arg(args, void *);
11832647ba38SAlex Elder 		size = va_arg(args, size_t);
11842647ba38SAlex Elder 		rbd_assert(size <= (size_t) U32_MAX);
11852647ba38SAlex Elder 		op->cls.indata_len = (u32) size;
11862647ba38SAlex Elder 		op->payload_len += size;
11872647ba38SAlex Elder 		break;
11885efea49aSAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
11895efea49aSAlex Elder 	case CEPH_OSD_OP_WATCH:
11905efea49aSAlex Elder 		/* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
11915efea49aSAlex Elder 		/* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
11925efea49aSAlex Elder 		op->watch.cookie = va_arg(args, u64);
11935efea49aSAlex Elder 		op->watch.ver = va_arg(args, u64);
11945efea49aSAlex Elder 		op->watch.ver = cpu_to_le64(op->watch.ver);
11955efea49aSAlex Elder 		if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
11965efea49aSAlex Elder 			op->watch.flag = (u8) 1;
11975efea49aSAlex Elder 		break;
11988d23bf29SAlex Elder 	default:
11998d23bf29SAlex Elder 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
12008d23bf29SAlex Elder 		kfree(op);
12018d23bf29SAlex Elder 		op = NULL;
12028d23bf29SAlex Elder 		break;
12038d23bf29SAlex Elder 	}
12048d23bf29SAlex Elder 	va_end(args);
12058d23bf29SAlex Elder 
12068d23bf29SAlex Elder 	return op;
12078d23bf29SAlex Elder }
12088d23bf29SAlex Elder 
12098d23bf29SAlex Elder static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
12108d23bf29SAlex Elder {
12118d23bf29SAlex Elder 	kfree(op);
12128d23bf29SAlex Elder }
12138d23bf29SAlex Elder 
1214bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1216bf0d5f50SAlex Elder {
121737206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
121837206ee5SAlex Elder 
1219bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1220bf0d5f50SAlex Elder }
1221bf0d5f50SAlex Elder 
1222bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223bf0d5f50SAlex Elder {
122437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
1225bf0d5f50SAlex Elder 	if (img_request->callback)
1226bf0d5f50SAlex Elder 		img_request->callback(img_request);
1227bf0d5f50SAlex Elder 	else
1228bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1229bf0d5f50SAlex Elder }
1230bf0d5f50SAlex Elder 
1231788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1232788e2df3SAlex Elder 
1233788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1234788e2df3SAlex Elder {
123537206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
123637206ee5SAlex Elder 
1237788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1238788e2df3SAlex Elder }
1239788e2df3SAlex Elder 
124007741308SAlex Elder static void obj_request_done_init(struct rbd_obj_request *obj_request)
124107741308SAlex Elder {
124207741308SAlex Elder 	atomic_set(&obj_request->done, 0);
124307741308SAlex Elder 	smp_wmb();
124407741308SAlex Elder }
124507741308SAlex Elder 
124607741308SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
124707741308SAlex Elder {
1248632b88caSAlex Elder 	int done;
1249632b88caSAlex Elder 
1250632b88caSAlex Elder 	done = atomic_inc_return(&obj_request->done);
1251632b88caSAlex Elder 	if (done > 1) {
1252632b88caSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
1253632b88caSAlex Elder 		struct rbd_device *rbd_dev;
1254632b88caSAlex Elder 
1255632b88caSAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256632b88caSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257632b88caSAlex Elder 			obj_request);
1258632b88caSAlex Elder 	}
125907741308SAlex Elder }
126007741308SAlex Elder 
126107741308SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
126207741308SAlex Elder {
1263632b88caSAlex Elder 	smp_mb();
126407741308SAlex Elder 	return atomic_read(&obj_request->done) != 0;
126507741308SAlex Elder }
126607741308SAlex Elder 
12679969ebc5SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
12689969ebc5SAlex Elder 				struct ceph_osd_op *op)
12699969ebc5SAlex Elder {
127037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
127107741308SAlex Elder 	obj_request_done_set(obj_request);
12729969ebc5SAlex Elder }
12739969ebc5SAlex Elder 
1274bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1275bf0d5f50SAlex Elder {
127637206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
127737206ee5SAlex Elder 		obj_request->callback);
1278bf0d5f50SAlex Elder 	if (obj_request->callback)
1279bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1280788e2df3SAlex Elder 	else
1281788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1282bf0d5f50SAlex Elder }
1283bf0d5f50SAlex Elder 
1284bf0d5f50SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1285bf0d5f50SAlex Elder 				struct ceph_osd_op *op)
1286bf0d5f50SAlex Elder {
1287bf0d5f50SAlex Elder 	u64 xferred;
1288bf0d5f50SAlex Elder 
1289bf0d5f50SAlex Elder 	/*
1290bf0d5f50SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1291bf0d5f50SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1292bf0d5f50SAlex Elder 	 */
1293bf0d5f50SAlex Elder 	xferred = le64_to_cpu(op->extent.length);
1294bf0d5f50SAlex Elder 	rbd_assert(xferred < (u64) UINT_MAX);
129537206ee5SAlex Elder 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
129637206ee5SAlex Elder 		obj_request->result, xferred, obj_request->length);
1297bf0d5f50SAlex Elder 	if (obj_request->result == (s32) -ENOENT) {
1298bf0d5f50SAlex Elder 		zero_bio_chain(obj_request->bio_list, 0);
1299bf0d5f50SAlex Elder 		obj_request->result = 0;
1300bf0d5f50SAlex Elder 	} else if (xferred < obj_request->length && !obj_request->result) {
1301bf0d5f50SAlex Elder 		zero_bio_chain(obj_request->bio_list, xferred);
1302bf0d5f50SAlex Elder 		xferred = obj_request->length;
1303bf0d5f50SAlex Elder 	}
1304bf0d5f50SAlex Elder 	obj_request->xferred = xferred;
130507741308SAlex Elder 	obj_request_done_set(obj_request);
1306bf0d5f50SAlex Elder }
1307bf0d5f50SAlex Elder 
1308bf0d5f50SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1309bf0d5f50SAlex Elder 				struct ceph_osd_op *op)
1310bf0d5f50SAlex Elder {
131137206ee5SAlex Elder 
1312bf0d5f50SAlex Elder 	obj_request->xferred = le64_to_cpu(op->extent.length);
131337206ee5SAlex Elder 	dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
131437206ee5SAlex Elder 		obj_request->result, obj_request->xferred, obj_request->length);
131537206ee5SAlex Elder 
131637206ee5SAlex Elder 	/* A short write really shouldn't occur.  Warn if we see one */
131737206ee5SAlex Elder 
131837206ee5SAlex Elder 	if (obj_request->xferred != obj_request->length) {
131937206ee5SAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
132037206ee5SAlex Elder 		struct rbd_device *rbd_dev;
132137206ee5SAlex Elder 
132237206ee5SAlex Elder 		rbd_dev = img_request ? img_request->rbd_dev : NULL;
132337206ee5SAlex Elder 		rbd_warn(rbd_dev, "wrote %llu want %llu\n",
132437206ee5SAlex Elder 			obj_request->xferred, obj_request->length);
132537206ee5SAlex Elder 	}
132637206ee5SAlex Elder 
132707741308SAlex Elder 	obj_request_done_set(obj_request);
1328bf0d5f50SAlex Elder }
1329bf0d5f50SAlex Elder 
1330fbfab539SAlex Elder /*
1331fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1332fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1333fbfab539SAlex Elder  */
1334fbfab539SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request,
1335fbfab539SAlex Elder 				struct ceph_osd_op *op)
1336fbfab539SAlex Elder {
133737206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1338fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1339fbfab539SAlex Elder }
1340fbfab539SAlex Elder 
1341bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1342bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1343bf0d5f50SAlex Elder {
1344bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1345bf0d5f50SAlex Elder 	struct ceph_osd_reply_head *reply_head;
1346bf0d5f50SAlex Elder 	struct ceph_osd_op *op;
1347bf0d5f50SAlex Elder 	u32 num_ops;
1348bf0d5f50SAlex Elder 	u16 opcode;
1349bf0d5f50SAlex Elder 
135037206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1351bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
1352bf0d5f50SAlex Elder 	rbd_assert(!!obj_request->img_request ^
1353bf0d5f50SAlex Elder 				(obj_request->which == BAD_WHICH));
1354bf0d5f50SAlex Elder 
1355bf0d5f50SAlex Elder 	obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1356bf0d5f50SAlex Elder 	reply_head = msg->front.iov_base;
1357bf0d5f50SAlex Elder 	obj_request->result = (s32) le32_to_cpu(reply_head->result);
1358bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1359bf0d5f50SAlex Elder 
1360bf0d5f50SAlex Elder 	num_ops = le32_to_cpu(reply_head->num_ops);
1361bf0d5f50SAlex Elder 	WARN_ON(num_ops != 1);	/* For now */
1362bf0d5f50SAlex Elder 
1363bf0d5f50SAlex Elder 	op = &reply_head->ops[0];
1364bf0d5f50SAlex Elder 	opcode = le16_to_cpu(op->op);
1365bf0d5f50SAlex Elder 	switch (opcode) {
1366bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1367bf0d5f50SAlex Elder 		rbd_osd_read_callback(obj_request, op);
1368bf0d5f50SAlex Elder 		break;
1369bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1370bf0d5f50SAlex Elder 		rbd_osd_write_callback(obj_request, op);
1371bf0d5f50SAlex Elder 		break;
1372fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1373fbfab539SAlex Elder 		rbd_osd_stat_callback(obj_request, op);
1374fbfab539SAlex Elder 		break;
137536be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1376b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
13779969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
13789969ebc5SAlex Elder 		rbd_osd_trivial_callback(obj_request, op);
13799969ebc5SAlex Elder 		break;
1380bf0d5f50SAlex Elder 	default:
1381bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1382bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1383bf0d5f50SAlex Elder 		break;
1384bf0d5f50SAlex Elder 	}
1385bf0d5f50SAlex Elder 
138607741308SAlex Elder 	if (obj_request_done_test(obj_request))
1387bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1388bf0d5f50SAlex Elder }
1389bf0d5f50SAlex Elder 
1390bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1391bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1392bf0d5f50SAlex Elder 					bool write_request,
1393bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request,
1394bf0d5f50SAlex Elder 					struct ceph_osd_req_op *op)
1395bf0d5f50SAlex Elder {
1396bf0d5f50SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
1397bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1398bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1399bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1400bf0d5f50SAlex Elder 	struct timespec now;
1401bf0d5f50SAlex Elder 	struct timespec *mtime;
1402bf0d5f50SAlex Elder 	u64 snap_id = CEPH_NOSNAP;
1403bf0d5f50SAlex Elder 	u64 offset = obj_request->offset;
1404bf0d5f50SAlex Elder 	u64 length = obj_request->length;
1405bf0d5f50SAlex Elder 
1406bf0d5f50SAlex Elder 	if (img_request) {
1407bf0d5f50SAlex Elder 		rbd_assert(img_request->write_request == write_request);
1408bf0d5f50SAlex Elder 		if (img_request->write_request)
1409bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1410bf0d5f50SAlex Elder 		else
1411bf0d5f50SAlex Elder 			snap_id = img_request->snap_id;
1412bf0d5f50SAlex Elder 	}
1413bf0d5f50SAlex Elder 
1414bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1415bf0d5f50SAlex Elder 
1416bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1417bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1418bf0d5f50SAlex Elder 	if (!osd_req)
1419bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1420bf0d5f50SAlex Elder 
1421bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1422bf0d5f50SAlex Elder 	switch (obj_request->type) {
14239969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
14249969ebc5SAlex Elder 		break;		/* Nothing to do */
1425bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1426bf0d5f50SAlex Elder 		rbd_assert(obj_request->bio_list != NULL);
1427bf0d5f50SAlex Elder 		osd_req->r_bio = obj_request->bio_list;
1428bf0d5f50SAlex Elder 		break;
1429788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1430788e2df3SAlex Elder 		osd_req->r_pages = obj_request->pages;
1431788e2df3SAlex Elder 		osd_req->r_num_pages = obj_request->page_count;
1432788e2df3SAlex Elder 		osd_req->r_page_alignment = offset & ~PAGE_MASK;
1433788e2df3SAlex Elder 		break;
1434bf0d5f50SAlex Elder 	}
1435bf0d5f50SAlex Elder 
1436bf0d5f50SAlex Elder 	if (write_request) {
1437bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1438bf0d5f50SAlex Elder 		now = CURRENT_TIME;
1439bf0d5f50SAlex Elder 		mtime = &now;
1440bf0d5f50SAlex Elder 	} else {
1441bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1442bf0d5f50SAlex Elder 		mtime = NULL;	/* not needed for reads */
1443bf0d5f50SAlex Elder 		offset = 0;	/* These are not used... */
1444bf0d5f50SAlex Elder 		length = 0;	/* ...for osd read requests */
1445bf0d5f50SAlex Elder 	}
1446bf0d5f50SAlex Elder 
1447bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1448bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1449bf0d5f50SAlex Elder 
1450bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1451bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1452bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1453bf0d5f50SAlex Elder 
1454bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1455bf0d5f50SAlex Elder 
1456bf0d5f50SAlex Elder 	/* osd_req will get its own reference to snapc (if non-null) */
1457bf0d5f50SAlex Elder 
1458bf0d5f50SAlex Elder 	ceph_osdc_build_request(osd_req, offset, length, 1, op,
1459bf0d5f50SAlex Elder 				snapc, snap_id, mtime);
1460bf0d5f50SAlex Elder 
1461bf0d5f50SAlex Elder 	return osd_req;
1462bf0d5f50SAlex Elder }
1463bf0d5f50SAlex Elder 
1464bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1465bf0d5f50SAlex Elder {
1466bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1467bf0d5f50SAlex Elder }
1468bf0d5f50SAlex Elder 
1469bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1470bf0d5f50SAlex Elder 
1471bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1472bf0d5f50SAlex Elder 						u64 offset, u64 length,
1473bf0d5f50SAlex Elder 						enum obj_request_type type)
1474bf0d5f50SAlex Elder {
1475bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1476bf0d5f50SAlex Elder 	size_t size;
1477bf0d5f50SAlex Elder 	char *name;
1478bf0d5f50SAlex Elder 
1479bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1480bf0d5f50SAlex Elder 
1481bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1482bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1483bf0d5f50SAlex Elder 	if (!obj_request)
1484bf0d5f50SAlex Elder 		return NULL;
1485bf0d5f50SAlex Elder 
1486bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1487bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1488bf0d5f50SAlex Elder 	obj_request->offset = offset;
1489bf0d5f50SAlex Elder 	obj_request->length = length;
1490bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1491bf0d5f50SAlex Elder 	obj_request->type = type;
1492bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
149307741308SAlex Elder 	obj_request_done_init(obj_request);
1494788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1495bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1496bf0d5f50SAlex Elder 
149737206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
149837206ee5SAlex Elder 		offset, length, (int)type, obj_request);
149937206ee5SAlex Elder 
1500bf0d5f50SAlex Elder 	return obj_request;
1501bf0d5f50SAlex Elder }
1502bf0d5f50SAlex Elder 
1503bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1504bf0d5f50SAlex Elder {
1505bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1506bf0d5f50SAlex Elder 
1507bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1508bf0d5f50SAlex Elder 
150937206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
151037206ee5SAlex Elder 
1511bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1512bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1513bf0d5f50SAlex Elder 
1514bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1515bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1516bf0d5f50SAlex Elder 
1517bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1518bf0d5f50SAlex Elder 	switch (obj_request->type) {
15199969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
15209969ebc5SAlex Elder 		break;		/* Nothing to do */
1521bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1522bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1523bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1524bf0d5f50SAlex Elder 		break;
1525788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1526788e2df3SAlex Elder 		if (obj_request->pages)
1527788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1528788e2df3SAlex Elder 						obj_request->page_count);
1529788e2df3SAlex Elder 		break;
1530bf0d5f50SAlex Elder 	}
1531bf0d5f50SAlex Elder 
1532bf0d5f50SAlex Elder 	kfree(obj_request);
1533bf0d5f50SAlex Elder }
1534bf0d5f50SAlex Elder 
1535bf0d5f50SAlex Elder /*
1536bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1537bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1538bf0d5f50SAlex Elder  * (if there is one).
1539bf0d5f50SAlex Elder  */
1540cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1541cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1542bf0d5f50SAlex Elder 					u64 offset, u64 length,
1543bf0d5f50SAlex Elder 					bool write_request)
1544bf0d5f50SAlex Elder {
1545bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1546bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1547bf0d5f50SAlex Elder 
1548bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1549bf0d5f50SAlex Elder 	if (!img_request)
1550bf0d5f50SAlex Elder 		return NULL;
1551bf0d5f50SAlex Elder 
1552bf0d5f50SAlex Elder 	if (write_request) {
1553bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1554bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1555bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1556bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1557bf0d5f50SAlex Elder 			kfree(img_request);
1558bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1559bf0d5f50SAlex Elder 		}
1560bf0d5f50SAlex Elder 	}
1561bf0d5f50SAlex Elder 
1562bf0d5f50SAlex Elder 	img_request->rq = NULL;
1563bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1564bf0d5f50SAlex Elder 	img_request->offset = offset;
1565bf0d5f50SAlex Elder 	img_request->length = length;
1566bf0d5f50SAlex Elder 	img_request->write_request = write_request;
1567bf0d5f50SAlex Elder 	if (write_request)
1568bf0d5f50SAlex Elder 		img_request->snapc = snapc;
1569bf0d5f50SAlex Elder 	else
1570bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
1571bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1572bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1573bf0d5f50SAlex Elder 	img_request->callback = NULL;
1574bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1575bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1576bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1577bf0d5f50SAlex Elder 
1578bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1579bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1580bf0d5f50SAlex Elder 
158137206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
158237206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
158337206ee5SAlex Elder 		img_request);
158437206ee5SAlex Elder 
1585bf0d5f50SAlex Elder 	return img_request;
1586bf0d5f50SAlex Elder }
1587bf0d5f50SAlex Elder 
1588bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1589bf0d5f50SAlex Elder {
1590bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1591bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1592bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1593bf0d5f50SAlex Elder 
1594bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1595bf0d5f50SAlex Elder 
159637206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
159737206ee5SAlex Elder 
1598bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1599bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
160025dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1601bf0d5f50SAlex Elder 
1602bf0d5f50SAlex Elder 	if (img_request->write_request)
1603bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1604bf0d5f50SAlex Elder 
1605bf0d5f50SAlex Elder 	kfree(img_request);
1606bf0d5f50SAlex Elder }
1607bf0d5f50SAlex Elder 
1608bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1609bf0d5f50SAlex Elder 					struct bio *bio_list)
1610bf0d5f50SAlex Elder {
1611bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1612bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1613bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1614bf0d5f50SAlex Elder 	unsigned int bio_offset;
1615bf0d5f50SAlex Elder 	u64 image_offset;
1616bf0d5f50SAlex Elder 	u64 resid;
1617bf0d5f50SAlex Elder 	u16 opcode;
1618bf0d5f50SAlex Elder 
161937206ee5SAlex Elder 	dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
162037206ee5SAlex Elder 
1621bf0d5f50SAlex Elder 	opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1622bf0d5f50SAlex Elder 					      : CEPH_OSD_OP_READ;
1623bf0d5f50SAlex Elder 	bio_offset = 0;
1624bf0d5f50SAlex Elder 	image_offset = img_request->offset;
1625bf0d5f50SAlex Elder 	rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1626bf0d5f50SAlex Elder 	resid = img_request->length;
16274dda41d3SAlex Elder 	rbd_assert(resid > 0);
1628bf0d5f50SAlex Elder 	while (resid) {
1629bf0d5f50SAlex Elder 		const char *object_name;
1630bf0d5f50SAlex Elder 		unsigned int clone_size;
1631bf0d5f50SAlex Elder 		struct ceph_osd_req_op *op;
1632bf0d5f50SAlex Elder 		u64 offset;
1633bf0d5f50SAlex Elder 		u64 length;
1634bf0d5f50SAlex Elder 
1635bf0d5f50SAlex Elder 		object_name = rbd_segment_name(rbd_dev, image_offset);
1636bf0d5f50SAlex Elder 		if (!object_name)
1637bf0d5f50SAlex Elder 			goto out_unwind;
1638bf0d5f50SAlex Elder 		offset = rbd_segment_offset(rbd_dev, image_offset);
1639bf0d5f50SAlex Elder 		length = rbd_segment_length(rbd_dev, image_offset, resid);
1640bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1641bf0d5f50SAlex Elder 						offset, length,
1642bf0d5f50SAlex Elder 						OBJ_REQUEST_BIO);
1643bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1644bf0d5f50SAlex Elder 		if (!obj_request)
1645bf0d5f50SAlex Elder 			goto out_unwind;
1646bf0d5f50SAlex Elder 
1647bf0d5f50SAlex Elder 		rbd_assert(length <= (u64) UINT_MAX);
1648bf0d5f50SAlex Elder 		clone_size = (unsigned int) length;
1649bf0d5f50SAlex Elder 		obj_request->bio_list = bio_chain_clone_range(&bio_list,
1650bf0d5f50SAlex Elder 						&bio_offset, clone_size,
1651bf0d5f50SAlex Elder 						GFP_ATOMIC);
1652bf0d5f50SAlex Elder 		if (!obj_request->bio_list)
1653bf0d5f50SAlex Elder 			goto out_partial;
1654bf0d5f50SAlex Elder 
1655bf0d5f50SAlex Elder 		/*
1656bf0d5f50SAlex Elder 		 * Build up the op to use in building the osd
1657bf0d5f50SAlex Elder 		 * request.  Note that the contents of the op are
1658bf0d5f50SAlex Elder 		 * copied by rbd_osd_req_create().
1659bf0d5f50SAlex Elder 		 */
1660bf0d5f50SAlex Elder 		op = rbd_osd_req_op_create(opcode, offset, length);
1661bf0d5f50SAlex Elder 		if (!op)
1662bf0d5f50SAlex Elder 			goto out_partial;
1663bf0d5f50SAlex Elder 		obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1664bf0d5f50SAlex Elder 						img_request->write_request,
1665bf0d5f50SAlex Elder 						obj_request, op);
1666bf0d5f50SAlex Elder 		rbd_osd_req_op_destroy(op);
1667bf0d5f50SAlex Elder 		if (!obj_request->osd_req)
1668bf0d5f50SAlex Elder 			goto out_partial;
1669bf0d5f50SAlex Elder 		/* status and version are initially zero-filled */
1670bf0d5f50SAlex Elder 
1671bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1672bf0d5f50SAlex Elder 
1673bf0d5f50SAlex Elder 		image_offset += length;
1674bf0d5f50SAlex Elder 		resid -= length;
1675bf0d5f50SAlex Elder 	}
1676bf0d5f50SAlex Elder 
1677bf0d5f50SAlex Elder 	return 0;
1678bf0d5f50SAlex Elder 
1679bf0d5f50SAlex Elder out_partial:
1680bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1681bf0d5f50SAlex Elder out_unwind:
1682bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1683bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1684bf0d5f50SAlex Elder 
1685bf0d5f50SAlex Elder 	return -ENOMEM;
1686bf0d5f50SAlex Elder }
1687bf0d5f50SAlex Elder 
1688bf0d5f50SAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1689bf0d5f50SAlex Elder {
1690bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1691bf0d5f50SAlex Elder 	u32 which = obj_request->which;
1692bf0d5f50SAlex Elder 	bool more = true;
1693bf0d5f50SAlex Elder 
1694bf0d5f50SAlex Elder 	img_request = obj_request->img_request;
16954dda41d3SAlex Elder 
169637206ee5SAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1697bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
1698bf0d5f50SAlex Elder 	rbd_assert(img_request->rq != NULL);
16994dda41d3SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
1700bf0d5f50SAlex Elder 	rbd_assert(which != BAD_WHICH);
1701bf0d5f50SAlex Elder 	rbd_assert(which < img_request->obj_request_count);
1702bf0d5f50SAlex Elder 	rbd_assert(which >= img_request->next_completion);
1703bf0d5f50SAlex Elder 
1704bf0d5f50SAlex Elder 	spin_lock_irq(&img_request->completion_lock);
1705bf0d5f50SAlex Elder 	if (which != img_request->next_completion)
1706bf0d5f50SAlex Elder 		goto out;
1707bf0d5f50SAlex Elder 
1708bf0d5f50SAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
1709bf0d5f50SAlex Elder 		unsigned int xferred;
1710bf0d5f50SAlex Elder 		int result;
1711bf0d5f50SAlex Elder 
1712bf0d5f50SAlex Elder 		rbd_assert(more);
1713bf0d5f50SAlex Elder 		rbd_assert(which < img_request->obj_request_count);
1714bf0d5f50SAlex Elder 
171507741308SAlex Elder 		if (!obj_request_done_test(obj_request))
1716bf0d5f50SAlex Elder 			break;
1717bf0d5f50SAlex Elder 
1718bf0d5f50SAlex Elder 		rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1719bf0d5f50SAlex Elder 		xferred = (unsigned int) obj_request->xferred;
1720bf0d5f50SAlex Elder 		result = (int) obj_request->result;
1721bf0d5f50SAlex Elder 		if (result)
1722bf0d5f50SAlex Elder 			rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1723bf0d5f50SAlex Elder 				img_request->write_request ? "write" : "read",
1724bf0d5f50SAlex Elder 				result, xferred);
1725bf0d5f50SAlex Elder 
1726bf0d5f50SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
1727bf0d5f50SAlex Elder 		which++;
1728bf0d5f50SAlex Elder 	}
1729bf0d5f50SAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
1730bf0d5f50SAlex Elder 	img_request->next_completion = which;
1731bf0d5f50SAlex Elder out:
1732bf0d5f50SAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
1733bf0d5f50SAlex Elder 
1734bf0d5f50SAlex Elder 	if (!more)
1735bf0d5f50SAlex Elder 		rbd_img_request_complete(img_request);
1736bf0d5f50SAlex Elder }
1737bf0d5f50SAlex Elder 
1738bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
1739bf0d5f50SAlex Elder {
1740bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1741bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1742bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1743bf0d5f50SAlex Elder 
174437206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
1745bf0d5f50SAlex Elder 	for_each_obj_request(img_request, obj_request) {
1746bf0d5f50SAlex Elder 		int ret;
1747bf0d5f50SAlex Elder 
1748bf0d5f50SAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1749bf0d5f50SAlex Elder 		ret = rbd_obj_request_submit(osdc, obj_request);
1750bf0d5f50SAlex Elder 		if (ret)
1751bf0d5f50SAlex Elder 			return ret;
1752bf0d5f50SAlex Elder 		/*
1753bf0d5f50SAlex Elder 		 * The image request has its own reference to each
1754bf0d5f50SAlex Elder 		 * of its object requests, so we can safely drop the
1755bf0d5f50SAlex Elder 		 * initial one here.
1756bf0d5f50SAlex Elder 		 */
1757bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1758bf0d5f50SAlex Elder 	}
1759bf0d5f50SAlex Elder 
1760bf0d5f50SAlex Elder 	return 0;
1761bf0d5f50SAlex Elder }
1762bf0d5f50SAlex Elder 
1763cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
1764b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
1765b8d70035SAlex Elder {
1766b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
1767b8d70035SAlex Elder 	struct ceph_osd_req_op *op;
1768b8d70035SAlex Elder 	struct ceph_osd_client *osdc;
1769b8d70035SAlex Elder 	int ret;
1770b8d70035SAlex Elder 
1771b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1772b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
1773b8d70035SAlex Elder 	if (!obj_request)
1774b8d70035SAlex Elder 		return -ENOMEM;
1775b8d70035SAlex Elder 
1776b8d70035SAlex Elder 	ret = -ENOMEM;
1777b8d70035SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1778b8d70035SAlex Elder 	if (!op)
1779b8d70035SAlex Elder 		goto out;
1780b8d70035SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1781b8d70035SAlex Elder 						obj_request, op);
1782b8d70035SAlex Elder 	rbd_osd_req_op_destroy(op);
1783b8d70035SAlex Elder 	if (!obj_request->osd_req)
1784b8d70035SAlex Elder 		goto out;
1785b8d70035SAlex Elder 
1786b8d70035SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1787cf81b60eSAlex Elder 	obj_request->callback = rbd_obj_request_put;
1788b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
1789b8d70035SAlex Elder out:
1790cf81b60eSAlex Elder 	if (ret)
1791b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
1792b8d70035SAlex Elder 
1793b8d70035SAlex Elder 	return ret;
1794b8d70035SAlex Elder }
1795b8d70035SAlex Elder 
1796b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1797b8d70035SAlex Elder {
1798b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1799b8d70035SAlex Elder 	u64 hver;
1800b8d70035SAlex Elder 	int rc;
1801b8d70035SAlex Elder 
1802b8d70035SAlex Elder 	if (!rbd_dev)
1803b8d70035SAlex Elder 		return;
1804b8d70035SAlex Elder 
180537206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
1806b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1807b8d70035SAlex Elder 		(unsigned int) opcode);
1808b8d70035SAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
1809b8d70035SAlex Elder 	if (rc)
1810b8d70035SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
1811b8d70035SAlex Elder 			   " update snaps: %d\n", rc);
1812b8d70035SAlex Elder 
1813cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
1814b8d70035SAlex Elder }
1815b8d70035SAlex Elder 
18169969ebc5SAlex Elder /*
18179969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
18189969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
18199969ebc5SAlex Elder  */
18209969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
18219969ebc5SAlex Elder {
18229969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
18239969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
18249969ebc5SAlex Elder 	struct ceph_osd_req_op *op;
18259969ebc5SAlex Elder 	int ret;
18269969ebc5SAlex Elder 
18279969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
18289969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
18299969ebc5SAlex Elder 
18309969ebc5SAlex Elder 	if (start) {
18313c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
18329969ebc5SAlex Elder 						&rbd_dev->watch_event);
18339969ebc5SAlex Elder 		if (ret < 0)
18349969ebc5SAlex Elder 			return ret;
18358eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
18369969ebc5SAlex Elder 	}
18379969ebc5SAlex Elder 
18389969ebc5SAlex Elder 	ret = -ENOMEM;
18399969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
18409969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
18419969ebc5SAlex Elder 	if (!obj_request)
18429969ebc5SAlex Elder 		goto out_cancel;
18439969ebc5SAlex Elder 
18449969ebc5SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
18459969ebc5SAlex Elder 				rbd_dev->watch_event->cookie,
18469969ebc5SAlex Elder 				rbd_dev->header.obj_version, start);
18479969ebc5SAlex Elder 	if (!op)
18489969ebc5SAlex Elder 		goto out_cancel;
18499969ebc5SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
18509969ebc5SAlex Elder 							obj_request, op);
18519969ebc5SAlex Elder 	rbd_osd_req_op_destroy(op);
18529969ebc5SAlex Elder 	if (!obj_request->osd_req)
18539969ebc5SAlex Elder 		goto out_cancel;
18549969ebc5SAlex Elder 
18558eb87565SAlex Elder 	if (start)
1856975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
18578eb87565SAlex Elder 	else
18586977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
1859975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
18609969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
18619969ebc5SAlex Elder 	if (ret)
18629969ebc5SAlex Elder 		goto out_cancel;
18639969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
18649969ebc5SAlex Elder 	if (ret)
18659969ebc5SAlex Elder 		goto out_cancel;
18669969ebc5SAlex Elder 	ret = obj_request->result;
18679969ebc5SAlex Elder 	if (ret)
18689969ebc5SAlex Elder 		goto out_cancel;
18699969ebc5SAlex Elder 
18708eb87565SAlex Elder 	/*
18718eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
18728eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
18738eb87565SAlex Elder 	 * a pointer to the object request during that time (in
18748eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
18758eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
18768eb87565SAlex Elder 	 * unregistered it.
18778eb87565SAlex Elder 	 */
18788eb87565SAlex Elder 	if (start) {
18798eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
18808eb87565SAlex Elder 
18818eb87565SAlex Elder 		return 0;
18828eb87565SAlex Elder 	}
18838eb87565SAlex Elder 
18848eb87565SAlex Elder 	/* We have successfully torn down the watch request */
18858eb87565SAlex Elder 
18868eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
18878eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
18889969ebc5SAlex Elder out_cancel:
18899969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
18909969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
18919969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
18929969ebc5SAlex Elder 	if (obj_request)
18939969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
18949969ebc5SAlex Elder 
18959969ebc5SAlex Elder 	return ret;
18969969ebc5SAlex Elder }
18979969ebc5SAlex Elder 
189836be9a76SAlex Elder /*
189936be9a76SAlex Elder  * Synchronous osd object method call
190036be9a76SAlex Elder  */
190136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
190236be9a76SAlex Elder 			     const char *object_name,
190336be9a76SAlex Elder 			     const char *class_name,
190436be9a76SAlex Elder 			     const char *method_name,
190536be9a76SAlex Elder 			     const char *outbound,
190636be9a76SAlex Elder 			     size_t outbound_size,
190736be9a76SAlex Elder 			     char *inbound,
190836be9a76SAlex Elder 			     size_t inbound_size,
190936be9a76SAlex Elder 			     u64 *version)
191036be9a76SAlex Elder {
191136be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
191236be9a76SAlex Elder 	struct ceph_osd_client *osdc;
191336be9a76SAlex Elder 	struct ceph_osd_req_op *op;
191436be9a76SAlex Elder 	struct page **pages;
191536be9a76SAlex Elder 	u32 page_count;
191636be9a76SAlex Elder 	int ret;
191736be9a76SAlex Elder 
191836be9a76SAlex Elder 	/*
191936be9a76SAlex Elder 	 * Method calls are ultimately read operations but they
192036be9a76SAlex Elder 	 * don't involve object data (so no offset or length).
192136be9a76SAlex Elder 	 * The result should placed into the inbound buffer
192236be9a76SAlex Elder 	 * provided.  They also supply outbound data--parameters for
192336be9a76SAlex Elder 	 * the object method.  Currently if this is present it will
192436be9a76SAlex Elder 	 * be a snapshot id.
192536be9a76SAlex Elder 	 */
192636be9a76SAlex Elder 	page_count = (u32) calc_pages_for(0, inbound_size);
192736be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
192836be9a76SAlex Elder 	if (IS_ERR(pages))
192936be9a76SAlex Elder 		return PTR_ERR(pages);
193036be9a76SAlex Elder 
193136be9a76SAlex Elder 	ret = -ENOMEM;
193236be9a76SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, 0,
193336be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
193436be9a76SAlex Elder 	if (!obj_request)
193536be9a76SAlex Elder 		goto out;
193636be9a76SAlex Elder 
193736be9a76SAlex Elder 	obj_request->pages = pages;
193836be9a76SAlex Elder 	obj_request->page_count = page_count;
193936be9a76SAlex Elder 
194036be9a76SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
194136be9a76SAlex Elder 					method_name, outbound, outbound_size);
194236be9a76SAlex Elder 	if (!op)
194336be9a76SAlex Elder 		goto out;
194436be9a76SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
194536be9a76SAlex Elder 						obj_request, op);
194636be9a76SAlex Elder 	rbd_osd_req_op_destroy(op);
194736be9a76SAlex Elder 	if (!obj_request->osd_req)
194836be9a76SAlex Elder 		goto out;
194936be9a76SAlex Elder 
195036be9a76SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
195136be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
195236be9a76SAlex Elder 	if (ret)
195336be9a76SAlex Elder 		goto out;
195436be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
195536be9a76SAlex Elder 	if (ret)
195636be9a76SAlex Elder 		goto out;
195736be9a76SAlex Elder 
195836be9a76SAlex Elder 	ret = obj_request->result;
195936be9a76SAlex Elder 	if (ret < 0)
196036be9a76SAlex Elder 		goto out;
196123ed6e13SAlex Elder 	ret = 0;
1962903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
196336be9a76SAlex Elder 	if (version)
196436be9a76SAlex Elder 		*version = obj_request->version;
196536be9a76SAlex Elder out:
196636be9a76SAlex Elder 	if (obj_request)
196736be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
196836be9a76SAlex Elder 	else
196936be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
197036be9a76SAlex Elder 
197136be9a76SAlex Elder 	return ret;
197236be9a76SAlex Elder }
197336be9a76SAlex Elder 
1974bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
1975cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
1976bf0d5f50SAlex Elder {
1977bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
1978bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
1979bf0d5f50SAlex Elder 	struct request *rq;
1980bf0d5f50SAlex Elder 	int result;
1981bf0d5f50SAlex Elder 
1982bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
1983bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
1984bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
1985bf0d5f50SAlex Elder 		u64 offset;
1986bf0d5f50SAlex Elder 		u64 length;
1987bf0d5f50SAlex Elder 
1988bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
1989bf0d5f50SAlex Elder 
1990bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
19914dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
19924dda41d3SAlex Elder 				(int) rq->cmd_type);
19934dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
19944dda41d3SAlex Elder 			continue;
19954dda41d3SAlex Elder 		}
19964dda41d3SAlex Elder 
19974dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
19984dda41d3SAlex Elder 
19994dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
20004dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
20014dda41d3SAlex Elder 
20024dda41d3SAlex Elder 		if (!length) {
20034dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2004bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2005bf0d5f50SAlex Elder 			continue;
2006bf0d5f50SAlex Elder 		}
2007bf0d5f50SAlex Elder 
2008bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2009bf0d5f50SAlex Elder 
2010bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2011bf0d5f50SAlex Elder 
2012bf0d5f50SAlex Elder 		if (write_request) {
2013bf0d5f50SAlex Elder 			result = -EROFS;
2014bf0d5f50SAlex Elder 			if (read_only)
2015bf0d5f50SAlex Elder 				goto end_request;
2016bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2017bf0d5f50SAlex Elder 		}
2018bf0d5f50SAlex Elder 
20196d292906SAlex Elder 		/*
20206d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
20216d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
20226d292906SAlex Elder 		 * have disappeared by the time our request arrives
20236d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
20246d292906SAlex Elder 		 * we already know.
20256d292906SAlex Elder 		 */
20266d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2027bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2028bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2029bf0d5f50SAlex Elder 			result = -ENXIO;
2030bf0d5f50SAlex Elder 			goto end_request;
2031bf0d5f50SAlex Elder 		}
2032bf0d5f50SAlex Elder 
2033bf0d5f50SAlex Elder 		result = -EINVAL;
2034bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2035bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2036bf0d5f50SAlex Elder 
2037bf0d5f50SAlex Elder 		result = -ENOMEM;
2038bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
2039bf0d5f50SAlex Elder 							write_request);
2040bf0d5f50SAlex Elder 		if (!img_request)
2041bf0d5f50SAlex Elder 			goto end_request;
2042bf0d5f50SAlex Elder 
2043bf0d5f50SAlex Elder 		img_request->rq = rq;
2044bf0d5f50SAlex Elder 
2045bf0d5f50SAlex Elder 		result = rbd_img_request_fill_bio(img_request, rq->bio);
2046bf0d5f50SAlex Elder 		if (!result)
2047bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2048bf0d5f50SAlex Elder 		if (result)
2049bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2050bf0d5f50SAlex Elder end_request:
2051bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2052bf0d5f50SAlex Elder 		if (result < 0) {
2053bf0d5f50SAlex Elder 			rbd_warn(rbd_dev, "obj_request %s result %d\n",
2054bf0d5f50SAlex Elder 				write_request ? "write" : "read", result);
2055bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2056bf0d5f50SAlex Elder 		}
2057bf0d5f50SAlex Elder 	}
2058bf0d5f50SAlex Elder }
2059bf0d5f50SAlex Elder 
2060602adf40SYehuda Sadeh /*
2061602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2062602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2063f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2064602adf40SYehuda Sadeh  */
2065602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2066602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2067602adf40SYehuda Sadeh {
2068602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2069e5cfeed2SAlex Elder 	sector_t sector_offset;
2070e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2071e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2072e5cfeed2SAlex Elder 	int ret;
2073602adf40SYehuda Sadeh 
2074e5cfeed2SAlex Elder 	/*
2075e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2076e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2077e5cfeed2SAlex Elder 	 * device.
2078e5cfeed2SAlex Elder 	 */
2079e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2080e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2081e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2082593a9e7bSAlex Elder 
2083e5cfeed2SAlex Elder 	/*
2084e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2085e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2086e5cfeed2SAlex Elder 	 */
2087e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2088e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2089e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2090e5cfeed2SAlex Elder 	else
2091e5cfeed2SAlex Elder 		ret = 0;
2092e5cfeed2SAlex Elder 
2093e5cfeed2SAlex Elder 	/*
2094e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2095e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2096e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2097e5cfeed2SAlex Elder 	 * added to an empty bio."
2098e5cfeed2SAlex Elder 	 */
2099e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2100e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2101e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2102e5cfeed2SAlex Elder 
2103e5cfeed2SAlex Elder 	return ret;
2104602adf40SYehuda Sadeh }
2105602adf40SYehuda Sadeh 
2106602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2107602adf40SYehuda Sadeh {
2108602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2109602adf40SYehuda Sadeh 
2110602adf40SYehuda Sadeh 	if (!disk)
2111602adf40SYehuda Sadeh 		return;
2112602adf40SYehuda Sadeh 
2113602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2114602adf40SYehuda Sadeh 		del_gendisk(disk);
2115602adf40SYehuda Sadeh 	if (disk->queue)
2116602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2117602adf40SYehuda Sadeh 	put_disk(disk);
2118602adf40SYehuda Sadeh }
2119602adf40SYehuda Sadeh 
2120788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2121788e2df3SAlex Elder 				const char *object_name,
2122788e2df3SAlex Elder 				u64 offset, u64 length,
2123788e2df3SAlex Elder 				char *buf, u64 *version)
2124788e2df3SAlex Elder 
2125788e2df3SAlex Elder {
2126788e2df3SAlex Elder 	struct ceph_osd_req_op *op;
2127788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2128788e2df3SAlex Elder 	struct ceph_osd_client *osdc;
2129788e2df3SAlex Elder 	struct page **pages = NULL;
2130788e2df3SAlex Elder 	u32 page_count;
21311ceae7efSAlex Elder 	size_t size;
2132788e2df3SAlex Elder 	int ret;
2133788e2df3SAlex Elder 
2134788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2135788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2136788e2df3SAlex Elder 	if (IS_ERR(pages))
2137788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2138788e2df3SAlex Elder 
2139788e2df3SAlex Elder 	ret = -ENOMEM;
2140788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2141788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2142788e2df3SAlex Elder 	if (!obj_request)
2143788e2df3SAlex Elder 		goto out;
2144788e2df3SAlex Elder 
2145788e2df3SAlex Elder 	obj_request->pages = pages;
2146788e2df3SAlex Elder 	obj_request->page_count = page_count;
2147788e2df3SAlex Elder 
2148788e2df3SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2149788e2df3SAlex Elder 	if (!op)
2150788e2df3SAlex Elder 		goto out;
2151788e2df3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2152788e2df3SAlex Elder 						obj_request, op);
2153788e2df3SAlex Elder 	rbd_osd_req_op_destroy(op);
2154788e2df3SAlex Elder 	if (!obj_request->osd_req)
2155788e2df3SAlex Elder 		goto out;
2156788e2df3SAlex Elder 
2157788e2df3SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2158788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2159788e2df3SAlex Elder 	if (ret)
2160788e2df3SAlex Elder 		goto out;
2161788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2162788e2df3SAlex Elder 	if (ret)
2163788e2df3SAlex Elder 		goto out;
2164788e2df3SAlex Elder 
2165788e2df3SAlex Elder 	ret = obj_request->result;
2166788e2df3SAlex Elder 	if (ret < 0)
2167788e2df3SAlex Elder 		goto out;
21681ceae7efSAlex Elder 
21691ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
21701ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2171903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
217223ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
217323ed6e13SAlex Elder 	ret = (int) size;
2174788e2df3SAlex Elder 	if (version)
2175788e2df3SAlex Elder 		*version = obj_request->version;
2176788e2df3SAlex Elder out:
2177788e2df3SAlex Elder 	if (obj_request)
2178788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2179788e2df3SAlex Elder 	else
2180788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2181788e2df3SAlex Elder 
2182788e2df3SAlex Elder 	return ret;
2183788e2df3SAlex Elder }
2184788e2df3SAlex Elder 
2185602adf40SYehuda Sadeh /*
21864156d998SAlex Elder  * Read the complete header for the given rbd device.
21874156d998SAlex Elder  *
21884156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
21894156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
21904156d998SAlex Elder  * of a variable that will be filled in with the version of the
21914156d998SAlex Elder  * header object at the time it was read.
21924156d998SAlex Elder  *
21934156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
21944156d998SAlex Elder  */
21954156d998SAlex Elder static struct rbd_image_header_ondisk *
21964156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
21974156d998SAlex Elder {
21984156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
21994156d998SAlex Elder 	u32 snap_count = 0;
22004156d998SAlex Elder 	u64 names_size = 0;
22014156d998SAlex Elder 	u32 want_count;
22024156d998SAlex Elder 	int ret;
22034156d998SAlex Elder 
22044156d998SAlex Elder 	/*
22054156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
22064156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
22074156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
22084156d998SAlex Elder 	 * the number of snapshots could change by the time we read
22094156d998SAlex Elder 	 * it in, in which case we re-read it.
22104156d998SAlex Elder 	 */
22114156d998SAlex Elder 	do {
22124156d998SAlex Elder 		size_t size;
22134156d998SAlex Elder 
22144156d998SAlex Elder 		kfree(ondisk);
22154156d998SAlex Elder 
22164156d998SAlex Elder 		size = sizeof (*ondisk);
22174156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
22184156d998SAlex Elder 		size += names_size;
22194156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
22204156d998SAlex Elder 		if (!ondisk)
22214156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
22224156d998SAlex Elder 
2223788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
22244156d998SAlex Elder 				       0, size,
22254156d998SAlex Elder 				       (char *) ondisk, version);
22264156d998SAlex Elder 		if (ret < 0)
22274156d998SAlex Elder 			goto out_err;
22284156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
22294156d998SAlex Elder 			ret = -ENXIO;
223006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
223106ecc6cbSAlex Elder 				size, ret);
22324156d998SAlex Elder 			goto out_err;
22334156d998SAlex Elder 		}
22344156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
22354156d998SAlex Elder 			ret = -ENXIO;
223606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
22374156d998SAlex Elder 			goto out_err;
22384156d998SAlex Elder 		}
22394156d998SAlex Elder 
22404156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
22414156d998SAlex Elder 		want_count = snap_count;
22424156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
22434156d998SAlex Elder 	} while (snap_count != want_count);
22444156d998SAlex Elder 
22454156d998SAlex Elder 	return ondisk;
22464156d998SAlex Elder 
22474156d998SAlex Elder out_err:
22484156d998SAlex Elder 	kfree(ondisk);
22494156d998SAlex Elder 
22504156d998SAlex Elder 	return ERR_PTR(ret);
22514156d998SAlex Elder }
22524156d998SAlex Elder 
22534156d998SAlex Elder /*
2254602adf40SYehuda Sadeh  * reload the ondisk the header
2255602adf40SYehuda Sadeh  */
2256602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2257602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2258602adf40SYehuda Sadeh {
22594156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
22604156d998SAlex Elder 	u64 ver = 0;
22614156d998SAlex Elder 	int ret;
2262602adf40SYehuda Sadeh 
22634156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
22644156d998SAlex Elder 	if (IS_ERR(ondisk))
22654156d998SAlex Elder 		return PTR_ERR(ondisk);
22664156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
22674156d998SAlex Elder 	if (ret >= 0)
226859c2be1eSYehuda Sadeh 		header->obj_version = ver;
22694156d998SAlex Elder 	kfree(ondisk);
2270602adf40SYehuda Sadeh 
22714156d998SAlex Elder 	return ret;
2272602adf40SYehuda Sadeh }
2273602adf40SYehuda Sadeh 
227441f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2275dfc5606dSYehuda Sadeh {
2276dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2277a0593290SAlex Elder 	struct rbd_snap *next;
2278dfc5606dSYehuda Sadeh 
2279a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
228041f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
2281dfc5606dSYehuda Sadeh }
2282dfc5606dSYehuda Sadeh 
22839478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
22849478554aSAlex Elder {
22859478554aSAlex Elder 	sector_t size;
22869478554aSAlex Elder 
22870d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
22889478554aSAlex Elder 		return;
22899478554aSAlex Elder 
22909478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
22919478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
22929478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
22939478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
22949478554aSAlex Elder }
22959478554aSAlex Elder 
2296602adf40SYehuda Sadeh /*
2297602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
2298602adf40SYehuda Sadeh  */
2299117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2300602adf40SYehuda Sadeh {
2301602adf40SYehuda Sadeh 	int ret;
2302602adf40SYehuda Sadeh 	struct rbd_image_header h;
2303602adf40SYehuda Sadeh 
2304602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
2305602adf40SYehuda Sadeh 	if (ret < 0)
2306602adf40SYehuda Sadeh 		return ret;
2307602adf40SYehuda Sadeh 
2308a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
2309a51aa0c0SJosh Durgin 
23109478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
23119478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
23129478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
23139db4b3e3SSage Weil 
2314849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
2315602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
2316849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
2317d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
2318d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
2319602adf40SYehuda Sadeh 
2320b813623aSAlex Elder 	if (hver)
2321b813623aSAlex Elder 		*hver = h.obj_version;
2322a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
232393a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
2324602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
2325602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
2326602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
2327849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
2328849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2329849b4260SAlex Elder 	kfree(h.object_prefix);
2330849b4260SAlex Elder 
2331304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2332304f6808SAlex Elder 	if (!ret)
2333304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
2334dfc5606dSYehuda Sadeh 
2335c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
2336602adf40SYehuda Sadeh 
2337dfc5606dSYehuda Sadeh 	return ret;
2338602adf40SYehuda Sadeh }
2339602adf40SYehuda Sadeh 
2340117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
23411fe5e993SAlex Elder {
23421fe5e993SAlex Elder 	int ret;
23431fe5e993SAlex Elder 
2344117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
23451fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2346117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
2347117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
2348117973fbSAlex Elder 	else
2349117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
23501fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
23511fe5e993SAlex Elder 
23521fe5e993SAlex Elder 	return ret;
23531fe5e993SAlex Elder }
23541fe5e993SAlex Elder 
2355602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
2356602adf40SYehuda Sadeh {
2357602adf40SYehuda Sadeh 	struct gendisk *disk;
2358602adf40SYehuda Sadeh 	struct request_queue *q;
2359593a9e7bSAlex Elder 	u64 segment_size;
2360602adf40SYehuda Sadeh 
2361602adf40SYehuda Sadeh 	/* create gendisk info */
2362602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2363602adf40SYehuda Sadeh 	if (!disk)
23641fcdb8aaSAlex Elder 		return -ENOMEM;
2365602adf40SYehuda Sadeh 
2366f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2367de71a297SAlex Elder 		 rbd_dev->dev_id);
2368602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
2369602adf40SYehuda Sadeh 	disk->first_minor = 0;
2370602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
2371602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
2372602adf40SYehuda Sadeh 
2373bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2374602adf40SYehuda Sadeh 	if (!q)
2375602adf40SYehuda Sadeh 		goto out_disk;
2376029bcbd8SJosh Durgin 
2377593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
2378593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
2379593a9e7bSAlex Elder 
2380029bcbd8SJosh Durgin 	/* set io sizes to object size */
2381593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
2382593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2383593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
2384593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
2385593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
2386029bcbd8SJosh Durgin 
2387602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
2388602adf40SYehuda Sadeh 	disk->queue = q;
2389602adf40SYehuda Sadeh 
2390602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
2391602adf40SYehuda Sadeh 
2392602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
2393602adf40SYehuda Sadeh 
239412f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
239512f02944SAlex Elder 
2396602adf40SYehuda Sadeh 	return 0;
2397602adf40SYehuda Sadeh out_disk:
2398602adf40SYehuda Sadeh 	put_disk(disk);
23991fcdb8aaSAlex Elder 
24001fcdb8aaSAlex Elder 	return -ENOMEM;
2401602adf40SYehuda Sadeh }
2402602adf40SYehuda Sadeh 
2403dfc5606dSYehuda Sadeh /*
2404dfc5606dSYehuda Sadeh   sysfs
2405dfc5606dSYehuda Sadeh */
2406602adf40SYehuda Sadeh 
2407593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2408593a9e7bSAlex Elder {
2409593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
2410593a9e7bSAlex Elder }
2411593a9e7bSAlex Elder 
2412dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
2413dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2414602adf40SYehuda Sadeh {
2415593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2416a51aa0c0SJosh Durgin 	sector_t size;
2417dfc5606dSYehuda Sadeh 
2418a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2419a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2420a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2421a51aa0c0SJosh Durgin 
2422a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2423602adf40SYehuda Sadeh }
2424602adf40SYehuda Sadeh 
242534b13184SAlex Elder /*
242634b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
242734b13184SAlex Elder  * necessarily the base image.
242834b13184SAlex Elder  */
242934b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
243034b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
243134b13184SAlex Elder {
243234b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
243334b13184SAlex Elder 
243434b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
243534b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
243634b13184SAlex Elder }
243734b13184SAlex Elder 
2438dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2439dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2440602adf40SYehuda Sadeh {
2441593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2442dfc5606dSYehuda Sadeh 
2443dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2444dfc5606dSYehuda Sadeh }
2445dfc5606dSYehuda Sadeh 
2446dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2447dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2448dfc5606dSYehuda Sadeh {
2449593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2450dfc5606dSYehuda Sadeh 
24511dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
24521dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2453dfc5606dSYehuda Sadeh }
2454dfc5606dSYehuda Sadeh 
2455dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2456dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2457dfc5606dSYehuda Sadeh {
2458593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2459dfc5606dSYehuda Sadeh 
24600d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2461dfc5606dSYehuda Sadeh }
2462dfc5606dSYehuda Sadeh 
24639bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
24649bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
24659bb2f334SAlex Elder {
24669bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
24679bb2f334SAlex Elder 
24680d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
24690d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
24709bb2f334SAlex Elder }
24719bb2f334SAlex Elder 
2472dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2473dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2474dfc5606dSYehuda Sadeh {
2475593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2476dfc5606dSYehuda Sadeh 
2477a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
24780d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2479a92ffdf8SAlex Elder 
2480a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2481dfc5606dSYehuda Sadeh }
2482dfc5606dSYehuda Sadeh 
2483589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2484589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2485589d30e0SAlex Elder {
2486589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2487589d30e0SAlex Elder 
24880d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2489589d30e0SAlex Elder }
2490589d30e0SAlex Elder 
249134b13184SAlex Elder /*
249234b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
249334b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
249434b13184SAlex Elder  */
2495dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2496dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2497dfc5606dSYehuda Sadeh 			     char *buf)
2498dfc5606dSYehuda Sadeh {
2499593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2500dfc5606dSYehuda Sadeh 
25010d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2502dfc5606dSYehuda Sadeh }
2503dfc5606dSYehuda Sadeh 
250486b00e0dSAlex Elder /*
250586b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
250686b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
250786b00e0dSAlex Elder  * "(no parent image)".
250886b00e0dSAlex Elder  */
250986b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
251086b00e0dSAlex Elder 			     struct device_attribute *attr,
251186b00e0dSAlex Elder 			     char *buf)
251286b00e0dSAlex Elder {
251386b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
251486b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
251586b00e0dSAlex Elder 	int count;
251686b00e0dSAlex Elder 	char *bufp = buf;
251786b00e0dSAlex Elder 
251886b00e0dSAlex Elder 	if (!spec)
251986b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
252086b00e0dSAlex Elder 
252186b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
252286b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
252386b00e0dSAlex Elder 	if (count < 0)
252486b00e0dSAlex Elder 		return count;
252586b00e0dSAlex Elder 	bufp += count;
252686b00e0dSAlex Elder 
252786b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
252886b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
252986b00e0dSAlex Elder 	if (count < 0)
253086b00e0dSAlex Elder 		return count;
253186b00e0dSAlex Elder 	bufp += count;
253286b00e0dSAlex Elder 
253386b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
253486b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
253586b00e0dSAlex Elder 	if (count < 0)
253686b00e0dSAlex Elder 		return count;
253786b00e0dSAlex Elder 	bufp += count;
253886b00e0dSAlex Elder 
253986b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
254086b00e0dSAlex Elder 	if (count < 0)
254186b00e0dSAlex Elder 		return count;
254286b00e0dSAlex Elder 	bufp += count;
254386b00e0dSAlex Elder 
254486b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
254586b00e0dSAlex Elder }
254686b00e0dSAlex Elder 
2547dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2548dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2549dfc5606dSYehuda Sadeh 				 const char *buf,
2550dfc5606dSYehuda Sadeh 				 size_t size)
2551dfc5606dSYehuda Sadeh {
2552593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2553b813623aSAlex Elder 	int ret;
2554602adf40SYehuda Sadeh 
2555117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2556b813623aSAlex Elder 
2557b813623aSAlex Elder 	return ret < 0 ? ret : size;
2558dfc5606dSYehuda Sadeh }
2559602adf40SYehuda Sadeh 
2560dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
256134b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2562dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2563dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2564dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
25659bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2566dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2567589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2568dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2569dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
257086b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2571dfc5606dSYehuda Sadeh 
2572dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2573dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
257434b13184SAlex Elder 	&dev_attr_features.attr,
2575dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2576dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2577dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
25789bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2579dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2580589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2581dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
258286b00e0dSAlex Elder 	&dev_attr_parent.attr,
2583dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2584dfc5606dSYehuda Sadeh 	NULL
2585dfc5606dSYehuda Sadeh };
2586dfc5606dSYehuda Sadeh 
2587dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2588dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2589dfc5606dSYehuda Sadeh };
2590dfc5606dSYehuda Sadeh 
2591dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2592dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2593dfc5606dSYehuda Sadeh 	NULL
2594dfc5606dSYehuda Sadeh };
2595dfc5606dSYehuda Sadeh 
2596dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2597dfc5606dSYehuda Sadeh {
2598dfc5606dSYehuda Sadeh }
2599dfc5606dSYehuda Sadeh 
2600dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2601dfc5606dSYehuda Sadeh 	.name		= "rbd",
2602dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2603dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2604dfc5606dSYehuda Sadeh };
2605dfc5606dSYehuda Sadeh 
2606dfc5606dSYehuda Sadeh 
2607dfc5606dSYehuda Sadeh /*
2608dfc5606dSYehuda Sadeh   sysfs - snapshots
2609dfc5606dSYehuda Sadeh */
2610dfc5606dSYehuda Sadeh 
2611dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2612dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2613dfc5606dSYehuda Sadeh 				  char *buf)
2614dfc5606dSYehuda Sadeh {
2615dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2616dfc5606dSYehuda Sadeh 
26173591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2618dfc5606dSYehuda Sadeh }
2619dfc5606dSYehuda Sadeh 
2620dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2621dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2622dfc5606dSYehuda Sadeh 				char *buf)
2623dfc5606dSYehuda Sadeh {
2624dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2625dfc5606dSYehuda Sadeh 
2626593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2627dfc5606dSYehuda Sadeh }
2628dfc5606dSYehuda Sadeh 
262934b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
263034b13184SAlex Elder 				struct device_attribute *attr,
263134b13184SAlex Elder 				char *buf)
263234b13184SAlex Elder {
263334b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
263434b13184SAlex Elder 
263534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
263634b13184SAlex Elder 			(unsigned long long) snap->features);
263734b13184SAlex Elder }
263834b13184SAlex Elder 
2639dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2640dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
264134b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2642dfc5606dSYehuda Sadeh 
2643dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2644dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2645dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
264634b13184SAlex Elder 	&dev_attr_snap_features.attr,
2647dfc5606dSYehuda Sadeh 	NULL,
2648dfc5606dSYehuda Sadeh };
2649dfc5606dSYehuda Sadeh 
2650dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2651dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2652dfc5606dSYehuda Sadeh };
2653dfc5606dSYehuda Sadeh 
2654dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2655dfc5606dSYehuda Sadeh {
2656dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2657dfc5606dSYehuda Sadeh 	kfree(snap->name);
2658dfc5606dSYehuda Sadeh 	kfree(snap);
2659dfc5606dSYehuda Sadeh }
2660dfc5606dSYehuda Sadeh 
2661dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2662dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2663dfc5606dSYehuda Sadeh 	NULL
2664dfc5606dSYehuda Sadeh };
2665dfc5606dSYehuda Sadeh 
2666dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2667dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2668dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2669dfc5606dSYehuda Sadeh };
2670dfc5606dSYehuda Sadeh 
26718b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
26728b8fb99cSAlex Elder {
26738b8fb99cSAlex Elder 	kref_get(&spec->kref);
26748b8fb99cSAlex Elder 
26758b8fb99cSAlex Elder 	return spec;
26768b8fb99cSAlex Elder }
26778b8fb99cSAlex Elder 
26788b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
26798b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
26808b8fb99cSAlex Elder {
26818b8fb99cSAlex Elder 	if (spec)
26828b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
26838b8fb99cSAlex Elder }
26848b8fb99cSAlex Elder 
26858b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
26868b8fb99cSAlex Elder {
26878b8fb99cSAlex Elder 	struct rbd_spec *spec;
26888b8fb99cSAlex Elder 
26898b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
26908b8fb99cSAlex Elder 	if (!spec)
26918b8fb99cSAlex Elder 		return NULL;
26928b8fb99cSAlex Elder 	kref_init(&spec->kref);
26938b8fb99cSAlex Elder 
26948b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
26958b8fb99cSAlex Elder 
26968b8fb99cSAlex Elder 	return spec;
26978b8fb99cSAlex Elder }
26988b8fb99cSAlex Elder 
26998b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
27008b8fb99cSAlex Elder {
27018b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
27028b8fb99cSAlex Elder 
27038b8fb99cSAlex Elder 	kfree(spec->pool_name);
27048b8fb99cSAlex Elder 	kfree(spec->image_id);
27058b8fb99cSAlex Elder 	kfree(spec->image_name);
27068b8fb99cSAlex Elder 	kfree(spec->snap_name);
27078b8fb99cSAlex Elder 	kfree(spec);
27088b8fb99cSAlex Elder }
27098b8fb99cSAlex Elder 
2710cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2711c53d5893SAlex Elder 				struct rbd_spec *spec)
2712c53d5893SAlex Elder {
2713c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2714c53d5893SAlex Elder 
2715c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2716c53d5893SAlex Elder 	if (!rbd_dev)
2717c53d5893SAlex Elder 		return NULL;
2718c53d5893SAlex Elder 
2719c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
27206d292906SAlex Elder 	rbd_dev->flags = 0;
2721c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2722c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2723c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2724c53d5893SAlex Elder 
2725c53d5893SAlex Elder 	rbd_dev->spec = spec;
2726c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2727c53d5893SAlex Elder 
27280903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
27290903e875SAlex Elder 
27300903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
27310903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
27320903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
27330903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
27340903e875SAlex Elder 
2735c53d5893SAlex Elder 	return rbd_dev;
2736c53d5893SAlex Elder }
2737c53d5893SAlex Elder 
2738c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2739c53d5893SAlex Elder {
274086b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2741c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2742c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2743c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2744c53d5893SAlex Elder 	kfree(rbd_dev);
2745c53d5893SAlex Elder }
2746c53d5893SAlex Elder 
2747304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2748304f6808SAlex Elder {
2749304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2750304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2751304f6808SAlex Elder 
2752304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2753304f6808SAlex Elder 
2754304f6808SAlex Elder 	return ret;
2755304f6808SAlex Elder }
2756304f6808SAlex Elder 
275741f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2758dfc5606dSYehuda Sadeh {
2759dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2760304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2761dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2762dfc5606dSYehuda Sadeh }
2763dfc5606dSYehuda Sadeh 
276414e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2765dfc5606dSYehuda Sadeh 				  struct device *parent)
2766dfc5606dSYehuda Sadeh {
2767dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2768dfc5606dSYehuda Sadeh 	int ret;
2769dfc5606dSYehuda Sadeh 
2770dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2771dfc5606dSYehuda Sadeh 	dev->parent = parent;
2772dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2773d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2774304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2775304f6808SAlex Elder 
2776dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2777dfc5606dSYehuda Sadeh 
2778dfc5606dSYehuda Sadeh 	return ret;
2779dfc5606dSYehuda Sadeh }
2780dfc5606dSYehuda Sadeh 
27814e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2782c8d18425SAlex Elder 						const char *snap_name,
278334b13184SAlex Elder 						u64 snap_id, u64 snap_size,
278434b13184SAlex Elder 						u64 snap_features)
2785dfc5606dSYehuda Sadeh {
27864e891e0aSAlex Elder 	struct rbd_snap *snap;
2787dfc5606dSYehuda Sadeh 	int ret;
27884e891e0aSAlex Elder 
27894e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2790dfc5606dSYehuda Sadeh 	if (!snap)
27914e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
27924e891e0aSAlex Elder 
27934e891e0aSAlex Elder 	ret = -ENOMEM;
2794c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
27954e891e0aSAlex Elder 	if (!snap->name)
27964e891e0aSAlex Elder 		goto err;
27974e891e0aSAlex Elder 
2798c8d18425SAlex Elder 	snap->id = snap_id;
2799c8d18425SAlex Elder 	snap->size = snap_size;
280034b13184SAlex Elder 	snap->features = snap_features;
28014e891e0aSAlex Elder 
28024e891e0aSAlex Elder 	return snap;
28034e891e0aSAlex Elder 
2804dfc5606dSYehuda Sadeh err:
2805dfc5606dSYehuda Sadeh 	kfree(snap->name);
2806dfc5606dSYehuda Sadeh 	kfree(snap);
28074e891e0aSAlex Elder 
28084e891e0aSAlex Elder 	return ERR_PTR(ret);
2809dfc5606dSYehuda Sadeh }
2810dfc5606dSYehuda Sadeh 
2811cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2812cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2813cd892126SAlex Elder {
2814cd892126SAlex Elder 	char *snap_name;
2815cd892126SAlex Elder 
2816cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2817cd892126SAlex Elder 
2818cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2819cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2820cd892126SAlex Elder 
2821cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2822cd892126SAlex Elder 
2823cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2824cd892126SAlex Elder 	while (which--)
2825cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2826cd892126SAlex Elder 
2827cd892126SAlex Elder 	return snap_name;
2828cd892126SAlex Elder }
2829cd892126SAlex Elder 
2830dfc5606dSYehuda Sadeh /*
28319d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
28329d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
28339d475de5SAlex Elder  * image.
28349d475de5SAlex Elder  */
28359d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
28369d475de5SAlex Elder 				u8 *order, u64 *snap_size)
28379d475de5SAlex Elder {
28389d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
28399d475de5SAlex Elder 	int ret;
28409d475de5SAlex Elder 	struct {
28419d475de5SAlex Elder 		u8 order;
28429d475de5SAlex Elder 		__le64 size;
28439d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
28449d475de5SAlex Elder 
284536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
28469d475de5SAlex Elder 				"rbd", "get_size",
28479d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
284807b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
284936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
28509d475de5SAlex Elder 	if (ret < 0)
28519d475de5SAlex Elder 		return ret;
28529d475de5SAlex Elder 
28539d475de5SAlex Elder 	*order = size_buf.order;
28549d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
28559d475de5SAlex Elder 
28569d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
28579d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
28589d475de5SAlex Elder 		(unsigned long long) *snap_size);
28599d475de5SAlex Elder 
28609d475de5SAlex Elder 	return 0;
28619d475de5SAlex Elder }
28629d475de5SAlex Elder 
28639d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
28649d475de5SAlex Elder {
28659d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
28669d475de5SAlex Elder 					&rbd_dev->header.obj_order,
28679d475de5SAlex Elder 					&rbd_dev->header.image_size);
28689d475de5SAlex Elder }
28699d475de5SAlex Elder 
28701e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
28711e130199SAlex Elder {
28721e130199SAlex Elder 	void *reply_buf;
28731e130199SAlex Elder 	int ret;
28741e130199SAlex Elder 	void *p;
28751e130199SAlex Elder 
28761e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
28771e130199SAlex Elder 	if (!reply_buf)
28781e130199SAlex Elder 		return -ENOMEM;
28791e130199SAlex Elder 
288036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
28811e130199SAlex Elder 				"rbd", "get_object_prefix",
28821e130199SAlex Elder 				NULL, 0,
288307b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
288436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
28851e130199SAlex Elder 	if (ret < 0)
28861e130199SAlex Elder 		goto out;
28871e130199SAlex Elder 
28881e130199SAlex Elder 	p = reply_buf;
28891e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
28901e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
28911e130199SAlex Elder 						NULL, GFP_NOIO);
28921e130199SAlex Elder 
28931e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
28941e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
28951e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
28961e130199SAlex Elder 	} else {
28971e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
28981e130199SAlex Elder 	}
28991e130199SAlex Elder 
29001e130199SAlex Elder out:
29011e130199SAlex Elder 	kfree(reply_buf);
29021e130199SAlex Elder 
29031e130199SAlex Elder 	return ret;
29041e130199SAlex Elder }
29051e130199SAlex Elder 
2906b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2907b1b5402aSAlex Elder 		u64 *snap_features)
2908b1b5402aSAlex Elder {
2909b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2910b1b5402aSAlex Elder 	struct {
2911b1b5402aSAlex Elder 		__le64 features;
2912b1b5402aSAlex Elder 		__le64 incompat;
2913b1b5402aSAlex Elder 	} features_buf = { 0 };
2914d889140cSAlex Elder 	u64 incompat;
2915b1b5402aSAlex Elder 	int ret;
2916b1b5402aSAlex Elder 
291736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
2918b1b5402aSAlex Elder 				"rbd", "get_features",
2919b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2920b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
292107b2391fSAlex Elder 				NULL);
292236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
2923b1b5402aSAlex Elder 	if (ret < 0)
2924b1b5402aSAlex Elder 		return ret;
2925d889140cSAlex Elder 
2926d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2927d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2928b8f5c6edSAlex Elder 		return -ENXIO;
2929d889140cSAlex Elder 
2930b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2931b1b5402aSAlex Elder 
2932b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2933b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2934b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2935b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2936b1b5402aSAlex Elder 
2937b1b5402aSAlex Elder 	return 0;
2938b1b5402aSAlex Elder }
2939b1b5402aSAlex Elder 
2940b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2941b1b5402aSAlex Elder {
2942b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2943b1b5402aSAlex Elder 						&rbd_dev->header.features);
2944b1b5402aSAlex Elder }
2945b1b5402aSAlex Elder 
294686b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
294786b00e0dSAlex Elder {
294886b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
294986b00e0dSAlex Elder 	size_t size;
295086b00e0dSAlex Elder 	void *reply_buf = NULL;
295186b00e0dSAlex Elder 	__le64 snapid;
295286b00e0dSAlex Elder 	void *p;
295386b00e0dSAlex Elder 	void *end;
295486b00e0dSAlex Elder 	char *image_id;
295586b00e0dSAlex Elder 	u64 overlap;
295686b00e0dSAlex Elder 	int ret;
295786b00e0dSAlex Elder 
295886b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
295986b00e0dSAlex Elder 	if (!parent_spec)
296086b00e0dSAlex Elder 		return -ENOMEM;
296186b00e0dSAlex Elder 
296286b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
296386b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
296486b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
296586b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
296686b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
296786b00e0dSAlex Elder 	if (!reply_buf) {
296886b00e0dSAlex Elder 		ret = -ENOMEM;
296986b00e0dSAlex Elder 		goto out_err;
297086b00e0dSAlex Elder 	}
297186b00e0dSAlex Elder 
297286b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
297336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
297486b00e0dSAlex Elder 				"rbd", "get_parent",
297586b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
297607b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
297736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
297886b00e0dSAlex Elder 	if (ret < 0)
297986b00e0dSAlex Elder 		goto out_err;
298086b00e0dSAlex Elder 
298186b00e0dSAlex Elder 	ret = -ERANGE;
298286b00e0dSAlex Elder 	p = reply_buf;
298386b00e0dSAlex Elder 	end = (char *) reply_buf + size;
298486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
298586b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
298686b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
298786b00e0dSAlex Elder 
29880903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
29890903e875SAlex Elder 
29900903e875SAlex Elder 	ret = -EIO;
29910903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
29920903e875SAlex Elder 		goto out;
29930903e875SAlex Elder 
2994979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
299586b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
299686b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
299786b00e0dSAlex Elder 		goto out_err;
299886b00e0dSAlex Elder 	}
299986b00e0dSAlex Elder 	parent_spec->image_id = image_id;
300086b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
300186b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
300286b00e0dSAlex Elder 
300386b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
300486b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
300586b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
300686b00e0dSAlex Elder out:
300786b00e0dSAlex Elder 	ret = 0;
300886b00e0dSAlex Elder out_err:
300986b00e0dSAlex Elder 	kfree(reply_buf);
301086b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
301186b00e0dSAlex Elder 
301286b00e0dSAlex Elder 	return ret;
301386b00e0dSAlex Elder }
301486b00e0dSAlex Elder 
30159e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
30169e15b77dSAlex Elder {
30179e15b77dSAlex Elder 	size_t image_id_size;
30189e15b77dSAlex Elder 	char *image_id;
30199e15b77dSAlex Elder 	void *p;
30209e15b77dSAlex Elder 	void *end;
30219e15b77dSAlex Elder 	size_t size;
30229e15b77dSAlex Elder 	void *reply_buf = NULL;
30239e15b77dSAlex Elder 	size_t len = 0;
30249e15b77dSAlex Elder 	char *image_name = NULL;
30259e15b77dSAlex Elder 	int ret;
30269e15b77dSAlex Elder 
30279e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
30289e15b77dSAlex Elder 
302969e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
303069e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
30319e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
30329e15b77dSAlex Elder 	if (!image_id)
30339e15b77dSAlex Elder 		return NULL;
30349e15b77dSAlex Elder 
30359e15b77dSAlex Elder 	p = image_id;
30369e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
303769e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
30389e15b77dSAlex Elder 
30399e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
30409e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
30419e15b77dSAlex Elder 	if (!reply_buf)
30429e15b77dSAlex Elder 		goto out;
30439e15b77dSAlex Elder 
304436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
30459e15b77dSAlex Elder 				"rbd", "dir_get_name",
30469e15b77dSAlex Elder 				image_id, image_id_size,
304707b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
30489e15b77dSAlex Elder 	if (ret < 0)
30499e15b77dSAlex Elder 		goto out;
30509e15b77dSAlex Elder 	p = reply_buf;
30519e15b77dSAlex Elder 	end = (char *) reply_buf + size;
30529e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
30539e15b77dSAlex Elder 	if (IS_ERR(image_name))
30549e15b77dSAlex Elder 		image_name = NULL;
30559e15b77dSAlex Elder 	else
30569e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
30579e15b77dSAlex Elder out:
30589e15b77dSAlex Elder 	kfree(reply_buf);
30599e15b77dSAlex Elder 	kfree(image_id);
30609e15b77dSAlex Elder 
30619e15b77dSAlex Elder 	return image_name;
30629e15b77dSAlex Elder }
30639e15b77dSAlex Elder 
30649e15b77dSAlex Elder /*
30659e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
30669e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
30679e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
30689e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
30699e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
30709e15b77dSAlex Elder  * until then.
30719e15b77dSAlex Elder  */
30729e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
30739e15b77dSAlex Elder {
30749e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
30759e15b77dSAlex Elder 	const char *name;
30769e15b77dSAlex Elder 	void *reply_buf = NULL;
30779e15b77dSAlex Elder 	int ret;
30789e15b77dSAlex Elder 
30799e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
30809e15b77dSAlex Elder 		return 0;	/* Already have the names */
30819e15b77dSAlex Elder 
30829e15b77dSAlex Elder 	/* Look up the pool name */
30839e15b77dSAlex Elder 
30849e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
30859e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3086935dc89fSAlex Elder 	if (!name) {
3087935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3088935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3089935dc89fSAlex Elder 		return -EIO;
3090935dc89fSAlex Elder 	}
30919e15b77dSAlex Elder 
30929e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
30939e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
30949e15b77dSAlex Elder 		return -ENOMEM;
30959e15b77dSAlex Elder 
30969e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
30979e15b77dSAlex Elder 
30989e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
309969e7a02fSAlex Elder 	if (name)
31009e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
310169e7a02fSAlex Elder 	else
310206ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
31039e15b77dSAlex Elder 
31049e15b77dSAlex Elder 	/* Look up the snapshot name. */
31059e15b77dSAlex Elder 
31069e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
31079e15b77dSAlex Elder 	if (!name) {
3108935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3109935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
31109e15b77dSAlex Elder 		ret = -EIO;
31119e15b77dSAlex Elder 		goto out_err;
31129e15b77dSAlex Elder 	}
31139e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
31149e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
31159e15b77dSAlex Elder 		goto out_err;
31169e15b77dSAlex Elder 
31179e15b77dSAlex Elder 	return 0;
31189e15b77dSAlex Elder out_err:
31199e15b77dSAlex Elder 	kfree(reply_buf);
31209e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
31219e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
31229e15b77dSAlex Elder 
31239e15b77dSAlex Elder 	return ret;
31249e15b77dSAlex Elder }
31259e15b77dSAlex Elder 
31266e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
312735d489f9SAlex Elder {
312835d489f9SAlex Elder 	size_t size;
312935d489f9SAlex Elder 	int ret;
313035d489f9SAlex Elder 	void *reply_buf;
313135d489f9SAlex Elder 	void *p;
313235d489f9SAlex Elder 	void *end;
313335d489f9SAlex Elder 	u64 seq;
313435d489f9SAlex Elder 	u32 snap_count;
313535d489f9SAlex Elder 	struct ceph_snap_context *snapc;
313635d489f9SAlex Elder 	u32 i;
313735d489f9SAlex Elder 
313835d489f9SAlex Elder 	/*
313935d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
314035d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
314135d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
314235d489f9SAlex Elder 	 * prepared to receive.
314335d489f9SAlex Elder 	 */
314435d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
314535d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
314635d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
314735d489f9SAlex Elder 	if (!reply_buf)
314835d489f9SAlex Elder 		return -ENOMEM;
314935d489f9SAlex Elder 
315036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
315135d489f9SAlex Elder 				"rbd", "get_snapcontext",
315235d489f9SAlex Elder 				NULL, 0,
315307b2391fSAlex Elder 				reply_buf, size, ver);
315436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
315535d489f9SAlex Elder 	if (ret < 0)
315635d489f9SAlex Elder 		goto out;
315735d489f9SAlex Elder 
315835d489f9SAlex Elder 	ret = -ERANGE;
315935d489f9SAlex Elder 	p = reply_buf;
316035d489f9SAlex Elder 	end = (char *) reply_buf + size;
316135d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
316235d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
316335d489f9SAlex Elder 
316435d489f9SAlex Elder 	/*
316535d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
316635d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
316735d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
316835d489f9SAlex Elder 	 * allocate is representable in a size_t.
316935d489f9SAlex Elder 	 */
317035d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
317135d489f9SAlex Elder 				 / sizeof (u64)) {
317235d489f9SAlex Elder 		ret = -EINVAL;
317335d489f9SAlex Elder 		goto out;
317435d489f9SAlex Elder 	}
317535d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
317635d489f9SAlex Elder 		goto out;
317735d489f9SAlex Elder 
317835d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
317935d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
318035d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
318135d489f9SAlex Elder 	if (!snapc) {
318235d489f9SAlex Elder 		ret = -ENOMEM;
318335d489f9SAlex Elder 		goto out;
318435d489f9SAlex Elder 	}
318535d489f9SAlex Elder 
318635d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
318735d489f9SAlex Elder 	snapc->seq = seq;
318835d489f9SAlex Elder 	snapc->num_snaps = snap_count;
318935d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
319035d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
319135d489f9SAlex Elder 
319235d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
319335d489f9SAlex Elder 
319435d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
319535d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
319635d489f9SAlex Elder 
319735d489f9SAlex Elder out:
319835d489f9SAlex Elder 	kfree(reply_buf);
319935d489f9SAlex Elder 
320035d489f9SAlex Elder 	return 0;
320135d489f9SAlex Elder }
320235d489f9SAlex Elder 
3203b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3204b8b1e2dbSAlex Elder {
3205b8b1e2dbSAlex Elder 	size_t size;
3206b8b1e2dbSAlex Elder 	void *reply_buf;
3207b8b1e2dbSAlex Elder 	__le64 snap_id;
3208b8b1e2dbSAlex Elder 	int ret;
3209b8b1e2dbSAlex Elder 	void *p;
3210b8b1e2dbSAlex Elder 	void *end;
3211b8b1e2dbSAlex Elder 	char *snap_name;
3212b8b1e2dbSAlex Elder 
3213b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3214b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3215b8b1e2dbSAlex Elder 	if (!reply_buf)
3216b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3217b8b1e2dbSAlex Elder 
3218b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
321936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3220b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
3221b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
322207b2391fSAlex Elder 				reply_buf, size, NULL);
322336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3224b8b1e2dbSAlex Elder 	if (ret < 0)
3225b8b1e2dbSAlex Elder 		goto out;
3226b8b1e2dbSAlex Elder 
3227b8b1e2dbSAlex Elder 	p = reply_buf;
3228b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
3229e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3230b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3231b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3232b8b1e2dbSAlex Elder 		goto out;
3233b8b1e2dbSAlex Elder 	} else {
3234b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3235b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
3236b8b1e2dbSAlex Elder 	}
3237b8b1e2dbSAlex Elder 	kfree(reply_buf);
3238b8b1e2dbSAlex Elder 
3239b8b1e2dbSAlex Elder 	return snap_name;
3240b8b1e2dbSAlex Elder out:
3241b8b1e2dbSAlex Elder 	kfree(reply_buf);
3242b8b1e2dbSAlex Elder 
3243b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3244b8b1e2dbSAlex Elder }
3245b8b1e2dbSAlex Elder 
3246b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3247b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3248b8b1e2dbSAlex Elder {
3249e0b49868SAlex Elder 	u64 snap_id;
3250b8b1e2dbSAlex Elder 	u8 order;
3251b8b1e2dbSAlex Elder 	int ret;
3252b8b1e2dbSAlex Elder 
3253b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3254b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3255b8b1e2dbSAlex Elder 	if (ret)
3256b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3257b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3258b8b1e2dbSAlex Elder 	if (ret)
3259b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3260b8b1e2dbSAlex Elder 
3261b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
3262b8b1e2dbSAlex Elder }
3263b8b1e2dbSAlex Elder 
3264b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3265b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3266b8b1e2dbSAlex Elder {
3267b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3268b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3269b8b1e2dbSAlex Elder 					snap_size, snap_features);
3270b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3271b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3272b8b1e2dbSAlex Elder 					snap_size, snap_features);
3273b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3274b8b1e2dbSAlex Elder }
3275b8b1e2dbSAlex Elder 
3276117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3277117973fbSAlex Elder {
3278117973fbSAlex Elder 	int ret;
3279117973fbSAlex Elder 	__u8 obj_order;
3280117973fbSAlex Elder 
3281117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3282117973fbSAlex Elder 
3283117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3284117973fbSAlex Elder 
3285117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3286117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3287117973fbSAlex Elder 	if (ret)
3288117973fbSAlex Elder 		goto out;
3289117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3290117973fbSAlex Elder 		ret = -EIO;
3291117973fbSAlex Elder 		goto out;
3292117973fbSAlex Elder 	}
3293117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3294117973fbSAlex Elder 
3295117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3296117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
3297117973fbSAlex Elder 	if (ret)
3298117973fbSAlex Elder 		goto out;
3299117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3300117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
3301117973fbSAlex Elder 	if (ret)
3302117973fbSAlex Elder 		goto out;
3303117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
3304117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
3305117973fbSAlex Elder out:
3306117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
3307117973fbSAlex Elder 
3308117973fbSAlex Elder 	return ret;
3309117973fbSAlex Elder }
3310117973fbSAlex Elder 
33119d475de5SAlex Elder /*
331235938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
331335938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
331435938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
331535938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
331635938150SAlex Elder  * And verify there are no changes to snapshots we already know
331735938150SAlex Elder  * about.
331835938150SAlex Elder  *
331935938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
332035938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
332135938150SAlex Elder  * are also maintained in that order.)
3322dfc5606dSYehuda Sadeh  */
3323304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3324dfc5606dSYehuda Sadeh {
332535938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
332635938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
332735938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
332835938150SAlex Elder 	struct list_head *links = head->next;
332935938150SAlex Elder 	u32 index = 0;
3330dfc5606dSYehuda Sadeh 
33319fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
333235938150SAlex Elder 	while (index < snap_count || links != head) {
333335938150SAlex Elder 		u64 snap_id;
333435938150SAlex Elder 		struct rbd_snap *snap;
3335cd892126SAlex Elder 		char *snap_name;
3336cd892126SAlex Elder 		u64 snap_size = 0;
3337cd892126SAlex Elder 		u64 snap_features = 0;
3338dfc5606dSYehuda Sadeh 
333935938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
334035938150SAlex Elder 					     : CEPH_NOSNAP;
334135938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
334235938150SAlex Elder 				     : NULL;
3343aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3344dfc5606dSYehuda Sadeh 
334535938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
334635938150SAlex Elder 			struct list_head *next = links->next;
3347dfc5606dSYehuda Sadeh 
33486d292906SAlex Elder 			/*
33496d292906SAlex Elder 			 * A previously-existing snapshot is not in
33506d292906SAlex Elder 			 * the new snap context.
33516d292906SAlex Elder 			 *
33526d292906SAlex Elder 			 * If the now missing snapshot is the one the
33536d292906SAlex Elder 			 * image is mapped to, clear its exists flag
33546d292906SAlex Elder 			 * so we can avoid sending any more requests
33556d292906SAlex Elder 			 * to it.
33566d292906SAlex Elder 			 */
33570d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
33586d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
335941f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
33609fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
33610d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
33620d7dbfceSAlex Elder 							"mapped " : "",
33639fcbb800SAlex Elder 				(unsigned long long) snap->id);
3364dfc5606dSYehuda Sadeh 
336535938150SAlex Elder 			/* Done with this list entry; advance */
336635938150SAlex Elder 
336735938150SAlex Elder 			links = next;
336835938150SAlex Elder 			continue;
3369dfc5606dSYehuda Sadeh 		}
337035938150SAlex Elder 
3371b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
3372cd892126SAlex Elder 					&snap_size, &snap_features);
3373cd892126SAlex Elder 		if (IS_ERR(snap_name))
3374cd892126SAlex Elder 			return PTR_ERR(snap_name);
3375cd892126SAlex Elder 
33769fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
33779fcbb800SAlex Elder 			(unsigned long long) snap_id);
337835938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
337935938150SAlex Elder 			struct rbd_snap *new_snap;
338035938150SAlex Elder 
338135938150SAlex Elder 			/* We haven't seen this snapshot before */
338235938150SAlex Elder 
3383c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3384cd892126SAlex Elder 					snap_id, snap_size, snap_features);
33859fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
33869fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
33879fcbb800SAlex Elder 
33889fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
33899fcbb800SAlex Elder 
33909fcbb800SAlex Elder 				return err;
33919fcbb800SAlex Elder 			}
339235938150SAlex Elder 
339335938150SAlex Elder 			/* New goes before existing, or at end of list */
339435938150SAlex Elder 
33959fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
339635938150SAlex Elder 			if (snap)
339735938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
339835938150SAlex Elder 			else
3399523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
340035938150SAlex Elder 		} else {
340135938150SAlex Elder 			/* Already have this one */
340235938150SAlex Elder 
34039fcbb800SAlex Elder 			dout("  already present\n");
34049fcbb800SAlex Elder 
3405cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
3406aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
3407cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
340835938150SAlex Elder 
340935938150SAlex Elder 			/* Done with this list entry; advance */
341035938150SAlex Elder 
341135938150SAlex Elder 			links = links->next;
3412dfc5606dSYehuda Sadeh 		}
341335938150SAlex Elder 
341435938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
341535938150SAlex Elder 
341635938150SAlex Elder 		index++;
3417dfc5606dSYehuda Sadeh 	}
34189fcbb800SAlex Elder 	dout("%s: done\n", __func__);
3419dfc5606dSYehuda Sadeh 
3420dfc5606dSYehuda Sadeh 	return 0;
3421dfc5606dSYehuda Sadeh }
3422dfc5606dSYehuda Sadeh 
3423304f6808SAlex Elder /*
3424304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
3425304f6808SAlex Elder  * have not already been registered.
3426304f6808SAlex Elder  */
3427304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3428304f6808SAlex Elder {
3429304f6808SAlex Elder 	struct rbd_snap *snap;
3430304f6808SAlex Elder 	int ret = 0;
3431304f6808SAlex Elder 
343237206ee5SAlex Elder 	dout("%s:\n", __func__);
343386ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
343486ff77bbSAlex Elder 		return -EIO;
3435304f6808SAlex Elder 
3436304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3437304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3438304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3439304f6808SAlex Elder 			if (ret < 0)
3440304f6808SAlex Elder 				break;
3441304f6808SAlex Elder 		}
3442304f6808SAlex Elder 	}
3443304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3444304f6808SAlex Elder 
3445304f6808SAlex Elder 	return ret;
3446304f6808SAlex Elder }
3447304f6808SAlex Elder 
3448dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3449dfc5606dSYehuda Sadeh {
3450dfc5606dSYehuda Sadeh 	struct device *dev;
3451cd789ab9SAlex Elder 	int ret;
3452dfc5606dSYehuda Sadeh 
3453dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3454dfc5606dSYehuda Sadeh 
3455cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3456dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3457dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3458dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3459dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3460de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3461dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3462dfc5606dSYehuda Sadeh 
3463dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3464cd789ab9SAlex Elder 
3465dfc5606dSYehuda Sadeh 	return ret;
3466602adf40SYehuda Sadeh }
3467602adf40SYehuda Sadeh 
3468dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3469dfc5606dSYehuda Sadeh {
3470dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3471dfc5606dSYehuda Sadeh }
3472dfc5606dSYehuda Sadeh 
3473e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
34741ddbe94eSAlex Elder 
34751ddbe94eSAlex Elder /*
3476499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3477499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
34781ddbe94eSAlex Elder  */
3479e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3480b7f23c36SAlex Elder {
3481e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3482499afd5bSAlex Elder 
3483499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3484499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3485499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3486e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3487e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3488b7f23c36SAlex Elder }
3489b7f23c36SAlex Elder 
34901ddbe94eSAlex Elder /*
3491499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3492499afd5bSAlex Elder  * identifier is no longer in use.
34931ddbe94eSAlex Elder  */
3494e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
34951ddbe94eSAlex Elder {
3496d184f6bfSAlex Elder 	struct list_head *tmp;
3497de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3498d184f6bfSAlex Elder 	int max_id;
3499d184f6bfSAlex Elder 
3500aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3501499afd5bSAlex Elder 
3502e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3503e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3504499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3505499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3506d184f6bfSAlex Elder 
3507d184f6bfSAlex Elder 	/*
3508d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3509d184f6bfSAlex Elder 	 * is nothing special we need to do.
3510d184f6bfSAlex Elder 	 */
3511e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3512d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3513d184f6bfSAlex Elder 		return;
3514d184f6bfSAlex Elder 	}
3515d184f6bfSAlex Elder 
3516d184f6bfSAlex Elder 	/*
3517d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3518d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3519d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3520d184f6bfSAlex Elder 	 */
3521d184f6bfSAlex Elder 	max_id = 0;
3522d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3523d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3524d184f6bfSAlex Elder 
3525d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3526b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3527b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3528d184f6bfSAlex Elder 	}
3529499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
35301ddbe94eSAlex Elder 
35311ddbe94eSAlex Elder 	/*
3532e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3533d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3534d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3535d184f6bfSAlex Elder 	 * case.
35361ddbe94eSAlex Elder 	 */
3537e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3538e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3539b7f23c36SAlex Elder }
3540b7f23c36SAlex Elder 
3541a725f65eSAlex Elder /*
3542e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3543e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3544593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3545593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3546e28fff26SAlex Elder  */
3547e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3548e28fff26SAlex Elder {
3549e28fff26SAlex Elder         /*
3550e28fff26SAlex Elder         * These are the characters that produce nonzero for
3551e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3552e28fff26SAlex Elder         */
3553e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3554e28fff26SAlex Elder 
3555e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3556e28fff26SAlex Elder 
3557e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3558e28fff26SAlex Elder }
3559e28fff26SAlex Elder 
3560e28fff26SAlex Elder /*
3561e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3562e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3563593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3564593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3565e28fff26SAlex Elder  *
3566e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3567e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3568e28fff26SAlex Elder  * token_size if the token would not fit.
3569e28fff26SAlex Elder  *
3570593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3571e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3572e28fff26SAlex Elder  * too small to hold it.
3573e28fff26SAlex Elder  */
3574e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3575e28fff26SAlex Elder 				char *token,
3576e28fff26SAlex Elder 				size_t token_size)
3577e28fff26SAlex Elder {
3578e28fff26SAlex Elder         size_t len;
3579e28fff26SAlex Elder 
3580e28fff26SAlex Elder 	len = next_token(buf);
3581e28fff26SAlex Elder 	if (len < token_size) {
3582e28fff26SAlex Elder 		memcpy(token, *buf, len);
3583e28fff26SAlex Elder 		*(token + len) = '\0';
3584e28fff26SAlex Elder 	}
3585e28fff26SAlex Elder 	*buf += len;
3586e28fff26SAlex Elder 
3587e28fff26SAlex Elder         return len;
3588e28fff26SAlex Elder }
3589e28fff26SAlex Elder 
3590e28fff26SAlex Elder /*
3591ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3592ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3593ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3594ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3595ea3352f4SAlex Elder  *
3596ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3597ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3598ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3599ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3600ea3352f4SAlex Elder  *
3601ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3602ea3352f4SAlex Elder  * the end of the found token.
3603ea3352f4SAlex Elder  *
3604ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3605ea3352f4SAlex Elder  */
3606ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3607ea3352f4SAlex Elder {
3608ea3352f4SAlex Elder 	char *dup;
3609ea3352f4SAlex Elder 	size_t len;
3610ea3352f4SAlex Elder 
3611ea3352f4SAlex Elder 	len = next_token(buf);
36124caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3613ea3352f4SAlex Elder 	if (!dup)
3614ea3352f4SAlex Elder 		return NULL;
3615ea3352f4SAlex Elder 	*(dup + len) = '\0';
3616ea3352f4SAlex Elder 	*buf += len;
3617ea3352f4SAlex Elder 
3618ea3352f4SAlex Elder 	if (lenp)
3619ea3352f4SAlex Elder 		*lenp = len;
3620ea3352f4SAlex Elder 
3621ea3352f4SAlex Elder 	return dup;
3622ea3352f4SAlex Elder }
3623ea3352f4SAlex Elder 
3624ea3352f4SAlex Elder /*
3625859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3626859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3627859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3628859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3629d22f76e7SAlex Elder  *
3630859c31dfSAlex Elder  * The information extracted from these options is recorded in
3631859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3632859c31dfSAlex Elder  * structures:
3633859c31dfSAlex Elder  *  ceph_opts
3634859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3635859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3636859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3637859c31dfSAlex Elder  *  rbd_opts
3638859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3639859c31dfSAlex Elder  *	this function; caller must release with kfree().
3640859c31dfSAlex Elder  *  spec
3641859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3642859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3643859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3644859c31dfSAlex Elder  *
3645859c31dfSAlex Elder  * The options passed take this form:
3646859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3647859c31dfSAlex Elder  * where:
3648859c31dfSAlex Elder  *  <mon_addrs>
3649859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3650859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3651859c31dfSAlex Elder  *      by a port number (separated by a colon).
3652859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3653859c31dfSAlex Elder  *  <options>
3654859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3655859c31dfSAlex Elder  *  <pool_name>
3656859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3657859c31dfSAlex Elder  *  <image_name>
3658859c31dfSAlex Elder  *      The name of the image in that pool to map.
3659859c31dfSAlex Elder  *  <snap_id>
3660859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3661859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3662859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3663859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3664a725f65eSAlex Elder  */
3665859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3666dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3667859c31dfSAlex Elder 				struct rbd_options **opts,
3668859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3669a725f65eSAlex Elder {
3670e28fff26SAlex Elder 	size_t len;
3671859c31dfSAlex Elder 	char *options;
36720ddebc0cSAlex Elder 	const char *mon_addrs;
36730ddebc0cSAlex Elder 	size_t mon_addrs_size;
3674859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
36754e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3676859c31dfSAlex Elder 	struct ceph_options *copts;
3677dc79b113SAlex Elder 	int ret;
3678e28fff26SAlex Elder 
3679e28fff26SAlex Elder 	/* The first four tokens are required */
3680e28fff26SAlex Elder 
36817ef3214aSAlex Elder 	len = next_token(&buf);
36824fb5d671SAlex Elder 	if (!len) {
36834fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
36844fb5d671SAlex Elder 		return -EINVAL;
36854fb5d671SAlex Elder 	}
36860ddebc0cSAlex Elder 	mon_addrs = buf;
3687f28e565aSAlex Elder 	mon_addrs_size = len + 1;
36887ef3214aSAlex Elder 	buf += len;
3689a725f65eSAlex Elder 
3690dc79b113SAlex Elder 	ret = -EINVAL;
3691f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3692f28e565aSAlex Elder 	if (!options)
3693dc79b113SAlex Elder 		return -ENOMEM;
36944fb5d671SAlex Elder 	if (!*options) {
36954fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
36964fb5d671SAlex Elder 		goto out_err;
36974fb5d671SAlex Elder 	}
3698a725f65eSAlex Elder 
3699859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3700859c31dfSAlex Elder 	if (!spec)
3701f28e565aSAlex Elder 		goto out_mem;
3702859c31dfSAlex Elder 
3703859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3704859c31dfSAlex Elder 	if (!spec->pool_name)
3705859c31dfSAlex Elder 		goto out_mem;
37064fb5d671SAlex Elder 	if (!*spec->pool_name) {
37074fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
37084fb5d671SAlex Elder 		goto out_err;
37094fb5d671SAlex Elder 	}
3710e28fff26SAlex Elder 
371169e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3712859c31dfSAlex Elder 	if (!spec->image_name)
3713f28e565aSAlex Elder 		goto out_mem;
37144fb5d671SAlex Elder 	if (!*spec->image_name) {
37154fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
37164fb5d671SAlex Elder 		goto out_err;
37174fb5d671SAlex Elder 	}
3718e28fff26SAlex Elder 
3719f28e565aSAlex Elder 	/*
3720f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3721f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3722f28e565aSAlex Elder 	 */
37233feeb894SAlex Elder 	len = next_token(&buf);
3724820a5f3eSAlex Elder 	if (!len) {
37253feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
37263feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3727f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3728dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3729f28e565aSAlex Elder 		goto out_err;
3730849b4260SAlex Elder 	}
37314caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3732859c31dfSAlex Elder 	if (!spec->snap_name)
3733f28e565aSAlex Elder 		goto out_mem;
3734859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3735e5c35534SAlex Elder 
37360ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3737e28fff26SAlex Elder 
37384e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
37394e9afebaSAlex Elder 	if (!rbd_opts)
37404e9afebaSAlex Elder 		goto out_mem;
37414e9afebaSAlex Elder 
37424e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3743d22f76e7SAlex Elder 
3744859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
37450ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
37464e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3747859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3748859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3749dc79b113SAlex Elder 		goto out_err;
3750dc79b113SAlex Elder 	}
3751859c31dfSAlex Elder 	kfree(options);
3752859c31dfSAlex Elder 
3753859c31dfSAlex Elder 	*ceph_opts = copts;
37544e9afebaSAlex Elder 	*opts = rbd_opts;
3755859c31dfSAlex Elder 	*rbd_spec = spec;
37560ddebc0cSAlex Elder 
3757dc79b113SAlex Elder 	return 0;
3758f28e565aSAlex Elder out_mem:
3759dc79b113SAlex Elder 	ret = -ENOMEM;
3760d22f76e7SAlex Elder out_err:
3761859c31dfSAlex Elder 	kfree(rbd_opts);
3762859c31dfSAlex Elder 	rbd_spec_put(spec);
3763f28e565aSAlex Elder 	kfree(options);
3764d22f76e7SAlex Elder 
3765dc79b113SAlex Elder 	return ret;
3766a725f65eSAlex Elder }
3767a725f65eSAlex Elder 
3768589d30e0SAlex Elder /*
3769589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3770589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3771589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3772589d30e0SAlex Elder  *
3773589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3774589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3775589d30e0SAlex Elder  * with the supplied name.
3776589d30e0SAlex Elder  *
3777589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3778589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3779589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3780589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3781589d30e0SAlex Elder  */
3782589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3783589d30e0SAlex Elder {
3784589d30e0SAlex Elder 	int ret;
3785589d30e0SAlex Elder 	size_t size;
3786589d30e0SAlex Elder 	char *object_name;
3787589d30e0SAlex Elder 	void *response;
3788589d30e0SAlex Elder 	void *p;
3789589d30e0SAlex Elder 
3790589d30e0SAlex Elder 	/*
37912c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
37922c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
37932c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
37942c0d0a10SAlex Elder 	 */
37952c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
37962c0d0a10SAlex Elder 		return 0;
37972c0d0a10SAlex Elder 
37982c0d0a10SAlex Elder 	/*
3799589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3800589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3801589d30e0SAlex Elder 	 */
380269e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3803589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3804589d30e0SAlex Elder 	if (!object_name)
3805589d30e0SAlex Elder 		return -ENOMEM;
38060d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3807589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3808589d30e0SAlex Elder 
3809589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3810589d30e0SAlex Elder 
3811589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3812589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3813589d30e0SAlex Elder 	if (!response) {
3814589d30e0SAlex Elder 		ret = -ENOMEM;
3815589d30e0SAlex Elder 		goto out;
3816589d30e0SAlex Elder 	}
3817589d30e0SAlex Elder 
381836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
3819589d30e0SAlex Elder 				"rbd", "get_id",
3820589d30e0SAlex Elder 				NULL, 0,
382107b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
382236be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3823589d30e0SAlex Elder 	if (ret < 0)
3824589d30e0SAlex Elder 		goto out;
3825589d30e0SAlex Elder 
3826589d30e0SAlex Elder 	p = response;
38270d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3828589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3829979ed480SAlex Elder 						NULL, GFP_NOIO);
38300d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
38310d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
38320d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3833589d30e0SAlex Elder 	} else {
38340d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3835589d30e0SAlex Elder 	}
3836589d30e0SAlex Elder out:
3837589d30e0SAlex Elder 	kfree(response);
3838589d30e0SAlex Elder 	kfree(object_name);
3839589d30e0SAlex Elder 
3840589d30e0SAlex Elder 	return ret;
3841589d30e0SAlex Elder }
3842589d30e0SAlex Elder 
3843a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3844a30b71b9SAlex Elder {
3845a30b71b9SAlex Elder 	int ret;
3846a30b71b9SAlex Elder 	size_t size;
3847a30b71b9SAlex Elder 
3848a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3849a30b71b9SAlex Elder 
38500d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
38510d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3852a30b71b9SAlex Elder 		return -ENOMEM;
3853a30b71b9SAlex Elder 
3854a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3855a30b71b9SAlex Elder 
385669e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3857a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3858a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3859a30b71b9SAlex Elder 		ret = -ENOMEM;
3860a30b71b9SAlex Elder 		goto out_err;
3861a30b71b9SAlex Elder 	}
38620d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
38630d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3864a30b71b9SAlex Elder 
3865a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3866a30b71b9SAlex Elder 
3867a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3868a30b71b9SAlex Elder 	if (ret < 0)
3869a30b71b9SAlex Elder 		goto out_err;
387086b00e0dSAlex Elder 
387186b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
387286b00e0dSAlex Elder 
387386b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
387486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
387586b00e0dSAlex Elder 
3876a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3877a30b71b9SAlex Elder 
3878a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3879a30b71b9SAlex Elder 		rbd_dev->header_name);
3880a30b71b9SAlex Elder 
3881a30b71b9SAlex Elder 	return 0;
3882a30b71b9SAlex Elder 
3883a30b71b9SAlex Elder out_err:
3884a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3885a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
38860d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
38870d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3888a30b71b9SAlex Elder 
3889a30b71b9SAlex Elder 	return ret;
3890a30b71b9SAlex Elder }
3891a30b71b9SAlex Elder 
3892a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3893a30b71b9SAlex Elder {
3894a30b71b9SAlex Elder 	size_t size;
38959d475de5SAlex Elder 	int ret;
38966e14b1a6SAlex Elder 	u64 ver = 0;
3897a30b71b9SAlex Elder 
3898a30b71b9SAlex Elder 	/*
3899a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3900a30b71b9SAlex Elder 	 * object name for this rbd image.
3901a30b71b9SAlex Elder 	 */
3902979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3903a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3904a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3905a30b71b9SAlex Elder 		return -ENOMEM;
3906a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
39070d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
39089d475de5SAlex Elder 
39099d475de5SAlex Elder 	/* Get the size and object order for the image */
39109d475de5SAlex Elder 
39119d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
39129d475de5SAlex Elder 	if (ret < 0)
39139d475de5SAlex Elder 		goto out_err;
39141e130199SAlex Elder 
39151e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
39161e130199SAlex Elder 
39171e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
39181e130199SAlex Elder 	if (ret < 0)
39191e130199SAlex Elder 		goto out_err;
3920b1b5402aSAlex Elder 
3921d889140cSAlex Elder 	/* Get the and check features for the image */
3922b1b5402aSAlex Elder 
3923b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3924b1b5402aSAlex Elder 	if (ret < 0)
3925b1b5402aSAlex Elder 		goto out_err;
392635d489f9SAlex Elder 
392786b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
392886b00e0dSAlex Elder 
392986b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
393086b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
393186b00e0dSAlex Elder 		if (ret < 0)
393286b00e0dSAlex Elder 			goto out_err;
393386b00e0dSAlex Elder 	}
393486b00e0dSAlex Elder 
39356e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
393635d489f9SAlex Elder 
39376e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
39386e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
39396e14b1a6SAlex Elder 
39406e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
39416e14b1a6SAlex Elder 
39426e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
394335d489f9SAlex Elder 	if (ret)
394435d489f9SAlex Elder 		goto out_err;
39456e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
39466e14b1a6SAlex Elder 
3947a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3948a30b71b9SAlex Elder 
3949a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3950a30b71b9SAlex Elder 		rbd_dev->header_name);
3951a30b71b9SAlex Elder 
395235152979SAlex Elder 	return 0;
39539d475de5SAlex Elder out_err:
395486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
395586b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
395686b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
39579d475de5SAlex Elder 	kfree(rbd_dev->header_name);
39589d475de5SAlex Elder 	rbd_dev->header_name = NULL;
39591e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
39601e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
39619d475de5SAlex Elder 
39629d475de5SAlex Elder 	return ret;
3963a30b71b9SAlex Elder }
3964a30b71b9SAlex Elder 
396583a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
396683a06263SAlex Elder {
396783a06263SAlex Elder 	int ret;
396883a06263SAlex Elder 
396983a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
397083a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
397183a06263SAlex Elder 	if (ret)
397283a06263SAlex Elder 		return ret;
397383a06263SAlex Elder 
39749e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
39759e15b77dSAlex Elder 	if (ret)
39769e15b77dSAlex Elder 		goto err_out_snaps;
39779e15b77dSAlex Elder 
397883a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
397983a06263SAlex Elder 	if (ret)
398083a06263SAlex Elder 		goto err_out_snaps;
398183a06263SAlex Elder 
398283a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
398383a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
398483a06263SAlex Elder 
398583a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
398683a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
398783a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
398883a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
398983a06263SAlex Elder 
399083a06263SAlex Elder 	/* Get our block major device number. */
399183a06263SAlex Elder 
399283a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
399383a06263SAlex Elder 	if (ret < 0)
399483a06263SAlex Elder 		goto err_out_id;
399583a06263SAlex Elder 	rbd_dev->major = ret;
399683a06263SAlex Elder 
399783a06263SAlex Elder 	/* Set up the blkdev mapping. */
399883a06263SAlex Elder 
399983a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
400083a06263SAlex Elder 	if (ret)
400183a06263SAlex Elder 		goto err_out_blkdev;
400283a06263SAlex Elder 
400383a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
400483a06263SAlex Elder 	if (ret)
400583a06263SAlex Elder 		goto err_out_disk;
400683a06263SAlex Elder 
400783a06263SAlex Elder 	/*
400883a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
400983a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
401083a06263SAlex Elder 	 */
401183a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
401283a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
401383a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
401483a06263SAlex Elder 	if (ret)
401583a06263SAlex Elder 		goto err_out_bus;
401683a06263SAlex Elder 
40179969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
401883a06263SAlex Elder 	if (ret)
401983a06263SAlex Elder 		goto err_out_bus;
402083a06263SAlex Elder 
402183a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
402283a06263SAlex Elder 
402383a06263SAlex Elder 	add_disk(rbd_dev->disk);
402483a06263SAlex Elder 
402583a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
402683a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
402783a06263SAlex Elder 
402883a06263SAlex Elder 	return ret;
402983a06263SAlex Elder err_out_bus:
403083a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
403183a06263SAlex Elder 
403283a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
403383a06263SAlex Elder 
403483a06263SAlex Elder 	return ret;
403583a06263SAlex Elder err_out_disk:
403683a06263SAlex Elder 	rbd_free_disk(rbd_dev);
403783a06263SAlex Elder err_out_blkdev:
403883a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
403983a06263SAlex Elder err_out_id:
404083a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
404183a06263SAlex Elder err_out_snaps:
404283a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
404383a06263SAlex Elder 
404483a06263SAlex Elder 	return ret;
404583a06263SAlex Elder }
404683a06263SAlex Elder 
4047a30b71b9SAlex Elder /*
4048a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4049a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4050a30b71b9SAlex Elder  * id.
4051a30b71b9SAlex Elder  */
4052a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4053a30b71b9SAlex Elder {
4054a30b71b9SAlex Elder 	int ret;
4055a30b71b9SAlex Elder 
4056a30b71b9SAlex Elder 	/*
4057a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4058a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4059a30b71b9SAlex Elder 	 * it's a format 1 image.
4060a30b71b9SAlex Elder 	 */
4061a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4062a30b71b9SAlex Elder 	if (ret)
4063a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4064a30b71b9SAlex Elder 	else
4065a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
406683a06263SAlex Elder 	if (ret) {
4067a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4068a30b71b9SAlex Elder 
4069a30b71b9SAlex Elder 		return ret;
4070a30b71b9SAlex Elder 	}
4071a30b71b9SAlex Elder 
407283a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
407383a06263SAlex Elder 	if (ret)
407483a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
407583a06263SAlex Elder 
407683a06263SAlex Elder 	return ret;
407783a06263SAlex Elder }
407883a06263SAlex Elder 
407959c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
408059c2be1eSYehuda Sadeh 		       const char *buf,
408159c2be1eSYehuda Sadeh 		       size_t count)
4082602adf40SYehuda Sadeh {
4083cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4084dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
40854e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4086859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
40879d3997fdSAlex Elder 	struct rbd_client *rbdc;
408827cc2594SAlex Elder 	struct ceph_osd_client *osdc;
408927cc2594SAlex Elder 	int rc = -ENOMEM;
4090602adf40SYehuda Sadeh 
4091602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4092602adf40SYehuda Sadeh 		return -ENODEV;
4093602adf40SYehuda Sadeh 
4094a725f65eSAlex Elder 	/* parse add command */
4095859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4096dc79b113SAlex Elder 	if (rc < 0)
4097bd4ba655SAlex Elder 		goto err_out_module;
4098a725f65eSAlex Elder 
40999d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
41009d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
41019d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
41020ddebc0cSAlex Elder 		goto err_out_args;
41039d3997fdSAlex Elder 	}
4104c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4105602adf40SYehuda Sadeh 
4106602adf40SYehuda Sadeh 	/* pick the pool */
41079d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4108859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4109602adf40SYehuda Sadeh 	if (rc < 0)
4110602adf40SYehuda Sadeh 		goto err_out_client;
4111859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4112859c31dfSAlex Elder 
41130903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
41140903e875SAlex Elder 
41150903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
41160903e875SAlex Elder 		rc = -EIO;
41170903e875SAlex Elder 		goto err_out_client;
41180903e875SAlex Elder 	}
41190903e875SAlex Elder 
4120c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4121bd4ba655SAlex Elder 	if (!rbd_dev)
4122bd4ba655SAlex Elder 		goto err_out_client;
4123c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4124c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4125602adf40SYehuda Sadeh 
4126bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4127c53d5893SAlex Elder 	kfree(rbd_opts);
4128c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4129bd4ba655SAlex Elder 
4130a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4131a30b71b9SAlex Elder 	if (rc < 0)
4132c53d5893SAlex Elder 		goto err_out_rbd_dev;
413305fd6f6fSAlex Elder 
4134602adf40SYehuda Sadeh 	return count;
4135c53d5893SAlex Elder err_out_rbd_dev:
4136c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4137bd4ba655SAlex Elder err_out_client:
41389d3997fdSAlex Elder 	rbd_put_client(rbdc);
41390ddebc0cSAlex Elder err_out_args:
414078cea76eSAlex Elder 	if (ceph_opts)
414178cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
41424e9afebaSAlex Elder 	kfree(rbd_opts);
4143859c31dfSAlex Elder 	rbd_spec_put(spec);
4144bd4ba655SAlex Elder err_out_module:
4145bd4ba655SAlex Elder 	module_put(THIS_MODULE);
414627cc2594SAlex Elder 
4147602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
414827cc2594SAlex Elder 
414927cc2594SAlex Elder 	return (ssize_t) rc;
4150602adf40SYehuda Sadeh }
4151602adf40SYehuda Sadeh 
4152de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4153602adf40SYehuda Sadeh {
4154602adf40SYehuda Sadeh 	struct list_head *tmp;
4155602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4156602adf40SYehuda Sadeh 
4157e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4158602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4159602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4160de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4161e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4162602adf40SYehuda Sadeh 			return rbd_dev;
4163602adf40SYehuda Sadeh 		}
4164e124a82fSAlex Elder 	}
4165e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4166602adf40SYehuda Sadeh 	return NULL;
4167602adf40SYehuda Sadeh }
4168602adf40SYehuda Sadeh 
4169dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4170602adf40SYehuda Sadeh {
4171593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4172602adf40SYehuda Sadeh 
417359c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
41749969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4175602adf40SYehuda Sadeh 
4176602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4177602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4178602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
417932eec68dSAlex Elder 
41802ac4e75dSAlex Elder 	/* release allocated disk header fields */
41812ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
41822ac4e75dSAlex Elder 
418332eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4184e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4185c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4186c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4187602adf40SYehuda Sadeh 
4188602adf40SYehuda Sadeh 	/* release module ref */
4189602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4190602adf40SYehuda Sadeh }
4191602adf40SYehuda Sadeh 
4192dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4193602adf40SYehuda Sadeh 			  const char *buf,
4194602adf40SYehuda Sadeh 			  size_t count)
4195602adf40SYehuda Sadeh {
4196602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4197602adf40SYehuda Sadeh 	int target_id, rc;
4198602adf40SYehuda Sadeh 	unsigned long ul;
4199602adf40SYehuda Sadeh 	int ret = count;
4200602adf40SYehuda Sadeh 
4201602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4202602adf40SYehuda Sadeh 	if (rc)
4203602adf40SYehuda Sadeh 		return rc;
4204602adf40SYehuda Sadeh 
4205602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4206602adf40SYehuda Sadeh 	target_id = (int) ul;
4207602adf40SYehuda Sadeh 	if (target_id != ul)
4208602adf40SYehuda Sadeh 		return -EINVAL;
4209602adf40SYehuda Sadeh 
4210602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4211602adf40SYehuda Sadeh 
4212602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4213602adf40SYehuda Sadeh 	if (!rbd_dev) {
4214602adf40SYehuda Sadeh 		ret = -ENOENT;
4215602adf40SYehuda Sadeh 		goto done;
4216602adf40SYehuda Sadeh 	}
4217602adf40SYehuda Sadeh 
4218a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4219b82d167bSAlex Elder 	if (rbd_dev->open_count)
422042382b70SAlex Elder 		ret = -EBUSY;
4221b82d167bSAlex Elder 	else
4222b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4223a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4224b82d167bSAlex Elder 	if (ret < 0)
422542382b70SAlex Elder 		goto done;
422642382b70SAlex Elder 
422741f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4228dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
4229602adf40SYehuda Sadeh 
4230602adf40SYehuda Sadeh done:
4231602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4232aafb230eSAlex Elder 
4233602adf40SYehuda Sadeh 	return ret;
4234602adf40SYehuda Sadeh }
4235602adf40SYehuda Sadeh 
4236602adf40SYehuda Sadeh /*
4237602adf40SYehuda Sadeh  * create control files in sysfs
4238dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4239602adf40SYehuda Sadeh  */
4240602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4241602adf40SYehuda Sadeh {
4242dfc5606dSYehuda Sadeh 	int ret;
4243602adf40SYehuda Sadeh 
4244fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4245dfc5606dSYehuda Sadeh 	if (ret < 0)
4246dfc5606dSYehuda Sadeh 		return ret;
4247602adf40SYehuda Sadeh 
4248fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4249fed4c143SAlex Elder 	if (ret < 0)
4250fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4251602adf40SYehuda Sadeh 
4252602adf40SYehuda Sadeh 	return ret;
4253602adf40SYehuda Sadeh }
4254602adf40SYehuda Sadeh 
4255602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4256602adf40SYehuda Sadeh {
4257dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4258fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4259602adf40SYehuda Sadeh }
4260602adf40SYehuda Sadeh 
4261cc344fa1SAlex Elder static int __init rbd_init(void)
4262602adf40SYehuda Sadeh {
4263602adf40SYehuda Sadeh 	int rc;
4264602adf40SYehuda Sadeh 
42651e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
42661e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
42671e32d34cSAlex Elder 
42681e32d34cSAlex Elder 		return -EINVAL;
42691e32d34cSAlex Elder 	}
4270602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
4271602adf40SYehuda Sadeh 	if (rc)
4272602adf40SYehuda Sadeh 		return rc;
4273f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4274602adf40SYehuda Sadeh 	return 0;
4275602adf40SYehuda Sadeh }
4276602adf40SYehuda Sadeh 
4277cc344fa1SAlex Elder static void __exit rbd_exit(void)
4278602adf40SYehuda Sadeh {
4279602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
4280602adf40SYehuda Sadeh }
4281602adf40SYehuda Sadeh 
4282602adf40SYehuda Sadeh module_init(rbd_init);
4283602adf40SYehuda Sadeh module_exit(rbd_exit);
4284602adf40SYehuda Sadeh 
4285602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4286602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4287602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
4288602adf40SYehuda Sadeh 
4289602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
4290602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4291602adf40SYehuda Sadeh 
4292602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
4293