xref: /openbmc/linux/drivers/block/rbd.c (revision b8f5c6ed)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
66d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67d4b125e9SAlex Elder 
6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
70602adf40SYehuda Sadeh 
71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
72602adf40SYehuda Sadeh 
739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
769e15b77dSAlex Elder 
771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
78589d30e0SAlex Elder 
79d889140cSAlex Elder /* Feature bits */
80d889140cSAlex Elder 
81d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
82d889140cSAlex Elder 
83d889140cSAlex Elder /* Features supported by this (client software) implementation. */
84d889140cSAlex Elder 
85d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
86d889140cSAlex Elder 
8781a89793SAlex Elder /*
8881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9081a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9181a89793SAlex Elder  * enough to hold all possible device names.
9281a89793SAlex Elder  */
93602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9481a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
95602adf40SYehuda Sadeh 
96cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
9759c2be1eSYehuda Sadeh 
98602adf40SYehuda Sadeh /*
99602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
100602adf40SYehuda Sadeh  */
101602adf40SYehuda Sadeh struct rbd_image_header {
102f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
103849b4260SAlex Elder 	char *object_prefix;
10434b13184SAlex Elder 	u64 features;
105602adf40SYehuda Sadeh 	__u8 obj_order;
106602adf40SYehuda Sadeh 	__u8 crypt_type;
107602adf40SYehuda Sadeh 	__u8 comp_type;
108602adf40SYehuda Sadeh 
109f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
110f84344f3SAlex Elder 	u64 image_size;
111f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
112602adf40SYehuda Sadeh 	char *snap_names;
113602adf40SYehuda Sadeh 	u64 *snap_sizes;
11459c2be1eSYehuda Sadeh 
11559c2be1eSYehuda Sadeh 	u64 obj_version;
11659c2be1eSYehuda Sadeh };
11759c2be1eSYehuda Sadeh 
1180d7dbfceSAlex Elder /*
1190d7dbfceSAlex Elder  * An rbd image specification.
1200d7dbfceSAlex Elder  *
1210d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
1220d7dbfceSAlex Elder  * identify an image.
1230d7dbfceSAlex Elder  */
1240d7dbfceSAlex Elder struct rbd_spec {
1250d7dbfceSAlex Elder 	u64		pool_id;
1260d7dbfceSAlex Elder 	char		*pool_name;
1270d7dbfceSAlex Elder 
1280d7dbfceSAlex Elder 	char		*image_id;
1290d7dbfceSAlex Elder 	size_t		image_id_len;
1300d7dbfceSAlex Elder 	char		*image_name;
1310d7dbfceSAlex Elder 	size_t		image_name_len;
1320d7dbfceSAlex Elder 
1330d7dbfceSAlex Elder 	u64		snap_id;
1340d7dbfceSAlex Elder 	char		*snap_name;
1350d7dbfceSAlex Elder 
1360d7dbfceSAlex Elder 	struct kref	kref;
1370d7dbfceSAlex Elder };
1380d7dbfceSAlex Elder 
13959c2be1eSYehuda Sadeh struct rbd_options {
140cc0538b6SAlex Elder 	bool	read_only;
141602adf40SYehuda Sadeh };
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh /*
144f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
145602adf40SYehuda Sadeh  */
146602adf40SYehuda Sadeh struct rbd_client {
147602adf40SYehuda Sadeh 	struct ceph_client	*client;
148602adf40SYehuda Sadeh 	struct kref		kref;
149602adf40SYehuda Sadeh 	struct list_head	node;
150602adf40SYehuda Sadeh };
151602adf40SYehuda Sadeh 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * a request completion status
154602adf40SYehuda Sadeh  */
1551fec7093SYehuda Sadeh struct rbd_req_status {
1561fec7093SYehuda Sadeh 	int done;
1571fec7093SYehuda Sadeh 	int rc;
1581fec7093SYehuda Sadeh 	u64 bytes;
1591fec7093SYehuda Sadeh };
1601fec7093SYehuda Sadeh 
1611fec7093SYehuda Sadeh /*
1621fec7093SYehuda Sadeh  * a collection of requests
1631fec7093SYehuda Sadeh  */
1641fec7093SYehuda Sadeh struct rbd_req_coll {
1651fec7093SYehuda Sadeh 	int			total;
1661fec7093SYehuda Sadeh 	int			num_done;
1671fec7093SYehuda Sadeh 	struct kref		kref;
1681fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
169602adf40SYehuda Sadeh };
170602adf40SYehuda Sadeh 
171f0f8cef5SAlex Elder /*
172f0f8cef5SAlex Elder  * a single io request
173f0f8cef5SAlex Elder  */
174f0f8cef5SAlex Elder struct rbd_request {
175f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
176f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
177f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
178f0f8cef5SAlex Elder 	u64			len;
179f0f8cef5SAlex Elder 	int			coll_index;
180f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
181f0f8cef5SAlex Elder };
182f0f8cef5SAlex Elder 
183dfc5606dSYehuda Sadeh struct rbd_snap {
184dfc5606dSYehuda Sadeh 	struct	device		dev;
185dfc5606dSYehuda Sadeh 	const char		*name;
1863591538fSJosh Durgin 	u64			size;
187dfc5606dSYehuda Sadeh 	struct list_head	node;
188dfc5606dSYehuda Sadeh 	u64			id;
18934b13184SAlex Elder 	u64			features;
190dfc5606dSYehuda Sadeh };
191dfc5606dSYehuda Sadeh 
192f84344f3SAlex Elder struct rbd_mapping {
19399c1f08fSAlex Elder 	u64                     size;
19434b13184SAlex Elder 	u64                     features;
195f84344f3SAlex Elder 	bool			read_only;
196f84344f3SAlex Elder };
197f84344f3SAlex Elder 
198602adf40SYehuda Sadeh /*
199602adf40SYehuda Sadeh  * a single device
200602adf40SYehuda Sadeh  */
201602adf40SYehuda Sadeh struct rbd_device {
202de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
203602adf40SYehuda Sadeh 
204602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
205602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
206602adf40SYehuda Sadeh 
207a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
208602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
209602adf40SYehuda Sadeh 
210602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
211602adf40SYehuda Sadeh 
212602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
213602adf40SYehuda Sadeh 
214602adf40SYehuda Sadeh 	struct rbd_image_header	header;
215daba5fdbSAlex Elder 	bool                    exists;
2160d7dbfceSAlex Elder 	struct rbd_spec		*spec;
217602adf40SYehuda Sadeh 
2180d7dbfceSAlex Elder 	char			*header_name;
219971f839aSAlex Elder 
22059c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
22159c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
22259c2be1eSYehuda Sadeh 
22386b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
22486b00e0dSAlex Elder 	u64			parent_overlap;
22586b00e0dSAlex Elder 
226c666601aSJosh Durgin 	/* protects updating the header */
227c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
228f84344f3SAlex Elder 
229f84344f3SAlex Elder 	struct rbd_mapping	mapping;
230602adf40SYehuda Sadeh 
231602adf40SYehuda Sadeh 	struct list_head	node;
232dfc5606dSYehuda Sadeh 
233dfc5606dSYehuda Sadeh 	/* list of snapshots */
234dfc5606dSYehuda Sadeh 	struct list_head	snaps;
235dfc5606dSYehuda Sadeh 
236dfc5606dSYehuda Sadeh 	/* sysfs related */
237dfc5606dSYehuda Sadeh 	struct device		dev;
23842382b70SAlex Elder 	unsigned long		open_count;
239dfc5606dSYehuda Sadeh };
240dfc5606dSYehuda Sadeh 
241602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
242e124a82fSAlex Elder 
243602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
244e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
245e124a82fSAlex Elder 
246602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
247432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
248602adf40SYehuda Sadeh 
249304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
250304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
251304f6808SAlex Elder 
252dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
25341f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
254dfc5606dSYehuda Sadeh 
255f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
256f0f8cef5SAlex Elder 		       size_t count);
257f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
258f0f8cef5SAlex Elder 			  size_t count);
259f0f8cef5SAlex Elder 
260f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
261f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
262f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
263f0f8cef5SAlex Elder 	__ATTR_NULL
264f0f8cef5SAlex Elder };
265f0f8cef5SAlex Elder 
266f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
267f0f8cef5SAlex Elder 	.name		= "rbd",
268f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
269f0f8cef5SAlex Elder };
270f0f8cef5SAlex Elder 
271f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
272f0f8cef5SAlex Elder {
273f0f8cef5SAlex Elder }
274f0f8cef5SAlex Elder 
275f0f8cef5SAlex Elder static struct device rbd_root_dev = {
276f0f8cef5SAlex Elder 	.init_name =    "rbd",
277f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
278f0f8cef5SAlex Elder };
279f0f8cef5SAlex Elder 
280aafb230eSAlex Elder #ifdef RBD_DEBUG
281aafb230eSAlex Elder #define rbd_assert(expr)						\
282aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
283aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
284aafb230eSAlex Elder 						"at line %d:\n\n"	\
285aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
286aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
287aafb230eSAlex Elder 			BUG();						\
288aafb230eSAlex Elder 		}
289aafb230eSAlex Elder #else /* !RBD_DEBUG */
290aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
291aafb230eSAlex Elder #endif /* !RBD_DEBUG */
292dfc5606dSYehuda Sadeh 
293dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
294dfc5606dSYehuda Sadeh {
295dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
296dfc5606dSYehuda Sadeh }
297dfc5606dSYehuda Sadeh 
298dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
299dfc5606dSYehuda Sadeh {
300dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
301dfc5606dSYehuda Sadeh }
302602adf40SYehuda Sadeh 
303117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
304117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
30559c2be1eSYehuda Sadeh 
306602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
307602adf40SYehuda Sadeh {
308f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
309602adf40SYehuda Sadeh 
310f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
311602adf40SYehuda Sadeh 		return -EROFS;
312602adf40SYehuda Sadeh 
31342382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
314340c7a2bSAlex Elder 	rbd_get_dev(rbd_dev);
315f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
31642382b70SAlex Elder 	rbd_dev->open_count++;
31742382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
318340c7a2bSAlex Elder 
319602adf40SYehuda Sadeh 	return 0;
320602adf40SYehuda Sadeh }
321602adf40SYehuda Sadeh 
322dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
323dfc5606dSYehuda Sadeh {
324dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
325dfc5606dSYehuda Sadeh 
32642382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
32742382b70SAlex Elder 	rbd_assert(rbd_dev->open_count > 0);
32842382b70SAlex Elder 	rbd_dev->open_count--;
329dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
33042382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
331dfc5606dSYehuda Sadeh 
332dfc5606dSYehuda Sadeh 	return 0;
333dfc5606dSYehuda Sadeh }
334dfc5606dSYehuda Sadeh 
335602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
336602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
337602adf40SYehuda Sadeh 	.open			= rbd_open,
338dfc5606dSYehuda Sadeh 	.release		= rbd_release,
339602adf40SYehuda Sadeh };
340602adf40SYehuda Sadeh 
341602adf40SYehuda Sadeh /*
342602adf40SYehuda Sadeh  * Initialize an rbd client instance.
34343ae4701SAlex Elder  * We own *ceph_opts.
344602adf40SYehuda Sadeh  */
345f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
346602adf40SYehuda Sadeh {
347602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
348602adf40SYehuda Sadeh 	int ret = -ENOMEM;
349602adf40SYehuda Sadeh 
350602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
351602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
352602adf40SYehuda Sadeh 	if (!rbdc)
353602adf40SYehuda Sadeh 		goto out_opt;
354602adf40SYehuda Sadeh 
355602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
356602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
357602adf40SYehuda Sadeh 
358bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
359bc534d86SAlex Elder 
36043ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
361602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
362bc534d86SAlex Elder 		goto out_mutex;
36343ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
364602adf40SYehuda Sadeh 
365602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
366602adf40SYehuda Sadeh 	if (ret < 0)
367602adf40SYehuda Sadeh 		goto out_err;
368602adf40SYehuda Sadeh 
369432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
370602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
371432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
372602adf40SYehuda Sadeh 
373bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
374bc534d86SAlex Elder 
375602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
376602adf40SYehuda Sadeh 	return rbdc;
377602adf40SYehuda Sadeh 
378602adf40SYehuda Sadeh out_err:
379602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
380bc534d86SAlex Elder out_mutex:
381bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
382602adf40SYehuda Sadeh 	kfree(rbdc);
383602adf40SYehuda Sadeh out_opt:
38443ae4701SAlex Elder 	if (ceph_opts)
38543ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
38628f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
387602adf40SYehuda Sadeh }
388602adf40SYehuda Sadeh 
389602adf40SYehuda Sadeh /*
3901f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
3911f7ba331SAlex Elder  * found, bump its reference count.
392602adf40SYehuda Sadeh  */
3931f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
394602adf40SYehuda Sadeh {
395602adf40SYehuda Sadeh 	struct rbd_client *client_node;
3961f7ba331SAlex Elder 	bool found = false;
397602adf40SYehuda Sadeh 
39843ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
399602adf40SYehuda Sadeh 		return NULL;
400602adf40SYehuda Sadeh 
4011f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
4021f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
4031f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
4041f7ba331SAlex Elder 			kref_get(&client_node->kref);
4051f7ba331SAlex Elder 			found = true;
4061f7ba331SAlex Elder 			break;
4071f7ba331SAlex Elder 		}
4081f7ba331SAlex Elder 	}
4091f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
4101f7ba331SAlex Elder 
4111f7ba331SAlex Elder 	return found ? client_node : NULL;
412602adf40SYehuda Sadeh }
413602adf40SYehuda Sadeh 
414602adf40SYehuda Sadeh /*
41559c2be1eSYehuda Sadeh  * mount options
41659c2be1eSYehuda Sadeh  */
41759c2be1eSYehuda Sadeh enum {
41859c2be1eSYehuda Sadeh 	Opt_last_int,
41959c2be1eSYehuda Sadeh 	/* int args above */
42059c2be1eSYehuda Sadeh 	Opt_last_string,
42159c2be1eSYehuda Sadeh 	/* string args above */
422cc0538b6SAlex Elder 	Opt_read_only,
423cc0538b6SAlex Elder 	Opt_read_write,
424cc0538b6SAlex Elder 	/* Boolean args above */
425cc0538b6SAlex Elder 	Opt_last_bool,
42659c2be1eSYehuda Sadeh };
42759c2be1eSYehuda Sadeh 
42843ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
42959c2be1eSYehuda Sadeh 	/* int args above */
43059c2be1eSYehuda Sadeh 	/* string args above */
431be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
432cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
433cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
434cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
435cc0538b6SAlex Elder 	/* Boolean args above */
43659c2be1eSYehuda Sadeh 	{-1, NULL}
43759c2be1eSYehuda Sadeh };
43859c2be1eSYehuda Sadeh 
43959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
44059c2be1eSYehuda Sadeh {
44143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
44259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
44359c2be1eSYehuda Sadeh 	int token, intval, ret;
44459c2be1eSYehuda Sadeh 
44543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
44659c2be1eSYehuda Sadeh 	if (token < 0)
44759c2be1eSYehuda Sadeh 		return -EINVAL;
44859c2be1eSYehuda Sadeh 
44959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
45059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
45159c2be1eSYehuda Sadeh 		if (ret < 0) {
45259c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
45359c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
45459c2be1eSYehuda Sadeh 			return ret;
45559c2be1eSYehuda Sadeh 		}
45659c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
45759c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
45859c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
45959c2be1eSYehuda Sadeh 		     argstr[0].from);
460cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
461cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
46259c2be1eSYehuda Sadeh 	} else {
46359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
46459c2be1eSYehuda Sadeh 	}
46559c2be1eSYehuda Sadeh 
46659c2be1eSYehuda Sadeh 	switch (token) {
467cc0538b6SAlex Elder 	case Opt_read_only:
468cc0538b6SAlex Elder 		rbd_opts->read_only = true;
469cc0538b6SAlex Elder 		break;
470cc0538b6SAlex Elder 	case Opt_read_write:
471cc0538b6SAlex Elder 		rbd_opts->read_only = false;
472cc0538b6SAlex Elder 		break;
47359c2be1eSYehuda Sadeh 	default:
474aafb230eSAlex Elder 		rbd_assert(false);
475aafb230eSAlex Elder 		break;
47659c2be1eSYehuda Sadeh 	}
47759c2be1eSYehuda Sadeh 	return 0;
47859c2be1eSYehuda Sadeh }
47959c2be1eSYehuda Sadeh 
48059c2be1eSYehuda Sadeh /*
481602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
482602adf40SYehuda Sadeh  * not exist create it.
483602adf40SYehuda Sadeh  */
4849d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
485602adf40SYehuda Sadeh {
486f8c38929SAlex Elder 	struct rbd_client *rbdc;
48759c2be1eSYehuda Sadeh 
4881f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
4899d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
49043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
4919d3997fdSAlex Elder 	else
492f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
493d720bcb0SAlex Elder 
4949d3997fdSAlex Elder 	return rbdc;
495602adf40SYehuda Sadeh }
496602adf40SYehuda Sadeh 
497602adf40SYehuda Sadeh /*
498602adf40SYehuda Sadeh  * Destroy ceph client
499d23a4b3fSAlex Elder  *
500432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
501602adf40SYehuda Sadeh  */
502602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
503602adf40SYehuda Sadeh {
504602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
505602adf40SYehuda Sadeh 
506602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
507cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
508602adf40SYehuda Sadeh 	list_del(&rbdc->node);
509cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
510602adf40SYehuda Sadeh 
511602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
512602adf40SYehuda Sadeh 	kfree(rbdc);
513602adf40SYehuda Sadeh }
514602adf40SYehuda Sadeh 
515602adf40SYehuda Sadeh /*
516602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
517602adf40SYehuda Sadeh  * it.
518602adf40SYehuda Sadeh  */
5199d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
520602adf40SYehuda Sadeh {
521c53d5893SAlex Elder 	if (rbdc)
5229d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
523602adf40SYehuda Sadeh }
524602adf40SYehuda Sadeh 
5251fec7093SYehuda Sadeh /*
5261fec7093SYehuda Sadeh  * Destroy requests collection
5271fec7093SYehuda Sadeh  */
5281fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5291fec7093SYehuda Sadeh {
5301fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5311fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5321fec7093SYehuda Sadeh 
5331fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5341fec7093SYehuda Sadeh 	kfree(coll);
5351fec7093SYehuda Sadeh }
536602adf40SYehuda Sadeh 
537a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
538a30b71b9SAlex Elder {
539a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
540a30b71b9SAlex Elder }
541a30b71b9SAlex Elder 
5428e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5438e94af8eSAlex Elder {
544103a150fSAlex Elder 	size_t size;
545103a150fSAlex Elder 	u32 snap_count;
546103a150fSAlex Elder 
547103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
548103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
549103a150fSAlex Elder 		return false;
550103a150fSAlex Elder 
551db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
552db2388b6SAlex Elder 
553db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
554db2388b6SAlex Elder 		return false;
555db2388b6SAlex Elder 
556db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
557db2388b6SAlex Elder 
558db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
559db2388b6SAlex Elder 		return false;
560db2388b6SAlex Elder 
561103a150fSAlex Elder 	/*
562103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
563103a150fSAlex Elder 	 * that limits the number of snapshots.
564103a150fSAlex Elder 	 */
565103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
566103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
567103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
568103a150fSAlex Elder 		return false;
569103a150fSAlex Elder 
570103a150fSAlex Elder 	/*
571103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
572103a150fSAlex Elder 	 * header must also be representable in a size_t.
573103a150fSAlex Elder 	 */
574103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
575103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
576103a150fSAlex Elder 		return false;
577103a150fSAlex Elder 
578103a150fSAlex Elder 	return true;
5798e94af8eSAlex Elder }
5808e94af8eSAlex Elder 
581602adf40SYehuda Sadeh /*
582602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
583602adf40SYehuda Sadeh  * header.
584602adf40SYehuda Sadeh  */
585602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
5864156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
587602adf40SYehuda Sadeh {
588ccece235SAlex Elder 	u32 snap_count;
58958c17b0eSAlex Elder 	size_t len;
590d2bb24e5SAlex Elder 	size_t size;
591621901d6SAlex Elder 	u32 i;
592602adf40SYehuda Sadeh 
5936a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
5946a52325fSAlex Elder 
595103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
596103a150fSAlex Elder 
59758c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
59858c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
5996a52325fSAlex Elder 	if (!header->object_prefix)
600602adf40SYehuda Sadeh 		return -ENOMEM;
60158c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
60258c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
60300f1f36fSAlex Elder 
604602adf40SYehuda Sadeh 	if (snap_count) {
605f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
606f785cc1dSAlex Elder 
607621901d6SAlex Elder 		/* Save a copy of the snapshot names */
608621901d6SAlex Elder 
609f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
610f785cc1dSAlex Elder 			return -EIO;
611f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
612602adf40SYehuda Sadeh 		if (!header->snap_names)
6136a52325fSAlex Elder 			goto out_err;
614f785cc1dSAlex Elder 		/*
615f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
616f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
617f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
618f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
619f785cc1dSAlex Elder 		 */
620f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
621f785cc1dSAlex Elder 			snap_names_len);
6226a52325fSAlex Elder 
623621901d6SAlex Elder 		/* Record each snapshot's size */
624621901d6SAlex Elder 
625d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
626d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
627602adf40SYehuda Sadeh 		if (!header->snap_sizes)
6286a52325fSAlex Elder 			goto out_err;
629621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
630621901d6SAlex Elder 			header->snap_sizes[i] =
631621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
632602adf40SYehuda Sadeh 	} else {
633ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
634602adf40SYehuda Sadeh 		header->snap_names = NULL;
635602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
636602adf40SYehuda Sadeh 	}
637849b4260SAlex Elder 
63834b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
639602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
640602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
641602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6426a52325fSAlex Elder 
643621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
644621901d6SAlex Elder 
645f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6466a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6476a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6486a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6496a52325fSAlex Elder 	if (!header->snapc)
6506a52325fSAlex Elder 		goto out_err;
651602adf40SYehuda Sadeh 
652602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
653505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
654602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
655621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
656602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
657602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
658602adf40SYehuda Sadeh 
659602adf40SYehuda Sadeh 	return 0;
660602adf40SYehuda Sadeh 
6616a52325fSAlex Elder out_err:
662849b4260SAlex Elder 	kfree(header->snap_sizes);
663ccece235SAlex Elder 	header->snap_sizes = NULL;
664602adf40SYehuda Sadeh 	kfree(header->snap_names);
665ccece235SAlex Elder 	header->snap_names = NULL;
6666a52325fSAlex Elder 	kfree(header->object_prefix);
6676a52325fSAlex Elder 	header->object_prefix = NULL;
668ccece235SAlex Elder 
66900f1f36fSAlex Elder 	return -ENOMEM;
670602adf40SYehuda Sadeh }
671602adf40SYehuda Sadeh 
6729e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
6739e15b77dSAlex Elder {
6749e15b77dSAlex Elder 	struct rbd_snap *snap;
6759e15b77dSAlex Elder 
6769e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
6779e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
6789e15b77dSAlex Elder 
6799e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
6809e15b77dSAlex Elder 		if (snap_id == snap->id)
6819e15b77dSAlex Elder 			return snap->name;
6829e15b77dSAlex Elder 
6839e15b77dSAlex Elder 	return NULL;
6849e15b77dSAlex Elder }
6859e15b77dSAlex Elder 
6868836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
687602adf40SYehuda Sadeh {
688602adf40SYehuda Sadeh 
689e86924a8SAlex Elder 	struct rbd_snap *snap;
69000f1f36fSAlex Elder 
691e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
692e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
6930d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
694e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
69534b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
69600f1f36fSAlex Elder 
697e86924a8SAlex Elder 			return 0;
698602adf40SYehuda Sadeh 		}
69900f1f36fSAlex Elder 	}
700e86924a8SAlex Elder 
70100f1f36fSAlex Elder 	return -ENOENT;
70200f1f36fSAlex Elder }
703602adf40SYehuda Sadeh 
704819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
705602adf40SYehuda Sadeh {
70678dc447dSAlex Elder 	int ret;
707602adf40SYehuda Sadeh 
7080d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
709cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
7100d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
71199c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
71234b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
713e86924a8SAlex Elder 		ret = 0;
714602adf40SYehuda Sadeh 	} else {
7150d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
716602adf40SYehuda Sadeh 		if (ret < 0)
717602adf40SYehuda Sadeh 			goto done;
718f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
719602adf40SYehuda Sadeh 	}
720daba5fdbSAlex Elder 	rbd_dev->exists = true;
721602adf40SYehuda Sadeh done:
722602adf40SYehuda Sadeh 	return ret;
723602adf40SYehuda Sadeh }
724602adf40SYehuda Sadeh 
725602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
726602adf40SYehuda Sadeh {
727849b4260SAlex Elder 	kfree(header->object_prefix);
728d78fd7aeSAlex Elder 	header->object_prefix = NULL;
729602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
730d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
731849b4260SAlex Elder 	kfree(header->snap_names);
732d78fd7aeSAlex Elder 	header->snap_names = NULL;
733d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
734d78fd7aeSAlex Elder 	header->snapc = NULL;
735602adf40SYehuda Sadeh }
736602adf40SYehuda Sadeh 
73765ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
738602adf40SYehuda Sadeh {
73965ccfe21SAlex Elder 	char *name;
74065ccfe21SAlex Elder 	u64 segment;
74165ccfe21SAlex Elder 	int ret;
742602adf40SYehuda Sadeh 
7432fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
74465ccfe21SAlex Elder 	if (!name)
74565ccfe21SAlex Elder 		return NULL;
74665ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
7472fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
74865ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
7492fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
75065ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
75165ccfe21SAlex Elder 			segment, ret);
75265ccfe21SAlex Elder 		kfree(name);
75365ccfe21SAlex Elder 		name = NULL;
75465ccfe21SAlex Elder 	}
755602adf40SYehuda Sadeh 
75665ccfe21SAlex Elder 	return name;
75765ccfe21SAlex Elder }
758602adf40SYehuda Sadeh 
75965ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
76065ccfe21SAlex Elder {
76165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
762602adf40SYehuda Sadeh 
76365ccfe21SAlex Elder 	return offset & (segment_size - 1);
76465ccfe21SAlex Elder }
76565ccfe21SAlex Elder 
76665ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
76765ccfe21SAlex Elder 				u64 offset, u64 length)
76865ccfe21SAlex Elder {
76965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
77065ccfe21SAlex Elder 
77165ccfe21SAlex Elder 	offset &= segment_size - 1;
77265ccfe21SAlex Elder 
773aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
77465ccfe21SAlex Elder 	if (offset + length > segment_size)
77565ccfe21SAlex Elder 		length = segment_size - offset;
77665ccfe21SAlex Elder 
77765ccfe21SAlex Elder 	return length;
778602adf40SYehuda Sadeh }
779602adf40SYehuda Sadeh 
7801fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
7811fec7093SYehuda Sadeh 				u64 ofs, u64 len)
7821fec7093SYehuda Sadeh {
783df111be6SAlex Elder 	u64 start_seg;
784df111be6SAlex Elder 	u64 end_seg;
785df111be6SAlex Elder 
786df111be6SAlex Elder 	if (!len)
787df111be6SAlex Elder 		return 0;
788df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
789df111be6SAlex Elder 		return -ERANGE;
790df111be6SAlex Elder 
791df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
792df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
793df111be6SAlex Elder 
7941fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
7951fec7093SYehuda Sadeh }
7961fec7093SYehuda Sadeh 
797602adf40SYehuda Sadeh /*
798029bcbd8SJosh Durgin  * returns the size of an object in the image
799029bcbd8SJosh Durgin  */
800029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
801029bcbd8SJosh Durgin {
802029bcbd8SJosh Durgin 	return 1 << header->obj_order;
803029bcbd8SJosh Durgin }
804029bcbd8SJosh Durgin 
805029bcbd8SJosh Durgin /*
806602adf40SYehuda Sadeh  * bio helpers
807602adf40SYehuda Sadeh  */
808602adf40SYehuda Sadeh 
809602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
810602adf40SYehuda Sadeh {
811602adf40SYehuda Sadeh 	struct bio *tmp;
812602adf40SYehuda Sadeh 
813602adf40SYehuda Sadeh 	while (chain) {
814602adf40SYehuda Sadeh 		tmp = chain;
815602adf40SYehuda Sadeh 		chain = chain->bi_next;
816602adf40SYehuda Sadeh 		bio_put(tmp);
817602adf40SYehuda Sadeh 	}
818602adf40SYehuda Sadeh }
819602adf40SYehuda Sadeh 
820602adf40SYehuda Sadeh /*
821602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
822602adf40SYehuda Sadeh  */
823602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
824602adf40SYehuda Sadeh {
825602adf40SYehuda Sadeh 	struct bio_vec *bv;
826602adf40SYehuda Sadeh 	unsigned long flags;
827602adf40SYehuda Sadeh 	void *buf;
828602adf40SYehuda Sadeh 	int i;
829602adf40SYehuda Sadeh 	int pos = 0;
830602adf40SYehuda Sadeh 
831602adf40SYehuda Sadeh 	while (chain) {
832602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
833602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
834602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
835602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
836602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
837602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
83885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
839602adf40SYehuda Sadeh 			}
840602adf40SYehuda Sadeh 			pos += bv->bv_len;
841602adf40SYehuda Sadeh 		}
842602adf40SYehuda Sadeh 
843602adf40SYehuda Sadeh 		chain = chain->bi_next;
844602adf40SYehuda Sadeh 	}
845602adf40SYehuda Sadeh }
846602adf40SYehuda Sadeh 
847602adf40SYehuda Sadeh /*
848f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
849f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
850602adf40SYehuda Sadeh  */
851f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
852f7760dadSAlex Elder 					unsigned int offset,
853f7760dadSAlex Elder 					unsigned int len,
854f7760dadSAlex Elder 					gfp_t gfpmask)
855602adf40SYehuda Sadeh {
856f7760dadSAlex Elder 	struct bio_vec *bv;
857f7760dadSAlex Elder 	unsigned int resid;
858f7760dadSAlex Elder 	unsigned short idx;
859f7760dadSAlex Elder 	unsigned int voff;
860f7760dadSAlex Elder 	unsigned short end_idx;
861f7760dadSAlex Elder 	unsigned short vcnt;
862f7760dadSAlex Elder 	struct bio *bio;
863602adf40SYehuda Sadeh 
864f7760dadSAlex Elder 	/* Handle the easy case for the caller */
865f7760dadSAlex Elder 
866f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
867f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
868f7760dadSAlex Elder 
869f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
870f7760dadSAlex Elder 		return NULL;
871f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
872f7760dadSAlex Elder 		return NULL;
873f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
874f7760dadSAlex Elder 		return NULL;
875f7760dadSAlex Elder 
876f7760dadSAlex Elder 	/* Find first affected segment... */
877f7760dadSAlex Elder 
878f7760dadSAlex Elder 	resid = offset;
879f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
880f7760dadSAlex Elder 		if (resid < bv->bv_len)
881f7760dadSAlex Elder 			break;
882f7760dadSAlex Elder 		resid -= bv->bv_len;
883602adf40SYehuda Sadeh 	}
884f7760dadSAlex Elder 	voff = resid;
885602adf40SYehuda Sadeh 
886f7760dadSAlex Elder 	/* ...and the last affected segment */
887542582fcSAlex Elder 
888f7760dadSAlex Elder 	resid += len;
889f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
890f7760dadSAlex Elder 		if (resid <= bv->bv_len)
891f7760dadSAlex Elder 			break;
892f7760dadSAlex Elder 		resid -= bv->bv_len;
893f7760dadSAlex Elder 	}
894f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
895602adf40SYehuda Sadeh 
896f7760dadSAlex Elder 	/* Build the clone */
897f7760dadSAlex Elder 
898f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
899f7760dadSAlex Elder 	if (!bio)
900f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
901f7760dadSAlex Elder 
902f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
903f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
904f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
905f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
906602adf40SYehuda Sadeh 
907602adf40SYehuda Sadeh 	/*
908f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
909f7760dadSAlex Elder 	 * and last (or only) entries.
910602adf40SYehuda Sadeh 	 */
911f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
912f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
913f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
914f7760dadSAlex Elder 	if (vcnt > 1) {
915f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
916f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
917602adf40SYehuda Sadeh 	} else {
918f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
919602adf40SYehuda Sadeh 	}
920602adf40SYehuda Sadeh 
921f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
922f7760dadSAlex Elder 	bio->bi_size = len;
923f7760dadSAlex Elder 	bio->bi_idx = 0;
924602adf40SYehuda Sadeh 
925f7760dadSAlex Elder 	return bio;
926602adf40SYehuda Sadeh }
927602adf40SYehuda Sadeh 
928f7760dadSAlex Elder /*
929f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
930f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
931f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
932f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
933f7760dadSAlex Elder  *
934f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
935f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
936f7760dadSAlex Elder  * the start of data to be cloned is located.
937f7760dadSAlex Elder  *
938f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
939f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
940f7760dadSAlex Elder  * contain the offset of that byte within that bio.
941f7760dadSAlex Elder  */
942f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
943f7760dadSAlex Elder 					unsigned int *offset,
944f7760dadSAlex Elder 					unsigned int len,
945f7760dadSAlex Elder 					gfp_t gfpmask)
946f7760dadSAlex Elder {
947f7760dadSAlex Elder 	struct bio *bi = *bio_src;
948f7760dadSAlex Elder 	unsigned int off = *offset;
949f7760dadSAlex Elder 	struct bio *chain = NULL;
950f7760dadSAlex Elder 	struct bio **end;
951602adf40SYehuda Sadeh 
952f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
953602adf40SYehuda Sadeh 
954f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
955f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
956602adf40SYehuda Sadeh 
957f7760dadSAlex Elder 	end = &chain;
958f7760dadSAlex Elder 	while (len) {
959f7760dadSAlex Elder 		unsigned int bi_size;
960f7760dadSAlex Elder 		struct bio *bio;
961f7760dadSAlex Elder 
962f7760dadSAlex Elder 		if (!bi)
963f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
964f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
965f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
966f7760dadSAlex Elder 		if (!bio)
967f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
968f7760dadSAlex Elder 
969f7760dadSAlex Elder 		*end = bio;
970f7760dadSAlex Elder 		end = &bio->bi_next;
971f7760dadSAlex Elder 
972f7760dadSAlex Elder 		off += bi_size;
973f7760dadSAlex Elder 		if (off == bi->bi_size) {
974f7760dadSAlex Elder 			bi = bi->bi_next;
975f7760dadSAlex Elder 			off = 0;
976f7760dadSAlex Elder 		}
977f7760dadSAlex Elder 		len -= bi_size;
978f7760dadSAlex Elder 	}
979f7760dadSAlex Elder 	*bio_src = bi;
980f7760dadSAlex Elder 	*offset = off;
981f7760dadSAlex Elder 
982f7760dadSAlex Elder 	return chain;
983f7760dadSAlex Elder out_err:
984f7760dadSAlex Elder 	bio_chain_put(chain);
985f7760dadSAlex Elder 
986602adf40SYehuda Sadeh 	return NULL;
987602adf40SYehuda Sadeh }
988602adf40SYehuda Sadeh 
989602adf40SYehuda Sadeh /*
990602adf40SYehuda Sadeh  * helpers for osd request op vectors.
991602adf40SYehuda Sadeh  */
99257cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
99357cfc106SAlex Elder 					int opcode, u32 payload_len)
994602adf40SYehuda Sadeh {
99557cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
99657cfc106SAlex Elder 
99757cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
99857cfc106SAlex Elder 	if (!ops)
99957cfc106SAlex Elder 		return NULL;
100057cfc106SAlex Elder 
100157cfc106SAlex Elder 	ops[0].op = opcode;
100257cfc106SAlex Elder 
1003602adf40SYehuda Sadeh 	/*
1004602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
1005602adf40SYehuda Sadeh 	 * in calc_raw_layout()
1006602adf40SYehuda Sadeh 	 */
100757cfc106SAlex Elder 	ops[0].payload_len = payload_len;
100857cfc106SAlex Elder 
100957cfc106SAlex Elder 	return ops;
1010602adf40SYehuda Sadeh }
1011602adf40SYehuda Sadeh 
1012602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1013602adf40SYehuda Sadeh {
1014602adf40SYehuda Sadeh 	kfree(ops);
1015602adf40SYehuda Sadeh }
1016602adf40SYehuda Sadeh 
10171fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
10181fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
10191fec7093SYehuda Sadeh 				   int index,
10201fec7093SYehuda Sadeh 				   int ret, u64 len)
10211fec7093SYehuda Sadeh {
10221fec7093SYehuda Sadeh 	struct request_queue *q;
10231fec7093SYehuda Sadeh 	int min, max, i;
10241fec7093SYehuda Sadeh 
1025bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1026bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
10271fec7093SYehuda Sadeh 
10281fec7093SYehuda Sadeh 	if (!rq)
10291fec7093SYehuda Sadeh 		return;
10301fec7093SYehuda Sadeh 
10311fec7093SYehuda Sadeh 	if (!coll) {
10321fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
10331fec7093SYehuda Sadeh 		return;
10341fec7093SYehuda Sadeh 	}
10351fec7093SYehuda Sadeh 
10361fec7093SYehuda Sadeh 	q = rq->q;
10371fec7093SYehuda Sadeh 
10381fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
10391fec7093SYehuda Sadeh 	coll->status[index].done = 1;
10401fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
10411fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
10421fec7093SYehuda Sadeh 	max = min = coll->num_done;
10431fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
10441fec7093SYehuda Sadeh 		max++;
10451fec7093SYehuda Sadeh 
10461fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
10471fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
10481fec7093SYehuda Sadeh 				  coll->status[i].bytes);
10491fec7093SYehuda Sadeh 		coll->num_done++;
10501fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
10511fec7093SYehuda Sadeh 	}
10521fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
10531fec7093SYehuda Sadeh }
10541fec7093SYehuda Sadeh 
10551fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
10561fec7093SYehuda Sadeh 			     int ret, u64 len)
10571fec7093SYehuda Sadeh {
10581fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
10591fec7093SYehuda Sadeh }
10601fec7093SYehuda Sadeh 
1061602adf40SYehuda Sadeh /*
1062602adf40SYehuda Sadeh  * Send ceph osd request
1063602adf40SYehuda Sadeh  */
1064602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
10650ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
1066602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1067602adf40SYehuda Sadeh 			  u64 snapid,
1068aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
1069602adf40SYehuda Sadeh 			  struct bio *bio,
1070602adf40SYehuda Sadeh 			  struct page **pages,
1071602adf40SYehuda Sadeh 			  int num_pages,
1072602adf40SYehuda Sadeh 			  int flags,
1073602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
10741fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
10751fec7093SYehuda Sadeh 			  int coll_index,
1076602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
107759c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
107859c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
107959c2be1eSYehuda Sadeh 			  u64 *ver)
1080602adf40SYehuda Sadeh {
1081602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
1082602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
1083602adf40SYehuda Sadeh 	int ret;
1084602adf40SYehuda Sadeh 	u64 bno;
1085602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
1086602adf40SYehuda Sadeh 	struct rbd_request *req_data;
1087602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
10881dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
1089602adf40SYehuda Sadeh 
1090602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
10911fec7093SYehuda Sadeh 	if (!req_data) {
10921fec7093SYehuda Sadeh 		if (coll)
10931fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
10941fec7093SYehuda Sadeh 					       -ENOMEM, len);
10951fec7093SYehuda Sadeh 		return -ENOMEM;
10961fec7093SYehuda Sadeh 	}
1097602adf40SYehuda Sadeh 
10981fec7093SYehuda Sadeh 	if (coll) {
10991fec7093SYehuda Sadeh 		req_data->coll = coll;
11001fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
11011fec7093SYehuda Sadeh 	}
11021fec7093SYehuda Sadeh 
1103f7760dadSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1104f7760dadSAlex Elder 		object_name, (unsigned long long) ofs,
1105f7760dadSAlex Elder 		(unsigned long long) len, coll, coll_index);
1106602adf40SYehuda Sadeh 
11070ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
11081dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
11091dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
11104ad12621SSage Weil 	if (!req) {
11114ad12621SSage Weil 		ret = -ENOMEM;
1112602adf40SYehuda Sadeh 		goto done_pages;
1113602adf40SYehuda Sadeh 	}
1114602adf40SYehuda Sadeh 
1115602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
1116602adf40SYehuda Sadeh 
1117602adf40SYehuda Sadeh 	req_data->rq = rq;
1118602adf40SYehuda Sadeh 	req_data->bio = bio;
1119602adf40SYehuda Sadeh 	req_data->pages = pages;
1120602adf40SYehuda Sadeh 	req_data->len = len;
1121602adf40SYehuda Sadeh 
1122602adf40SYehuda Sadeh 	req->r_priv = req_data;
1123602adf40SYehuda Sadeh 
1124602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
1125602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1126602adf40SYehuda Sadeh 
1127aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
1128602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
1129602adf40SYehuda Sadeh 
1130602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
1131602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1132602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1133602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1134602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
11350d7dbfceSAlex Elder 	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
11366cae3717SSage Weil 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
11371dbb4399SAlex Elder 				   req, ops);
11386cae3717SSage Weil 	rbd_assert(ret == 0);
1139602adf40SYehuda Sadeh 
1140602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
1141602adf40SYehuda Sadeh 				ops,
1142602adf40SYehuda Sadeh 				snapc,
1143602adf40SYehuda Sadeh 				&mtime,
1144602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
1145602adf40SYehuda Sadeh 
114659c2be1eSYehuda Sadeh 	if (linger_req) {
11471dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
114859c2be1eSYehuda Sadeh 		*linger_req = req;
114959c2be1eSYehuda Sadeh 	}
115059c2be1eSYehuda Sadeh 
11511dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
1152602adf40SYehuda Sadeh 	if (ret < 0)
1153602adf40SYehuda Sadeh 		goto done_err;
1154602adf40SYehuda Sadeh 
1155602adf40SYehuda Sadeh 	if (!rbd_cb) {
11561dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
115759c2be1eSYehuda Sadeh 		if (ver)
115859c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
1159bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
1160bd919d45SAlex Elder 			(unsigned long long)
11611fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
1162602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
1163602adf40SYehuda Sadeh 	}
1164602adf40SYehuda Sadeh 	return ret;
1165602adf40SYehuda Sadeh 
1166602adf40SYehuda Sadeh done_err:
1167602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
1168602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1169602adf40SYehuda Sadeh done_pages:
11701fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
1171602adf40SYehuda Sadeh 	kfree(req_data);
1172602adf40SYehuda Sadeh 	return ret;
1173602adf40SYehuda Sadeh }
1174602adf40SYehuda Sadeh 
1175602adf40SYehuda Sadeh /*
1176602adf40SYehuda Sadeh  * Ceph osd op callback
1177602adf40SYehuda Sadeh  */
1178602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1179602adf40SYehuda Sadeh {
1180602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
1181602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1182602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
1183602adf40SYehuda Sadeh 	__s32 rc;
1184602adf40SYehuda Sadeh 	u64 bytes;
1185602adf40SYehuda Sadeh 	int read_op;
1186602adf40SYehuda Sadeh 
1187602adf40SYehuda Sadeh 	/* parse reply */
1188602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1189602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1190602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
1191602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
1192602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1193895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1194602adf40SYehuda Sadeh 
1195bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1196bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1197602adf40SYehuda Sadeh 
1198602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1199602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1200602adf40SYehuda Sadeh 		rc = 0;
1201602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1202602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1203602adf40SYehuda Sadeh 		bytes = req_data->len;
1204602adf40SYehuda Sadeh 	}
1205602adf40SYehuda Sadeh 
12061fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1207602adf40SYehuda Sadeh 
1208602adf40SYehuda Sadeh 	if (req_data->bio)
1209602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1210602adf40SYehuda Sadeh 
1211602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1212602adf40SYehuda Sadeh 	kfree(req_data);
1213602adf40SYehuda Sadeh }
1214602adf40SYehuda Sadeh 
121559c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
121659c2be1eSYehuda Sadeh {
121759c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
121859c2be1eSYehuda Sadeh }
121959c2be1eSYehuda Sadeh 
1220602adf40SYehuda Sadeh /*
1221602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1222602adf40SYehuda Sadeh  */
12230ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1224602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1225602adf40SYehuda Sadeh 			   u64 snapid,
1226602adf40SYehuda Sadeh 			   int flags,
1227913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1228aded07eaSAlex Elder 			   const char *object_name,
1229f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1230f8d4de6eSAlex Elder 			   char *inbound,
123159c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
123259c2be1eSYehuda Sadeh 			   u64 *ver)
1233602adf40SYehuda Sadeh {
1234602adf40SYehuda Sadeh 	int ret;
1235602adf40SYehuda Sadeh 	struct page **pages;
1236602adf40SYehuda Sadeh 	int num_pages;
1237913d2fdcSAlex Elder 
1238aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1239602adf40SYehuda Sadeh 
1240f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1241602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1242b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1243b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1244602adf40SYehuda Sadeh 
12450ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1246f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1247602adf40SYehuda Sadeh 			  pages, num_pages,
1248602adf40SYehuda Sadeh 			  flags,
1249602adf40SYehuda Sadeh 			  ops,
12501fec7093SYehuda Sadeh 			  NULL, 0,
125159c2be1eSYehuda Sadeh 			  NULL,
125259c2be1eSYehuda Sadeh 			  linger_req, ver);
1253602adf40SYehuda Sadeh 	if (ret < 0)
1254913d2fdcSAlex Elder 		goto done;
1255602adf40SYehuda Sadeh 
1256f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1257f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1258602adf40SYehuda Sadeh 
1259602adf40SYehuda Sadeh done:
1260602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1261602adf40SYehuda Sadeh 	return ret;
1262602adf40SYehuda Sadeh }
1263602adf40SYehuda Sadeh 
1264602adf40SYehuda Sadeh /*
1265602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1266602adf40SYehuda Sadeh  */
1267602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1268602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1269602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1270602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
12711fec7093SYehuda Sadeh 		     struct bio *bio,
12721fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
12731fec7093SYehuda Sadeh 		     int coll_index)
1274602adf40SYehuda Sadeh {
1275602adf40SYehuda Sadeh 	char *seg_name;
1276602adf40SYehuda Sadeh 	u64 seg_ofs;
1277602adf40SYehuda Sadeh 	u64 seg_len;
1278602adf40SYehuda Sadeh 	int ret;
1279602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1280602adf40SYehuda Sadeh 	u32 payload_len;
1281ff2e4bb5SAlex Elder 	int opcode;
1282ff2e4bb5SAlex Elder 	int flags;
12834634246dSAlex Elder 	u64 snapid;
1284602adf40SYehuda Sadeh 
128565ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1286602adf40SYehuda Sadeh 	if (!seg_name)
1287602adf40SYehuda Sadeh 		return -ENOMEM;
128865ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
128965ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1290602adf40SYehuda Sadeh 
1291ff2e4bb5SAlex Elder 	if (rq_data_dir(rq) == WRITE) {
1292ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_WRITE;
1293ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
12944634246dSAlex Elder 		snapid = CEPH_NOSNAP;
1295ff2e4bb5SAlex Elder 		payload_len = seg_len;
1296ff2e4bb5SAlex Elder 	} else {
1297ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_READ;
1298ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_READ;
12994634246dSAlex Elder 		snapc = NULL;
13000d7dbfceSAlex Elder 		snapid = rbd_dev->spec->snap_id;
1301ff2e4bb5SAlex Elder 		payload_len = 0;
1302ff2e4bb5SAlex Elder 	}
1303602adf40SYehuda Sadeh 
130457cfc106SAlex Elder 	ret = -ENOMEM;
130557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
130657cfc106SAlex Elder 	if (!ops)
1307602adf40SYehuda Sadeh 		goto done;
1308602adf40SYehuda Sadeh 
1309602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1310602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1311602adf40SYehuda Sadeh 	   truncated at this point */
1312aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1313602adf40SYehuda Sadeh 
1314602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1315602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1316602adf40SYehuda Sadeh 			     bio,
1317602adf40SYehuda Sadeh 			     NULL, 0,
1318602adf40SYehuda Sadeh 			     flags,
1319602adf40SYehuda Sadeh 			     ops,
13201fec7093SYehuda Sadeh 			     coll, coll_index,
132159c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
132211f77002SSage Weil 
132311f77002SSage Weil 	rbd_destroy_ops(ops);
1324602adf40SYehuda Sadeh done:
1325602adf40SYehuda Sadeh 	kfree(seg_name);
1326602adf40SYehuda Sadeh 	return ret;
1327602adf40SYehuda Sadeh }
1328602adf40SYehuda Sadeh 
1329602adf40SYehuda Sadeh /*
1330602adf40SYehuda Sadeh  * Request sync osd read
1331602adf40SYehuda Sadeh  */
13320ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1333602adf40SYehuda Sadeh 			  u64 snapid,
1334aded07eaSAlex Elder 			  const char *object_name,
1335602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
133659c2be1eSYehuda Sadeh 			  char *buf,
133759c2be1eSYehuda Sadeh 			  u64 *ver)
1338602adf40SYehuda Sadeh {
1339913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1340913d2fdcSAlex Elder 	int ret;
1341913d2fdcSAlex Elder 
1342913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1343913d2fdcSAlex Elder 	if (!ops)
1344913d2fdcSAlex Elder 		return -ENOMEM;
1345913d2fdcSAlex Elder 
1346913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1347b06e6a6bSJosh Durgin 			       snapid,
1348602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1349913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1350913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1351913d2fdcSAlex Elder 
1352913d2fdcSAlex Elder 	return ret;
1353602adf40SYehuda Sadeh }
1354602adf40SYehuda Sadeh 
1355602adf40SYehuda Sadeh /*
135659c2be1eSYehuda Sadeh  * Request sync osd watch
135759c2be1eSYehuda Sadeh  */
13580ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
135959c2be1eSYehuda Sadeh 				   u64 ver,
13607f0a24d8SAlex Elder 				   u64 notify_id)
136159c2be1eSYehuda Sadeh {
136259c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
136311f77002SSage Weil 	int ret;
136411f77002SSage Weil 
136557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
136657cfc106SAlex Elder 	if (!ops)
136757cfc106SAlex Elder 		return -ENOMEM;
136859c2be1eSYehuda Sadeh 
1369a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
137059c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
137159c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
137259c2be1eSYehuda Sadeh 
13730ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
13747f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1375ad4f232fSAlex Elder 			  NULL, 0,
137659c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
137759c2be1eSYehuda Sadeh 			  ops,
13781fec7093SYehuda Sadeh 			  NULL, 0,
137959c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
138059c2be1eSYehuda Sadeh 
138159c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
138259c2be1eSYehuda Sadeh 	return ret;
138359c2be1eSYehuda Sadeh }
138459c2be1eSYehuda Sadeh 
138559c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
138659c2be1eSYehuda Sadeh {
13870ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1388a71b891bSJosh Durgin 	u64 hver;
138913143d2dSSage Weil 	int rc;
139013143d2dSSage Weil 
13910ce1a794SAlex Elder 	if (!rbd_dev)
139259c2be1eSYehuda Sadeh 		return;
139359c2be1eSYehuda Sadeh 
1394bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1395bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1396bd919d45SAlex Elder 		(unsigned int) opcode);
1397117973fbSAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
139813143d2dSSage Weil 	if (rc)
1399f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
14000ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
140159c2be1eSYehuda Sadeh 
14027f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
140359c2be1eSYehuda Sadeh }
140459c2be1eSYehuda Sadeh 
140559c2be1eSYehuda Sadeh /*
140659c2be1eSYehuda Sadeh  * Request sync osd watch
140759c2be1eSYehuda Sadeh  */
14080e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
140959c2be1eSYehuda Sadeh {
141059c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
14110ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
141257cfc106SAlex Elder 	int ret;
141359c2be1eSYehuda Sadeh 
141457cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
141557cfc106SAlex Elder 	if (!ops)
141657cfc106SAlex Elder 		return -ENOMEM;
141759c2be1eSYehuda Sadeh 
141859c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
14190ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
142059c2be1eSYehuda Sadeh 	if (ret < 0)
142159c2be1eSYehuda Sadeh 		goto fail;
142259c2be1eSYehuda Sadeh 
14230e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
14240ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
142559c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
142659c2be1eSYehuda Sadeh 
14270ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
142859c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
142959c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
143059c2be1eSYehuda Sadeh 			      ops,
14310e6f322dSAlex Elder 			      rbd_dev->header_name,
14320e6f322dSAlex Elder 			      0, 0, NULL,
14330ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
143459c2be1eSYehuda Sadeh 
143559c2be1eSYehuda Sadeh 	if (ret < 0)
143659c2be1eSYehuda Sadeh 		goto fail_event;
143759c2be1eSYehuda Sadeh 
143859c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
143959c2be1eSYehuda Sadeh 	return 0;
144059c2be1eSYehuda Sadeh 
144159c2be1eSYehuda Sadeh fail_event:
14420ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14430ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
144459c2be1eSYehuda Sadeh fail:
144559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
144659c2be1eSYehuda Sadeh 	return ret;
144759c2be1eSYehuda Sadeh }
144859c2be1eSYehuda Sadeh 
144979e3057cSYehuda Sadeh /*
145079e3057cSYehuda Sadeh  * Request sync osd unwatch
145179e3057cSYehuda Sadeh  */
1452070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
145379e3057cSYehuda Sadeh {
145479e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
145557cfc106SAlex Elder 	int ret;
145679e3057cSYehuda Sadeh 
145757cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
145857cfc106SAlex Elder 	if (!ops)
145957cfc106SAlex Elder 		return -ENOMEM;
146079e3057cSYehuda Sadeh 
146179e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
14620ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
146379e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
146479e3057cSYehuda Sadeh 
14650ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
146679e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
146779e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
146879e3057cSYehuda Sadeh 			      ops,
1469070c633fSAlex Elder 			      rbd_dev->header_name,
1470070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1471070c633fSAlex Elder 
147279e3057cSYehuda Sadeh 
147379e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
14740ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14750ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
147679e3057cSYehuda Sadeh 	return ret;
147779e3057cSYehuda Sadeh }
147879e3057cSYehuda Sadeh 
147959c2be1eSYehuda Sadeh /*
14803cb4a687SAlex Elder  * Synchronous osd object method call
1481602adf40SYehuda Sadeh  */
14820ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1483aded07eaSAlex Elder 			     const char *object_name,
1484aded07eaSAlex Elder 			     const char *class_name,
1485aded07eaSAlex Elder 			     const char *method_name,
14863cb4a687SAlex Elder 			     const char *outbound,
14873cb4a687SAlex Elder 			     size_t outbound_size,
1488f8d4de6eSAlex Elder 			     char *inbound,
1489f8d4de6eSAlex Elder 			     size_t inbound_size,
14903cb4a687SAlex Elder 			     int flags,
149159c2be1eSYehuda Sadeh 			     u64 *ver)
1492602adf40SYehuda Sadeh {
1493602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1494aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1495aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
14963cb4a687SAlex Elder 	int payload_size;
149757cfc106SAlex Elder 	int ret;
149857cfc106SAlex Elder 
14993cb4a687SAlex Elder 	/*
15003cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
15013cb4a687SAlex Elder 	 * will be sent along with the class and method names as
15023cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
15033cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
15043cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
15053cb4a687SAlex Elder 	 * operation.
15063cb4a687SAlex Elder 	 */
15073cb4a687SAlex Elder 	payload_size = class_name_len + method_name_len + outbound_size;
15083cb4a687SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
150957cfc106SAlex Elder 	if (!ops)
151057cfc106SAlex Elder 		return -ENOMEM;
1511602adf40SYehuda Sadeh 
1512aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1513aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1514aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1515aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1516602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
15173cb4a687SAlex Elder 	ops[0].cls.indata = outbound;
15183cb4a687SAlex Elder 	ops[0].cls.indata_len = outbound_size;
1519602adf40SYehuda Sadeh 
15200ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1521602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
15223cb4a687SAlex Elder 			       flags, ops,
1523f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1524f8d4de6eSAlex Elder 			       NULL, ver);
1525602adf40SYehuda Sadeh 
1526602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1527602adf40SYehuda Sadeh 
1528602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1529602adf40SYehuda Sadeh 	return ret;
1530602adf40SYehuda Sadeh }
1531602adf40SYehuda Sadeh 
15321fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
15331fec7093SYehuda Sadeh {
15341fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
15351fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
15361fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
15371fec7093SYehuda Sadeh 				GFP_ATOMIC);
15381fec7093SYehuda Sadeh 
15391fec7093SYehuda Sadeh 	if (!coll)
15401fec7093SYehuda Sadeh 		return NULL;
15411fec7093SYehuda Sadeh 	coll->total = num_reqs;
15421fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15431fec7093SYehuda Sadeh 	return coll;
15441fec7093SYehuda Sadeh }
15451fec7093SYehuda Sadeh 
1546602adf40SYehuda Sadeh /*
1547602adf40SYehuda Sadeh  * block device queue callback
1548602adf40SYehuda Sadeh  */
1549602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1550602adf40SYehuda Sadeh {
1551602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1552602adf40SYehuda Sadeh 	struct request *rq;
1553602adf40SYehuda Sadeh 
155400f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1555602adf40SYehuda Sadeh 		struct bio *bio;
1556602adf40SYehuda Sadeh 		bool do_write;
1557bd919d45SAlex Elder 		unsigned int size;
1558602adf40SYehuda Sadeh 		u64 ofs;
15591fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
15601fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1561d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1562f7760dadSAlex Elder 		unsigned int bio_offset;
1563602adf40SYehuda Sadeh 
1564602adf40SYehuda Sadeh 		dout("fetched request\n");
1565602adf40SYehuda Sadeh 
1566602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1567602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1568602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
156900f1f36fSAlex Elder 			continue;
1570602adf40SYehuda Sadeh 		}
1571602adf40SYehuda Sadeh 
1572602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1573602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1574f84344f3SAlex Elder 		if (do_write && rbd_dev->mapping.read_only) {
1575602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
157600f1f36fSAlex Elder 			continue;
1577602adf40SYehuda Sadeh 		}
1578602adf40SYehuda Sadeh 
1579602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1580602adf40SYehuda Sadeh 
1581e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1582e88a36ecSJosh Durgin 
1583daba5fdbSAlex Elder 		if (!rbd_dev->exists) {
15840d7dbfceSAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1585d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1586e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1587e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1588e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1589e88a36ecSJosh Durgin 			continue;
1590e88a36ecSJosh Durgin 		}
1591d1d25646SJosh Durgin 
1592d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1593d1d25646SJosh Durgin 
1594d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1595e88a36ecSJosh Durgin 
1596f7760dadSAlex Elder 		size = blk_rq_bytes(rq);
1597f7760dadSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1598f7760dadSAlex Elder 		bio = rq->bio;
1599f7760dadSAlex Elder 
1600602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1601602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1602bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1603602adf40SYehuda Sadeh 
16041fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1605df111be6SAlex Elder 		if (num_segs <= 0) {
1606df111be6SAlex Elder 			spin_lock_irq(q->queue_lock);
1607df111be6SAlex Elder 			__blk_end_request_all(rq, num_segs);
1608df111be6SAlex Elder 			ceph_put_snap_context(snapc);
1609df111be6SAlex Elder 			continue;
1610df111be6SAlex Elder 		}
16111fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
16121fec7093SYehuda Sadeh 		if (!coll) {
16131fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
16141fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1615d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
161600f1f36fSAlex Elder 			continue;
16171fec7093SYehuda Sadeh 		}
16181fec7093SYehuda Sadeh 
1619f7760dadSAlex Elder 		bio_offset = 0;
1620602adf40SYehuda Sadeh 		do {
1621f7760dadSAlex Elder 			u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1622f7760dadSAlex Elder 			unsigned int chain_size;
1623f7760dadSAlex Elder 			struct bio *bio_chain;
1624f7760dadSAlex Elder 
1625f7760dadSAlex Elder 			BUG_ON(limit > (u64) UINT_MAX);
1626f7760dadSAlex Elder 			chain_size = (unsigned int) limit;
1627bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1628f7760dadSAlex Elder 
16291fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1630f7760dadSAlex Elder 
1631f7760dadSAlex Elder 			/* Pass a cloned bio chain via an osd request */
1632f7760dadSAlex Elder 
1633f7760dadSAlex Elder 			bio_chain = bio_chain_clone_range(&bio,
1634f7760dadSAlex Elder 						&bio_offset, chain_size,
1635f7760dadSAlex Elder 						GFP_ATOMIC);
1636f7760dadSAlex Elder 			if (bio_chain)
16374634246dSAlex Elder 				(void) rbd_do_op(rq, rbd_dev, snapc,
1638f7760dadSAlex Elder 						ofs, chain_size,
1639f7760dadSAlex Elder 						bio_chain, coll, cur_seg);
16404634246dSAlex Elder 			else
16411fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
1642f7760dadSAlex Elder 						       -ENOMEM, chain_size);
1643f7760dadSAlex Elder 			size -= chain_size;
1644f7760dadSAlex Elder 			ofs += chain_size;
1645602adf40SYehuda Sadeh 
16461fec7093SYehuda Sadeh 			cur_seg++;
1647602adf40SYehuda Sadeh 		} while (size > 0);
16481fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1649602adf40SYehuda Sadeh 
1650602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1651d1d25646SJosh Durgin 
1652d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1653602adf40SYehuda Sadeh 	}
1654602adf40SYehuda Sadeh }
1655602adf40SYehuda Sadeh 
1656602adf40SYehuda Sadeh /*
1657602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1658602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1659f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1660602adf40SYehuda Sadeh  */
1661602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1662602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1663602adf40SYehuda Sadeh {
1664602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1665e5cfeed2SAlex Elder 	sector_t sector_offset;
1666e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1667e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1668e5cfeed2SAlex Elder 	int ret;
1669602adf40SYehuda Sadeh 
1670e5cfeed2SAlex Elder 	/*
1671e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1672e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1673e5cfeed2SAlex Elder 	 * device.
1674e5cfeed2SAlex Elder 	 */
1675e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1676e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1677e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1678593a9e7bSAlex Elder 
1679e5cfeed2SAlex Elder 	/*
1680e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
1681e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
1682e5cfeed2SAlex Elder 	 */
1683e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1684e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
1685e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
1686e5cfeed2SAlex Elder 	else
1687e5cfeed2SAlex Elder 		ret = 0;
1688e5cfeed2SAlex Elder 
1689e5cfeed2SAlex Elder 	/*
1690e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
1691e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
1692e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
1693e5cfeed2SAlex Elder 	 * added to an empty bio."
1694e5cfeed2SAlex Elder 	 */
1695e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
1696e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
1697e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
1698e5cfeed2SAlex Elder 
1699e5cfeed2SAlex Elder 	return ret;
1700602adf40SYehuda Sadeh }
1701602adf40SYehuda Sadeh 
1702602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1703602adf40SYehuda Sadeh {
1704602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1705602adf40SYehuda Sadeh 
1706602adf40SYehuda Sadeh 	if (!disk)
1707602adf40SYehuda Sadeh 		return;
1708602adf40SYehuda Sadeh 
1709602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1710602adf40SYehuda Sadeh 		del_gendisk(disk);
1711602adf40SYehuda Sadeh 	if (disk->queue)
1712602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1713602adf40SYehuda Sadeh 	put_disk(disk);
1714602adf40SYehuda Sadeh }
1715602adf40SYehuda Sadeh 
1716602adf40SYehuda Sadeh /*
17174156d998SAlex Elder  * Read the complete header for the given rbd device.
17184156d998SAlex Elder  *
17194156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
17204156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
17214156d998SAlex Elder  * of a variable that will be filled in with the version of the
17224156d998SAlex Elder  * header object at the time it was read.
17234156d998SAlex Elder  *
17244156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
17254156d998SAlex Elder  */
17264156d998SAlex Elder static struct rbd_image_header_ondisk *
17274156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
17284156d998SAlex Elder {
17294156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
17304156d998SAlex Elder 	u32 snap_count = 0;
17314156d998SAlex Elder 	u64 names_size = 0;
17324156d998SAlex Elder 	u32 want_count;
17334156d998SAlex Elder 	int ret;
17344156d998SAlex Elder 
17354156d998SAlex Elder 	/*
17364156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
17374156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
17384156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
17394156d998SAlex Elder 	 * the number of snapshots could change by the time we read
17404156d998SAlex Elder 	 * it in, in which case we re-read it.
17414156d998SAlex Elder 	 */
17424156d998SAlex Elder 	do {
17434156d998SAlex Elder 		size_t size;
17444156d998SAlex Elder 
17454156d998SAlex Elder 		kfree(ondisk);
17464156d998SAlex Elder 
17474156d998SAlex Elder 		size = sizeof (*ondisk);
17484156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17494156d998SAlex Elder 		size += names_size;
17504156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17514156d998SAlex Elder 		if (!ondisk)
17524156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
17534156d998SAlex Elder 
17544156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
17554156d998SAlex Elder 				       rbd_dev->header_name,
17564156d998SAlex Elder 				       0, size,
17574156d998SAlex Elder 				       (char *) ondisk, version);
17584156d998SAlex Elder 
17594156d998SAlex Elder 		if (ret < 0)
17604156d998SAlex Elder 			goto out_err;
17614156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
17624156d998SAlex Elder 			ret = -ENXIO;
17634156d998SAlex Elder 			pr_warning("short header read for image %s"
17644156d998SAlex Elder 					" (want %zd got %d)\n",
17650d7dbfceSAlex Elder 				rbd_dev->spec->image_name, size, ret);
17664156d998SAlex Elder 			goto out_err;
17674156d998SAlex Elder 		}
17684156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
17694156d998SAlex Elder 			ret = -ENXIO;
17704156d998SAlex Elder 			pr_warning("invalid header for image %s\n",
17710d7dbfceSAlex Elder 				rbd_dev->spec->image_name);
17724156d998SAlex Elder 			goto out_err;
17734156d998SAlex Elder 		}
17744156d998SAlex Elder 
17754156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
17764156d998SAlex Elder 		want_count = snap_count;
17774156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
17784156d998SAlex Elder 	} while (snap_count != want_count);
17794156d998SAlex Elder 
17804156d998SAlex Elder 	return ondisk;
17814156d998SAlex Elder 
17824156d998SAlex Elder out_err:
17834156d998SAlex Elder 	kfree(ondisk);
17844156d998SAlex Elder 
17854156d998SAlex Elder 	return ERR_PTR(ret);
17864156d998SAlex Elder }
17874156d998SAlex Elder 
17884156d998SAlex Elder /*
1789602adf40SYehuda Sadeh  * reload the ondisk the header
1790602adf40SYehuda Sadeh  */
1791602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1792602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1793602adf40SYehuda Sadeh {
17944156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
17954156d998SAlex Elder 	u64 ver = 0;
17964156d998SAlex Elder 	int ret;
1797602adf40SYehuda Sadeh 
17984156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
17994156d998SAlex Elder 	if (IS_ERR(ondisk))
18004156d998SAlex Elder 		return PTR_ERR(ondisk);
18014156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
18024156d998SAlex Elder 	if (ret >= 0)
180359c2be1eSYehuda Sadeh 		header->obj_version = ver;
18044156d998SAlex Elder 	kfree(ondisk);
1805602adf40SYehuda Sadeh 
18064156d998SAlex Elder 	return ret;
1807602adf40SYehuda Sadeh }
1808602adf40SYehuda Sadeh 
180941f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1810dfc5606dSYehuda Sadeh {
1811dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1812a0593290SAlex Elder 	struct rbd_snap *next;
1813dfc5606dSYehuda Sadeh 
1814a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
181541f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
1816dfc5606dSYehuda Sadeh }
1817dfc5606dSYehuda Sadeh 
18189478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
18199478554aSAlex Elder {
18209478554aSAlex Elder 	sector_t size;
18219478554aSAlex Elder 
18220d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
18239478554aSAlex Elder 		return;
18249478554aSAlex Elder 
18259478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
18269478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
18279478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
18289478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
18299478554aSAlex Elder }
18309478554aSAlex Elder 
1831602adf40SYehuda Sadeh /*
1832602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1833602adf40SYehuda Sadeh  */
1834117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1835602adf40SYehuda Sadeh {
1836602adf40SYehuda Sadeh 	int ret;
1837602adf40SYehuda Sadeh 	struct rbd_image_header h;
1838602adf40SYehuda Sadeh 
1839602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1840602adf40SYehuda Sadeh 	if (ret < 0)
1841602adf40SYehuda Sadeh 		return ret;
1842602adf40SYehuda Sadeh 
1843a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1844a51aa0c0SJosh Durgin 
18459478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
18469478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
18479478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
18489db4b3e3SSage Weil 
1849849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1850602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1851849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1852d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1853d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1854602adf40SYehuda Sadeh 
1855b813623aSAlex Elder 	if (hver)
1856b813623aSAlex Elder 		*hver = h.obj_version;
1857a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
185893a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1859602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1860602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1861602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1862849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1863849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1864849b4260SAlex Elder 	kfree(h.object_prefix);
1865849b4260SAlex Elder 
1866304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1867304f6808SAlex Elder 	if (!ret)
1868304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1869dfc5606dSYehuda Sadeh 
1870c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1871602adf40SYehuda Sadeh 
1872dfc5606dSYehuda Sadeh 	return ret;
1873602adf40SYehuda Sadeh }
1874602adf40SYehuda Sadeh 
1875117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
18761fe5e993SAlex Elder {
18771fe5e993SAlex Elder 	int ret;
18781fe5e993SAlex Elder 
1879117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
18801fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1881117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
1882117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
1883117973fbSAlex Elder 	else
1884117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
18851fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
18861fe5e993SAlex Elder 
18871fe5e993SAlex Elder 	return ret;
18881fe5e993SAlex Elder }
18891fe5e993SAlex Elder 
1890602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1891602adf40SYehuda Sadeh {
1892602adf40SYehuda Sadeh 	struct gendisk *disk;
1893602adf40SYehuda Sadeh 	struct request_queue *q;
1894593a9e7bSAlex Elder 	u64 segment_size;
1895602adf40SYehuda Sadeh 
1896602adf40SYehuda Sadeh 	/* create gendisk info */
1897602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1898602adf40SYehuda Sadeh 	if (!disk)
18991fcdb8aaSAlex Elder 		return -ENOMEM;
1900602adf40SYehuda Sadeh 
1901f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1902de71a297SAlex Elder 		 rbd_dev->dev_id);
1903602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1904602adf40SYehuda Sadeh 	disk->first_minor = 0;
1905602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1906602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1907602adf40SYehuda Sadeh 
1908602adf40SYehuda Sadeh 	/* init rq */
1909602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1910602adf40SYehuda Sadeh 	if (!q)
1911602adf40SYehuda Sadeh 		goto out_disk;
1912029bcbd8SJosh Durgin 
1913593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1914593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1915593a9e7bSAlex Elder 
1916029bcbd8SJosh Durgin 	/* set io sizes to object size */
1917593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1918593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1919593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1920593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1921593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1922029bcbd8SJosh Durgin 
1923602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1924602adf40SYehuda Sadeh 	disk->queue = q;
1925602adf40SYehuda Sadeh 
1926602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1927602adf40SYehuda Sadeh 
1928602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1929602adf40SYehuda Sadeh 
193012f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
193112f02944SAlex Elder 
1932602adf40SYehuda Sadeh 	return 0;
1933602adf40SYehuda Sadeh out_disk:
1934602adf40SYehuda Sadeh 	put_disk(disk);
19351fcdb8aaSAlex Elder 
19361fcdb8aaSAlex Elder 	return -ENOMEM;
1937602adf40SYehuda Sadeh }
1938602adf40SYehuda Sadeh 
1939dfc5606dSYehuda Sadeh /*
1940dfc5606dSYehuda Sadeh   sysfs
1941dfc5606dSYehuda Sadeh */
1942602adf40SYehuda Sadeh 
1943593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1944593a9e7bSAlex Elder {
1945593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1946593a9e7bSAlex Elder }
1947593a9e7bSAlex Elder 
1948dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1949dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1950602adf40SYehuda Sadeh {
1951593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1952a51aa0c0SJosh Durgin 	sector_t size;
1953dfc5606dSYehuda Sadeh 
1954a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1955a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1956a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1957a51aa0c0SJosh Durgin 
1958a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1959602adf40SYehuda Sadeh }
1960602adf40SYehuda Sadeh 
196134b13184SAlex Elder /*
196234b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
196334b13184SAlex Elder  * necessarily the base image.
196434b13184SAlex Elder  */
196534b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
196634b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
196734b13184SAlex Elder {
196834b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
196934b13184SAlex Elder 
197034b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
197134b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
197234b13184SAlex Elder }
197334b13184SAlex Elder 
1974dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1975dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1976602adf40SYehuda Sadeh {
1977593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1978dfc5606dSYehuda Sadeh 
1979dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1980dfc5606dSYehuda Sadeh }
1981dfc5606dSYehuda Sadeh 
1982dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1983dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1984dfc5606dSYehuda Sadeh {
1985593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1986dfc5606dSYehuda Sadeh 
19871dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
19881dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1989dfc5606dSYehuda Sadeh }
1990dfc5606dSYehuda Sadeh 
1991dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1992dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1993dfc5606dSYehuda Sadeh {
1994593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1995dfc5606dSYehuda Sadeh 
19960d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
1997dfc5606dSYehuda Sadeh }
1998dfc5606dSYehuda Sadeh 
19999bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
20009bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
20019bb2f334SAlex Elder {
20029bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
20039bb2f334SAlex Elder 
20040d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
20050d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
20069bb2f334SAlex Elder }
20079bb2f334SAlex Elder 
2008dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2009dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2010dfc5606dSYehuda Sadeh {
2011593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2012dfc5606dSYehuda Sadeh 
2013a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
20140d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2015a92ffdf8SAlex Elder 
2016a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2017dfc5606dSYehuda Sadeh }
2018dfc5606dSYehuda Sadeh 
2019589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2020589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2021589d30e0SAlex Elder {
2022589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2023589d30e0SAlex Elder 
20240d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2025589d30e0SAlex Elder }
2026589d30e0SAlex Elder 
202734b13184SAlex Elder /*
202834b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
202934b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
203034b13184SAlex Elder  */
2031dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2032dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2033dfc5606dSYehuda Sadeh 			     char *buf)
2034dfc5606dSYehuda Sadeh {
2035593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2036dfc5606dSYehuda Sadeh 
20370d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2038dfc5606dSYehuda Sadeh }
2039dfc5606dSYehuda Sadeh 
204086b00e0dSAlex Elder /*
204186b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
204286b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
204386b00e0dSAlex Elder  * "(no parent image)".
204486b00e0dSAlex Elder  */
204586b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
204686b00e0dSAlex Elder 			     struct device_attribute *attr,
204786b00e0dSAlex Elder 			     char *buf)
204886b00e0dSAlex Elder {
204986b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
205086b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
205186b00e0dSAlex Elder 	int count;
205286b00e0dSAlex Elder 	char *bufp = buf;
205386b00e0dSAlex Elder 
205486b00e0dSAlex Elder 	if (!spec)
205586b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
205686b00e0dSAlex Elder 
205786b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
205886b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
205986b00e0dSAlex Elder 	if (count < 0)
206086b00e0dSAlex Elder 		return count;
206186b00e0dSAlex Elder 	bufp += count;
206286b00e0dSAlex Elder 
206386b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
206486b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
206586b00e0dSAlex Elder 	if (count < 0)
206686b00e0dSAlex Elder 		return count;
206786b00e0dSAlex Elder 	bufp += count;
206886b00e0dSAlex Elder 
206986b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
207086b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
207186b00e0dSAlex Elder 	if (count < 0)
207286b00e0dSAlex Elder 		return count;
207386b00e0dSAlex Elder 	bufp += count;
207486b00e0dSAlex Elder 
207586b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
207686b00e0dSAlex Elder 	if (count < 0)
207786b00e0dSAlex Elder 		return count;
207886b00e0dSAlex Elder 	bufp += count;
207986b00e0dSAlex Elder 
208086b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
208186b00e0dSAlex Elder }
208286b00e0dSAlex Elder 
2083dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2084dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2085dfc5606dSYehuda Sadeh 				 const char *buf,
2086dfc5606dSYehuda Sadeh 				 size_t size)
2087dfc5606dSYehuda Sadeh {
2088593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2089b813623aSAlex Elder 	int ret;
2090602adf40SYehuda Sadeh 
2091117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2092b813623aSAlex Elder 
2093b813623aSAlex Elder 	return ret < 0 ? ret : size;
2094dfc5606dSYehuda Sadeh }
2095602adf40SYehuda Sadeh 
2096dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
209734b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2098dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2099dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2100dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
21019bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2102dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2103589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2104dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2105dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
210686b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2107dfc5606dSYehuda Sadeh 
2108dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2109dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
211034b13184SAlex Elder 	&dev_attr_features.attr,
2111dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2112dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2113dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
21149bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2115dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2116589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2117dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
211886b00e0dSAlex Elder 	&dev_attr_parent.attr,
2119dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2120dfc5606dSYehuda Sadeh 	NULL
2121dfc5606dSYehuda Sadeh };
2122dfc5606dSYehuda Sadeh 
2123dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2124dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2125dfc5606dSYehuda Sadeh };
2126dfc5606dSYehuda Sadeh 
2127dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2128dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2129dfc5606dSYehuda Sadeh 	NULL
2130dfc5606dSYehuda Sadeh };
2131dfc5606dSYehuda Sadeh 
2132dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2133dfc5606dSYehuda Sadeh {
2134dfc5606dSYehuda Sadeh }
2135dfc5606dSYehuda Sadeh 
2136dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2137dfc5606dSYehuda Sadeh 	.name		= "rbd",
2138dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2139dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2140dfc5606dSYehuda Sadeh };
2141dfc5606dSYehuda Sadeh 
2142dfc5606dSYehuda Sadeh 
2143dfc5606dSYehuda Sadeh /*
2144dfc5606dSYehuda Sadeh   sysfs - snapshots
2145dfc5606dSYehuda Sadeh */
2146dfc5606dSYehuda Sadeh 
2147dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2148dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2149dfc5606dSYehuda Sadeh 				  char *buf)
2150dfc5606dSYehuda Sadeh {
2151dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2152dfc5606dSYehuda Sadeh 
21533591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2154dfc5606dSYehuda Sadeh }
2155dfc5606dSYehuda Sadeh 
2156dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2157dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2158dfc5606dSYehuda Sadeh 				char *buf)
2159dfc5606dSYehuda Sadeh {
2160dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2161dfc5606dSYehuda Sadeh 
2162593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2163dfc5606dSYehuda Sadeh }
2164dfc5606dSYehuda Sadeh 
216534b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
216634b13184SAlex Elder 				struct device_attribute *attr,
216734b13184SAlex Elder 				char *buf)
216834b13184SAlex Elder {
216934b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
217034b13184SAlex Elder 
217134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
217234b13184SAlex Elder 			(unsigned long long) snap->features);
217334b13184SAlex Elder }
217434b13184SAlex Elder 
2175dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2176dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
217734b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2178dfc5606dSYehuda Sadeh 
2179dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2180dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2181dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
218234b13184SAlex Elder 	&dev_attr_snap_features.attr,
2183dfc5606dSYehuda Sadeh 	NULL,
2184dfc5606dSYehuda Sadeh };
2185dfc5606dSYehuda Sadeh 
2186dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2187dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2188dfc5606dSYehuda Sadeh };
2189dfc5606dSYehuda Sadeh 
2190dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2191dfc5606dSYehuda Sadeh {
2192dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2193dfc5606dSYehuda Sadeh 	kfree(snap->name);
2194dfc5606dSYehuda Sadeh 	kfree(snap);
2195dfc5606dSYehuda Sadeh }
2196dfc5606dSYehuda Sadeh 
2197dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2198dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2199dfc5606dSYehuda Sadeh 	NULL
2200dfc5606dSYehuda Sadeh };
2201dfc5606dSYehuda Sadeh 
2202dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2203dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2204dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2205dfc5606dSYehuda Sadeh };
2206dfc5606dSYehuda Sadeh 
22078b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
22088b8fb99cSAlex Elder {
22098b8fb99cSAlex Elder 	kref_get(&spec->kref);
22108b8fb99cSAlex Elder 
22118b8fb99cSAlex Elder 	return spec;
22128b8fb99cSAlex Elder }
22138b8fb99cSAlex Elder 
22148b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
22158b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
22168b8fb99cSAlex Elder {
22178b8fb99cSAlex Elder 	if (spec)
22188b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
22198b8fb99cSAlex Elder }
22208b8fb99cSAlex Elder 
22218b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
22228b8fb99cSAlex Elder {
22238b8fb99cSAlex Elder 	struct rbd_spec *spec;
22248b8fb99cSAlex Elder 
22258b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
22268b8fb99cSAlex Elder 	if (!spec)
22278b8fb99cSAlex Elder 		return NULL;
22288b8fb99cSAlex Elder 	kref_init(&spec->kref);
22298b8fb99cSAlex Elder 
22308b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
22318b8fb99cSAlex Elder 
22328b8fb99cSAlex Elder 	return spec;
22338b8fb99cSAlex Elder }
22348b8fb99cSAlex Elder 
22358b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
22368b8fb99cSAlex Elder {
22378b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
22388b8fb99cSAlex Elder 
22398b8fb99cSAlex Elder 	kfree(spec->pool_name);
22408b8fb99cSAlex Elder 	kfree(spec->image_id);
22418b8fb99cSAlex Elder 	kfree(spec->image_name);
22428b8fb99cSAlex Elder 	kfree(spec->snap_name);
22438b8fb99cSAlex Elder 	kfree(spec);
22448b8fb99cSAlex Elder }
22458b8fb99cSAlex Elder 
2246c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2247c53d5893SAlex Elder 				struct rbd_spec *spec)
2248c53d5893SAlex Elder {
2249c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2250c53d5893SAlex Elder 
2251c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2252c53d5893SAlex Elder 	if (!rbd_dev)
2253c53d5893SAlex Elder 		return NULL;
2254c53d5893SAlex Elder 
2255c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
2256c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2257c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2258c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2259c53d5893SAlex Elder 
2260c53d5893SAlex Elder 	rbd_dev->spec = spec;
2261c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2262c53d5893SAlex Elder 
2263c53d5893SAlex Elder 	return rbd_dev;
2264c53d5893SAlex Elder }
2265c53d5893SAlex Elder 
2266c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2267c53d5893SAlex Elder {
226886b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2269c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2270c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2271c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2272c53d5893SAlex Elder 	kfree(rbd_dev);
2273c53d5893SAlex Elder }
2274c53d5893SAlex Elder 
2275304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2276304f6808SAlex Elder {
2277304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2278304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2279304f6808SAlex Elder 
2280304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2281304f6808SAlex Elder 
2282304f6808SAlex Elder 	return ret;
2283304f6808SAlex Elder }
2284304f6808SAlex Elder 
228541f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2286dfc5606dSYehuda Sadeh {
2287dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2288304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2289dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2290dfc5606dSYehuda Sadeh }
2291dfc5606dSYehuda Sadeh 
229214e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2293dfc5606dSYehuda Sadeh 				  struct device *parent)
2294dfc5606dSYehuda Sadeh {
2295dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2296dfc5606dSYehuda Sadeh 	int ret;
2297dfc5606dSYehuda Sadeh 
2298dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2299dfc5606dSYehuda Sadeh 	dev->parent = parent;
2300dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2301d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2302304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2303304f6808SAlex Elder 
2304dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2305dfc5606dSYehuda Sadeh 
2306dfc5606dSYehuda Sadeh 	return ret;
2307dfc5606dSYehuda Sadeh }
2308dfc5606dSYehuda Sadeh 
23094e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2310c8d18425SAlex Elder 						const char *snap_name,
231134b13184SAlex Elder 						u64 snap_id, u64 snap_size,
231234b13184SAlex Elder 						u64 snap_features)
2313dfc5606dSYehuda Sadeh {
23144e891e0aSAlex Elder 	struct rbd_snap *snap;
2315dfc5606dSYehuda Sadeh 	int ret;
23164e891e0aSAlex Elder 
23174e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2318dfc5606dSYehuda Sadeh 	if (!snap)
23194e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
23204e891e0aSAlex Elder 
23214e891e0aSAlex Elder 	ret = -ENOMEM;
2322c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
23234e891e0aSAlex Elder 	if (!snap->name)
23244e891e0aSAlex Elder 		goto err;
23254e891e0aSAlex Elder 
2326c8d18425SAlex Elder 	snap->id = snap_id;
2327c8d18425SAlex Elder 	snap->size = snap_size;
232834b13184SAlex Elder 	snap->features = snap_features;
23294e891e0aSAlex Elder 
23304e891e0aSAlex Elder 	return snap;
23314e891e0aSAlex Elder 
2332dfc5606dSYehuda Sadeh err:
2333dfc5606dSYehuda Sadeh 	kfree(snap->name);
2334dfc5606dSYehuda Sadeh 	kfree(snap);
23354e891e0aSAlex Elder 
23364e891e0aSAlex Elder 	return ERR_PTR(ret);
2337dfc5606dSYehuda Sadeh }
2338dfc5606dSYehuda Sadeh 
2339cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2340cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2341cd892126SAlex Elder {
2342cd892126SAlex Elder 	char *snap_name;
2343cd892126SAlex Elder 
2344cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2345cd892126SAlex Elder 
2346cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2347cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2348cd892126SAlex Elder 
2349cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2350cd892126SAlex Elder 
2351cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2352cd892126SAlex Elder 	while (which--)
2353cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2354cd892126SAlex Elder 
2355cd892126SAlex Elder 	return snap_name;
2356cd892126SAlex Elder }
2357cd892126SAlex Elder 
2358dfc5606dSYehuda Sadeh /*
23599d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
23609d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
23619d475de5SAlex Elder  * image.
23629d475de5SAlex Elder  */
23639d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
23649d475de5SAlex Elder 				u8 *order, u64 *snap_size)
23659d475de5SAlex Elder {
23669d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
23679d475de5SAlex Elder 	int ret;
23689d475de5SAlex Elder 	struct {
23699d475de5SAlex Elder 		u8 order;
23709d475de5SAlex Elder 		__le64 size;
23719d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
23729d475de5SAlex Elder 
23739d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
23749d475de5SAlex Elder 				"rbd", "get_size",
23759d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
23769d475de5SAlex Elder 				(char *) &size_buf, sizeof (size_buf),
23779d475de5SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
23789d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
23799d475de5SAlex Elder 	if (ret < 0)
23809d475de5SAlex Elder 		return ret;
23819d475de5SAlex Elder 
23829d475de5SAlex Elder 	*order = size_buf.order;
23839d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
23849d475de5SAlex Elder 
23859d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
23869d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
23879d475de5SAlex Elder 		(unsigned long long) *snap_size);
23889d475de5SAlex Elder 
23899d475de5SAlex Elder 	return 0;
23909d475de5SAlex Elder }
23919d475de5SAlex Elder 
23929d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
23939d475de5SAlex Elder {
23949d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
23959d475de5SAlex Elder 					&rbd_dev->header.obj_order,
23969d475de5SAlex Elder 					&rbd_dev->header.image_size);
23979d475de5SAlex Elder }
23989d475de5SAlex Elder 
23991e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
24001e130199SAlex Elder {
24011e130199SAlex Elder 	void *reply_buf;
24021e130199SAlex Elder 	int ret;
24031e130199SAlex Elder 	void *p;
24041e130199SAlex Elder 
24051e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
24061e130199SAlex Elder 	if (!reply_buf)
24071e130199SAlex Elder 		return -ENOMEM;
24081e130199SAlex Elder 
24091e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24101e130199SAlex Elder 				"rbd", "get_object_prefix",
24111e130199SAlex Elder 				NULL, 0,
24121e130199SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
24131e130199SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
24141e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24151e130199SAlex Elder 	if (ret < 0)
24161e130199SAlex Elder 		goto out;
2417a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
24181e130199SAlex Elder 
24191e130199SAlex Elder 	p = reply_buf;
24201e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
24211e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
24221e130199SAlex Elder 						NULL, GFP_NOIO);
24231e130199SAlex Elder 
24241e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
24251e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
24261e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
24271e130199SAlex Elder 	} else {
24281e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
24291e130199SAlex Elder 	}
24301e130199SAlex Elder 
24311e130199SAlex Elder out:
24321e130199SAlex Elder 	kfree(reply_buf);
24331e130199SAlex Elder 
24341e130199SAlex Elder 	return ret;
24351e130199SAlex Elder }
24361e130199SAlex Elder 
2437b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2438b1b5402aSAlex Elder 		u64 *snap_features)
2439b1b5402aSAlex Elder {
2440b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2441b1b5402aSAlex Elder 	struct {
2442b1b5402aSAlex Elder 		__le64 features;
2443b1b5402aSAlex Elder 		__le64 incompat;
2444b1b5402aSAlex Elder 	} features_buf = { 0 };
2445d889140cSAlex Elder 	u64 incompat;
2446b1b5402aSAlex Elder 	int ret;
2447b1b5402aSAlex Elder 
2448b1b5402aSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2449b1b5402aSAlex Elder 				"rbd", "get_features",
2450b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2451b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
2452b1b5402aSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2453b1b5402aSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2454b1b5402aSAlex Elder 	if (ret < 0)
2455b1b5402aSAlex Elder 		return ret;
2456d889140cSAlex Elder 
2457d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2458d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2459b8f5c6edSAlex Elder 		return -ENXIO;
2460d889140cSAlex Elder 
2461b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2462b1b5402aSAlex Elder 
2463b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2464b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2465b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2466b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2467b1b5402aSAlex Elder 
2468b1b5402aSAlex Elder 	return 0;
2469b1b5402aSAlex Elder }
2470b1b5402aSAlex Elder 
2471b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2472b1b5402aSAlex Elder {
2473b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2474b1b5402aSAlex Elder 						&rbd_dev->header.features);
2475b1b5402aSAlex Elder }
2476b1b5402aSAlex Elder 
247786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
247886b00e0dSAlex Elder {
247986b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
248086b00e0dSAlex Elder 	size_t size;
248186b00e0dSAlex Elder 	void *reply_buf = NULL;
248286b00e0dSAlex Elder 	__le64 snapid;
248386b00e0dSAlex Elder 	void *p;
248486b00e0dSAlex Elder 	void *end;
248586b00e0dSAlex Elder 	char *image_id;
248686b00e0dSAlex Elder 	u64 overlap;
248786b00e0dSAlex Elder 	size_t len = 0;
248886b00e0dSAlex Elder 	int ret;
248986b00e0dSAlex Elder 
249086b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
249186b00e0dSAlex Elder 	if (!parent_spec)
249286b00e0dSAlex Elder 		return -ENOMEM;
249386b00e0dSAlex Elder 
249486b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
249586b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
249686b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
249786b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
249886b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
249986b00e0dSAlex Elder 	if (!reply_buf) {
250086b00e0dSAlex Elder 		ret = -ENOMEM;
250186b00e0dSAlex Elder 		goto out_err;
250286b00e0dSAlex Elder 	}
250386b00e0dSAlex Elder 
250486b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
250586b00e0dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
250686b00e0dSAlex Elder 				"rbd", "get_parent",
250786b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
250886b00e0dSAlex Elder 				(char *) reply_buf, size,
250986b00e0dSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
251086b00e0dSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
251186b00e0dSAlex Elder 	if (ret < 0)
251286b00e0dSAlex Elder 		goto out_err;
251386b00e0dSAlex Elder 
251486b00e0dSAlex Elder 	ret = -ERANGE;
251586b00e0dSAlex Elder 	p = reply_buf;
251686b00e0dSAlex Elder 	end = (char *) reply_buf + size;
251786b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
251886b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
251986b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
252086b00e0dSAlex Elder 
252186b00e0dSAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
252286b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
252386b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
252486b00e0dSAlex Elder 		goto out_err;
252586b00e0dSAlex Elder 	}
252686b00e0dSAlex Elder 	parent_spec->image_id = image_id;
25279e15b77dSAlex Elder 	parent_spec->image_id_len = len;
252886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
252986b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
253086b00e0dSAlex Elder 
253186b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
253286b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
253386b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
253486b00e0dSAlex Elder out:
253586b00e0dSAlex Elder 	ret = 0;
253686b00e0dSAlex Elder out_err:
253786b00e0dSAlex Elder 	kfree(reply_buf);
253886b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
253986b00e0dSAlex Elder 
254086b00e0dSAlex Elder 	return ret;
254186b00e0dSAlex Elder }
254286b00e0dSAlex Elder 
25439e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
25449e15b77dSAlex Elder {
25459e15b77dSAlex Elder 	size_t image_id_size;
25469e15b77dSAlex Elder 	char *image_id;
25479e15b77dSAlex Elder 	void *p;
25489e15b77dSAlex Elder 	void *end;
25499e15b77dSAlex Elder 	size_t size;
25509e15b77dSAlex Elder 	void *reply_buf = NULL;
25519e15b77dSAlex Elder 	size_t len = 0;
25529e15b77dSAlex Elder 	char *image_name = NULL;
25539e15b77dSAlex Elder 	int ret;
25549e15b77dSAlex Elder 
25559e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
25569e15b77dSAlex Elder 
25579e15b77dSAlex Elder 	image_id_size = sizeof (__le32) + rbd_dev->spec->image_id_len;
25589e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
25599e15b77dSAlex Elder 	if (!image_id)
25609e15b77dSAlex Elder 		return NULL;
25619e15b77dSAlex Elder 
25629e15b77dSAlex Elder 	p = image_id;
25639e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
25649e15b77dSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id,
25659e15b77dSAlex Elder 				(u32) rbd_dev->spec->image_id_len);
25669e15b77dSAlex Elder 
25679e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
25689e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
25699e15b77dSAlex Elder 	if (!reply_buf)
25709e15b77dSAlex Elder 		goto out;
25719e15b77dSAlex Elder 
25729e15b77dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
25739e15b77dSAlex Elder 				"rbd", "dir_get_name",
25749e15b77dSAlex Elder 				image_id, image_id_size,
25759e15b77dSAlex Elder 				(char *) reply_buf, size,
25769e15b77dSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
25779e15b77dSAlex Elder 	if (ret < 0)
25789e15b77dSAlex Elder 		goto out;
25799e15b77dSAlex Elder 	p = reply_buf;
25809e15b77dSAlex Elder 	end = (char *) reply_buf + size;
25819e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
25829e15b77dSAlex Elder 	if (IS_ERR(image_name))
25839e15b77dSAlex Elder 		image_name = NULL;
25849e15b77dSAlex Elder 	else
25859e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
25869e15b77dSAlex Elder out:
25879e15b77dSAlex Elder 	kfree(reply_buf);
25889e15b77dSAlex Elder 	kfree(image_id);
25899e15b77dSAlex Elder 
25909e15b77dSAlex Elder 	return image_name;
25919e15b77dSAlex Elder }
25929e15b77dSAlex Elder 
25939e15b77dSAlex Elder /*
25949e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
25959e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
25969e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
25979e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
25989e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
25999e15b77dSAlex Elder  * until then.
26009e15b77dSAlex Elder  */
26019e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
26029e15b77dSAlex Elder {
26039e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
26049e15b77dSAlex Elder 	const char *name;
26059e15b77dSAlex Elder 	void *reply_buf = NULL;
26069e15b77dSAlex Elder 	int ret;
26079e15b77dSAlex Elder 
26089e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
26099e15b77dSAlex Elder 		return 0;	/* Already have the names */
26109e15b77dSAlex Elder 
26119e15b77dSAlex Elder 	/* Look up the pool name */
26129e15b77dSAlex Elder 
26139e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
26149e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
26159e15b77dSAlex Elder 	if (!name)
26169e15b77dSAlex Elder 		return -EIO;	/* pool id too large (>= 2^31) */
26179e15b77dSAlex Elder 
26189e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
26199e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
26209e15b77dSAlex Elder 		return -ENOMEM;
26219e15b77dSAlex Elder 
26229e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
26239e15b77dSAlex Elder 
26249e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
26259e15b77dSAlex Elder 	if (name) {
26269e15b77dSAlex Elder 		rbd_dev->spec->image_name_len = strlen(name);
26279e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
26289e15b77dSAlex Elder 	} else {
26299e15b77dSAlex Elder 		pr_warning(RBD_DRV_NAME "%d "
26309e15b77dSAlex Elder 			"unable to get image name for image id %s\n",
26319e15b77dSAlex Elder 			rbd_dev->major, rbd_dev->spec->image_id);
26329e15b77dSAlex Elder 	}
26339e15b77dSAlex Elder 
26349e15b77dSAlex Elder 	/* Look up the snapshot name. */
26359e15b77dSAlex Elder 
26369e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
26379e15b77dSAlex Elder 	if (!name) {
26389e15b77dSAlex Elder 		ret = -EIO;
26399e15b77dSAlex Elder 		goto out_err;
26409e15b77dSAlex Elder 	}
26419e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
26429e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
26439e15b77dSAlex Elder 		goto out_err;
26449e15b77dSAlex Elder 
26459e15b77dSAlex Elder 	return 0;
26469e15b77dSAlex Elder out_err:
26479e15b77dSAlex Elder 	kfree(reply_buf);
26489e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
26499e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
26509e15b77dSAlex Elder 
26519e15b77dSAlex Elder 	return ret;
26529e15b77dSAlex Elder }
26539e15b77dSAlex Elder 
26546e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
265535d489f9SAlex Elder {
265635d489f9SAlex Elder 	size_t size;
265735d489f9SAlex Elder 	int ret;
265835d489f9SAlex Elder 	void *reply_buf;
265935d489f9SAlex Elder 	void *p;
266035d489f9SAlex Elder 	void *end;
266135d489f9SAlex Elder 	u64 seq;
266235d489f9SAlex Elder 	u32 snap_count;
266335d489f9SAlex Elder 	struct ceph_snap_context *snapc;
266435d489f9SAlex Elder 	u32 i;
266535d489f9SAlex Elder 
266635d489f9SAlex Elder 	/*
266735d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
266835d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
266935d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
267035d489f9SAlex Elder 	 * prepared to receive.
267135d489f9SAlex Elder 	 */
267235d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
267335d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
267435d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
267535d489f9SAlex Elder 	if (!reply_buf)
267635d489f9SAlex Elder 		return -ENOMEM;
267735d489f9SAlex Elder 
267835d489f9SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
267935d489f9SAlex Elder 				"rbd", "get_snapcontext",
268035d489f9SAlex Elder 				NULL, 0,
268135d489f9SAlex Elder 				reply_buf, size,
26826e14b1a6SAlex Elder 				CEPH_OSD_FLAG_READ, ver);
268335d489f9SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
268435d489f9SAlex Elder 	if (ret < 0)
268535d489f9SAlex Elder 		goto out;
268635d489f9SAlex Elder 
268735d489f9SAlex Elder 	ret = -ERANGE;
268835d489f9SAlex Elder 	p = reply_buf;
268935d489f9SAlex Elder 	end = (char *) reply_buf + size;
269035d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
269135d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
269235d489f9SAlex Elder 
269335d489f9SAlex Elder 	/*
269435d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
269535d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
269635d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
269735d489f9SAlex Elder 	 * allocate is representable in a size_t.
269835d489f9SAlex Elder 	 */
269935d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
270035d489f9SAlex Elder 				 / sizeof (u64)) {
270135d489f9SAlex Elder 		ret = -EINVAL;
270235d489f9SAlex Elder 		goto out;
270335d489f9SAlex Elder 	}
270435d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
270535d489f9SAlex Elder 		goto out;
270635d489f9SAlex Elder 
270735d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
270835d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
270935d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
271035d489f9SAlex Elder 	if (!snapc) {
271135d489f9SAlex Elder 		ret = -ENOMEM;
271235d489f9SAlex Elder 		goto out;
271335d489f9SAlex Elder 	}
271435d489f9SAlex Elder 
271535d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
271635d489f9SAlex Elder 	snapc->seq = seq;
271735d489f9SAlex Elder 	snapc->num_snaps = snap_count;
271835d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
271935d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
272035d489f9SAlex Elder 
272135d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
272235d489f9SAlex Elder 
272335d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
272435d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
272535d489f9SAlex Elder 
272635d489f9SAlex Elder out:
272735d489f9SAlex Elder 	kfree(reply_buf);
272835d489f9SAlex Elder 
272935d489f9SAlex Elder 	return 0;
273035d489f9SAlex Elder }
273135d489f9SAlex Elder 
2732b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2733b8b1e2dbSAlex Elder {
2734b8b1e2dbSAlex Elder 	size_t size;
2735b8b1e2dbSAlex Elder 	void *reply_buf;
2736b8b1e2dbSAlex Elder 	__le64 snap_id;
2737b8b1e2dbSAlex Elder 	int ret;
2738b8b1e2dbSAlex Elder 	void *p;
2739b8b1e2dbSAlex Elder 	void *end;
2740b8b1e2dbSAlex Elder 	char *snap_name;
2741b8b1e2dbSAlex Elder 
2742b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2743b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
2744b8b1e2dbSAlex Elder 	if (!reply_buf)
2745b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
2746b8b1e2dbSAlex Elder 
2747b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2748b8b1e2dbSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2749b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
2750b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
2751b8b1e2dbSAlex Elder 				reply_buf, size,
2752b8b1e2dbSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2753b8b1e2dbSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2754b8b1e2dbSAlex Elder 	if (ret < 0)
2755b8b1e2dbSAlex Elder 		goto out;
2756b8b1e2dbSAlex Elder 
2757b8b1e2dbSAlex Elder 	p = reply_buf;
2758b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
2759e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2760b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
2761b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
2762b8b1e2dbSAlex Elder 		goto out;
2763b8b1e2dbSAlex Elder 	} else {
2764b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
2765b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
2766b8b1e2dbSAlex Elder 	}
2767b8b1e2dbSAlex Elder 	kfree(reply_buf);
2768b8b1e2dbSAlex Elder 
2769b8b1e2dbSAlex Elder 	return snap_name;
2770b8b1e2dbSAlex Elder out:
2771b8b1e2dbSAlex Elder 	kfree(reply_buf);
2772b8b1e2dbSAlex Elder 
2773b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
2774b8b1e2dbSAlex Elder }
2775b8b1e2dbSAlex Elder 
2776b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2777b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2778b8b1e2dbSAlex Elder {
2779b8b1e2dbSAlex Elder 	__le64 snap_id;
2780b8b1e2dbSAlex Elder 	u8 order;
2781b8b1e2dbSAlex Elder 	int ret;
2782b8b1e2dbSAlex Elder 
2783b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
2784b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2785b8b1e2dbSAlex Elder 	if (ret)
2786b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2787b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2788b8b1e2dbSAlex Elder 	if (ret)
2789b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2790b8b1e2dbSAlex Elder 
2791b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
2792b8b1e2dbSAlex Elder }
2793b8b1e2dbSAlex Elder 
2794b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2795b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2796b8b1e2dbSAlex Elder {
2797b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
2798b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
2799b8b1e2dbSAlex Elder 					snap_size, snap_features);
2800b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
2801b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
2802b8b1e2dbSAlex Elder 					snap_size, snap_features);
2803b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
2804b8b1e2dbSAlex Elder }
2805b8b1e2dbSAlex Elder 
2806117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2807117973fbSAlex Elder {
2808117973fbSAlex Elder 	int ret;
2809117973fbSAlex Elder 	__u8 obj_order;
2810117973fbSAlex Elder 
2811117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
2812117973fbSAlex Elder 
2813117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
2814117973fbSAlex Elder 
2815117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
2816117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
2817117973fbSAlex Elder 	if (ret)
2818117973fbSAlex Elder 		goto out;
2819117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
2820117973fbSAlex Elder 		ret = -EIO;
2821117973fbSAlex Elder 		goto out;
2822117973fbSAlex Elder 	}
2823117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
2824117973fbSAlex Elder 
2825117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2826117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
2827117973fbSAlex Elder 	if (ret)
2828117973fbSAlex Elder 		goto out;
2829117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2830117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
2831117973fbSAlex Elder 	if (ret)
2832117973fbSAlex Elder 		goto out;
2833117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
2834117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
2835117973fbSAlex Elder out:
2836117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
2837117973fbSAlex Elder 
2838117973fbSAlex Elder 	return ret;
2839117973fbSAlex Elder }
2840117973fbSAlex Elder 
28419d475de5SAlex Elder /*
284235938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
284335938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
284435938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
284535938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
284635938150SAlex Elder  * And verify there are no changes to snapshots we already know
284735938150SAlex Elder  * about.
284835938150SAlex Elder  *
284935938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
285035938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
285135938150SAlex Elder  * are also maintained in that order.)
2852dfc5606dSYehuda Sadeh  */
2853304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2854dfc5606dSYehuda Sadeh {
285535938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
285635938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
285735938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
285835938150SAlex Elder 	struct list_head *links = head->next;
285935938150SAlex Elder 	u32 index = 0;
2860dfc5606dSYehuda Sadeh 
28619fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
286235938150SAlex Elder 	while (index < snap_count || links != head) {
286335938150SAlex Elder 		u64 snap_id;
286435938150SAlex Elder 		struct rbd_snap *snap;
2865cd892126SAlex Elder 		char *snap_name;
2866cd892126SAlex Elder 		u64 snap_size = 0;
2867cd892126SAlex Elder 		u64 snap_features = 0;
2868dfc5606dSYehuda Sadeh 
286935938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
287035938150SAlex Elder 					     : CEPH_NOSNAP;
287135938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
287235938150SAlex Elder 				     : NULL;
2873aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2874dfc5606dSYehuda Sadeh 
287535938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
287635938150SAlex Elder 			struct list_head *next = links->next;
2877dfc5606dSYehuda Sadeh 
287835938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2879dfc5606dSYehuda Sadeh 
28800d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
2881daba5fdbSAlex Elder 				rbd_dev->exists = false;
288241f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
28839fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
28840d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
28850d7dbfceSAlex Elder 							"mapped " : "",
28869fcbb800SAlex Elder 				(unsigned long long) snap->id);
2887dfc5606dSYehuda Sadeh 
288835938150SAlex Elder 			/* Done with this list entry; advance */
288935938150SAlex Elder 
289035938150SAlex Elder 			links = next;
289135938150SAlex Elder 			continue;
2892dfc5606dSYehuda Sadeh 		}
289335938150SAlex Elder 
2894b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
2895cd892126SAlex Elder 					&snap_size, &snap_features);
2896cd892126SAlex Elder 		if (IS_ERR(snap_name))
2897cd892126SAlex Elder 			return PTR_ERR(snap_name);
2898cd892126SAlex Elder 
28999fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
29009fcbb800SAlex Elder 			(unsigned long long) snap_id);
290135938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
290235938150SAlex Elder 			struct rbd_snap *new_snap;
290335938150SAlex Elder 
290435938150SAlex Elder 			/* We haven't seen this snapshot before */
290535938150SAlex Elder 
2906c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2907cd892126SAlex Elder 					snap_id, snap_size, snap_features);
29089fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
29099fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
29109fcbb800SAlex Elder 
29119fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
29129fcbb800SAlex Elder 
29139fcbb800SAlex Elder 				return err;
29149fcbb800SAlex Elder 			}
291535938150SAlex Elder 
291635938150SAlex Elder 			/* New goes before existing, or at end of list */
291735938150SAlex Elder 
29189fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
291935938150SAlex Elder 			if (snap)
292035938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
292135938150SAlex Elder 			else
2922523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
292335938150SAlex Elder 		} else {
292435938150SAlex Elder 			/* Already have this one */
292535938150SAlex Elder 
29269fcbb800SAlex Elder 			dout("  already present\n");
29279fcbb800SAlex Elder 
2928cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2929aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2930cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
293135938150SAlex Elder 
293235938150SAlex Elder 			/* Done with this list entry; advance */
293335938150SAlex Elder 
293435938150SAlex Elder 			links = links->next;
2935dfc5606dSYehuda Sadeh 		}
293635938150SAlex Elder 
293735938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
293835938150SAlex Elder 
293935938150SAlex Elder 		index++;
2940dfc5606dSYehuda Sadeh 	}
29419fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2942dfc5606dSYehuda Sadeh 
2943dfc5606dSYehuda Sadeh 	return 0;
2944dfc5606dSYehuda Sadeh }
2945dfc5606dSYehuda Sadeh 
2946304f6808SAlex Elder /*
2947304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2948304f6808SAlex Elder  * have not already been registered.
2949304f6808SAlex Elder  */
2950304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2951304f6808SAlex Elder {
2952304f6808SAlex Elder 	struct rbd_snap *snap;
2953304f6808SAlex Elder 	int ret = 0;
2954304f6808SAlex Elder 
2955304f6808SAlex Elder 	dout("%s called\n", __func__);
295686ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
295786ff77bbSAlex Elder 		return -EIO;
2958304f6808SAlex Elder 
2959304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2960304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
2961304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2962304f6808SAlex Elder 			if (ret < 0)
2963304f6808SAlex Elder 				break;
2964304f6808SAlex Elder 		}
2965304f6808SAlex Elder 	}
2966304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
2967304f6808SAlex Elder 
2968304f6808SAlex Elder 	return ret;
2969304f6808SAlex Elder }
2970304f6808SAlex Elder 
2971dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2972dfc5606dSYehuda Sadeh {
2973dfc5606dSYehuda Sadeh 	struct device *dev;
2974cd789ab9SAlex Elder 	int ret;
2975dfc5606dSYehuda Sadeh 
2976dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2977dfc5606dSYehuda Sadeh 
2978cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
2979dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2980dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2981dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2982dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2983de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2984dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2985dfc5606dSYehuda Sadeh 
2986dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2987cd789ab9SAlex Elder 
2988dfc5606dSYehuda Sadeh 	return ret;
2989602adf40SYehuda Sadeh }
2990602adf40SYehuda Sadeh 
2991dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2992dfc5606dSYehuda Sadeh {
2993dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2994dfc5606dSYehuda Sadeh }
2995dfc5606dSYehuda Sadeh 
299659c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
299759c2be1eSYehuda Sadeh {
299859c2be1eSYehuda Sadeh 	int ret, rc;
299959c2be1eSYehuda Sadeh 
300059c2be1eSYehuda Sadeh 	do {
30010e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
300259c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
3003117973fbSAlex Elder 			rc = rbd_dev_refresh(rbd_dev, NULL);
300459c2be1eSYehuda Sadeh 			if (rc < 0)
300559c2be1eSYehuda Sadeh 				return rc;
300659c2be1eSYehuda Sadeh 		}
300759c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
300859c2be1eSYehuda Sadeh 
300959c2be1eSYehuda Sadeh 	return ret;
301059c2be1eSYehuda Sadeh }
301159c2be1eSYehuda Sadeh 
3012e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
30131ddbe94eSAlex Elder 
30141ddbe94eSAlex Elder /*
3015499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3016499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
30171ddbe94eSAlex Elder  */
3018e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3019b7f23c36SAlex Elder {
3020e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3021499afd5bSAlex Elder 
3022499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3023499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3024499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3025e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3026e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3027b7f23c36SAlex Elder }
3028b7f23c36SAlex Elder 
30291ddbe94eSAlex Elder /*
3030499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3031499afd5bSAlex Elder  * identifier is no longer in use.
30321ddbe94eSAlex Elder  */
3033e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
30341ddbe94eSAlex Elder {
3035d184f6bfSAlex Elder 	struct list_head *tmp;
3036de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3037d184f6bfSAlex Elder 	int max_id;
3038d184f6bfSAlex Elder 
3039aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3040499afd5bSAlex Elder 
3041e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3042e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3043499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3044499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3045d184f6bfSAlex Elder 
3046d184f6bfSAlex Elder 	/*
3047d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3048d184f6bfSAlex Elder 	 * is nothing special we need to do.
3049d184f6bfSAlex Elder 	 */
3050e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3051d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3052d184f6bfSAlex Elder 		return;
3053d184f6bfSAlex Elder 	}
3054d184f6bfSAlex Elder 
3055d184f6bfSAlex Elder 	/*
3056d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3057d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3058d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3059d184f6bfSAlex Elder 	 */
3060d184f6bfSAlex Elder 	max_id = 0;
3061d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3062d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3063d184f6bfSAlex Elder 
3064d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3065b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3066b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3067d184f6bfSAlex Elder 	}
3068499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
30691ddbe94eSAlex Elder 
30701ddbe94eSAlex Elder 	/*
3071e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3072d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3073d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3074d184f6bfSAlex Elder 	 * case.
30751ddbe94eSAlex Elder 	 */
3076e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3077e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3078b7f23c36SAlex Elder }
3079b7f23c36SAlex Elder 
3080a725f65eSAlex Elder /*
3081e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3082e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3083593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3084593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3085e28fff26SAlex Elder  */
3086e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3087e28fff26SAlex Elder {
3088e28fff26SAlex Elder         /*
3089e28fff26SAlex Elder         * These are the characters that produce nonzero for
3090e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3091e28fff26SAlex Elder         */
3092e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3093e28fff26SAlex Elder 
3094e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3095e28fff26SAlex Elder 
3096e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3097e28fff26SAlex Elder }
3098e28fff26SAlex Elder 
3099e28fff26SAlex Elder /*
3100e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3101e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3102593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3103593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3104e28fff26SAlex Elder  *
3105e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3106e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3107e28fff26SAlex Elder  * token_size if the token would not fit.
3108e28fff26SAlex Elder  *
3109593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3110e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3111e28fff26SAlex Elder  * too small to hold it.
3112e28fff26SAlex Elder  */
3113e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3114e28fff26SAlex Elder 				char *token,
3115e28fff26SAlex Elder 				size_t token_size)
3116e28fff26SAlex Elder {
3117e28fff26SAlex Elder         size_t len;
3118e28fff26SAlex Elder 
3119e28fff26SAlex Elder 	len = next_token(buf);
3120e28fff26SAlex Elder 	if (len < token_size) {
3121e28fff26SAlex Elder 		memcpy(token, *buf, len);
3122e28fff26SAlex Elder 		*(token + len) = '\0';
3123e28fff26SAlex Elder 	}
3124e28fff26SAlex Elder 	*buf += len;
3125e28fff26SAlex Elder 
3126e28fff26SAlex Elder         return len;
3127e28fff26SAlex Elder }
3128e28fff26SAlex Elder 
3129e28fff26SAlex Elder /*
3130ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3131ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3132ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3133ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3134ea3352f4SAlex Elder  *
3135ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3136ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3137ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3138ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3139ea3352f4SAlex Elder  *
3140ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3141ea3352f4SAlex Elder  * the end of the found token.
3142ea3352f4SAlex Elder  *
3143ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3144ea3352f4SAlex Elder  */
3145ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3146ea3352f4SAlex Elder {
3147ea3352f4SAlex Elder 	char *dup;
3148ea3352f4SAlex Elder 	size_t len;
3149ea3352f4SAlex Elder 
3150ea3352f4SAlex Elder 	len = next_token(buf);
3151ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
3152ea3352f4SAlex Elder 	if (!dup)
3153ea3352f4SAlex Elder 		return NULL;
3154ea3352f4SAlex Elder 
3155ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
3156ea3352f4SAlex Elder 	*(dup + len) = '\0';
3157ea3352f4SAlex Elder 	*buf += len;
3158ea3352f4SAlex Elder 
3159ea3352f4SAlex Elder 	if (lenp)
3160ea3352f4SAlex Elder 		*lenp = len;
3161ea3352f4SAlex Elder 
3162ea3352f4SAlex Elder 	return dup;
3163ea3352f4SAlex Elder }
3164ea3352f4SAlex Elder 
3165ea3352f4SAlex Elder /*
3166859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3167859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3168859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3169859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3170d22f76e7SAlex Elder  *
3171859c31dfSAlex Elder  * The information extracted from these options is recorded in
3172859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3173859c31dfSAlex Elder  * structures:
3174859c31dfSAlex Elder  *  ceph_opts
3175859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3176859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3177859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3178859c31dfSAlex Elder  *  rbd_opts
3179859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3180859c31dfSAlex Elder  *	this function; caller must release with kfree().
3181859c31dfSAlex Elder  *  spec
3182859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3183859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3184859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3185859c31dfSAlex Elder  *
3186859c31dfSAlex Elder  * The options passed take this form:
3187859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3188859c31dfSAlex Elder  * where:
3189859c31dfSAlex Elder  *  <mon_addrs>
3190859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3191859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3192859c31dfSAlex Elder  *      by a port number (separated by a colon).
3193859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3194859c31dfSAlex Elder  *  <options>
3195859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3196859c31dfSAlex Elder  *  <pool_name>
3197859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3198859c31dfSAlex Elder  *  <image_name>
3199859c31dfSAlex Elder  *      The name of the image in that pool to map.
3200859c31dfSAlex Elder  *  <snap_id>
3201859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3202859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3203859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3204859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3205a725f65eSAlex Elder  */
3206859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3207dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3208859c31dfSAlex Elder 				struct rbd_options **opts,
3209859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3210a725f65eSAlex Elder {
3211e28fff26SAlex Elder 	size_t len;
3212859c31dfSAlex Elder 	char *options;
32130ddebc0cSAlex Elder 	const char *mon_addrs;
32140ddebc0cSAlex Elder 	size_t mon_addrs_size;
3215859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
32164e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3217859c31dfSAlex Elder 	struct ceph_options *copts;
3218dc79b113SAlex Elder 	int ret;
3219e28fff26SAlex Elder 
3220e28fff26SAlex Elder 	/* The first four tokens are required */
3221e28fff26SAlex Elder 
32227ef3214aSAlex Elder 	len = next_token(&buf);
32237ef3214aSAlex Elder 	if (!len)
3224dc79b113SAlex Elder 		return -EINVAL;	/* Missing monitor address(es) */
32250ddebc0cSAlex Elder 	mon_addrs = buf;
3226f28e565aSAlex Elder 	mon_addrs_size = len + 1;
32277ef3214aSAlex Elder 	buf += len;
3228a725f65eSAlex Elder 
3229dc79b113SAlex Elder 	ret = -EINVAL;
3230f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3231f28e565aSAlex Elder 	if (!options)
3232dc79b113SAlex Elder 		return -ENOMEM;
3233f28e565aSAlex Elder 	if (!*options)
3234f28e565aSAlex Elder 		goto out_err;	/* Missing options */
3235a725f65eSAlex Elder 
3236859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3237859c31dfSAlex Elder 	if (!spec)
3238f28e565aSAlex Elder 		goto out_mem;
3239859c31dfSAlex Elder 
3240859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3241859c31dfSAlex Elder 	if (!spec->pool_name)
3242859c31dfSAlex Elder 		goto out_mem;
3243859c31dfSAlex Elder 	if (!*spec->pool_name)
3244f28e565aSAlex Elder 		goto out_err;	/* Missing pool name */
3245e28fff26SAlex Elder 
3246859c31dfSAlex Elder 	spec->image_name = dup_token(&buf, &spec->image_name_len);
3247859c31dfSAlex Elder 	if (!spec->image_name)
3248f28e565aSAlex Elder 		goto out_mem;
3249859c31dfSAlex Elder 	if (!*spec->image_name)
3250f28e565aSAlex Elder 		goto out_err;	/* Missing image name */
3251e28fff26SAlex Elder 
3252f28e565aSAlex Elder 	/*
3253f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3254f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3255f28e565aSAlex Elder 	 */
32563feeb894SAlex Elder 	len = next_token(&buf);
3257820a5f3eSAlex Elder 	if (!len) {
32583feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
32593feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3260f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3261dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3262f28e565aSAlex Elder 		goto out_err;
3263849b4260SAlex Elder 	}
3264859c31dfSAlex Elder 	spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3265859c31dfSAlex Elder 	if (!spec->snap_name)
3266f28e565aSAlex Elder 		goto out_mem;
3267859c31dfSAlex Elder 	memcpy(spec->snap_name, buf, len);
3268859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3269e5c35534SAlex Elder 
32700ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3271e28fff26SAlex Elder 
32724e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
32734e9afebaSAlex Elder 	if (!rbd_opts)
32744e9afebaSAlex Elder 		goto out_mem;
32754e9afebaSAlex Elder 
32764e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3277d22f76e7SAlex Elder 
3278859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
32790ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
32804e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3281859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3282859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3283dc79b113SAlex Elder 		goto out_err;
3284dc79b113SAlex Elder 	}
3285859c31dfSAlex Elder 	kfree(options);
3286859c31dfSAlex Elder 
3287859c31dfSAlex Elder 	*ceph_opts = copts;
32884e9afebaSAlex Elder 	*opts = rbd_opts;
3289859c31dfSAlex Elder 	*rbd_spec = spec;
32900ddebc0cSAlex Elder 
3291dc79b113SAlex Elder 	return 0;
3292f28e565aSAlex Elder out_mem:
3293dc79b113SAlex Elder 	ret = -ENOMEM;
3294d22f76e7SAlex Elder out_err:
3295859c31dfSAlex Elder 	kfree(rbd_opts);
3296859c31dfSAlex Elder 	rbd_spec_put(spec);
3297f28e565aSAlex Elder 	kfree(options);
3298d22f76e7SAlex Elder 
3299dc79b113SAlex Elder 	return ret;
3300a725f65eSAlex Elder }
3301a725f65eSAlex Elder 
3302589d30e0SAlex Elder /*
3303589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3304589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3305589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3306589d30e0SAlex Elder  *
3307589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3308589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3309589d30e0SAlex Elder  * with the supplied name.
3310589d30e0SAlex Elder  *
3311589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3312589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3313589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3314589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3315589d30e0SAlex Elder  */
3316589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3317589d30e0SAlex Elder {
3318589d30e0SAlex Elder 	int ret;
3319589d30e0SAlex Elder 	size_t size;
3320589d30e0SAlex Elder 	char *object_name;
3321589d30e0SAlex Elder 	void *response;
3322589d30e0SAlex Elder 	void *p;
3323589d30e0SAlex Elder 
3324589d30e0SAlex Elder 	/*
33252c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
33262c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
33272c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
33282c0d0a10SAlex Elder 	 */
33292c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
33302c0d0a10SAlex Elder 		return 0;
33312c0d0a10SAlex Elder 
33322c0d0a10SAlex Elder 	/*
3333589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3334589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3335589d30e0SAlex Elder 	 */
33360d7dbfceSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
3337589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3338589d30e0SAlex Elder 	if (!object_name)
3339589d30e0SAlex Elder 		return -ENOMEM;
33400d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3341589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3342589d30e0SAlex Elder 
3343589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3344589d30e0SAlex Elder 
3345589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3346589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3347589d30e0SAlex Elder 	if (!response) {
3348589d30e0SAlex Elder 		ret = -ENOMEM;
3349589d30e0SAlex Elder 		goto out;
3350589d30e0SAlex Elder 	}
3351589d30e0SAlex Elder 
3352589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
3353589d30e0SAlex Elder 				"rbd", "get_id",
3354589d30e0SAlex Elder 				NULL, 0,
3355589d30e0SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX,
3356589d30e0SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
3357589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3358589d30e0SAlex Elder 	if (ret < 0)
3359589d30e0SAlex Elder 		goto out;
3360a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
3361589d30e0SAlex Elder 
3362589d30e0SAlex Elder 	p = response;
33630d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3364589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
33650d7dbfceSAlex Elder 						&rbd_dev->spec->image_id_len,
3366589d30e0SAlex Elder 						GFP_NOIO);
33670d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
33680d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
33690d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3370589d30e0SAlex Elder 	} else {
33710d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3372589d30e0SAlex Elder 	}
3373589d30e0SAlex Elder out:
3374589d30e0SAlex Elder 	kfree(response);
3375589d30e0SAlex Elder 	kfree(object_name);
3376589d30e0SAlex Elder 
3377589d30e0SAlex Elder 	return ret;
3378589d30e0SAlex Elder }
3379589d30e0SAlex Elder 
3380a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3381a30b71b9SAlex Elder {
3382a30b71b9SAlex Elder 	int ret;
3383a30b71b9SAlex Elder 	size_t size;
3384a30b71b9SAlex Elder 
3385a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3386a30b71b9SAlex Elder 
33870d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
33880d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3389a30b71b9SAlex Elder 		return -ENOMEM;
33900d7dbfceSAlex Elder 	rbd_dev->spec->image_id_len = 0;
3391a30b71b9SAlex Elder 
3392a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3393a30b71b9SAlex Elder 
33940d7dbfceSAlex Elder 	size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
3395a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3396a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3397a30b71b9SAlex Elder 		ret = -ENOMEM;
3398a30b71b9SAlex Elder 		goto out_err;
3399a30b71b9SAlex Elder 	}
34000d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34010d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3402a30b71b9SAlex Elder 
3403a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3404a30b71b9SAlex Elder 
3405a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3406a30b71b9SAlex Elder 	if (ret < 0)
3407a30b71b9SAlex Elder 		goto out_err;
340886b00e0dSAlex Elder 
340986b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
341086b00e0dSAlex Elder 
341186b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
341286b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
341386b00e0dSAlex Elder 
3414a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3415a30b71b9SAlex Elder 
3416a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3417a30b71b9SAlex Elder 		rbd_dev->header_name);
3418a30b71b9SAlex Elder 
3419a30b71b9SAlex Elder 	return 0;
3420a30b71b9SAlex Elder 
3421a30b71b9SAlex Elder out_err:
3422a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3423a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
34240d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
34250d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3426a30b71b9SAlex Elder 
3427a30b71b9SAlex Elder 	return ret;
3428a30b71b9SAlex Elder }
3429a30b71b9SAlex Elder 
3430a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3431a30b71b9SAlex Elder {
3432a30b71b9SAlex Elder 	size_t size;
34339d475de5SAlex Elder 	int ret;
34346e14b1a6SAlex Elder 	u64 ver = 0;
3435a30b71b9SAlex Elder 
3436a30b71b9SAlex Elder 	/*
3437a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3438a30b71b9SAlex Elder 	 * object name for this rbd image.
3439a30b71b9SAlex Elder 	 */
34400d7dbfceSAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
3441a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3442a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3443a30b71b9SAlex Elder 		return -ENOMEM;
3444a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34450d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
34469d475de5SAlex Elder 
34479d475de5SAlex Elder 	/* Get the size and object order for the image */
34489d475de5SAlex Elder 
34499d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
34509d475de5SAlex Elder 	if (ret < 0)
34519d475de5SAlex Elder 		goto out_err;
34521e130199SAlex Elder 
34531e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
34541e130199SAlex Elder 
34551e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
34561e130199SAlex Elder 	if (ret < 0)
34571e130199SAlex Elder 		goto out_err;
3458b1b5402aSAlex Elder 
3459d889140cSAlex Elder 	/* Get the and check features for the image */
3460b1b5402aSAlex Elder 
3461b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3462b1b5402aSAlex Elder 	if (ret < 0)
3463b1b5402aSAlex Elder 		goto out_err;
346435d489f9SAlex Elder 
346586b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
346686b00e0dSAlex Elder 
346786b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
346886b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
346986b00e0dSAlex Elder 		if (ret < 0)
347086b00e0dSAlex Elder 			goto out_err;
347186b00e0dSAlex Elder 	}
347286b00e0dSAlex Elder 
34736e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
347435d489f9SAlex Elder 
34756e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
34766e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
34776e14b1a6SAlex Elder 
34786e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
34796e14b1a6SAlex Elder 
34806e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
348135d489f9SAlex Elder 	if (ret)
348235d489f9SAlex Elder 		goto out_err;
34836e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
34846e14b1a6SAlex Elder 
3485a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3486a30b71b9SAlex Elder 
3487a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3488a30b71b9SAlex Elder 		rbd_dev->header_name);
3489a30b71b9SAlex Elder 
349035152979SAlex Elder 	return 0;
34919d475de5SAlex Elder out_err:
349286b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
349386b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
349486b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
34959d475de5SAlex Elder 	kfree(rbd_dev->header_name);
34969d475de5SAlex Elder 	rbd_dev->header_name = NULL;
34971e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
34981e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
34999d475de5SAlex Elder 
35009d475de5SAlex Elder 	return ret;
3501a30b71b9SAlex Elder }
3502a30b71b9SAlex Elder 
350383a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
350483a06263SAlex Elder {
350583a06263SAlex Elder 	int ret;
350683a06263SAlex Elder 
350783a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
350883a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
350983a06263SAlex Elder 	if (ret)
351083a06263SAlex Elder 		return ret;
351183a06263SAlex Elder 
35129e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
35139e15b77dSAlex Elder 	if (ret)
35149e15b77dSAlex Elder 		goto err_out_snaps;
35159e15b77dSAlex Elder 
351683a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
351783a06263SAlex Elder 	if (ret)
351883a06263SAlex Elder 		goto err_out_snaps;
351983a06263SAlex Elder 
352083a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
352183a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
352283a06263SAlex Elder 
352383a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
352483a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
352583a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
352683a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
352783a06263SAlex Elder 
352883a06263SAlex Elder 	/* Get our block major device number. */
352983a06263SAlex Elder 
353083a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
353183a06263SAlex Elder 	if (ret < 0)
353283a06263SAlex Elder 		goto err_out_id;
353383a06263SAlex Elder 	rbd_dev->major = ret;
353483a06263SAlex Elder 
353583a06263SAlex Elder 	/* Set up the blkdev mapping. */
353683a06263SAlex Elder 
353783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
353883a06263SAlex Elder 	if (ret)
353983a06263SAlex Elder 		goto err_out_blkdev;
354083a06263SAlex Elder 
354183a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
354283a06263SAlex Elder 	if (ret)
354383a06263SAlex Elder 		goto err_out_disk;
354483a06263SAlex Elder 
354583a06263SAlex Elder 	/*
354683a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
354783a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
354883a06263SAlex Elder 	 */
354983a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
355083a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
355183a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
355283a06263SAlex Elder 	if (ret)
355383a06263SAlex Elder 		goto err_out_bus;
355483a06263SAlex Elder 
355583a06263SAlex Elder 	ret = rbd_init_watch_dev(rbd_dev);
355683a06263SAlex Elder 	if (ret)
355783a06263SAlex Elder 		goto err_out_bus;
355883a06263SAlex Elder 
355983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
356083a06263SAlex Elder 
356183a06263SAlex Elder 	add_disk(rbd_dev->disk);
356283a06263SAlex Elder 
356383a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
356483a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
356583a06263SAlex Elder 
356683a06263SAlex Elder 	return ret;
356783a06263SAlex Elder err_out_bus:
356883a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
356983a06263SAlex Elder 
357083a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
357183a06263SAlex Elder 
357283a06263SAlex Elder 	return ret;
357383a06263SAlex Elder err_out_disk:
357483a06263SAlex Elder 	rbd_free_disk(rbd_dev);
357583a06263SAlex Elder err_out_blkdev:
357683a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
357783a06263SAlex Elder err_out_id:
357883a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
357983a06263SAlex Elder err_out_snaps:
358083a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
358183a06263SAlex Elder 
358283a06263SAlex Elder 	return ret;
358383a06263SAlex Elder }
358483a06263SAlex Elder 
3585a30b71b9SAlex Elder /*
3586a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3587a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3588a30b71b9SAlex Elder  * id.
3589a30b71b9SAlex Elder  */
3590a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3591a30b71b9SAlex Elder {
3592a30b71b9SAlex Elder 	int ret;
3593a30b71b9SAlex Elder 
3594a30b71b9SAlex Elder 	/*
3595a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3596a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3597a30b71b9SAlex Elder 	 * it's a format 1 image.
3598a30b71b9SAlex Elder 	 */
3599a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3600a30b71b9SAlex Elder 	if (ret)
3601a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3602a30b71b9SAlex Elder 	else
3603a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
360483a06263SAlex Elder 	if (ret) {
3605a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3606a30b71b9SAlex Elder 
3607a30b71b9SAlex Elder 		return ret;
3608a30b71b9SAlex Elder 	}
3609a30b71b9SAlex Elder 
361083a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
361183a06263SAlex Elder 	if (ret)
361283a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
361383a06263SAlex Elder 
361483a06263SAlex Elder 	return ret;
361583a06263SAlex Elder }
361683a06263SAlex Elder 
361759c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
361859c2be1eSYehuda Sadeh 		       const char *buf,
361959c2be1eSYehuda Sadeh 		       size_t count)
3620602adf40SYehuda Sadeh {
3621cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
3622dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
36234e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3624859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
36259d3997fdSAlex Elder 	struct rbd_client *rbdc;
362627cc2594SAlex Elder 	struct ceph_osd_client *osdc;
362727cc2594SAlex Elder 	int rc = -ENOMEM;
3628602adf40SYehuda Sadeh 
3629602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
3630602adf40SYehuda Sadeh 		return -ENODEV;
3631602adf40SYehuda Sadeh 
3632a725f65eSAlex Elder 	/* parse add command */
3633859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3634dc79b113SAlex Elder 	if (rc < 0)
3635bd4ba655SAlex Elder 		goto err_out_module;
3636a725f65eSAlex Elder 
36379d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
36389d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
36399d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
36400ddebc0cSAlex Elder 		goto err_out_args;
36419d3997fdSAlex Elder 	}
3642c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
3643602adf40SYehuda Sadeh 
3644602adf40SYehuda Sadeh 	/* pick the pool */
36459d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
3646859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3647602adf40SYehuda Sadeh 	if (rc < 0)
3648602adf40SYehuda Sadeh 		goto err_out_client;
3649859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
3650859c31dfSAlex Elder 
3651c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
3652bd4ba655SAlex Elder 	if (!rbd_dev)
3653bd4ba655SAlex Elder 		goto err_out_client;
3654c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
3655c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
3656602adf40SYehuda Sadeh 
3657bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
3658c53d5893SAlex Elder 	kfree(rbd_opts);
3659c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
3660bd4ba655SAlex Elder 
3661a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
3662a30b71b9SAlex Elder 	if (rc < 0)
3663c53d5893SAlex Elder 		goto err_out_rbd_dev;
366405fd6f6fSAlex Elder 
3665602adf40SYehuda Sadeh 	return count;
3666c53d5893SAlex Elder err_out_rbd_dev:
3667c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3668bd4ba655SAlex Elder err_out_client:
36699d3997fdSAlex Elder 	rbd_put_client(rbdc);
36700ddebc0cSAlex Elder err_out_args:
367178cea76eSAlex Elder 	if (ceph_opts)
367278cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
36734e9afebaSAlex Elder 	kfree(rbd_opts);
3674859c31dfSAlex Elder 	rbd_spec_put(spec);
3675bd4ba655SAlex Elder err_out_module:
3676bd4ba655SAlex Elder 	module_put(THIS_MODULE);
367727cc2594SAlex Elder 
3678602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
367927cc2594SAlex Elder 
368027cc2594SAlex Elder 	return (ssize_t) rc;
3681602adf40SYehuda Sadeh }
3682602adf40SYehuda Sadeh 
3683de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3684602adf40SYehuda Sadeh {
3685602adf40SYehuda Sadeh 	struct list_head *tmp;
3686602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
3687602adf40SYehuda Sadeh 
3688e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3689602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
3690602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3691de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
3692e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
3693602adf40SYehuda Sadeh 			return rbd_dev;
3694602adf40SYehuda Sadeh 		}
3695e124a82fSAlex Elder 	}
3696e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3697602adf40SYehuda Sadeh 	return NULL;
3698602adf40SYehuda Sadeh }
3699602adf40SYehuda Sadeh 
3700dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
3701602adf40SYehuda Sadeh {
3702593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3703602adf40SYehuda Sadeh 
37041dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
37051dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
37061dbb4399SAlex Elder 
37071dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
370859c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
37091dbb4399SAlex Elder 	}
371059c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
3711070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
371259c2be1eSYehuda Sadeh 
3713602adf40SYehuda Sadeh 
3714602adf40SYehuda Sadeh 	/* clean up and free blkdev */
3715602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
3716602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
371732eec68dSAlex Elder 
37182ac4e75dSAlex Elder 	/* release allocated disk header fields */
37192ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
37202ac4e75dSAlex Elder 
372132eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
3722e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
3723c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
3724c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3725602adf40SYehuda Sadeh 
3726602adf40SYehuda Sadeh 	/* release module ref */
3727602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
3728602adf40SYehuda Sadeh }
3729602adf40SYehuda Sadeh 
3730dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
3731602adf40SYehuda Sadeh 			  const char *buf,
3732602adf40SYehuda Sadeh 			  size_t count)
3733602adf40SYehuda Sadeh {
3734602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
3735602adf40SYehuda Sadeh 	int target_id, rc;
3736602adf40SYehuda Sadeh 	unsigned long ul;
3737602adf40SYehuda Sadeh 	int ret = count;
3738602adf40SYehuda Sadeh 
3739602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
3740602adf40SYehuda Sadeh 	if (rc)
3741602adf40SYehuda Sadeh 		return rc;
3742602adf40SYehuda Sadeh 
3743602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
3744602adf40SYehuda Sadeh 	target_id = (int) ul;
3745602adf40SYehuda Sadeh 	if (target_id != ul)
3746602adf40SYehuda Sadeh 		return -EINVAL;
3747602adf40SYehuda Sadeh 
3748602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3749602adf40SYehuda Sadeh 
3750602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
3751602adf40SYehuda Sadeh 	if (!rbd_dev) {
3752602adf40SYehuda Sadeh 		ret = -ENOENT;
3753602adf40SYehuda Sadeh 		goto done;
3754602adf40SYehuda Sadeh 	}
3755602adf40SYehuda Sadeh 
375642382b70SAlex Elder 	if (rbd_dev->open_count) {
375742382b70SAlex Elder 		ret = -EBUSY;
375842382b70SAlex Elder 		goto done;
375942382b70SAlex Elder 	}
376042382b70SAlex Elder 
376141f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
3762dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3763602adf40SYehuda Sadeh 
3764602adf40SYehuda Sadeh done:
3765602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3766aafb230eSAlex Elder 
3767602adf40SYehuda Sadeh 	return ret;
3768602adf40SYehuda Sadeh }
3769602adf40SYehuda Sadeh 
3770602adf40SYehuda Sadeh /*
3771602adf40SYehuda Sadeh  * create control files in sysfs
3772dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3773602adf40SYehuda Sadeh  */
3774602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3775602adf40SYehuda Sadeh {
3776dfc5606dSYehuda Sadeh 	int ret;
3777602adf40SYehuda Sadeh 
3778fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3779dfc5606dSYehuda Sadeh 	if (ret < 0)
3780dfc5606dSYehuda Sadeh 		return ret;
3781602adf40SYehuda Sadeh 
3782fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3783fed4c143SAlex Elder 	if (ret < 0)
3784fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3785602adf40SYehuda Sadeh 
3786602adf40SYehuda Sadeh 	return ret;
3787602adf40SYehuda Sadeh }
3788602adf40SYehuda Sadeh 
3789602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3790602adf40SYehuda Sadeh {
3791dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3792fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3793602adf40SYehuda Sadeh }
3794602adf40SYehuda Sadeh 
3795602adf40SYehuda Sadeh int __init rbd_init(void)
3796602adf40SYehuda Sadeh {
3797602adf40SYehuda Sadeh 	int rc;
3798602adf40SYehuda Sadeh 
3799602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3800602adf40SYehuda Sadeh 	if (rc)
3801602adf40SYehuda Sadeh 		return rc;
3802f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3803602adf40SYehuda Sadeh 	return 0;
3804602adf40SYehuda Sadeh }
3805602adf40SYehuda Sadeh 
3806602adf40SYehuda Sadeh void __exit rbd_exit(void)
3807602adf40SYehuda Sadeh {
3808602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3809602adf40SYehuda Sadeh }
3810602adf40SYehuda Sadeh 
3811602adf40SYehuda Sadeh module_init(rbd_init);
3812602adf40SYehuda Sadeh module_exit(rbd_exit);
3813602adf40SYehuda Sadeh 
3814602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3815602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3816602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3817602adf40SYehuda Sadeh 
3818602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3819602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3820602adf40SYehuda Sadeh 
3821602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3822