xref: /openbmc/linux/drivers/block/rbd.c (revision 1e130199)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
65602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
66602adf40SYehuda Sadeh 
67602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
68602adf40SYehuda Sadeh 
69589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
701e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
71589d30e0SAlex Elder 
7281a89793SAlex Elder /*
7381a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
7481a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
7581a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
7681a89793SAlex Elder  * enough to hold all possible device names.
7781a89793SAlex Elder  */
78602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7981a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
80602adf40SYehuda Sadeh 
81cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
8259c2be1eSYehuda Sadeh 
83602adf40SYehuda Sadeh /*
84602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
85602adf40SYehuda Sadeh  */
86602adf40SYehuda Sadeh struct rbd_image_header {
87f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
88849b4260SAlex Elder 	char *object_prefix;
8934b13184SAlex Elder 	u64 features;
90602adf40SYehuda Sadeh 	__u8 obj_order;
91602adf40SYehuda Sadeh 	__u8 crypt_type;
92602adf40SYehuda Sadeh 	__u8 comp_type;
93602adf40SYehuda Sadeh 
94f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
95f84344f3SAlex Elder 	u64 image_size;
96f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
97602adf40SYehuda Sadeh 	char *snap_names;
98602adf40SYehuda Sadeh 	u64 *snap_sizes;
9959c2be1eSYehuda Sadeh 
10059c2be1eSYehuda Sadeh 	u64 obj_version;
10159c2be1eSYehuda Sadeh };
10259c2be1eSYehuda Sadeh 
10359c2be1eSYehuda Sadeh struct rbd_options {
104cc0538b6SAlex Elder 	bool	read_only;
105602adf40SYehuda Sadeh };
106602adf40SYehuda Sadeh 
107602adf40SYehuda Sadeh /*
108f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
109602adf40SYehuda Sadeh  */
110602adf40SYehuda Sadeh struct rbd_client {
111602adf40SYehuda Sadeh 	struct ceph_client	*client;
112602adf40SYehuda Sadeh 	struct kref		kref;
113602adf40SYehuda Sadeh 	struct list_head	node;
114602adf40SYehuda Sadeh };
115602adf40SYehuda Sadeh 
116602adf40SYehuda Sadeh /*
117f0f8cef5SAlex Elder  * a request completion status
118602adf40SYehuda Sadeh  */
1191fec7093SYehuda Sadeh struct rbd_req_status {
1201fec7093SYehuda Sadeh 	int done;
1211fec7093SYehuda Sadeh 	int rc;
1221fec7093SYehuda Sadeh 	u64 bytes;
1231fec7093SYehuda Sadeh };
1241fec7093SYehuda Sadeh 
1251fec7093SYehuda Sadeh /*
1261fec7093SYehuda Sadeh  * a collection of requests
1271fec7093SYehuda Sadeh  */
1281fec7093SYehuda Sadeh struct rbd_req_coll {
1291fec7093SYehuda Sadeh 	int			total;
1301fec7093SYehuda Sadeh 	int			num_done;
1311fec7093SYehuda Sadeh 	struct kref		kref;
1321fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
133602adf40SYehuda Sadeh };
134602adf40SYehuda Sadeh 
135f0f8cef5SAlex Elder /*
136f0f8cef5SAlex Elder  * a single io request
137f0f8cef5SAlex Elder  */
138f0f8cef5SAlex Elder struct rbd_request {
139f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
140f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
141f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
142f0f8cef5SAlex Elder 	u64			len;
143f0f8cef5SAlex Elder 	int			coll_index;
144f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
145f0f8cef5SAlex Elder };
146f0f8cef5SAlex Elder 
147dfc5606dSYehuda Sadeh struct rbd_snap {
148dfc5606dSYehuda Sadeh 	struct	device		dev;
149dfc5606dSYehuda Sadeh 	const char		*name;
1503591538fSJosh Durgin 	u64			size;
151dfc5606dSYehuda Sadeh 	struct list_head	node;
152dfc5606dSYehuda Sadeh 	u64			id;
15334b13184SAlex Elder 	u64			features;
154dfc5606dSYehuda Sadeh };
155dfc5606dSYehuda Sadeh 
156f84344f3SAlex Elder struct rbd_mapping {
157f84344f3SAlex Elder 	char                    *snap_name;
158f84344f3SAlex Elder 	u64                     snap_id;
15999c1f08fSAlex Elder 	u64                     size;
16034b13184SAlex Elder 	u64                     features;
161f84344f3SAlex Elder 	bool                    snap_exists;
162f84344f3SAlex Elder 	bool			read_only;
163f84344f3SAlex Elder };
164f84344f3SAlex Elder 
165602adf40SYehuda Sadeh /*
166602adf40SYehuda Sadeh  * a single device
167602adf40SYehuda Sadeh  */
168602adf40SYehuda Sadeh struct rbd_device {
169de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
170602adf40SYehuda Sadeh 
171602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
172602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
173602adf40SYehuda Sadeh 
174a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
175f8c38929SAlex Elder 	struct rbd_options	rbd_opts;
176602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
177602adf40SYehuda Sadeh 
178602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
179602adf40SYehuda Sadeh 
180602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
181602adf40SYehuda Sadeh 
182602adf40SYehuda Sadeh 	struct rbd_image_header	header;
183589d30e0SAlex Elder 	char			*image_id;
184589d30e0SAlex Elder 	size_t			image_id_len;
1850bed54dcSAlex Elder 	char			*image_name;
1860bed54dcSAlex Elder 	size_t			image_name_len;
1870bed54dcSAlex Elder 	char			*header_name;
188d22f76e7SAlex Elder 	char			*pool_name;
1899bb2f334SAlex Elder 	int			pool_id;
190602adf40SYehuda Sadeh 
19159c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
19259c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
19359c2be1eSYehuda Sadeh 
194c666601aSJosh Durgin 	/* protects updating the header */
195c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
196f84344f3SAlex Elder 
197f84344f3SAlex Elder 	struct rbd_mapping	mapping;
198602adf40SYehuda Sadeh 
199602adf40SYehuda Sadeh 	struct list_head	node;
200dfc5606dSYehuda Sadeh 
201dfc5606dSYehuda Sadeh 	/* list of snapshots */
202dfc5606dSYehuda Sadeh 	struct list_head	snaps;
203dfc5606dSYehuda Sadeh 
204dfc5606dSYehuda Sadeh 	/* sysfs related */
205dfc5606dSYehuda Sadeh 	struct device		dev;
206dfc5606dSYehuda Sadeh };
207dfc5606dSYehuda Sadeh 
208602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
209e124a82fSAlex Elder 
210602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
211e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
212e124a82fSAlex Elder 
213602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
214432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
215602adf40SYehuda Sadeh 
216304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
217304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
218304f6808SAlex Elder 
219dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
22014e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap);
221dfc5606dSYehuda Sadeh 
222f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
223f0f8cef5SAlex Elder 		       size_t count);
224f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
225f0f8cef5SAlex Elder 			  size_t count);
226f0f8cef5SAlex Elder 
227f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
228f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
229f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
230f0f8cef5SAlex Elder 	__ATTR_NULL
231f0f8cef5SAlex Elder };
232f0f8cef5SAlex Elder 
233f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
234f0f8cef5SAlex Elder 	.name		= "rbd",
235f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
236f0f8cef5SAlex Elder };
237f0f8cef5SAlex Elder 
238f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
239f0f8cef5SAlex Elder {
240f0f8cef5SAlex Elder }
241f0f8cef5SAlex Elder 
242f0f8cef5SAlex Elder static struct device rbd_root_dev = {
243f0f8cef5SAlex Elder 	.init_name =    "rbd",
244f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
245f0f8cef5SAlex Elder };
246f0f8cef5SAlex Elder 
247aafb230eSAlex Elder #ifdef RBD_DEBUG
248aafb230eSAlex Elder #define rbd_assert(expr)						\
249aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
250aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
251aafb230eSAlex Elder 						"at line %d:\n\n"	\
252aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
253aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
254aafb230eSAlex Elder 			BUG();						\
255aafb230eSAlex Elder 		}
256aafb230eSAlex Elder #else /* !RBD_DEBUG */
257aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
258aafb230eSAlex Elder #endif /* !RBD_DEBUG */
259dfc5606dSYehuda Sadeh 
260dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
261dfc5606dSYehuda Sadeh {
262dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
263dfc5606dSYehuda Sadeh }
264dfc5606dSYehuda Sadeh 
265dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
266dfc5606dSYehuda Sadeh {
267dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
268dfc5606dSYehuda Sadeh }
269602adf40SYehuda Sadeh 
2701fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
27159c2be1eSYehuda Sadeh 
272602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
273602adf40SYehuda Sadeh {
274f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
275602adf40SYehuda Sadeh 
276f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
277602adf40SYehuda Sadeh 		return -EROFS;
278602adf40SYehuda Sadeh 
279340c7a2bSAlex Elder 	rbd_get_dev(rbd_dev);
280f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
281340c7a2bSAlex Elder 
282602adf40SYehuda Sadeh 	return 0;
283602adf40SYehuda Sadeh }
284602adf40SYehuda Sadeh 
285dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
286dfc5606dSYehuda Sadeh {
287dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
288dfc5606dSYehuda Sadeh 
289dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
290dfc5606dSYehuda Sadeh 
291dfc5606dSYehuda Sadeh 	return 0;
292dfc5606dSYehuda Sadeh }
293dfc5606dSYehuda Sadeh 
294602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
295602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
296602adf40SYehuda Sadeh 	.open			= rbd_open,
297dfc5606dSYehuda Sadeh 	.release		= rbd_release,
298602adf40SYehuda Sadeh };
299602adf40SYehuda Sadeh 
300602adf40SYehuda Sadeh /*
301602adf40SYehuda Sadeh  * Initialize an rbd client instance.
30243ae4701SAlex Elder  * We own *ceph_opts.
303602adf40SYehuda Sadeh  */
304f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
305602adf40SYehuda Sadeh {
306602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
307602adf40SYehuda Sadeh 	int ret = -ENOMEM;
308602adf40SYehuda Sadeh 
309602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
310602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
311602adf40SYehuda Sadeh 	if (!rbdc)
312602adf40SYehuda Sadeh 		goto out_opt;
313602adf40SYehuda Sadeh 
314602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
315602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
316602adf40SYehuda Sadeh 
317bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
318bc534d86SAlex Elder 
31943ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
320602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
321bc534d86SAlex Elder 		goto out_mutex;
32243ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
323602adf40SYehuda Sadeh 
324602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
325602adf40SYehuda Sadeh 	if (ret < 0)
326602adf40SYehuda Sadeh 		goto out_err;
327602adf40SYehuda Sadeh 
328432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
329602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
330432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
331602adf40SYehuda Sadeh 
332bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
333bc534d86SAlex Elder 
334602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
335602adf40SYehuda Sadeh 	return rbdc;
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh out_err:
338602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
339bc534d86SAlex Elder out_mutex:
340bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
341602adf40SYehuda Sadeh 	kfree(rbdc);
342602adf40SYehuda Sadeh out_opt:
34343ae4701SAlex Elder 	if (ceph_opts)
34443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
34528f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
346602adf40SYehuda Sadeh }
347602adf40SYehuda Sadeh 
348602adf40SYehuda Sadeh /*
3491f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
3501f7ba331SAlex Elder  * found, bump its reference count.
351602adf40SYehuda Sadeh  */
3521f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
353602adf40SYehuda Sadeh {
354602adf40SYehuda Sadeh 	struct rbd_client *client_node;
3551f7ba331SAlex Elder 	bool found = false;
356602adf40SYehuda Sadeh 
35743ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
358602adf40SYehuda Sadeh 		return NULL;
359602adf40SYehuda Sadeh 
3601f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
3611f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
3621f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
3631f7ba331SAlex Elder 			kref_get(&client_node->kref);
3641f7ba331SAlex Elder 			found = true;
3651f7ba331SAlex Elder 			break;
3661f7ba331SAlex Elder 		}
3671f7ba331SAlex Elder 	}
3681f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
3691f7ba331SAlex Elder 
3701f7ba331SAlex Elder 	return found ? client_node : NULL;
371602adf40SYehuda Sadeh }
372602adf40SYehuda Sadeh 
373602adf40SYehuda Sadeh /*
37459c2be1eSYehuda Sadeh  * mount options
37559c2be1eSYehuda Sadeh  */
37659c2be1eSYehuda Sadeh enum {
37759c2be1eSYehuda Sadeh 	Opt_last_int,
37859c2be1eSYehuda Sadeh 	/* int args above */
37959c2be1eSYehuda Sadeh 	Opt_last_string,
38059c2be1eSYehuda Sadeh 	/* string args above */
381cc0538b6SAlex Elder 	Opt_read_only,
382cc0538b6SAlex Elder 	Opt_read_write,
383cc0538b6SAlex Elder 	/* Boolean args above */
384cc0538b6SAlex Elder 	Opt_last_bool,
38559c2be1eSYehuda Sadeh };
38659c2be1eSYehuda Sadeh 
38743ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
38859c2be1eSYehuda Sadeh 	/* int args above */
38959c2be1eSYehuda Sadeh 	/* string args above */
390f84344f3SAlex Elder 	{Opt_read_only, "mapping.read_only"},
391cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
392cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
393cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
394cc0538b6SAlex Elder 	/* Boolean args above */
39559c2be1eSYehuda Sadeh 	{-1, NULL}
39659c2be1eSYehuda Sadeh };
39759c2be1eSYehuda Sadeh 
39859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
39959c2be1eSYehuda Sadeh {
40043ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
40159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
40259c2be1eSYehuda Sadeh 	int token, intval, ret;
40359c2be1eSYehuda Sadeh 
40443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
40559c2be1eSYehuda Sadeh 	if (token < 0)
40659c2be1eSYehuda Sadeh 		return -EINVAL;
40759c2be1eSYehuda Sadeh 
40859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
40959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
41059c2be1eSYehuda Sadeh 		if (ret < 0) {
41159c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
41259c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
41359c2be1eSYehuda Sadeh 			return ret;
41459c2be1eSYehuda Sadeh 		}
41559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
41659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
41759c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
41859c2be1eSYehuda Sadeh 		     argstr[0].from);
419cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
420cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
42159c2be1eSYehuda Sadeh 	} else {
42259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
42359c2be1eSYehuda Sadeh 	}
42459c2be1eSYehuda Sadeh 
42559c2be1eSYehuda Sadeh 	switch (token) {
426cc0538b6SAlex Elder 	case Opt_read_only:
427cc0538b6SAlex Elder 		rbd_opts->read_only = true;
428cc0538b6SAlex Elder 		break;
429cc0538b6SAlex Elder 	case Opt_read_write:
430cc0538b6SAlex Elder 		rbd_opts->read_only = false;
431cc0538b6SAlex Elder 		break;
43259c2be1eSYehuda Sadeh 	default:
433aafb230eSAlex Elder 		rbd_assert(false);
434aafb230eSAlex Elder 		break;
43559c2be1eSYehuda Sadeh 	}
43659c2be1eSYehuda Sadeh 	return 0;
43759c2be1eSYehuda Sadeh }
43859c2be1eSYehuda Sadeh 
43959c2be1eSYehuda Sadeh /*
440602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
441602adf40SYehuda Sadeh  * not exist create it.
442602adf40SYehuda Sadeh  */
443f8c38929SAlex Elder static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
444f8c38929SAlex Elder 				size_t mon_addr_len, char *options)
445602adf40SYehuda Sadeh {
446f8c38929SAlex Elder 	struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
44743ae4701SAlex Elder 	struct ceph_options *ceph_opts;
448f8c38929SAlex Elder 	struct rbd_client *rbdc;
44959c2be1eSYehuda Sadeh 
450cc0538b6SAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
451602adf40SYehuda Sadeh 
45243ae4701SAlex Elder 	ceph_opts = ceph_parse_options(options, mon_addr,
4535214ecc4SAlex Elder 					mon_addr + mon_addr_len,
45421079786SAlex Elder 					parse_rbd_opts_token, rbd_opts);
455f8c38929SAlex Elder 	if (IS_ERR(ceph_opts))
456f8c38929SAlex Elder 		return PTR_ERR(ceph_opts);
457602adf40SYehuda Sadeh 
4581f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
459602adf40SYehuda Sadeh 	if (rbdc) {
460e6994d3dSAlex Elder 		/* using an existing client */
46143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
462f8c38929SAlex Elder 	} else {
463f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
464d720bcb0SAlex Elder 		if (IS_ERR(rbdc))
465f8c38929SAlex Elder 			return PTR_ERR(rbdc);
466f8c38929SAlex Elder 	}
467f8c38929SAlex Elder 	rbd_dev->rbd_client = rbdc;
468d720bcb0SAlex Elder 
469f8c38929SAlex Elder 	return 0;
470602adf40SYehuda Sadeh }
471602adf40SYehuda Sadeh 
472602adf40SYehuda Sadeh /*
473602adf40SYehuda Sadeh  * Destroy ceph client
474d23a4b3fSAlex Elder  *
475432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
476602adf40SYehuda Sadeh  */
477602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
478602adf40SYehuda Sadeh {
479602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
480602adf40SYehuda Sadeh 
481602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
482cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
483602adf40SYehuda Sadeh 	list_del(&rbdc->node);
484cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
485602adf40SYehuda Sadeh 
486602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
487602adf40SYehuda Sadeh 	kfree(rbdc);
488602adf40SYehuda Sadeh }
489602adf40SYehuda Sadeh 
490602adf40SYehuda Sadeh /*
491602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
492602adf40SYehuda Sadeh  * it.
493602adf40SYehuda Sadeh  */
494602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
495602adf40SYehuda Sadeh {
496602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
497602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
498602adf40SYehuda Sadeh }
499602adf40SYehuda Sadeh 
5001fec7093SYehuda Sadeh /*
5011fec7093SYehuda Sadeh  * Destroy requests collection
5021fec7093SYehuda Sadeh  */
5031fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5041fec7093SYehuda Sadeh {
5051fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5061fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5071fec7093SYehuda Sadeh 
5081fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5091fec7093SYehuda Sadeh 	kfree(coll);
5101fec7093SYehuda Sadeh }
511602adf40SYehuda Sadeh 
512a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
513a30b71b9SAlex Elder {
514a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
515a30b71b9SAlex Elder }
516a30b71b9SAlex Elder 
5178e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5188e94af8eSAlex Elder {
519103a150fSAlex Elder 	size_t size;
520103a150fSAlex Elder 	u32 snap_count;
521103a150fSAlex Elder 
522103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
523103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
524103a150fSAlex Elder 		return false;
525103a150fSAlex Elder 
526103a150fSAlex Elder 	/*
527103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
528103a150fSAlex Elder 	 * that limits the number of snapshots.
529103a150fSAlex Elder 	 */
530103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
531103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
532103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
533103a150fSAlex Elder 		return false;
534103a150fSAlex Elder 
535103a150fSAlex Elder 	/*
536103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
537103a150fSAlex Elder 	 * header must also be representable in a size_t.
538103a150fSAlex Elder 	 */
539103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
540103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
541103a150fSAlex Elder 		return false;
542103a150fSAlex Elder 
543103a150fSAlex Elder 	return true;
5448e94af8eSAlex Elder }
5458e94af8eSAlex Elder 
546602adf40SYehuda Sadeh /*
547602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
548602adf40SYehuda Sadeh  * header.
549602adf40SYehuda Sadeh  */
550602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
5514156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
552602adf40SYehuda Sadeh {
553ccece235SAlex Elder 	u32 snap_count;
55458c17b0eSAlex Elder 	size_t len;
555d2bb24e5SAlex Elder 	size_t size;
556621901d6SAlex Elder 	u32 i;
557602adf40SYehuda Sadeh 
5586a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
5596a52325fSAlex Elder 
560103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
561103a150fSAlex Elder 
56258c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
56358c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
5646a52325fSAlex Elder 	if (!header->object_prefix)
565602adf40SYehuda Sadeh 		return -ENOMEM;
56658c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
56758c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
56800f1f36fSAlex Elder 
569602adf40SYehuda Sadeh 	if (snap_count) {
570f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
571f785cc1dSAlex Elder 
572621901d6SAlex Elder 		/* Save a copy of the snapshot names */
573621901d6SAlex Elder 
574f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
575f785cc1dSAlex Elder 			return -EIO;
576f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
577602adf40SYehuda Sadeh 		if (!header->snap_names)
5786a52325fSAlex Elder 			goto out_err;
579f785cc1dSAlex Elder 		/*
580f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
581f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
582f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
583f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
584f785cc1dSAlex Elder 		 */
585f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
586f785cc1dSAlex Elder 			snap_names_len);
5876a52325fSAlex Elder 
588621901d6SAlex Elder 		/* Record each snapshot's size */
589621901d6SAlex Elder 
590d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
591d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
592602adf40SYehuda Sadeh 		if (!header->snap_sizes)
5936a52325fSAlex Elder 			goto out_err;
594621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
595621901d6SAlex Elder 			header->snap_sizes[i] =
596621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
597602adf40SYehuda Sadeh 	} else {
598ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
599602adf40SYehuda Sadeh 		header->snap_names = NULL;
600602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
601602adf40SYehuda Sadeh 	}
602849b4260SAlex Elder 
60334b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
604602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
605602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
606602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6076a52325fSAlex Elder 
608621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
609621901d6SAlex Elder 
610f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6116a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6126a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6136a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6146a52325fSAlex Elder 	if (!header->snapc)
6156a52325fSAlex Elder 		goto out_err;
616602adf40SYehuda Sadeh 
617602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
618505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
619602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
620621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
621602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
622602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
623602adf40SYehuda Sadeh 
624602adf40SYehuda Sadeh 	return 0;
625602adf40SYehuda Sadeh 
6266a52325fSAlex Elder out_err:
627849b4260SAlex Elder 	kfree(header->snap_sizes);
628ccece235SAlex Elder 	header->snap_sizes = NULL;
629602adf40SYehuda Sadeh 	kfree(header->snap_names);
630ccece235SAlex Elder 	header->snap_names = NULL;
6316a52325fSAlex Elder 	kfree(header->object_prefix);
6326a52325fSAlex Elder 	header->object_prefix = NULL;
633ccece235SAlex Elder 
63400f1f36fSAlex Elder 	return -ENOMEM;
635602adf40SYehuda Sadeh }
636602adf40SYehuda Sadeh 
6378836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
638602adf40SYehuda Sadeh {
639602adf40SYehuda Sadeh 
640e86924a8SAlex Elder 	struct rbd_snap *snap;
64100f1f36fSAlex Elder 
642e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
643e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
644e86924a8SAlex Elder 			rbd_dev->mapping.snap_id = snap->id;
645e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
64634b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
64700f1f36fSAlex Elder 
648e86924a8SAlex Elder 			return 0;
649602adf40SYehuda Sadeh 		}
65000f1f36fSAlex Elder 	}
651e86924a8SAlex Elder 
65200f1f36fSAlex Elder 	return -ENOENT;
65300f1f36fSAlex Elder }
654602adf40SYehuda Sadeh 
6555ed16177SAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
656602adf40SYehuda Sadeh {
65778dc447dSAlex Elder 	int ret;
658602adf40SYehuda Sadeh 
6594e1105a2SAlex Elder 	if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
660cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
661f84344f3SAlex Elder 		rbd_dev->mapping.snap_id = CEPH_NOSNAP;
66299c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
66334b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
664f84344f3SAlex Elder 		rbd_dev->mapping.snap_exists = false;
665f84344f3SAlex Elder 		rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
666e86924a8SAlex Elder 		ret = 0;
667602adf40SYehuda Sadeh 	} else {
6688836b995SAlex Elder 		ret = snap_by_name(rbd_dev, snap_name);
669602adf40SYehuda Sadeh 		if (ret < 0)
670602adf40SYehuda Sadeh 			goto done;
671f84344f3SAlex Elder 		rbd_dev->mapping.snap_exists = true;
672f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
673602adf40SYehuda Sadeh 	}
6744e1105a2SAlex Elder 	rbd_dev->mapping.snap_name = snap_name;
675602adf40SYehuda Sadeh done:
676602adf40SYehuda Sadeh 	return ret;
677602adf40SYehuda Sadeh }
678602adf40SYehuda Sadeh 
679602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
680602adf40SYehuda Sadeh {
681849b4260SAlex Elder 	kfree(header->object_prefix);
682d78fd7aeSAlex Elder 	header->object_prefix = NULL;
683602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
684d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
685849b4260SAlex Elder 	kfree(header->snap_names);
686d78fd7aeSAlex Elder 	header->snap_names = NULL;
687d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
688d78fd7aeSAlex Elder 	header->snapc = NULL;
689602adf40SYehuda Sadeh }
690602adf40SYehuda Sadeh 
69165ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
692602adf40SYehuda Sadeh {
69365ccfe21SAlex Elder 	char *name;
69465ccfe21SAlex Elder 	u64 segment;
69565ccfe21SAlex Elder 	int ret;
696602adf40SYehuda Sadeh 
69765ccfe21SAlex Elder 	name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
69865ccfe21SAlex Elder 	if (!name)
69965ccfe21SAlex Elder 		return NULL;
70065ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
70165ccfe21SAlex Elder 	ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
70265ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
70365ccfe21SAlex Elder 	if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
70465ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
70565ccfe21SAlex Elder 			segment, ret);
70665ccfe21SAlex Elder 		kfree(name);
70765ccfe21SAlex Elder 		name = NULL;
70865ccfe21SAlex Elder 	}
709602adf40SYehuda Sadeh 
71065ccfe21SAlex Elder 	return name;
71165ccfe21SAlex Elder }
712602adf40SYehuda Sadeh 
71365ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
71465ccfe21SAlex Elder {
71565ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
716602adf40SYehuda Sadeh 
71765ccfe21SAlex Elder 	return offset & (segment_size - 1);
71865ccfe21SAlex Elder }
71965ccfe21SAlex Elder 
72065ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
72165ccfe21SAlex Elder 				u64 offset, u64 length)
72265ccfe21SAlex Elder {
72365ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
72465ccfe21SAlex Elder 
72565ccfe21SAlex Elder 	offset &= segment_size - 1;
72665ccfe21SAlex Elder 
727aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
72865ccfe21SAlex Elder 	if (offset + length > segment_size)
72965ccfe21SAlex Elder 		length = segment_size - offset;
73065ccfe21SAlex Elder 
73165ccfe21SAlex Elder 	return length;
732602adf40SYehuda Sadeh }
733602adf40SYehuda Sadeh 
7341fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
7351fec7093SYehuda Sadeh 				u64 ofs, u64 len)
7361fec7093SYehuda Sadeh {
737df111be6SAlex Elder 	u64 start_seg;
738df111be6SAlex Elder 	u64 end_seg;
739df111be6SAlex Elder 
740df111be6SAlex Elder 	if (!len)
741df111be6SAlex Elder 		return 0;
742df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
743df111be6SAlex Elder 		return -ERANGE;
744df111be6SAlex Elder 
745df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
746df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
747df111be6SAlex Elder 
7481fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
7491fec7093SYehuda Sadeh }
7501fec7093SYehuda Sadeh 
751602adf40SYehuda Sadeh /*
752029bcbd8SJosh Durgin  * returns the size of an object in the image
753029bcbd8SJosh Durgin  */
754029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
755029bcbd8SJosh Durgin {
756029bcbd8SJosh Durgin 	return 1 << header->obj_order;
757029bcbd8SJosh Durgin }
758029bcbd8SJosh Durgin 
759029bcbd8SJosh Durgin /*
760602adf40SYehuda Sadeh  * bio helpers
761602adf40SYehuda Sadeh  */
762602adf40SYehuda Sadeh 
763602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
764602adf40SYehuda Sadeh {
765602adf40SYehuda Sadeh 	struct bio *tmp;
766602adf40SYehuda Sadeh 
767602adf40SYehuda Sadeh 	while (chain) {
768602adf40SYehuda Sadeh 		tmp = chain;
769602adf40SYehuda Sadeh 		chain = chain->bi_next;
770602adf40SYehuda Sadeh 		bio_put(tmp);
771602adf40SYehuda Sadeh 	}
772602adf40SYehuda Sadeh }
773602adf40SYehuda Sadeh 
774602adf40SYehuda Sadeh /*
775602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
776602adf40SYehuda Sadeh  */
777602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
778602adf40SYehuda Sadeh {
779602adf40SYehuda Sadeh 	struct bio_vec *bv;
780602adf40SYehuda Sadeh 	unsigned long flags;
781602adf40SYehuda Sadeh 	void *buf;
782602adf40SYehuda Sadeh 	int i;
783602adf40SYehuda Sadeh 	int pos = 0;
784602adf40SYehuda Sadeh 
785602adf40SYehuda Sadeh 	while (chain) {
786602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
787602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
788602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
789602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
790602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
791602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
79285b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
793602adf40SYehuda Sadeh 			}
794602adf40SYehuda Sadeh 			pos += bv->bv_len;
795602adf40SYehuda Sadeh 		}
796602adf40SYehuda Sadeh 
797602adf40SYehuda Sadeh 		chain = chain->bi_next;
798602adf40SYehuda Sadeh 	}
799602adf40SYehuda Sadeh }
800602adf40SYehuda Sadeh 
801602adf40SYehuda Sadeh /*
802602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
803602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
804602adf40SYehuda Sadeh  */
805602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
806602adf40SYehuda Sadeh 				   struct bio_pair **bp,
807602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
808602adf40SYehuda Sadeh {
809542582fcSAlex Elder 	struct bio *old_chain = *old;
810542582fcSAlex Elder 	struct bio *new_chain = NULL;
811542582fcSAlex Elder 	struct bio *tail;
812602adf40SYehuda Sadeh 	int total = 0;
813602adf40SYehuda Sadeh 
814602adf40SYehuda Sadeh 	if (*bp) {
815602adf40SYehuda Sadeh 		bio_pair_release(*bp);
816602adf40SYehuda Sadeh 		*bp = NULL;
817602adf40SYehuda Sadeh 	}
818602adf40SYehuda Sadeh 
819602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
820542582fcSAlex Elder 		struct bio *tmp;
821542582fcSAlex Elder 
822602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
823602adf40SYehuda Sadeh 		if (!tmp)
824602adf40SYehuda Sadeh 			goto err_out;
825542582fcSAlex Elder 		gfpmask &= ~__GFP_WAIT;	/* can't wait after the first */
826602adf40SYehuda Sadeh 
827602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
828602adf40SYehuda Sadeh 			struct bio_pair *bp;
829602adf40SYehuda Sadeh 
830602adf40SYehuda Sadeh 			/*
831602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
832602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
833602adf40SYehuda Sadeh 			 */
834602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
835bd919d45SAlex Elder 			     "bi_size=%u\n",
836bd919d45SAlex Elder 			     total, len - total, old_chain->bi_size);
837602adf40SYehuda Sadeh 
838602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
839602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
840593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
841602adf40SYehuda Sadeh 			if (!bp)
842602adf40SYehuda Sadeh 				goto err_out;
843602adf40SYehuda Sadeh 
844602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
845602adf40SYehuda Sadeh 
846602adf40SYehuda Sadeh 			*next = &bp->bio2;
847602adf40SYehuda Sadeh 		} else {
848602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
849602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
850602adf40SYehuda Sadeh 		}
851602adf40SYehuda Sadeh 
852602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
853602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
854542582fcSAlex Elder 		if (new_chain)
855602adf40SYehuda Sadeh 			tail->bi_next = tmp;
856542582fcSAlex Elder 		else
857542582fcSAlex Elder 			new_chain = tmp;
858602adf40SYehuda Sadeh 		tail = tmp;
859602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
860602adf40SYehuda Sadeh 
861602adf40SYehuda Sadeh 		total += tmp->bi_size;
862602adf40SYehuda Sadeh 	}
863602adf40SYehuda Sadeh 
864aafb230eSAlex Elder 	rbd_assert(total == len);
865602adf40SYehuda Sadeh 
866602adf40SYehuda Sadeh 	*old = old_chain;
867602adf40SYehuda Sadeh 
868602adf40SYehuda Sadeh 	return new_chain;
869602adf40SYehuda Sadeh 
870602adf40SYehuda Sadeh err_out:
871602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
872602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
873602adf40SYehuda Sadeh 	return NULL;
874602adf40SYehuda Sadeh }
875602adf40SYehuda Sadeh 
876602adf40SYehuda Sadeh /*
877602adf40SYehuda Sadeh  * helpers for osd request op vectors.
878602adf40SYehuda Sadeh  */
87957cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
88057cfc106SAlex Elder 					int opcode, u32 payload_len)
881602adf40SYehuda Sadeh {
88257cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
88357cfc106SAlex Elder 
88457cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
88557cfc106SAlex Elder 	if (!ops)
88657cfc106SAlex Elder 		return NULL;
88757cfc106SAlex Elder 
88857cfc106SAlex Elder 	ops[0].op = opcode;
88957cfc106SAlex Elder 
890602adf40SYehuda Sadeh 	/*
891602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
892602adf40SYehuda Sadeh 	 * in calc_raw_layout()
893602adf40SYehuda Sadeh 	 */
89457cfc106SAlex Elder 	ops[0].payload_len = payload_len;
89557cfc106SAlex Elder 
89657cfc106SAlex Elder 	return ops;
897602adf40SYehuda Sadeh }
898602adf40SYehuda Sadeh 
899602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
900602adf40SYehuda Sadeh {
901602adf40SYehuda Sadeh 	kfree(ops);
902602adf40SYehuda Sadeh }
903602adf40SYehuda Sadeh 
9041fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
9051fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
9061fec7093SYehuda Sadeh 				   int index,
9071fec7093SYehuda Sadeh 				   int ret, u64 len)
9081fec7093SYehuda Sadeh {
9091fec7093SYehuda Sadeh 	struct request_queue *q;
9101fec7093SYehuda Sadeh 	int min, max, i;
9111fec7093SYehuda Sadeh 
912bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
913bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
9141fec7093SYehuda Sadeh 
9151fec7093SYehuda Sadeh 	if (!rq)
9161fec7093SYehuda Sadeh 		return;
9171fec7093SYehuda Sadeh 
9181fec7093SYehuda Sadeh 	if (!coll) {
9191fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
9201fec7093SYehuda Sadeh 		return;
9211fec7093SYehuda Sadeh 	}
9221fec7093SYehuda Sadeh 
9231fec7093SYehuda Sadeh 	q = rq->q;
9241fec7093SYehuda Sadeh 
9251fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
9261fec7093SYehuda Sadeh 	coll->status[index].done = 1;
9271fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
9281fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
9291fec7093SYehuda Sadeh 	max = min = coll->num_done;
9301fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
9311fec7093SYehuda Sadeh 		max++;
9321fec7093SYehuda Sadeh 
9331fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
9341fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
9351fec7093SYehuda Sadeh 				  coll->status[i].bytes);
9361fec7093SYehuda Sadeh 		coll->num_done++;
9371fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
9381fec7093SYehuda Sadeh 	}
9391fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
9401fec7093SYehuda Sadeh }
9411fec7093SYehuda Sadeh 
9421fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
9431fec7093SYehuda Sadeh 			     int ret, u64 len)
9441fec7093SYehuda Sadeh {
9451fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
9461fec7093SYehuda Sadeh }
9471fec7093SYehuda Sadeh 
948602adf40SYehuda Sadeh /*
949602adf40SYehuda Sadeh  * Send ceph osd request
950602adf40SYehuda Sadeh  */
951602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
9520ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
953602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
954602adf40SYehuda Sadeh 			  u64 snapid,
955aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
956602adf40SYehuda Sadeh 			  struct bio *bio,
957602adf40SYehuda Sadeh 			  struct page **pages,
958602adf40SYehuda Sadeh 			  int num_pages,
959602adf40SYehuda Sadeh 			  int flags,
960602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
9611fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
9621fec7093SYehuda Sadeh 			  int coll_index,
963602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
96459c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
96559c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
96659c2be1eSYehuda Sadeh 			  u64 *ver)
967602adf40SYehuda Sadeh {
968602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
969602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
970602adf40SYehuda Sadeh 	int ret;
971602adf40SYehuda Sadeh 	u64 bno;
972602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
973602adf40SYehuda Sadeh 	struct rbd_request *req_data;
974602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
9751dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
976602adf40SYehuda Sadeh 
977602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
9781fec7093SYehuda Sadeh 	if (!req_data) {
9791fec7093SYehuda Sadeh 		if (coll)
9801fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
9811fec7093SYehuda Sadeh 					       -ENOMEM, len);
9821fec7093SYehuda Sadeh 		return -ENOMEM;
9831fec7093SYehuda Sadeh 	}
984602adf40SYehuda Sadeh 
9851fec7093SYehuda Sadeh 	if (coll) {
9861fec7093SYehuda Sadeh 		req_data->coll = coll;
9871fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
9881fec7093SYehuda Sadeh 	}
9891fec7093SYehuda Sadeh 
990bd919d45SAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
991bd919d45SAlex Elder 		(unsigned long long) ofs, (unsigned long long) len);
992602adf40SYehuda Sadeh 
9930ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
9941dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9951dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9964ad12621SSage Weil 	if (!req) {
9974ad12621SSage Weil 		ret = -ENOMEM;
998602adf40SYehuda Sadeh 		goto done_pages;
999602adf40SYehuda Sadeh 	}
1000602adf40SYehuda Sadeh 
1001602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
1002602adf40SYehuda Sadeh 
1003602adf40SYehuda Sadeh 	req_data->rq = rq;
1004602adf40SYehuda Sadeh 	req_data->bio = bio;
1005602adf40SYehuda Sadeh 	req_data->pages = pages;
1006602adf40SYehuda Sadeh 	req_data->len = len;
1007602adf40SYehuda Sadeh 
1008602adf40SYehuda Sadeh 	req->r_priv = req_data;
1009602adf40SYehuda Sadeh 
1010602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
1011602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1012602adf40SYehuda Sadeh 
1013aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
1014602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
1015602adf40SYehuda Sadeh 
1016602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
1017602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1018602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1019602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1020602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
10210ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
10221dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
10231dbb4399SAlex Elder 				req, ops);
1024602adf40SYehuda Sadeh 
1025602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
1026602adf40SYehuda Sadeh 				ops,
1027602adf40SYehuda Sadeh 				snapc,
1028602adf40SYehuda Sadeh 				&mtime,
1029602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
1030602adf40SYehuda Sadeh 
103159c2be1eSYehuda Sadeh 	if (linger_req) {
10321dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
103359c2be1eSYehuda Sadeh 		*linger_req = req;
103459c2be1eSYehuda Sadeh 	}
103559c2be1eSYehuda Sadeh 
10361dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
1037602adf40SYehuda Sadeh 	if (ret < 0)
1038602adf40SYehuda Sadeh 		goto done_err;
1039602adf40SYehuda Sadeh 
1040602adf40SYehuda Sadeh 	if (!rbd_cb) {
10411dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
104259c2be1eSYehuda Sadeh 		if (ver)
104359c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
1044bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
1045bd919d45SAlex Elder 			(unsigned long long)
10461fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
1047602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
1048602adf40SYehuda Sadeh 	}
1049602adf40SYehuda Sadeh 	return ret;
1050602adf40SYehuda Sadeh 
1051602adf40SYehuda Sadeh done_err:
1052602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
1053602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1054602adf40SYehuda Sadeh done_pages:
10551fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
1056602adf40SYehuda Sadeh 	kfree(req_data);
1057602adf40SYehuda Sadeh 	return ret;
1058602adf40SYehuda Sadeh }
1059602adf40SYehuda Sadeh 
1060602adf40SYehuda Sadeh /*
1061602adf40SYehuda Sadeh  * Ceph osd op callback
1062602adf40SYehuda Sadeh  */
1063602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1064602adf40SYehuda Sadeh {
1065602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
1066602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1067602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
1068602adf40SYehuda Sadeh 	__s32 rc;
1069602adf40SYehuda Sadeh 	u64 bytes;
1070602adf40SYehuda Sadeh 	int read_op;
1071602adf40SYehuda Sadeh 
1072602adf40SYehuda Sadeh 	/* parse reply */
1073602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1074602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1075602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
1076602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
1077602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1078895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1079602adf40SYehuda Sadeh 
1080bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1081bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1082602adf40SYehuda Sadeh 
1083602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1084602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1085602adf40SYehuda Sadeh 		rc = 0;
1086602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1087602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1088602adf40SYehuda Sadeh 		bytes = req_data->len;
1089602adf40SYehuda Sadeh 	}
1090602adf40SYehuda Sadeh 
10911fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1092602adf40SYehuda Sadeh 
1093602adf40SYehuda Sadeh 	if (req_data->bio)
1094602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1095602adf40SYehuda Sadeh 
1096602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1097602adf40SYehuda Sadeh 	kfree(req_data);
1098602adf40SYehuda Sadeh }
1099602adf40SYehuda Sadeh 
110059c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
110159c2be1eSYehuda Sadeh {
110259c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
110359c2be1eSYehuda Sadeh }
110459c2be1eSYehuda Sadeh 
1105602adf40SYehuda Sadeh /*
1106602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1107602adf40SYehuda Sadeh  */
11080ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1109602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1110602adf40SYehuda Sadeh 			   u64 snapid,
1111602adf40SYehuda Sadeh 			   int flags,
1112913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1113aded07eaSAlex Elder 			   const char *object_name,
1114f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1115f8d4de6eSAlex Elder 			   char *inbound,
111659c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
111759c2be1eSYehuda Sadeh 			   u64 *ver)
1118602adf40SYehuda Sadeh {
1119602adf40SYehuda Sadeh 	int ret;
1120602adf40SYehuda Sadeh 	struct page **pages;
1121602adf40SYehuda Sadeh 	int num_pages;
1122913d2fdcSAlex Elder 
1123aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1124602adf40SYehuda Sadeh 
1125f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1126602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1127b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1128b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1129602adf40SYehuda Sadeh 
11300ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1131f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1132602adf40SYehuda Sadeh 			  pages, num_pages,
1133602adf40SYehuda Sadeh 			  flags,
1134602adf40SYehuda Sadeh 			  ops,
11351fec7093SYehuda Sadeh 			  NULL, 0,
113659c2be1eSYehuda Sadeh 			  NULL,
113759c2be1eSYehuda Sadeh 			  linger_req, ver);
1138602adf40SYehuda Sadeh 	if (ret < 0)
1139913d2fdcSAlex Elder 		goto done;
1140602adf40SYehuda Sadeh 
1141f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1142f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1143602adf40SYehuda Sadeh 
1144602adf40SYehuda Sadeh done:
1145602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1146602adf40SYehuda Sadeh 	return ret;
1147602adf40SYehuda Sadeh }
1148602adf40SYehuda Sadeh 
1149602adf40SYehuda Sadeh /*
1150602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1151602adf40SYehuda Sadeh  */
1152602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1153602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1154602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1155602adf40SYehuda Sadeh 		     u64 snapid,
1156d1f57ea6SAlex Elder 		     int opcode, int flags,
1157602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
11581fec7093SYehuda Sadeh 		     struct bio *bio,
11591fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
11601fec7093SYehuda Sadeh 		     int coll_index)
1161602adf40SYehuda Sadeh {
1162602adf40SYehuda Sadeh 	char *seg_name;
1163602adf40SYehuda Sadeh 	u64 seg_ofs;
1164602adf40SYehuda Sadeh 	u64 seg_len;
1165602adf40SYehuda Sadeh 	int ret;
1166602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1167602adf40SYehuda Sadeh 	u32 payload_len;
1168602adf40SYehuda Sadeh 
116965ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1170602adf40SYehuda Sadeh 	if (!seg_name)
1171602adf40SYehuda Sadeh 		return -ENOMEM;
117265ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
117365ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1174602adf40SYehuda Sadeh 
1175602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1176602adf40SYehuda Sadeh 
117757cfc106SAlex Elder 	ret = -ENOMEM;
117857cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
117957cfc106SAlex Elder 	if (!ops)
1180602adf40SYehuda Sadeh 		goto done;
1181602adf40SYehuda Sadeh 
1182602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1183602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1184602adf40SYehuda Sadeh 	   truncated at this point */
1185aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1186602adf40SYehuda Sadeh 
1187602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1188602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1189602adf40SYehuda Sadeh 			     bio,
1190602adf40SYehuda Sadeh 			     NULL, 0,
1191602adf40SYehuda Sadeh 			     flags,
1192602adf40SYehuda Sadeh 			     ops,
11931fec7093SYehuda Sadeh 			     coll, coll_index,
119459c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
119511f77002SSage Weil 
119611f77002SSage Weil 	rbd_destroy_ops(ops);
1197602adf40SYehuda Sadeh done:
1198602adf40SYehuda Sadeh 	kfree(seg_name);
1199602adf40SYehuda Sadeh 	return ret;
1200602adf40SYehuda Sadeh }
1201602adf40SYehuda Sadeh 
1202602adf40SYehuda Sadeh /*
1203602adf40SYehuda Sadeh  * Request async osd write
1204602adf40SYehuda Sadeh  */
1205602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1206602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1207602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1208602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
12091fec7093SYehuda Sadeh 			 struct bio *bio,
12101fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
12111fec7093SYehuda Sadeh 			 int coll_index)
1212602adf40SYehuda Sadeh {
1213602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1214602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1215602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
12161fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1217602adf40SYehuda Sadeh }
1218602adf40SYehuda Sadeh 
1219602adf40SYehuda Sadeh /*
1220602adf40SYehuda Sadeh  * Request async osd read
1221602adf40SYehuda Sadeh  */
1222602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1223602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1224602adf40SYehuda Sadeh 			 u64 snapid,
1225602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
12261fec7093SYehuda Sadeh 			 struct bio *bio,
12271fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
12281fec7093SYehuda Sadeh 			 int coll_index)
1229602adf40SYehuda Sadeh {
1230602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1231b06e6a6bSJosh Durgin 			 snapid,
1232602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1233602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
12341fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1235602adf40SYehuda Sadeh }
1236602adf40SYehuda Sadeh 
1237602adf40SYehuda Sadeh /*
1238602adf40SYehuda Sadeh  * Request sync osd read
1239602adf40SYehuda Sadeh  */
12400ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1241602adf40SYehuda Sadeh 			  u64 snapid,
1242aded07eaSAlex Elder 			  const char *object_name,
1243602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
124459c2be1eSYehuda Sadeh 			  char *buf,
124559c2be1eSYehuda Sadeh 			  u64 *ver)
1246602adf40SYehuda Sadeh {
1247913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1248913d2fdcSAlex Elder 	int ret;
1249913d2fdcSAlex Elder 
1250913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1251913d2fdcSAlex Elder 	if (!ops)
1252913d2fdcSAlex Elder 		return -ENOMEM;
1253913d2fdcSAlex Elder 
1254913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1255b06e6a6bSJosh Durgin 			       snapid,
1256602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1257913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1258913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1259913d2fdcSAlex Elder 
1260913d2fdcSAlex Elder 	return ret;
1261602adf40SYehuda Sadeh }
1262602adf40SYehuda Sadeh 
1263602adf40SYehuda Sadeh /*
126459c2be1eSYehuda Sadeh  * Request sync osd watch
126559c2be1eSYehuda Sadeh  */
12660ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
126759c2be1eSYehuda Sadeh 				   u64 ver,
12687f0a24d8SAlex Elder 				   u64 notify_id)
126959c2be1eSYehuda Sadeh {
127059c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
127111f77002SSage Weil 	int ret;
127211f77002SSage Weil 
127357cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
127457cfc106SAlex Elder 	if (!ops)
127557cfc106SAlex Elder 		return -ENOMEM;
127659c2be1eSYehuda Sadeh 
1277a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
127859c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
127959c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
128059c2be1eSYehuda Sadeh 
12810ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
12827f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1283ad4f232fSAlex Elder 			  NULL, 0,
128459c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
128559c2be1eSYehuda Sadeh 			  ops,
12861fec7093SYehuda Sadeh 			  NULL, 0,
128759c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
128859c2be1eSYehuda Sadeh 
128959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
129059c2be1eSYehuda Sadeh 	return ret;
129159c2be1eSYehuda Sadeh }
129259c2be1eSYehuda Sadeh 
129359c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
129459c2be1eSYehuda Sadeh {
12950ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1296a71b891bSJosh Durgin 	u64 hver;
129713143d2dSSage Weil 	int rc;
129813143d2dSSage Weil 
12990ce1a794SAlex Elder 	if (!rbd_dev)
130059c2be1eSYehuda Sadeh 		return;
130159c2be1eSYehuda Sadeh 
1302bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1303bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1304bd919d45SAlex Elder 		(unsigned int) opcode);
13051fe5e993SAlex Elder 	rc = rbd_refresh_header(rbd_dev, &hver);
130613143d2dSSage Weil 	if (rc)
1307f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
13080ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
130959c2be1eSYehuda Sadeh 
13107f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
131159c2be1eSYehuda Sadeh }
131259c2be1eSYehuda Sadeh 
131359c2be1eSYehuda Sadeh /*
131459c2be1eSYehuda Sadeh  * Request sync osd watch
131559c2be1eSYehuda Sadeh  */
13160e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
131759c2be1eSYehuda Sadeh {
131859c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13190ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
132057cfc106SAlex Elder 	int ret;
132159c2be1eSYehuda Sadeh 
132257cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
132357cfc106SAlex Elder 	if (!ops)
132457cfc106SAlex Elder 		return -ENOMEM;
132559c2be1eSYehuda Sadeh 
132659c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
13270ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
132859c2be1eSYehuda Sadeh 	if (ret < 0)
132959c2be1eSYehuda Sadeh 		goto fail;
133059c2be1eSYehuda Sadeh 
13310e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
13320ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
133359c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
133459c2be1eSYehuda Sadeh 
13350ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
133659c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
133759c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
133859c2be1eSYehuda Sadeh 			      ops,
13390e6f322dSAlex Elder 			      rbd_dev->header_name,
13400e6f322dSAlex Elder 			      0, 0, NULL,
13410ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
134259c2be1eSYehuda Sadeh 
134359c2be1eSYehuda Sadeh 	if (ret < 0)
134459c2be1eSYehuda Sadeh 		goto fail_event;
134559c2be1eSYehuda Sadeh 
134659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
134759c2be1eSYehuda Sadeh 	return 0;
134859c2be1eSYehuda Sadeh 
134959c2be1eSYehuda Sadeh fail_event:
13500ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13510ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
135259c2be1eSYehuda Sadeh fail:
135359c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
135459c2be1eSYehuda Sadeh 	return ret;
135559c2be1eSYehuda Sadeh }
135659c2be1eSYehuda Sadeh 
135779e3057cSYehuda Sadeh /*
135879e3057cSYehuda Sadeh  * Request sync osd unwatch
135979e3057cSYehuda Sadeh  */
1360070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
136179e3057cSYehuda Sadeh {
136279e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
136357cfc106SAlex Elder 	int ret;
136479e3057cSYehuda Sadeh 
136557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
136657cfc106SAlex Elder 	if (!ops)
136757cfc106SAlex Elder 		return -ENOMEM;
136879e3057cSYehuda Sadeh 
136979e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
13700ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
137179e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
137279e3057cSYehuda Sadeh 
13730ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
137479e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
137579e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
137679e3057cSYehuda Sadeh 			      ops,
1377070c633fSAlex Elder 			      rbd_dev->header_name,
1378070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1379070c633fSAlex Elder 
138079e3057cSYehuda Sadeh 
138179e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
13820ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13830ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
138479e3057cSYehuda Sadeh 	return ret;
138579e3057cSYehuda Sadeh }
138679e3057cSYehuda Sadeh 
138759c2be1eSYehuda Sadeh /*
13883cb4a687SAlex Elder  * Synchronous osd object method call
1389602adf40SYehuda Sadeh  */
13900ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1391aded07eaSAlex Elder 			     const char *object_name,
1392aded07eaSAlex Elder 			     const char *class_name,
1393aded07eaSAlex Elder 			     const char *method_name,
13943cb4a687SAlex Elder 			     const char *outbound,
13953cb4a687SAlex Elder 			     size_t outbound_size,
1396f8d4de6eSAlex Elder 			     char *inbound,
1397f8d4de6eSAlex Elder 			     size_t inbound_size,
13983cb4a687SAlex Elder 			     int flags,
139959c2be1eSYehuda Sadeh 			     u64 *ver)
1400602adf40SYehuda Sadeh {
1401602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1402aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1403aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
14043cb4a687SAlex Elder 	int payload_size;
140557cfc106SAlex Elder 	int ret;
140657cfc106SAlex Elder 
14073cb4a687SAlex Elder 	/*
14083cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
14093cb4a687SAlex Elder 	 * will be sent along with the class and method names as
14103cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
14113cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
14123cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
14133cb4a687SAlex Elder 	 * operation.
14143cb4a687SAlex Elder 	 */
14153cb4a687SAlex Elder 	payload_size = class_name_len + method_name_len + outbound_size;
14163cb4a687SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
141757cfc106SAlex Elder 	if (!ops)
141857cfc106SAlex Elder 		return -ENOMEM;
1419602adf40SYehuda Sadeh 
1420aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1421aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1422aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1423aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1424602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
14253cb4a687SAlex Elder 	ops[0].cls.indata = outbound;
14263cb4a687SAlex Elder 	ops[0].cls.indata_len = outbound_size;
1427602adf40SYehuda Sadeh 
14280ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1429602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
14303cb4a687SAlex Elder 			       flags, ops,
1431f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1432f8d4de6eSAlex Elder 			       NULL, ver);
1433602adf40SYehuda Sadeh 
1434602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1435602adf40SYehuda Sadeh 
1436602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1437602adf40SYehuda Sadeh 	return ret;
1438602adf40SYehuda Sadeh }
1439602adf40SYehuda Sadeh 
14401fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14411fec7093SYehuda Sadeh {
14421fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14431fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14441fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14451fec7093SYehuda Sadeh 				GFP_ATOMIC);
14461fec7093SYehuda Sadeh 
14471fec7093SYehuda Sadeh 	if (!coll)
14481fec7093SYehuda Sadeh 		return NULL;
14491fec7093SYehuda Sadeh 	coll->total = num_reqs;
14501fec7093SYehuda Sadeh 	kref_init(&coll->kref);
14511fec7093SYehuda Sadeh 	return coll;
14521fec7093SYehuda Sadeh }
14531fec7093SYehuda Sadeh 
1454602adf40SYehuda Sadeh /*
1455602adf40SYehuda Sadeh  * block device queue callback
1456602adf40SYehuda Sadeh  */
1457602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1458602adf40SYehuda Sadeh {
1459602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1460602adf40SYehuda Sadeh 	struct request *rq;
1461602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1462602adf40SYehuda Sadeh 
146300f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1464602adf40SYehuda Sadeh 		struct bio *bio;
1465602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1466602adf40SYehuda Sadeh 		bool do_write;
1467bd919d45SAlex Elder 		unsigned int size;
1468bd919d45SAlex Elder 		u64 op_size = 0;
1469602adf40SYehuda Sadeh 		u64 ofs;
14701fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
14711fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1472d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1473602adf40SYehuda Sadeh 
1474602adf40SYehuda Sadeh 		dout("fetched request\n");
1475602adf40SYehuda Sadeh 
1476602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1477602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1478602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
147900f1f36fSAlex Elder 			continue;
1480602adf40SYehuda Sadeh 		}
1481602adf40SYehuda Sadeh 
1482602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1483602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1484602adf40SYehuda Sadeh 
1485602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1486593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1487602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1488f84344f3SAlex Elder 		if (do_write && rbd_dev->mapping.read_only) {
1489602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
149000f1f36fSAlex Elder 			continue;
1491602adf40SYehuda Sadeh 		}
1492602adf40SYehuda Sadeh 
1493602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1494602adf40SYehuda Sadeh 
1495e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1496e88a36ecSJosh Durgin 
1497f84344f3SAlex Elder 		if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1498f84344f3SAlex Elder 				!rbd_dev->mapping.snap_exists) {
1499d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1500e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1501e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1502e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1503e88a36ecSJosh Durgin 			continue;
1504e88a36ecSJosh Durgin 		}
1505d1d25646SJosh Durgin 
1506d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1507d1d25646SJosh Durgin 
1508d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1509e88a36ecSJosh Durgin 
1510602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1511602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1512bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1513602adf40SYehuda Sadeh 
15141fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1515df111be6SAlex Elder 		if (num_segs <= 0) {
1516df111be6SAlex Elder 			spin_lock_irq(q->queue_lock);
1517df111be6SAlex Elder 			__blk_end_request_all(rq, num_segs);
1518df111be6SAlex Elder 			ceph_put_snap_context(snapc);
1519df111be6SAlex Elder 			continue;
1520df111be6SAlex Elder 		}
15211fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
15221fec7093SYehuda Sadeh 		if (!coll) {
15231fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
15241fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1525d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
152600f1f36fSAlex Elder 			continue;
15271fec7093SYehuda Sadeh 		}
15281fec7093SYehuda Sadeh 
1529602adf40SYehuda Sadeh 		do {
1530602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1531bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
153265ccfe21SAlex Elder 			op_size = rbd_segment_length(rbd_dev, ofs, size);
15331fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1534602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1535602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1536602adf40SYehuda Sadeh 			if (!bio) {
15371fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15381fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15391fec7093SYehuda Sadeh 				goto next_seg;
1540602adf40SYehuda Sadeh 			}
1541602adf40SYehuda Sadeh 
15421fec7093SYehuda Sadeh 
1543602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1544602adf40SYehuda Sadeh 			if (do_write)
1545602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1546d1d25646SJosh Durgin 					      snapc,
1547602adf40SYehuda Sadeh 					      ofs,
15481fec7093SYehuda Sadeh 					      op_size, bio,
15491fec7093SYehuda Sadeh 					      coll, cur_seg);
1550602adf40SYehuda Sadeh 			else
1551602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
1552f84344f3SAlex Elder 					     rbd_dev->mapping.snap_id,
1553602adf40SYehuda Sadeh 					     ofs,
15541fec7093SYehuda Sadeh 					     op_size, bio,
15551fec7093SYehuda Sadeh 					     coll, cur_seg);
1556602adf40SYehuda Sadeh 
15571fec7093SYehuda Sadeh next_seg:
1558602adf40SYehuda Sadeh 			size -= op_size;
1559602adf40SYehuda Sadeh 			ofs += op_size;
1560602adf40SYehuda Sadeh 
15611fec7093SYehuda Sadeh 			cur_seg++;
1562602adf40SYehuda Sadeh 			rq_bio = next_bio;
1563602adf40SYehuda Sadeh 		} while (size > 0);
15641fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1565602adf40SYehuda Sadeh 
1566602adf40SYehuda Sadeh 		if (bp)
1567602adf40SYehuda Sadeh 			bio_pair_release(bp);
1568602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1569d1d25646SJosh Durgin 
1570d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1571602adf40SYehuda Sadeh 	}
1572602adf40SYehuda Sadeh }
1573602adf40SYehuda Sadeh 
1574602adf40SYehuda Sadeh /*
1575602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1576602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1577602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1578602adf40SYehuda Sadeh  */
1579602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1580602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1581602adf40SYehuda Sadeh {
1582602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1583593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1584593a9e7bSAlex Elder 	sector_t sector;
1585593a9e7bSAlex Elder 	unsigned int bio_sectors;
1586602adf40SYehuda Sadeh 	int max;
1587602adf40SYehuda Sadeh 
1588593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1589593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1590593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1591593a9e7bSAlex Elder 
1592602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1593593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1594602adf40SYehuda Sadeh 	if (max < 0)
1595602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1596602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1597602adf40SYehuda Sadeh 		return bvec->bv_len;
1598602adf40SYehuda Sadeh 	return max;
1599602adf40SYehuda Sadeh }
1600602adf40SYehuda Sadeh 
1601602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1602602adf40SYehuda Sadeh {
1603602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1604602adf40SYehuda Sadeh 
1605602adf40SYehuda Sadeh 	if (!disk)
1606602adf40SYehuda Sadeh 		return;
1607602adf40SYehuda Sadeh 
1608602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1609602adf40SYehuda Sadeh 		del_gendisk(disk);
1610602adf40SYehuda Sadeh 	if (disk->queue)
1611602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1612602adf40SYehuda Sadeh 	put_disk(disk);
1613602adf40SYehuda Sadeh }
1614602adf40SYehuda Sadeh 
1615602adf40SYehuda Sadeh /*
16164156d998SAlex Elder  * Read the complete header for the given rbd device.
16174156d998SAlex Elder  *
16184156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
16194156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
16204156d998SAlex Elder  * of a variable that will be filled in with the version of the
16214156d998SAlex Elder  * header object at the time it was read.
16224156d998SAlex Elder  *
16234156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
16244156d998SAlex Elder  */
16254156d998SAlex Elder static struct rbd_image_header_ondisk *
16264156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
16274156d998SAlex Elder {
16284156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
16294156d998SAlex Elder 	u32 snap_count = 0;
16304156d998SAlex Elder 	u64 names_size = 0;
16314156d998SAlex Elder 	u32 want_count;
16324156d998SAlex Elder 	int ret;
16334156d998SAlex Elder 
16344156d998SAlex Elder 	/*
16354156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
16364156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
16374156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
16384156d998SAlex Elder 	 * the number of snapshots could change by the time we read
16394156d998SAlex Elder 	 * it in, in which case we re-read it.
16404156d998SAlex Elder 	 */
16414156d998SAlex Elder 	do {
16424156d998SAlex Elder 		size_t size;
16434156d998SAlex Elder 
16444156d998SAlex Elder 		kfree(ondisk);
16454156d998SAlex Elder 
16464156d998SAlex Elder 		size = sizeof (*ondisk);
16474156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
16484156d998SAlex Elder 		size += names_size;
16494156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
16504156d998SAlex Elder 		if (!ondisk)
16514156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
16524156d998SAlex Elder 
16534156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
16544156d998SAlex Elder 				       rbd_dev->header_name,
16554156d998SAlex Elder 				       0, size,
16564156d998SAlex Elder 				       (char *) ondisk, version);
16574156d998SAlex Elder 
16584156d998SAlex Elder 		if (ret < 0)
16594156d998SAlex Elder 			goto out_err;
16604156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
16614156d998SAlex Elder 			ret = -ENXIO;
16624156d998SAlex Elder 			pr_warning("short header read for image %s"
16634156d998SAlex Elder 					" (want %zd got %d)\n",
16644156d998SAlex Elder 				rbd_dev->image_name, size, ret);
16654156d998SAlex Elder 			goto out_err;
16664156d998SAlex Elder 		}
16674156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
16684156d998SAlex Elder 			ret = -ENXIO;
16694156d998SAlex Elder 			pr_warning("invalid header for image %s\n",
16704156d998SAlex Elder 				rbd_dev->image_name);
16714156d998SAlex Elder 			goto out_err;
16724156d998SAlex Elder 		}
16734156d998SAlex Elder 
16744156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
16754156d998SAlex Elder 		want_count = snap_count;
16764156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
16774156d998SAlex Elder 	} while (snap_count != want_count);
16784156d998SAlex Elder 
16794156d998SAlex Elder 	return ondisk;
16804156d998SAlex Elder 
16814156d998SAlex Elder out_err:
16824156d998SAlex Elder 	kfree(ondisk);
16834156d998SAlex Elder 
16844156d998SAlex Elder 	return ERR_PTR(ret);
16854156d998SAlex Elder }
16864156d998SAlex Elder 
16874156d998SAlex Elder /*
1688602adf40SYehuda Sadeh  * reload the ondisk the header
1689602adf40SYehuda Sadeh  */
1690602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1691602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1692602adf40SYehuda Sadeh {
16934156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
16944156d998SAlex Elder 	u64 ver = 0;
16954156d998SAlex Elder 	int ret;
1696602adf40SYehuda Sadeh 
16974156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
16984156d998SAlex Elder 	if (IS_ERR(ondisk))
16994156d998SAlex Elder 		return PTR_ERR(ondisk);
17004156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
17014156d998SAlex Elder 	if (ret >= 0)
170259c2be1eSYehuda Sadeh 		header->obj_version = ver;
17034156d998SAlex Elder 	kfree(ondisk);
1704602adf40SYehuda Sadeh 
17054156d998SAlex Elder 	return ret;
1706602adf40SYehuda Sadeh }
1707602adf40SYehuda Sadeh 
1708dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1709dfc5606dSYehuda Sadeh {
1710dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1711a0593290SAlex Elder 	struct rbd_snap *next;
1712dfc5606dSYehuda Sadeh 
1713a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
171414e7085dSAlex Elder 		__rbd_remove_snap_dev(snap);
1715dfc5606dSYehuda Sadeh }
1716dfc5606dSYehuda Sadeh 
1717602adf40SYehuda Sadeh /*
1718602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1719602adf40SYehuda Sadeh  */
1720b813623aSAlex Elder static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1721602adf40SYehuda Sadeh {
1722602adf40SYehuda Sadeh 	int ret;
1723602adf40SYehuda Sadeh 	struct rbd_image_header h;
1724602adf40SYehuda Sadeh 
1725602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1726602adf40SYehuda Sadeh 	if (ret < 0)
1727602adf40SYehuda Sadeh 		return ret;
1728602adf40SYehuda Sadeh 
1729a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1730a51aa0c0SJosh Durgin 
17319db4b3e3SSage Weil 	/* resized? */
1732f84344f3SAlex Elder 	if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
1733474ef7ceSJosh Durgin 		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1734474ef7ceSJosh Durgin 
173599c1f08fSAlex Elder 		if (size != (sector_t) rbd_dev->mapping.size) {
173699c1f08fSAlex Elder 			dout("setting size to %llu sectors",
173799c1f08fSAlex Elder 				(unsigned long long) size);
173899c1f08fSAlex Elder 			rbd_dev->mapping.size = (u64) size;
1739474ef7ceSJosh Durgin 			set_capacity(rbd_dev->disk, size);
1740474ef7ceSJosh Durgin 		}
174199c1f08fSAlex Elder 	}
17429db4b3e3SSage Weil 
1743849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1744602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1745849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1746d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1747d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1748602adf40SYehuda Sadeh 
1749b813623aSAlex Elder 	if (hver)
1750b813623aSAlex Elder 		*hver = h.obj_version;
1751a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
175293a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1753602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1754602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1755602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1756849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1757849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1758849b4260SAlex Elder 	kfree(h.object_prefix);
1759849b4260SAlex Elder 
1760304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1761304f6808SAlex Elder 	if (!ret)
1762304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1763dfc5606dSYehuda Sadeh 
1764c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1765602adf40SYehuda Sadeh 
1766dfc5606dSYehuda Sadeh 	return ret;
1767602adf40SYehuda Sadeh }
1768602adf40SYehuda Sadeh 
17691fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
17701fe5e993SAlex Elder {
17711fe5e993SAlex Elder 	int ret;
17721fe5e993SAlex Elder 
17731fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
17741fe5e993SAlex Elder 	ret = __rbd_refresh_header(rbd_dev, hver);
17751fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
17761fe5e993SAlex Elder 
17771fe5e993SAlex Elder 	return ret;
17781fe5e993SAlex Elder }
17791fe5e993SAlex Elder 
1780602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1781602adf40SYehuda Sadeh {
1782602adf40SYehuda Sadeh 	struct gendisk *disk;
1783602adf40SYehuda Sadeh 	struct request_queue *q;
1784593a9e7bSAlex Elder 	u64 segment_size;
1785602adf40SYehuda Sadeh 
1786602adf40SYehuda Sadeh 	/* create gendisk info */
1787602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1788602adf40SYehuda Sadeh 	if (!disk)
17891fcdb8aaSAlex Elder 		return -ENOMEM;
1790602adf40SYehuda Sadeh 
1791f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1792de71a297SAlex Elder 		 rbd_dev->dev_id);
1793602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1794602adf40SYehuda Sadeh 	disk->first_minor = 0;
1795602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1796602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1797602adf40SYehuda Sadeh 
1798602adf40SYehuda Sadeh 	/* init rq */
1799602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1800602adf40SYehuda Sadeh 	if (!q)
1801602adf40SYehuda Sadeh 		goto out_disk;
1802029bcbd8SJosh Durgin 
1803593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1804593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1805593a9e7bSAlex Elder 
1806029bcbd8SJosh Durgin 	/* set io sizes to object size */
1807593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1808593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1809593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1810593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1811593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1812029bcbd8SJosh Durgin 
1813602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1814602adf40SYehuda Sadeh 	disk->queue = q;
1815602adf40SYehuda Sadeh 
1816602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1817602adf40SYehuda Sadeh 
1818602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1819602adf40SYehuda Sadeh 
182012f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
182112f02944SAlex Elder 
1822602adf40SYehuda Sadeh 	return 0;
1823602adf40SYehuda Sadeh out_disk:
1824602adf40SYehuda Sadeh 	put_disk(disk);
18251fcdb8aaSAlex Elder 
18261fcdb8aaSAlex Elder 	return -ENOMEM;
1827602adf40SYehuda Sadeh }
1828602adf40SYehuda Sadeh 
1829dfc5606dSYehuda Sadeh /*
1830dfc5606dSYehuda Sadeh   sysfs
1831dfc5606dSYehuda Sadeh */
1832602adf40SYehuda Sadeh 
1833593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1834593a9e7bSAlex Elder {
1835593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1836593a9e7bSAlex Elder }
1837593a9e7bSAlex Elder 
1838dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1839dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1840602adf40SYehuda Sadeh {
1841593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1842a51aa0c0SJosh Durgin 	sector_t size;
1843dfc5606dSYehuda Sadeh 
1844a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1845a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1846a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1847a51aa0c0SJosh Durgin 
1848a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1849602adf40SYehuda Sadeh }
1850602adf40SYehuda Sadeh 
185134b13184SAlex Elder /*
185234b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
185334b13184SAlex Elder  * necessarily the base image.
185434b13184SAlex Elder  */
185534b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
185634b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
185734b13184SAlex Elder {
185834b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
185934b13184SAlex Elder 
186034b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
186134b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
186234b13184SAlex Elder }
186334b13184SAlex Elder 
1864dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1865dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1866602adf40SYehuda Sadeh {
1867593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1868dfc5606dSYehuda Sadeh 
1869dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1870dfc5606dSYehuda Sadeh }
1871dfc5606dSYehuda Sadeh 
1872dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1873dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1874dfc5606dSYehuda Sadeh {
1875593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1876dfc5606dSYehuda Sadeh 
18771dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
18781dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1879dfc5606dSYehuda Sadeh }
1880dfc5606dSYehuda Sadeh 
1881dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1882dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1883dfc5606dSYehuda Sadeh {
1884593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1885dfc5606dSYehuda Sadeh 
1886dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1887dfc5606dSYehuda Sadeh }
1888dfc5606dSYehuda Sadeh 
18899bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
18909bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
18919bb2f334SAlex Elder {
18929bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
18939bb2f334SAlex Elder 
18949bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
18959bb2f334SAlex Elder }
18969bb2f334SAlex Elder 
1897dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1898dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1899dfc5606dSYehuda Sadeh {
1900593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1901dfc5606dSYehuda Sadeh 
19020bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
1903dfc5606dSYehuda Sadeh }
1904dfc5606dSYehuda Sadeh 
1905589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
1906589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
1907589d30e0SAlex Elder {
1908589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1909589d30e0SAlex Elder 
1910589d30e0SAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_id);
1911589d30e0SAlex Elder }
1912589d30e0SAlex Elder 
191334b13184SAlex Elder /*
191434b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
191534b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
191634b13184SAlex Elder  */
1917dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
1918dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
1919dfc5606dSYehuda Sadeh 			     char *buf)
1920dfc5606dSYehuda Sadeh {
1921593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1922dfc5606dSYehuda Sadeh 
1923f84344f3SAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
1924dfc5606dSYehuda Sadeh }
1925dfc5606dSYehuda Sadeh 
1926dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
1927dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
1928dfc5606dSYehuda Sadeh 				 const char *buf,
1929dfc5606dSYehuda Sadeh 				 size_t size)
1930dfc5606dSYehuda Sadeh {
1931593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1932b813623aSAlex Elder 	int ret;
1933602adf40SYehuda Sadeh 
19341fe5e993SAlex Elder 	ret = rbd_refresh_header(rbd_dev, NULL);
1935b813623aSAlex Elder 
1936b813623aSAlex Elder 	return ret < 0 ? ret : size;
1937dfc5606dSYehuda Sadeh }
1938602adf40SYehuda Sadeh 
1939dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
194034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
1941dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1942dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1943dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
19449bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
1945dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1946589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
1947dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1948dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1949dfc5606dSYehuda Sadeh 
1950dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
1951dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
195234b13184SAlex Elder 	&dev_attr_features.attr,
1953dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
1954dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
1955dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
19569bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
1957dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
1958589d30e0SAlex Elder 	&dev_attr_image_id.attr,
1959dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
1960dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
1961dfc5606dSYehuda Sadeh 	NULL
1962dfc5606dSYehuda Sadeh };
1963dfc5606dSYehuda Sadeh 
1964dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
1965dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
1966dfc5606dSYehuda Sadeh };
1967dfc5606dSYehuda Sadeh 
1968dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
1969dfc5606dSYehuda Sadeh 	&rbd_attr_group,
1970dfc5606dSYehuda Sadeh 	NULL
1971dfc5606dSYehuda Sadeh };
1972dfc5606dSYehuda Sadeh 
1973dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
1974dfc5606dSYehuda Sadeh {
1975dfc5606dSYehuda Sadeh }
1976dfc5606dSYehuda Sadeh 
1977dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
1978dfc5606dSYehuda Sadeh 	.name		= "rbd",
1979dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
1980dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
1981dfc5606dSYehuda Sadeh };
1982dfc5606dSYehuda Sadeh 
1983dfc5606dSYehuda Sadeh 
1984dfc5606dSYehuda Sadeh /*
1985dfc5606dSYehuda Sadeh   sysfs - snapshots
1986dfc5606dSYehuda Sadeh */
1987dfc5606dSYehuda Sadeh 
1988dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
1989dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
1990dfc5606dSYehuda Sadeh 				  char *buf)
1991dfc5606dSYehuda Sadeh {
1992dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1993dfc5606dSYehuda Sadeh 
19943591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
1995dfc5606dSYehuda Sadeh }
1996dfc5606dSYehuda Sadeh 
1997dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
1998dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
1999dfc5606dSYehuda Sadeh 				char *buf)
2000dfc5606dSYehuda Sadeh {
2001dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2002dfc5606dSYehuda Sadeh 
2003593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2004dfc5606dSYehuda Sadeh }
2005dfc5606dSYehuda Sadeh 
200634b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
200734b13184SAlex Elder 				struct device_attribute *attr,
200834b13184SAlex Elder 				char *buf)
200934b13184SAlex Elder {
201034b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
201134b13184SAlex Elder 
201234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
201334b13184SAlex Elder 			(unsigned long long) snap->features);
201434b13184SAlex Elder }
201534b13184SAlex Elder 
2016dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2017dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
201834b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2019dfc5606dSYehuda Sadeh 
2020dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2021dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2022dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
202334b13184SAlex Elder 	&dev_attr_snap_features.attr,
2024dfc5606dSYehuda Sadeh 	NULL,
2025dfc5606dSYehuda Sadeh };
2026dfc5606dSYehuda Sadeh 
2027dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2028dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2029dfc5606dSYehuda Sadeh };
2030dfc5606dSYehuda Sadeh 
2031dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2032dfc5606dSYehuda Sadeh {
2033dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2034dfc5606dSYehuda Sadeh 	kfree(snap->name);
2035dfc5606dSYehuda Sadeh 	kfree(snap);
2036dfc5606dSYehuda Sadeh }
2037dfc5606dSYehuda Sadeh 
2038dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2039dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2040dfc5606dSYehuda Sadeh 	NULL
2041dfc5606dSYehuda Sadeh };
2042dfc5606dSYehuda Sadeh 
2043dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2044dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2045dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2046dfc5606dSYehuda Sadeh };
2047dfc5606dSYehuda Sadeh 
2048304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2049304f6808SAlex Elder {
2050304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2051304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2052304f6808SAlex Elder 
2053304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2054304f6808SAlex Elder 
2055304f6808SAlex Elder 	return ret;
2056304f6808SAlex Elder }
2057304f6808SAlex Elder 
205814e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2059dfc5606dSYehuda Sadeh {
2060dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2061304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2062dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2063dfc5606dSYehuda Sadeh }
2064dfc5606dSYehuda Sadeh 
206514e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2066dfc5606dSYehuda Sadeh 				  struct device *parent)
2067dfc5606dSYehuda Sadeh {
2068dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2069dfc5606dSYehuda Sadeh 	int ret;
2070dfc5606dSYehuda Sadeh 
2071dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2072dfc5606dSYehuda Sadeh 	dev->parent = parent;
2073dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2074dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2075304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2076304f6808SAlex Elder 
2077dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2078dfc5606dSYehuda Sadeh 
2079dfc5606dSYehuda Sadeh 	return ret;
2080dfc5606dSYehuda Sadeh }
2081dfc5606dSYehuda Sadeh 
20824e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2083c8d18425SAlex Elder 						const char *snap_name,
208434b13184SAlex Elder 						u64 snap_id, u64 snap_size,
208534b13184SAlex Elder 						u64 snap_features)
2086dfc5606dSYehuda Sadeh {
20874e891e0aSAlex Elder 	struct rbd_snap *snap;
2088dfc5606dSYehuda Sadeh 	int ret;
20894e891e0aSAlex Elder 
20904e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2091dfc5606dSYehuda Sadeh 	if (!snap)
20924e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
20934e891e0aSAlex Elder 
20944e891e0aSAlex Elder 	ret = -ENOMEM;
2095c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
20964e891e0aSAlex Elder 	if (!snap->name)
20974e891e0aSAlex Elder 		goto err;
20984e891e0aSAlex Elder 
2099c8d18425SAlex Elder 	snap->id = snap_id;
2100c8d18425SAlex Elder 	snap->size = snap_size;
210134b13184SAlex Elder 	snap->features = snap_features;
21024e891e0aSAlex Elder 
21034e891e0aSAlex Elder 	return snap;
21044e891e0aSAlex Elder 
2105dfc5606dSYehuda Sadeh err:
2106dfc5606dSYehuda Sadeh 	kfree(snap->name);
2107dfc5606dSYehuda Sadeh 	kfree(snap);
21084e891e0aSAlex Elder 
21094e891e0aSAlex Elder 	return ERR_PTR(ret);
2110dfc5606dSYehuda Sadeh }
2111dfc5606dSYehuda Sadeh 
2112cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2113cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2114cd892126SAlex Elder {
2115cd892126SAlex Elder 	char *snap_name;
2116cd892126SAlex Elder 
2117cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2118cd892126SAlex Elder 
2119cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2120cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2121cd892126SAlex Elder 
2122cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2123cd892126SAlex Elder 
2124cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2125cd892126SAlex Elder 	while (which--)
2126cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2127cd892126SAlex Elder 
2128cd892126SAlex Elder 	return snap_name;
2129cd892126SAlex Elder }
2130cd892126SAlex Elder 
2131dfc5606dSYehuda Sadeh /*
21329d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
21339d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
21349d475de5SAlex Elder  * image.
21359d475de5SAlex Elder  */
21369d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
21379d475de5SAlex Elder 				u8 *order, u64 *snap_size)
21389d475de5SAlex Elder {
21399d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
21409d475de5SAlex Elder 	int ret;
21419d475de5SAlex Elder 	struct {
21429d475de5SAlex Elder 		u8 order;
21439d475de5SAlex Elder 		__le64 size;
21449d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
21459d475de5SAlex Elder 
21469d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
21479d475de5SAlex Elder 				"rbd", "get_size",
21489d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
21499d475de5SAlex Elder 				(char *) &size_buf, sizeof (size_buf),
21509d475de5SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
21519d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
21529d475de5SAlex Elder 	if (ret < 0)
21539d475de5SAlex Elder 		return ret;
21549d475de5SAlex Elder 
21559d475de5SAlex Elder 	*order = size_buf.order;
21569d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
21579d475de5SAlex Elder 
21589d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
21599d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
21609d475de5SAlex Elder 		(unsigned long long) *snap_size);
21619d475de5SAlex Elder 
21629d475de5SAlex Elder 	return 0;
21639d475de5SAlex Elder }
21649d475de5SAlex Elder 
21659d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
21669d475de5SAlex Elder {
21679d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
21689d475de5SAlex Elder 					&rbd_dev->header.obj_order,
21699d475de5SAlex Elder 					&rbd_dev->header.image_size);
21709d475de5SAlex Elder }
21719d475de5SAlex Elder 
21721e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
21731e130199SAlex Elder {
21741e130199SAlex Elder 	void *reply_buf;
21751e130199SAlex Elder 	int ret;
21761e130199SAlex Elder 	void *p;
21771e130199SAlex Elder 
21781e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
21791e130199SAlex Elder 	if (!reply_buf)
21801e130199SAlex Elder 		return -ENOMEM;
21811e130199SAlex Elder 
21821e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
21831e130199SAlex Elder 				"rbd", "get_object_prefix",
21841e130199SAlex Elder 				NULL, 0,
21851e130199SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
21861e130199SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
21871e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
21881e130199SAlex Elder 	if (ret < 0)
21891e130199SAlex Elder 		goto out;
21901e130199SAlex Elder 
21911e130199SAlex Elder 	p = reply_buf;
21921e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
21931e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
21941e130199SAlex Elder 						NULL, GFP_NOIO);
21951e130199SAlex Elder 
21961e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
21971e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
21981e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
21991e130199SAlex Elder 	} else {
22001e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
22011e130199SAlex Elder 	}
22021e130199SAlex Elder 
22031e130199SAlex Elder out:
22041e130199SAlex Elder 	kfree(reply_buf);
22051e130199SAlex Elder 
22061e130199SAlex Elder 	return ret;
22071e130199SAlex Elder }
22081e130199SAlex Elder 
22099d475de5SAlex Elder /*
221035938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
221135938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
221235938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
221335938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
221435938150SAlex Elder  * And verify there are no changes to snapshots we already know
221535938150SAlex Elder  * about.
221635938150SAlex Elder  *
221735938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
221835938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
221935938150SAlex Elder  * are also maintained in that order.)
2220dfc5606dSYehuda Sadeh  */
2221304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2222dfc5606dSYehuda Sadeh {
222335938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
222435938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
222535938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
222635938150SAlex Elder 	struct list_head *links = head->next;
222735938150SAlex Elder 	u32 index = 0;
2228dfc5606dSYehuda Sadeh 
22299fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
223035938150SAlex Elder 	while (index < snap_count || links != head) {
223135938150SAlex Elder 		u64 snap_id;
223235938150SAlex Elder 		struct rbd_snap *snap;
2233cd892126SAlex Elder 		char *snap_name;
2234cd892126SAlex Elder 		u64 snap_size = 0;
2235cd892126SAlex Elder 		u64 snap_features = 0;
2236dfc5606dSYehuda Sadeh 
223735938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
223835938150SAlex Elder 					     : CEPH_NOSNAP;
223935938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
224035938150SAlex Elder 				     : NULL;
2241aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2242dfc5606dSYehuda Sadeh 
224335938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
224435938150SAlex Elder 			struct list_head *next = links->next;
2245dfc5606dSYehuda Sadeh 
224635938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2247dfc5606dSYehuda Sadeh 
2248f84344f3SAlex Elder 			if (rbd_dev->mapping.snap_id == snap->id)
2249f84344f3SAlex Elder 				rbd_dev->mapping.snap_exists = false;
225035938150SAlex Elder 			__rbd_remove_snap_dev(snap);
22519fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
2252f84344f3SAlex Elder 				rbd_dev->mapping.snap_id == snap->id ?
2253f84344f3SAlex Elder 								"mapped " : "",
22549fcbb800SAlex Elder 				(unsigned long long) snap->id);
2255dfc5606dSYehuda Sadeh 
225635938150SAlex Elder 			/* Done with this list entry; advance */
225735938150SAlex Elder 
225835938150SAlex Elder 			links = next;
225935938150SAlex Elder 			continue;
2260dfc5606dSYehuda Sadeh 		}
226135938150SAlex Elder 
2262cd892126SAlex Elder 		snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2263cd892126SAlex Elder 						&snap_size, &snap_features);
2264cd892126SAlex Elder 		if (IS_ERR(snap_name))
2265cd892126SAlex Elder 			return PTR_ERR(snap_name);
2266cd892126SAlex Elder 
22679fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
22689fcbb800SAlex Elder 			(unsigned long long) snap_id);
226935938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
227035938150SAlex Elder 			struct rbd_snap *new_snap;
227135938150SAlex Elder 
227235938150SAlex Elder 			/* We haven't seen this snapshot before */
227335938150SAlex Elder 
2274c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2275cd892126SAlex Elder 					snap_id, snap_size, snap_features);
22769fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
22779fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
22789fcbb800SAlex Elder 
22799fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
22809fcbb800SAlex Elder 
22819fcbb800SAlex Elder 				return err;
22829fcbb800SAlex Elder 			}
228335938150SAlex Elder 
228435938150SAlex Elder 			/* New goes before existing, or at end of list */
228535938150SAlex Elder 
22869fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
228735938150SAlex Elder 			if (snap)
228835938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
228935938150SAlex Elder 			else
2290523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
229135938150SAlex Elder 		} else {
229235938150SAlex Elder 			/* Already have this one */
229335938150SAlex Elder 
22949fcbb800SAlex Elder 			dout("  already present\n");
22959fcbb800SAlex Elder 
2296cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2297aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2298cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
229935938150SAlex Elder 
230035938150SAlex Elder 			/* Done with this list entry; advance */
230135938150SAlex Elder 
230235938150SAlex Elder 			links = links->next;
2303dfc5606dSYehuda Sadeh 		}
230435938150SAlex Elder 
230535938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
230635938150SAlex Elder 
230735938150SAlex Elder 		index++;
2308dfc5606dSYehuda Sadeh 	}
23099fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2310dfc5606dSYehuda Sadeh 
2311dfc5606dSYehuda Sadeh 	return 0;
2312dfc5606dSYehuda Sadeh }
2313dfc5606dSYehuda Sadeh 
2314304f6808SAlex Elder /*
2315304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2316304f6808SAlex Elder  * have not already been registered.
2317304f6808SAlex Elder  */
2318304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2319304f6808SAlex Elder {
2320304f6808SAlex Elder 	struct rbd_snap *snap;
2321304f6808SAlex Elder 	int ret = 0;
2322304f6808SAlex Elder 
2323304f6808SAlex Elder 	dout("%s called\n", __func__);
232486ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
232586ff77bbSAlex Elder 		return -EIO;
2326304f6808SAlex Elder 
2327304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2328304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
2329304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2330304f6808SAlex Elder 			if (ret < 0)
2331304f6808SAlex Elder 				break;
2332304f6808SAlex Elder 		}
2333304f6808SAlex Elder 	}
2334304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
2335304f6808SAlex Elder 
2336304f6808SAlex Elder 	return ret;
2337304f6808SAlex Elder }
2338304f6808SAlex Elder 
2339dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2340dfc5606dSYehuda Sadeh {
2341dfc5606dSYehuda Sadeh 	struct device *dev;
2342cd789ab9SAlex Elder 	int ret;
2343dfc5606dSYehuda Sadeh 
2344dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2345dfc5606dSYehuda Sadeh 
2346cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
2347dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2348dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2349dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2350dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2351de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2352dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2353dfc5606dSYehuda Sadeh 
2354dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2355cd789ab9SAlex Elder 
2356dfc5606dSYehuda Sadeh 	return ret;
2357602adf40SYehuda Sadeh }
2358602adf40SYehuda Sadeh 
2359dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2360dfc5606dSYehuda Sadeh {
2361dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2362dfc5606dSYehuda Sadeh }
2363dfc5606dSYehuda Sadeh 
236459c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
236559c2be1eSYehuda Sadeh {
236659c2be1eSYehuda Sadeh 	int ret, rc;
236759c2be1eSYehuda Sadeh 
236859c2be1eSYehuda Sadeh 	do {
23690e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
237059c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
23711fe5e993SAlex Elder 			rc = rbd_refresh_header(rbd_dev, NULL);
237259c2be1eSYehuda Sadeh 			if (rc < 0)
237359c2be1eSYehuda Sadeh 				return rc;
237459c2be1eSYehuda Sadeh 		}
237559c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
237659c2be1eSYehuda Sadeh 
237759c2be1eSYehuda Sadeh 	return ret;
237859c2be1eSYehuda Sadeh }
237959c2be1eSYehuda Sadeh 
2380e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
23811ddbe94eSAlex Elder 
23821ddbe94eSAlex Elder /*
2383499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2384499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
23851ddbe94eSAlex Elder  */
2386e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2387b7f23c36SAlex Elder {
2388e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2389499afd5bSAlex Elder 
2390499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2391499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2392499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2393e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2394e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2395b7f23c36SAlex Elder }
2396b7f23c36SAlex Elder 
23971ddbe94eSAlex Elder /*
2398499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2399499afd5bSAlex Elder  * identifier is no longer in use.
24001ddbe94eSAlex Elder  */
2401e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
24021ddbe94eSAlex Elder {
2403d184f6bfSAlex Elder 	struct list_head *tmp;
2404de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
2405d184f6bfSAlex Elder 	int max_id;
2406d184f6bfSAlex Elder 
2407aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
2408499afd5bSAlex Elder 
2409e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2410e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2411499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2412499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2413d184f6bfSAlex Elder 
2414d184f6bfSAlex Elder 	/*
2415d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2416d184f6bfSAlex Elder 	 * is nothing special we need to do.
2417d184f6bfSAlex Elder 	 */
2418e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2419d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2420d184f6bfSAlex Elder 		return;
2421d184f6bfSAlex Elder 	}
2422d184f6bfSAlex Elder 
2423d184f6bfSAlex Elder 	/*
2424d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2425d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2426d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2427d184f6bfSAlex Elder 	 */
2428d184f6bfSAlex Elder 	max_id = 0;
2429d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2430d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2431d184f6bfSAlex Elder 
2432d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2433d184f6bfSAlex Elder 		if (rbd_id > max_id)
2434d184f6bfSAlex Elder 			max_id = rbd_id;
2435d184f6bfSAlex Elder 	}
2436499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
24371ddbe94eSAlex Elder 
24381ddbe94eSAlex Elder 	/*
2439e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
2440d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2441d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2442d184f6bfSAlex Elder 	 * case.
24431ddbe94eSAlex Elder 	 */
2444e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2445e2839308SAlex Elder 	dout("  max dev id has been reset\n");
2446b7f23c36SAlex Elder }
2447b7f23c36SAlex Elder 
2448a725f65eSAlex Elder /*
2449e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2450e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2451593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2452593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2453e28fff26SAlex Elder  */
2454e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2455e28fff26SAlex Elder {
2456e28fff26SAlex Elder         /*
2457e28fff26SAlex Elder         * These are the characters that produce nonzero for
2458e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2459e28fff26SAlex Elder         */
2460e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2461e28fff26SAlex Elder 
2462e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2463e28fff26SAlex Elder 
2464e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2465e28fff26SAlex Elder }
2466e28fff26SAlex Elder 
2467e28fff26SAlex Elder /*
2468e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2469e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2470593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2471593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2472e28fff26SAlex Elder  *
2473e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2474e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2475e28fff26SAlex Elder  * token_size if the token would not fit.
2476e28fff26SAlex Elder  *
2477593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2478e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2479e28fff26SAlex Elder  * too small to hold it.
2480e28fff26SAlex Elder  */
2481e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2482e28fff26SAlex Elder 				char *token,
2483e28fff26SAlex Elder 				size_t token_size)
2484e28fff26SAlex Elder {
2485e28fff26SAlex Elder         size_t len;
2486e28fff26SAlex Elder 
2487e28fff26SAlex Elder 	len = next_token(buf);
2488e28fff26SAlex Elder 	if (len < token_size) {
2489e28fff26SAlex Elder 		memcpy(token, *buf, len);
2490e28fff26SAlex Elder 		*(token + len) = '\0';
2491e28fff26SAlex Elder 	}
2492e28fff26SAlex Elder 	*buf += len;
2493e28fff26SAlex Elder 
2494e28fff26SAlex Elder         return len;
2495e28fff26SAlex Elder }
2496e28fff26SAlex Elder 
2497e28fff26SAlex Elder /*
2498ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2499ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2500ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2501ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2502ea3352f4SAlex Elder  *
2503ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2504ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2505ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2506ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2507ea3352f4SAlex Elder  *
2508ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2509ea3352f4SAlex Elder  * the end of the found token.
2510ea3352f4SAlex Elder  *
2511ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2512ea3352f4SAlex Elder  */
2513ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2514ea3352f4SAlex Elder {
2515ea3352f4SAlex Elder 	char *dup;
2516ea3352f4SAlex Elder 	size_t len;
2517ea3352f4SAlex Elder 
2518ea3352f4SAlex Elder 	len = next_token(buf);
2519ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2520ea3352f4SAlex Elder 	if (!dup)
2521ea3352f4SAlex Elder 		return NULL;
2522ea3352f4SAlex Elder 
2523ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2524ea3352f4SAlex Elder 	*(dup + len) = '\0';
2525ea3352f4SAlex Elder 	*buf += len;
2526ea3352f4SAlex Elder 
2527ea3352f4SAlex Elder 	if (lenp)
2528ea3352f4SAlex Elder 		*lenp = len;
2529ea3352f4SAlex Elder 
2530ea3352f4SAlex Elder 	return dup;
2531ea3352f4SAlex Elder }
2532ea3352f4SAlex Elder 
2533ea3352f4SAlex Elder /*
25343feeb894SAlex Elder  * This fills in the pool_name, image_name, image_name_len, rbd_dev,
25353feeb894SAlex Elder  * rbd_md_name, and name fields of the given rbd_dev, based on the
25363feeb894SAlex Elder  * list of monitor addresses and other options provided via
25373feeb894SAlex Elder  * /sys/bus/rbd/add.  Returns a pointer to a dynamically-allocated
25383feeb894SAlex Elder  * copy of the snapshot name to map if successful, or a
25393feeb894SAlex Elder  * pointer-coded error otherwise.
2540d22f76e7SAlex Elder  *
2541d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2542a725f65eSAlex Elder  */
25433feeb894SAlex Elder static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2544a725f65eSAlex Elder 				const char *buf,
25457ef3214aSAlex Elder 				const char **mon_addrs,
25465214ecc4SAlex Elder 				size_t *mon_addrs_size,
2547e28fff26SAlex Elder 				char *options,
2548e28fff26SAlex Elder 				size_t options_size)
2549a725f65eSAlex Elder {
2550e28fff26SAlex Elder 	size_t len;
25513feeb894SAlex Elder 	char *err_ptr = ERR_PTR(-EINVAL);
25523feeb894SAlex Elder 	char *snap_name;
2553e28fff26SAlex Elder 
2554e28fff26SAlex Elder 	/* The first four tokens are required */
2555e28fff26SAlex Elder 
25567ef3214aSAlex Elder 	len = next_token(&buf);
25577ef3214aSAlex Elder 	if (!len)
25583feeb894SAlex Elder 		return err_ptr;
25595214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
25607ef3214aSAlex Elder 	*mon_addrs = buf;
25617ef3214aSAlex Elder 
25627ef3214aSAlex Elder 	buf += len;
2563a725f65eSAlex Elder 
2564e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2565e28fff26SAlex Elder 	if (!len || len >= options_size)
25663feeb894SAlex Elder 		return err_ptr;
2567a725f65eSAlex Elder 
25683feeb894SAlex Elder 	err_ptr = ERR_PTR(-ENOMEM);
2569d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2570d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2571d22f76e7SAlex Elder 		goto out_err;
2572e28fff26SAlex Elder 
25730bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
25740bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2575bf3e5ae1SAlex Elder 		goto out_err;
2576e28fff26SAlex Elder 
25773feeb894SAlex Elder 	/* Snapshot name is optional */
25783feeb894SAlex Elder 	len = next_token(&buf);
2579820a5f3eSAlex Elder 	if (!len) {
25803feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
25813feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
2582849b4260SAlex Elder 	}
25833feeb894SAlex Elder 	snap_name = kmalloc(len + 1, GFP_KERNEL);
25843feeb894SAlex Elder 	if (!snap_name)
25853feeb894SAlex Elder 		goto out_err;
25863feeb894SAlex Elder 	memcpy(snap_name, buf, len);
25873feeb894SAlex Elder 	*(snap_name + len) = '\0';
2588e28fff26SAlex Elder 
25893feeb894SAlex Elder dout("    SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
25903feeb894SAlex Elder 
25913feeb894SAlex Elder 	return snap_name;
2592d22f76e7SAlex Elder 
2593d22f76e7SAlex Elder out_err:
25940bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2595d78fd7aeSAlex Elder 	rbd_dev->image_name = NULL;
2596d78fd7aeSAlex Elder 	rbd_dev->image_name_len = 0;
2597d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2598d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2599d22f76e7SAlex Elder 
26003feeb894SAlex Elder 	return err_ptr;
2601a725f65eSAlex Elder }
2602a725f65eSAlex Elder 
2603589d30e0SAlex Elder /*
2604589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
2605589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
2606589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
2607589d30e0SAlex Elder  *
2608589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
2609589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
2610589d30e0SAlex Elder  * with the supplied name.
2611589d30e0SAlex Elder  *
2612589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
2613589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
2614589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
2615589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
2616589d30e0SAlex Elder  */
2617589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2618589d30e0SAlex Elder {
2619589d30e0SAlex Elder 	int ret;
2620589d30e0SAlex Elder 	size_t size;
2621589d30e0SAlex Elder 	char *object_name;
2622589d30e0SAlex Elder 	void *response;
2623589d30e0SAlex Elder 	void *p;
2624589d30e0SAlex Elder 
2625589d30e0SAlex Elder 	/*
2626589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
2627589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
2628589d30e0SAlex Elder 	 */
2629589d30e0SAlex Elder 	size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2630589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
2631589d30e0SAlex Elder 	if (!object_name)
2632589d30e0SAlex Elder 		return -ENOMEM;
2633589d30e0SAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2634589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
2635589d30e0SAlex Elder 
2636589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
2637589d30e0SAlex Elder 
2638589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2639589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
2640589d30e0SAlex Elder 	if (!response) {
2641589d30e0SAlex Elder 		ret = -ENOMEM;
2642589d30e0SAlex Elder 		goto out;
2643589d30e0SAlex Elder 	}
2644589d30e0SAlex Elder 
2645589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
2646589d30e0SAlex Elder 				"rbd", "get_id",
2647589d30e0SAlex Elder 				NULL, 0,
2648589d30e0SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX,
2649589d30e0SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2650589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2651589d30e0SAlex Elder 	if (ret < 0)
2652589d30e0SAlex Elder 		goto out;
2653589d30e0SAlex Elder 
2654589d30e0SAlex Elder 	p = response;
2655589d30e0SAlex Elder 	rbd_dev->image_id = ceph_extract_encoded_string(&p,
2656589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
2657589d30e0SAlex Elder 						&rbd_dev->image_id_len,
2658589d30e0SAlex Elder 						GFP_NOIO);
2659589d30e0SAlex Elder 	if (IS_ERR(rbd_dev->image_id)) {
2660589d30e0SAlex Elder 		ret = PTR_ERR(rbd_dev->image_id);
2661589d30e0SAlex Elder 		rbd_dev->image_id = NULL;
2662589d30e0SAlex Elder 	} else {
2663589d30e0SAlex Elder 		dout("image_id is %s\n", rbd_dev->image_id);
2664589d30e0SAlex Elder 	}
2665589d30e0SAlex Elder out:
2666589d30e0SAlex Elder 	kfree(response);
2667589d30e0SAlex Elder 	kfree(object_name);
2668589d30e0SAlex Elder 
2669589d30e0SAlex Elder 	return ret;
2670589d30e0SAlex Elder }
2671589d30e0SAlex Elder 
2672a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2673a30b71b9SAlex Elder {
2674a30b71b9SAlex Elder 	int ret;
2675a30b71b9SAlex Elder 	size_t size;
2676a30b71b9SAlex Elder 
2677a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
2678a30b71b9SAlex Elder 
2679a30b71b9SAlex Elder 	rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2680a30b71b9SAlex Elder 	if (!rbd_dev->image_id)
2681a30b71b9SAlex Elder 		return -ENOMEM;
2682a30b71b9SAlex Elder 	rbd_dev->image_id_len = 0;
2683a30b71b9SAlex Elder 
2684a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
2685a30b71b9SAlex Elder 
2686a30b71b9SAlex Elder 	size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2687a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2688a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
2689a30b71b9SAlex Elder 		ret = -ENOMEM;
2690a30b71b9SAlex Elder 		goto out_err;
2691a30b71b9SAlex Elder 	}
2692a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2693a30b71b9SAlex Elder 
2694a30b71b9SAlex Elder 	/* Populate rbd image metadata */
2695a30b71b9SAlex Elder 
2696a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2697a30b71b9SAlex Elder 	if (ret < 0)
2698a30b71b9SAlex Elder 		goto out_err;
2699a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
2700a30b71b9SAlex Elder 
2701a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
2702a30b71b9SAlex Elder 		rbd_dev->header_name);
2703a30b71b9SAlex Elder 
2704a30b71b9SAlex Elder 	return 0;
2705a30b71b9SAlex Elder 
2706a30b71b9SAlex Elder out_err:
2707a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
2708a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
2709a30b71b9SAlex Elder 	kfree(rbd_dev->image_id);
2710a30b71b9SAlex Elder 	rbd_dev->image_id = NULL;
2711a30b71b9SAlex Elder 
2712a30b71b9SAlex Elder 	return ret;
2713a30b71b9SAlex Elder }
2714a30b71b9SAlex Elder 
2715a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2716a30b71b9SAlex Elder {
2717a30b71b9SAlex Elder 	size_t size;
27189d475de5SAlex Elder 	int ret;
2719a30b71b9SAlex Elder 
2720a30b71b9SAlex Elder 	/*
2721a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
2722a30b71b9SAlex Elder 	 * object name for this rbd image.
2723a30b71b9SAlex Elder 	 */
2724a30b71b9SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2725a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2726a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
2727a30b71b9SAlex Elder 		return -ENOMEM;
2728a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
2729a30b71b9SAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->image_id);
27309d475de5SAlex Elder 
27319d475de5SAlex Elder 	/* Get the size and object order for the image */
27329d475de5SAlex Elder 
27339d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
27349d475de5SAlex Elder 	if (ret < 0)
27359d475de5SAlex Elder 		goto out_err;
27361e130199SAlex Elder 
27371e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
27381e130199SAlex Elder 
27391e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
27401e130199SAlex Elder 	if (ret < 0)
27411e130199SAlex Elder 		goto out_err;
2742a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
2743a30b71b9SAlex Elder 
2744a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
2745a30b71b9SAlex Elder 		rbd_dev->header_name);
2746a30b71b9SAlex Elder 
2747a30b71b9SAlex Elder 	return -ENOTSUPP;
27489d475de5SAlex Elder out_err:
27499d475de5SAlex Elder 	kfree(rbd_dev->header_name);
27509d475de5SAlex Elder 	rbd_dev->header_name = NULL;
27511e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
27521e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
27539d475de5SAlex Elder 
27549d475de5SAlex Elder 	return ret;
2755a30b71b9SAlex Elder }
2756a30b71b9SAlex Elder 
2757a30b71b9SAlex Elder /*
2758a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
2759a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
2760a30b71b9SAlex Elder  * id.
2761a30b71b9SAlex Elder  */
2762a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
2763a30b71b9SAlex Elder {
2764a30b71b9SAlex Elder 	int ret;
2765a30b71b9SAlex Elder 
2766a30b71b9SAlex Elder 	/*
2767a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
2768a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
2769a30b71b9SAlex Elder 	 * it's a format 1 image.
2770a30b71b9SAlex Elder 	 */
2771a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
2772a30b71b9SAlex Elder 	if (ret)
2773a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
2774a30b71b9SAlex Elder 	else
2775a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
2776a30b71b9SAlex Elder 	if (ret)
2777a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
2778a30b71b9SAlex Elder 
2779a30b71b9SAlex Elder 	return ret;
2780a30b71b9SAlex Elder }
2781a30b71b9SAlex Elder 
278259c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
278359c2be1eSYehuda Sadeh 		       const char *buf,
278459c2be1eSYehuda Sadeh 		       size_t count)
2785602adf40SYehuda Sadeh {
2786cb8627c7SAlex Elder 	char *options;
2787cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
27887ef3214aSAlex Elder 	const char *mon_addrs = NULL;
27897ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
279027cc2594SAlex Elder 	struct ceph_osd_client *osdc;
279127cc2594SAlex Elder 	int rc = -ENOMEM;
27923feeb894SAlex Elder 	char *snap_name;
2793602adf40SYehuda Sadeh 
2794602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2795602adf40SYehuda Sadeh 		return -ENODEV;
2796602adf40SYehuda Sadeh 
279727cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
279827cc2594SAlex Elder 	if (!options)
279985ae8926SAlex Elder 		goto err_out_mem;
2800cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2801cb8627c7SAlex Elder 	if (!rbd_dev)
280285ae8926SAlex Elder 		goto err_out_mem;
2803602adf40SYehuda Sadeh 
2804602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2805602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2806602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2807dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2808c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2809602adf40SYehuda Sadeh 
2810a725f65eSAlex Elder 	/* parse add command */
28113feeb894SAlex Elder 	snap_name = rbd_add_parse_args(rbd_dev, buf,
28123feeb894SAlex Elder 				&mon_addrs, &mon_addrs_size, options, count);
28133feeb894SAlex Elder 	if (IS_ERR(snap_name)) {
28143feeb894SAlex Elder 		rc = PTR_ERR(snap_name);
281585ae8926SAlex Elder 		goto err_out_mem;
28163feeb894SAlex Elder 	}
2817a725f65eSAlex Elder 
2818f8c38929SAlex Elder 	rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2819f8c38929SAlex Elder 	if (rc < 0)
282085ae8926SAlex Elder 		goto err_out_args;
2821602adf40SYehuda Sadeh 
2822602adf40SYehuda Sadeh 	/* pick the pool */
28231dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2824602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2825602adf40SYehuda Sadeh 	if (rc < 0)
2826602adf40SYehuda Sadeh 		goto err_out_client;
28279bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
2828602adf40SYehuda Sadeh 
2829a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
2830a30b71b9SAlex Elder 	if (rc < 0)
2831589d30e0SAlex Elder 		goto err_out_client;
2832a30b71b9SAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
283305fd6f6fSAlex Elder 
283405fd6f6fSAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
283505fd6f6fSAlex Elder 	rc = rbd_dev_snaps_update(rbd_dev);
283605fd6f6fSAlex Elder 	if (rc)
283705fd6f6fSAlex Elder 		goto err_out_header;
283805fd6f6fSAlex Elder 
283905fd6f6fSAlex Elder 	rc = rbd_dev_set_mapping(rbd_dev, snap_name);
284005fd6f6fSAlex Elder 	if (rc)
284105fd6f6fSAlex Elder 		goto err_out_header;
284205fd6f6fSAlex Elder 
284385ae8926SAlex Elder 	/* generate unique id: find highest unique id, add one */
284485ae8926SAlex Elder 	rbd_dev_id_get(rbd_dev);
284585ae8926SAlex Elder 
284685ae8926SAlex Elder 	/* Fill in the device name, now that we have its id. */
284785ae8926SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
284885ae8926SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
284985ae8926SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
285085ae8926SAlex Elder 
285185ae8926SAlex Elder 	/* Get our block major device number. */
285285ae8926SAlex Elder 
285327cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
285427cc2594SAlex Elder 	if (rc < 0)
285585ae8926SAlex Elder 		goto err_out_id;
285627cc2594SAlex Elder 	rbd_dev->major = rc;
2857602adf40SYehuda Sadeh 
28580f308a31SAlex Elder 	/* Set up the blkdev mapping. */
28590f308a31SAlex Elder 
28600f308a31SAlex Elder 	rc = rbd_init_disk(rbd_dev);
2861dfc5606dSYehuda Sadeh 	if (rc)
2862766fc439SYehuda Sadeh 		goto err_out_blkdev;
2863766fc439SYehuda Sadeh 
28640f308a31SAlex Elder 	rc = rbd_bus_add_dev(rbd_dev);
28650f308a31SAlex Elder 	if (rc)
28660f308a31SAlex Elder 		goto err_out_disk;
28670f308a31SAlex Elder 
286832eec68dSAlex Elder 	/*
286932eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
287032eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
287132eec68dSAlex Elder 	 */
28722ac4e75dSAlex Elder 
28734bb1f1edSAlex Elder 	down_write(&rbd_dev->header_rwsem);
28745ed16177SAlex Elder 	rc = rbd_dev_snaps_register(rbd_dev);
28754bb1f1edSAlex Elder 	up_write(&rbd_dev->header_rwsem);
28762ac4e75dSAlex Elder 	if (rc)
28772ac4e75dSAlex Elder 		goto err_out_bus;
28782ac4e75dSAlex Elder 
287959c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
288059c2be1eSYehuda Sadeh 	if (rc)
288159c2be1eSYehuda Sadeh 		goto err_out_bus;
288259c2be1eSYehuda Sadeh 
28833ee4001eSAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
28843ee4001eSAlex Elder 
28853ee4001eSAlex Elder 	add_disk(rbd_dev->disk);
28863ee4001eSAlex Elder 
28873ee4001eSAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
28883ee4001eSAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
28893ee4001eSAlex Elder 
2890602adf40SYehuda Sadeh 	return count;
2891602adf40SYehuda Sadeh 
2892766fc439SYehuda Sadeh err_out_bus:
2893766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2894766fc439SYehuda Sadeh 
2895766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2896766fc439SYehuda Sadeh 	kfree(options);
2897766fc439SYehuda Sadeh 	return rc;
2898766fc439SYehuda Sadeh 
28990f308a31SAlex Elder err_out_disk:
29000f308a31SAlex Elder 	rbd_free_disk(rbd_dev);
2901602adf40SYehuda Sadeh err_out_blkdev:
2902602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
290385ae8926SAlex Elder err_out_id:
290485ae8926SAlex Elder 	rbd_dev_id_put(rbd_dev);
290505fd6f6fSAlex Elder err_out_header:
290605fd6f6fSAlex Elder 	rbd_header_free(&rbd_dev->header);
2907602adf40SYehuda Sadeh err_out_client:
29083fcf2581SAlex Elder 	kfree(rbd_dev->header_name);
2909602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2910589d30e0SAlex Elder 	kfree(rbd_dev->image_id);
291185ae8926SAlex Elder err_out_args:
2912f84344f3SAlex Elder 	kfree(rbd_dev->mapping.snap_name);
29130bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2914d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
291585ae8926SAlex Elder err_out_mem:
291627cc2594SAlex Elder 	kfree(rbd_dev);
2917cb8627c7SAlex Elder 	kfree(options);
291827cc2594SAlex Elder 
2919602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2920602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
292127cc2594SAlex Elder 
292227cc2594SAlex Elder 	return (ssize_t) rc;
2923602adf40SYehuda Sadeh }
2924602adf40SYehuda Sadeh 
2925de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
2926602adf40SYehuda Sadeh {
2927602adf40SYehuda Sadeh 	struct list_head *tmp;
2928602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2929602adf40SYehuda Sadeh 
2930e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2931602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2932602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2933de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
2934e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2935602adf40SYehuda Sadeh 			return rbd_dev;
2936602adf40SYehuda Sadeh 		}
2937e124a82fSAlex Elder 	}
2938e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2939602adf40SYehuda Sadeh 	return NULL;
2940602adf40SYehuda Sadeh }
2941602adf40SYehuda Sadeh 
2942dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2943602adf40SYehuda Sadeh {
2944593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2945602adf40SYehuda Sadeh 
29461dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
29471dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
29481dbb4399SAlex Elder 
29491dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
295059c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
29511dbb4399SAlex Elder 	}
295259c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
2953070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
295459c2be1eSYehuda Sadeh 
2955602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2956602adf40SYehuda Sadeh 
2957602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2958602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2959602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
296032eec68dSAlex Elder 
29612ac4e75dSAlex Elder 	/* release allocated disk header fields */
29622ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
29632ac4e75dSAlex Elder 
296432eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
2965f84344f3SAlex Elder 	kfree(rbd_dev->mapping.snap_name);
2966589d30e0SAlex Elder 	kfree(rbd_dev->image_id);
29670bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2968d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
29690bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2970e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
2971602adf40SYehuda Sadeh 	kfree(rbd_dev);
2972602adf40SYehuda Sadeh 
2973602adf40SYehuda Sadeh 	/* release module ref */
2974602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2975602adf40SYehuda Sadeh }
2976602adf40SYehuda Sadeh 
2977dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2978602adf40SYehuda Sadeh 			  const char *buf,
2979602adf40SYehuda Sadeh 			  size_t count)
2980602adf40SYehuda Sadeh {
2981602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2982602adf40SYehuda Sadeh 	int target_id, rc;
2983602adf40SYehuda Sadeh 	unsigned long ul;
2984602adf40SYehuda Sadeh 	int ret = count;
2985602adf40SYehuda Sadeh 
2986602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2987602adf40SYehuda Sadeh 	if (rc)
2988602adf40SYehuda Sadeh 		return rc;
2989602adf40SYehuda Sadeh 
2990602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2991602adf40SYehuda Sadeh 	target_id = (int) ul;
2992602adf40SYehuda Sadeh 	if (target_id != ul)
2993602adf40SYehuda Sadeh 		return -EINVAL;
2994602adf40SYehuda Sadeh 
2995602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2996602adf40SYehuda Sadeh 
2997602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2998602adf40SYehuda Sadeh 	if (!rbd_dev) {
2999602adf40SYehuda Sadeh 		ret = -ENOENT;
3000602adf40SYehuda Sadeh 		goto done;
3001602adf40SYehuda Sadeh 	}
3002602adf40SYehuda Sadeh 
3003dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
3004dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3005602adf40SYehuda Sadeh 
3006602adf40SYehuda Sadeh done:
3007602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3008aafb230eSAlex Elder 
3009602adf40SYehuda Sadeh 	return ret;
3010602adf40SYehuda Sadeh }
3011602adf40SYehuda Sadeh 
3012602adf40SYehuda Sadeh /*
3013602adf40SYehuda Sadeh  * create control files in sysfs
3014dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3015602adf40SYehuda Sadeh  */
3016602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3017602adf40SYehuda Sadeh {
3018dfc5606dSYehuda Sadeh 	int ret;
3019602adf40SYehuda Sadeh 
3020fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3021dfc5606dSYehuda Sadeh 	if (ret < 0)
3022dfc5606dSYehuda Sadeh 		return ret;
3023602adf40SYehuda Sadeh 
3024fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3025fed4c143SAlex Elder 	if (ret < 0)
3026fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3027602adf40SYehuda Sadeh 
3028602adf40SYehuda Sadeh 	return ret;
3029602adf40SYehuda Sadeh }
3030602adf40SYehuda Sadeh 
3031602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3032602adf40SYehuda Sadeh {
3033dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3034fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3035602adf40SYehuda Sadeh }
3036602adf40SYehuda Sadeh 
3037602adf40SYehuda Sadeh int __init rbd_init(void)
3038602adf40SYehuda Sadeh {
3039602adf40SYehuda Sadeh 	int rc;
3040602adf40SYehuda Sadeh 
3041602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3042602adf40SYehuda Sadeh 	if (rc)
3043602adf40SYehuda Sadeh 		return rc;
3044f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3045602adf40SYehuda Sadeh 	return 0;
3046602adf40SYehuda Sadeh }
3047602adf40SYehuda Sadeh 
3048602adf40SYehuda Sadeh void __exit rbd_exit(void)
3049602adf40SYehuda Sadeh {
3050602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3051602adf40SYehuda Sadeh }
3052602adf40SYehuda Sadeh 
3053602adf40SYehuda Sadeh module_init(rbd_init);
3054602adf40SYehuda Sadeh module_exit(rbd_exit);
3055602adf40SYehuda Sadeh 
3056602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3057602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3058602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3059602adf40SYehuda Sadeh 
3060602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3061602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3062602adf40SYehuda Sadeh 
3063602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3064