xref: /openbmc/linux/drivers/block/rbd.c (revision f7760dad)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
66d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67d4b125e9SAlex Elder 
6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
70602adf40SYehuda Sadeh 
71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
72602adf40SYehuda Sadeh 
73589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
741e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
75589d30e0SAlex Elder 
76d889140cSAlex Elder /* Feature bits */
77d889140cSAlex Elder 
78d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
79d889140cSAlex Elder 
80d889140cSAlex Elder /* Features supported by this (client software) implementation. */
81d889140cSAlex Elder 
82d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
83d889140cSAlex Elder 
8481a89793SAlex Elder /*
8581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8781a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8881a89793SAlex Elder  * enough to hold all possible device names.
8981a89793SAlex Elder  */
90602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
92602adf40SYehuda Sadeh 
93cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
9459c2be1eSYehuda Sadeh 
95602adf40SYehuda Sadeh /*
96602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
97602adf40SYehuda Sadeh  */
98602adf40SYehuda Sadeh struct rbd_image_header {
99f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
100849b4260SAlex Elder 	char *object_prefix;
10134b13184SAlex Elder 	u64 features;
102602adf40SYehuda Sadeh 	__u8 obj_order;
103602adf40SYehuda Sadeh 	__u8 crypt_type;
104602adf40SYehuda Sadeh 	__u8 comp_type;
105602adf40SYehuda Sadeh 
106f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
107f84344f3SAlex Elder 	u64 image_size;
108f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
109602adf40SYehuda Sadeh 	char *snap_names;
110602adf40SYehuda Sadeh 	u64 *snap_sizes;
11159c2be1eSYehuda Sadeh 
11259c2be1eSYehuda Sadeh 	u64 obj_version;
11359c2be1eSYehuda Sadeh };
11459c2be1eSYehuda Sadeh 
11559c2be1eSYehuda Sadeh struct rbd_options {
116cc0538b6SAlex Elder 	bool	read_only;
117602adf40SYehuda Sadeh };
118602adf40SYehuda Sadeh 
119602adf40SYehuda Sadeh /*
120f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
121602adf40SYehuda Sadeh  */
122602adf40SYehuda Sadeh struct rbd_client {
123602adf40SYehuda Sadeh 	struct ceph_client	*client;
124602adf40SYehuda Sadeh 	struct kref		kref;
125602adf40SYehuda Sadeh 	struct list_head	node;
126602adf40SYehuda Sadeh };
127602adf40SYehuda Sadeh 
128602adf40SYehuda Sadeh /*
129f0f8cef5SAlex Elder  * a request completion status
130602adf40SYehuda Sadeh  */
1311fec7093SYehuda Sadeh struct rbd_req_status {
1321fec7093SYehuda Sadeh 	int done;
1331fec7093SYehuda Sadeh 	int rc;
1341fec7093SYehuda Sadeh 	u64 bytes;
1351fec7093SYehuda Sadeh };
1361fec7093SYehuda Sadeh 
1371fec7093SYehuda Sadeh /*
1381fec7093SYehuda Sadeh  * a collection of requests
1391fec7093SYehuda Sadeh  */
1401fec7093SYehuda Sadeh struct rbd_req_coll {
1411fec7093SYehuda Sadeh 	int			total;
1421fec7093SYehuda Sadeh 	int			num_done;
1431fec7093SYehuda Sadeh 	struct kref		kref;
1441fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
145602adf40SYehuda Sadeh };
146602adf40SYehuda Sadeh 
147f0f8cef5SAlex Elder /*
148f0f8cef5SAlex Elder  * a single io request
149f0f8cef5SAlex Elder  */
150f0f8cef5SAlex Elder struct rbd_request {
151f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
152f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
153f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
154f0f8cef5SAlex Elder 	u64			len;
155f0f8cef5SAlex Elder 	int			coll_index;
156f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
157f0f8cef5SAlex Elder };
158f0f8cef5SAlex Elder 
159dfc5606dSYehuda Sadeh struct rbd_snap {
160dfc5606dSYehuda Sadeh 	struct	device		dev;
161dfc5606dSYehuda Sadeh 	const char		*name;
1623591538fSJosh Durgin 	u64			size;
163dfc5606dSYehuda Sadeh 	struct list_head	node;
164dfc5606dSYehuda Sadeh 	u64			id;
16534b13184SAlex Elder 	u64			features;
166dfc5606dSYehuda Sadeh };
167dfc5606dSYehuda Sadeh 
168f84344f3SAlex Elder struct rbd_mapping {
169f84344f3SAlex Elder 	char                    *snap_name;
170f84344f3SAlex Elder 	u64                     snap_id;
17199c1f08fSAlex Elder 	u64                     size;
17234b13184SAlex Elder 	u64                     features;
173f84344f3SAlex Elder 	bool                    snap_exists;
174f84344f3SAlex Elder 	bool			read_only;
175f84344f3SAlex Elder };
176f84344f3SAlex Elder 
177602adf40SYehuda Sadeh /*
178602adf40SYehuda Sadeh  * a single device
179602adf40SYehuda Sadeh  */
180602adf40SYehuda Sadeh struct rbd_device {
181de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
182602adf40SYehuda Sadeh 
183602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
184602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
185602adf40SYehuda Sadeh 
186a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
187602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
188602adf40SYehuda Sadeh 
189602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
190602adf40SYehuda Sadeh 
191602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
192602adf40SYehuda Sadeh 
193602adf40SYehuda Sadeh 	struct rbd_image_header	header;
194589d30e0SAlex Elder 	char			*image_id;
195589d30e0SAlex Elder 	size_t			image_id_len;
1960bed54dcSAlex Elder 	char			*image_name;
1970bed54dcSAlex Elder 	size_t			image_name_len;
1980bed54dcSAlex Elder 	char			*header_name;
199d22f76e7SAlex Elder 	char			*pool_name;
2009bb2f334SAlex Elder 	int			pool_id;
201602adf40SYehuda Sadeh 
20259c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
20359c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
20459c2be1eSYehuda Sadeh 
205c666601aSJosh Durgin 	/* protects updating the header */
206c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
207f84344f3SAlex Elder 
208f84344f3SAlex Elder 	struct rbd_mapping	mapping;
209602adf40SYehuda Sadeh 
210602adf40SYehuda Sadeh 	struct list_head	node;
211dfc5606dSYehuda Sadeh 
212dfc5606dSYehuda Sadeh 	/* list of snapshots */
213dfc5606dSYehuda Sadeh 	struct list_head	snaps;
214dfc5606dSYehuda Sadeh 
215dfc5606dSYehuda Sadeh 	/* sysfs related */
216dfc5606dSYehuda Sadeh 	struct device		dev;
217dfc5606dSYehuda Sadeh };
218dfc5606dSYehuda Sadeh 
219602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
220e124a82fSAlex Elder 
221602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
222e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
223e124a82fSAlex Elder 
224602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
225432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
226602adf40SYehuda Sadeh 
227304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
228304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
229304f6808SAlex Elder 
230dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
23114e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap);
232dfc5606dSYehuda Sadeh 
233f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
234f0f8cef5SAlex Elder 		       size_t count);
235f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
236f0f8cef5SAlex Elder 			  size_t count);
237f0f8cef5SAlex Elder 
238f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
239f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
240f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
241f0f8cef5SAlex Elder 	__ATTR_NULL
242f0f8cef5SAlex Elder };
243f0f8cef5SAlex Elder 
244f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
245f0f8cef5SAlex Elder 	.name		= "rbd",
246f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
247f0f8cef5SAlex Elder };
248f0f8cef5SAlex Elder 
249f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
250f0f8cef5SAlex Elder {
251f0f8cef5SAlex Elder }
252f0f8cef5SAlex Elder 
253f0f8cef5SAlex Elder static struct device rbd_root_dev = {
254f0f8cef5SAlex Elder 	.init_name =    "rbd",
255f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
256f0f8cef5SAlex Elder };
257f0f8cef5SAlex Elder 
258aafb230eSAlex Elder #ifdef RBD_DEBUG
259aafb230eSAlex Elder #define rbd_assert(expr)						\
260aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
261aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
262aafb230eSAlex Elder 						"at line %d:\n\n"	\
263aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
264aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
265aafb230eSAlex Elder 			BUG();						\
266aafb230eSAlex Elder 		}
267aafb230eSAlex Elder #else /* !RBD_DEBUG */
268aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
269aafb230eSAlex Elder #endif /* !RBD_DEBUG */
270dfc5606dSYehuda Sadeh 
271dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
272dfc5606dSYehuda Sadeh {
273dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
274dfc5606dSYehuda Sadeh }
275dfc5606dSYehuda Sadeh 
276dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
277dfc5606dSYehuda Sadeh {
278dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
279dfc5606dSYehuda Sadeh }
280602adf40SYehuda Sadeh 
281117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
282117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
28359c2be1eSYehuda Sadeh 
284602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
285602adf40SYehuda Sadeh {
286f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
287602adf40SYehuda Sadeh 
288f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
289602adf40SYehuda Sadeh 		return -EROFS;
290602adf40SYehuda Sadeh 
291340c7a2bSAlex Elder 	rbd_get_dev(rbd_dev);
292f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
293340c7a2bSAlex Elder 
294602adf40SYehuda Sadeh 	return 0;
295602adf40SYehuda Sadeh }
296602adf40SYehuda Sadeh 
297dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
298dfc5606dSYehuda Sadeh {
299dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
300dfc5606dSYehuda Sadeh 
301dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
302dfc5606dSYehuda Sadeh 
303dfc5606dSYehuda Sadeh 	return 0;
304dfc5606dSYehuda Sadeh }
305dfc5606dSYehuda Sadeh 
306602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
307602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
308602adf40SYehuda Sadeh 	.open			= rbd_open,
309dfc5606dSYehuda Sadeh 	.release		= rbd_release,
310602adf40SYehuda Sadeh };
311602adf40SYehuda Sadeh 
312602adf40SYehuda Sadeh /*
313602adf40SYehuda Sadeh  * Initialize an rbd client instance.
31443ae4701SAlex Elder  * We own *ceph_opts.
315602adf40SYehuda Sadeh  */
316f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
317602adf40SYehuda Sadeh {
318602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
319602adf40SYehuda Sadeh 	int ret = -ENOMEM;
320602adf40SYehuda Sadeh 
321602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
322602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
323602adf40SYehuda Sadeh 	if (!rbdc)
324602adf40SYehuda Sadeh 		goto out_opt;
325602adf40SYehuda Sadeh 
326602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
327602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
328602adf40SYehuda Sadeh 
329bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
330bc534d86SAlex Elder 
33143ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
332602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
333bc534d86SAlex Elder 		goto out_mutex;
33443ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
335602adf40SYehuda Sadeh 
336602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
337602adf40SYehuda Sadeh 	if (ret < 0)
338602adf40SYehuda Sadeh 		goto out_err;
339602adf40SYehuda Sadeh 
340432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
341602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
342432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
343602adf40SYehuda Sadeh 
344bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
345bc534d86SAlex Elder 
346602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
347602adf40SYehuda Sadeh 	return rbdc;
348602adf40SYehuda Sadeh 
349602adf40SYehuda Sadeh out_err:
350602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
351bc534d86SAlex Elder out_mutex:
352bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
353602adf40SYehuda Sadeh 	kfree(rbdc);
354602adf40SYehuda Sadeh out_opt:
35543ae4701SAlex Elder 	if (ceph_opts)
35643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
35728f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
358602adf40SYehuda Sadeh }
359602adf40SYehuda Sadeh 
360602adf40SYehuda Sadeh /*
3611f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
3621f7ba331SAlex Elder  * found, bump its reference count.
363602adf40SYehuda Sadeh  */
3641f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
365602adf40SYehuda Sadeh {
366602adf40SYehuda Sadeh 	struct rbd_client *client_node;
3671f7ba331SAlex Elder 	bool found = false;
368602adf40SYehuda Sadeh 
36943ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
370602adf40SYehuda Sadeh 		return NULL;
371602adf40SYehuda Sadeh 
3721f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
3731f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
3741f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
3751f7ba331SAlex Elder 			kref_get(&client_node->kref);
3761f7ba331SAlex Elder 			found = true;
3771f7ba331SAlex Elder 			break;
3781f7ba331SAlex Elder 		}
3791f7ba331SAlex Elder 	}
3801f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
3811f7ba331SAlex Elder 
3821f7ba331SAlex Elder 	return found ? client_node : NULL;
383602adf40SYehuda Sadeh }
384602adf40SYehuda Sadeh 
385602adf40SYehuda Sadeh /*
38659c2be1eSYehuda Sadeh  * mount options
38759c2be1eSYehuda Sadeh  */
38859c2be1eSYehuda Sadeh enum {
38959c2be1eSYehuda Sadeh 	Opt_last_int,
39059c2be1eSYehuda Sadeh 	/* int args above */
39159c2be1eSYehuda Sadeh 	Opt_last_string,
39259c2be1eSYehuda Sadeh 	/* string args above */
393cc0538b6SAlex Elder 	Opt_read_only,
394cc0538b6SAlex Elder 	Opt_read_write,
395cc0538b6SAlex Elder 	/* Boolean args above */
396cc0538b6SAlex Elder 	Opt_last_bool,
39759c2be1eSYehuda Sadeh };
39859c2be1eSYehuda Sadeh 
39943ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
40059c2be1eSYehuda Sadeh 	/* int args above */
40159c2be1eSYehuda Sadeh 	/* string args above */
402be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
403cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
404cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
405cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
406cc0538b6SAlex Elder 	/* Boolean args above */
40759c2be1eSYehuda Sadeh 	{-1, NULL}
40859c2be1eSYehuda Sadeh };
40959c2be1eSYehuda Sadeh 
41059c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
41159c2be1eSYehuda Sadeh {
41243ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
41359c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
41459c2be1eSYehuda Sadeh 	int token, intval, ret;
41559c2be1eSYehuda Sadeh 
41643ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
41759c2be1eSYehuda Sadeh 	if (token < 0)
41859c2be1eSYehuda Sadeh 		return -EINVAL;
41959c2be1eSYehuda Sadeh 
42059c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
42159c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
42259c2be1eSYehuda Sadeh 		if (ret < 0) {
42359c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
42459c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
42559c2be1eSYehuda Sadeh 			return ret;
42659c2be1eSYehuda Sadeh 		}
42759c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
42859c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
42959c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
43059c2be1eSYehuda Sadeh 		     argstr[0].from);
431cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
432cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
43359c2be1eSYehuda Sadeh 	} else {
43459c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
43559c2be1eSYehuda Sadeh 	}
43659c2be1eSYehuda Sadeh 
43759c2be1eSYehuda Sadeh 	switch (token) {
438cc0538b6SAlex Elder 	case Opt_read_only:
439cc0538b6SAlex Elder 		rbd_opts->read_only = true;
440cc0538b6SAlex Elder 		break;
441cc0538b6SAlex Elder 	case Opt_read_write:
442cc0538b6SAlex Elder 		rbd_opts->read_only = false;
443cc0538b6SAlex Elder 		break;
44459c2be1eSYehuda Sadeh 	default:
445aafb230eSAlex Elder 		rbd_assert(false);
446aafb230eSAlex Elder 		break;
44759c2be1eSYehuda Sadeh 	}
44859c2be1eSYehuda Sadeh 	return 0;
44959c2be1eSYehuda Sadeh }
45059c2be1eSYehuda Sadeh 
45159c2be1eSYehuda Sadeh /*
452602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
453602adf40SYehuda Sadeh  * not exist create it.
454602adf40SYehuda Sadeh  */
455f8c38929SAlex Elder static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
456f8c38929SAlex Elder 				size_t mon_addr_len, char *options)
457602adf40SYehuda Sadeh {
458069a4b56SAlex Elder 	struct rbd_options rbd_opts;
45943ae4701SAlex Elder 	struct ceph_options *ceph_opts;
460f8c38929SAlex Elder 	struct rbd_client *rbdc;
46159c2be1eSYehuda Sadeh 
462069a4b56SAlex Elder 	/* Initialize all rbd options to the defaults */
463069a4b56SAlex Elder 
464069a4b56SAlex Elder 	rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
465602adf40SYehuda Sadeh 
46643ae4701SAlex Elder 	ceph_opts = ceph_parse_options(options, mon_addr,
4675214ecc4SAlex Elder 					mon_addr + mon_addr_len,
468069a4b56SAlex Elder 					parse_rbd_opts_token, &rbd_opts);
469f8c38929SAlex Elder 	if (IS_ERR(ceph_opts))
470f8c38929SAlex Elder 		return PTR_ERR(ceph_opts);
471602adf40SYehuda Sadeh 
472069a4b56SAlex Elder 	/* Record the parsed rbd options */
473069a4b56SAlex Elder 
474069a4b56SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts.read_only;
475069a4b56SAlex Elder 
4761f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
477602adf40SYehuda Sadeh 	if (rbdc) {
478e6994d3dSAlex Elder 		/* using an existing client */
47943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
480f8c38929SAlex Elder 	} else {
481f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
482d720bcb0SAlex Elder 		if (IS_ERR(rbdc))
483f8c38929SAlex Elder 			return PTR_ERR(rbdc);
484f8c38929SAlex Elder 	}
485f8c38929SAlex Elder 	rbd_dev->rbd_client = rbdc;
486d720bcb0SAlex Elder 
487f8c38929SAlex Elder 	return 0;
488602adf40SYehuda Sadeh }
489602adf40SYehuda Sadeh 
490602adf40SYehuda Sadeh /*
491602adf40SYehuda Sadeh  * Destroy ceph client
492d23a4b3fSAlex Elder  *
493432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
494602adf40SYehuda Sadeh  */
495602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
496602adf40SYehuda Sadeh {
497602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
498602adf40SYehuda Sadeh 
499602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
500cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
501602adf40SYehuda Sadeh 	list_del(&rbdc->node);
502cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
503602adf40SYehuda Sadeh 
504602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
505602adf40SYehuda Sadeh 	kfree(rbdc);
506602adf40SYehuda Sadeh }
507602adf40SYehuda Sadeh 
508602adf40SYehuda Sadeh /*
509602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
510602adf40SYehuda Sadeh  * it.
511602adf40SYehuda Sadeh  */
512602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
513602adf40SYehuda Sadeh {
514602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
515602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
516602adf40SYehuda Sadeh }
517602adf40SYehuda Sadeh 
5181fec7093SYehuda Sadeh /*
5191fec7093SYehuda Sadeh  * Destroy requests collection
5201fec7093SYehuda Sadeh  */
5211fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5221fec7093SYehuda Sadeh {
5231fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5241fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5251fec7093SYehuda Sadeh 
5261fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5271fec7093SYehuda Sadeh 	kfree(coll);
5281fec7093SYehuda Sadeh }
529602adf40SYehuda Sadeh 
530a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
531a30b71b9SAlex Elder {
532a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
533a30b71b9SAlex Elder }
534a30b71b9SAlex Elder 
5358e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5368e94af8eSAlex Elder {
537103a150fSAlex Elder 	size_t size;
538103a150fSAlex Elder 	u32 snap_count;
539103a150fSAlex Elder 
540103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
541103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
542103a150fSAlex Elder 		return false;
543103a150fSAlex Elder 
544db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
545db2388b6SAlex Elder 
546db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
547db2388b6SAlex Elder 		return false;
548db2388b6SAlex Elder 
549db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
550db2388b6SAlex Elder 
551db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
552db2388b6SAlex Elder 		return false;
553db2388b6SAlex Elder 
554103a150fSAlex Elder 	/*
555103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
556103a150fSAlex Elder 	 * that limits the number of snapshots.
557103a150fSAlex Elder 	 */
558103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
559103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
560103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
561103a150fSAlex Elder 		return false;
562103a150fSAlex Elder 
563103a150fSAlex Elder 	/*
564103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
565103a150fSAlex Elder 	 * header must also be representable in a size_t.
566103a150fSAlex Elder 	 */
567103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
568103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
569103a150fSAlex Elder 		return false;
570103a150fSAlex Elder 
571103a150fSAlex Elder 	return true;
5728e94af8eSAlex Elder }
5738e94af8eSAlex Elder 
574602adf40SYehuda Sadeh /*
575602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
576602adf40SYehuda Sadeh  * header.
577602adf40SYehuda Sadeh  */
578602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
5794156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
580602adf40SYehuda Sadeh {
581ccece235SAlex Elder 	u32 snap_count;
58258c17b0eSAlex Elder 	size_t len;
583d2bb24e5SAlex Elder 	size_t size;
584621901d6SAlex Elder 	u32 i;
585602adf40SYehuda Sadeh 
5866a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
5876a52325fSAlex Elder 
588103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
589103a150fSAlex Elder 
59058c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
59158c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
5926a52325fSAlex Elder 	if (!header->object_prefix)
593602adf40SYehuda Sadeh 		return -ENOMEM;
59458c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
59558c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
59600f1f36fSAlex Elder 
597602adf40SYehuda Sadeh 	if (snap_count) {
598f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
599f785cc1dSAlex Elder 
600621901d6SAlex Elder 		/* Save a copy of the snapshot names */
601621901d6SAlex Elder 
602f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
603f785cc1dSAlex Elder 			return -EIO;
604f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
605602adf40SYehuda Sadeh 		if (!header->snap_names)
6066a52325fSAlex Elder 			goto out_err;
607f785cc1dSAlex Elder 		/*
608f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
609f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
610f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
611f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
612f785cc1dSAlex Elder 		 */
613f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
614f785cc1dSAlex Elder 			snap_names_len);
6156a52325fSAlex Elder 
616621901d6SAlex Elder 		/* Record each snapshot's size */
617621901d6SAlex Elder 
618d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
619d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
620602adf40SYehuda Sadeh 		if (!header->snap_sizes)
6216a52325fSAlex Elder 			goto out_err;
622621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
623621901d6SAlex Elder 			header->snap_sizes[i] =
624621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
625602adf40SYehuda Sadeh 	} else {
626ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
627602adf40SYehuda Sadeh 		header->snap_names = NULL;
628602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
629602adf40SYehuda Sadeh 	}
630849b4260SAlex Elder 
63134b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
632602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
633602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
634602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6356a52325fSAlex Elder 
636621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
637621901d6SAlex Elder 
638f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6396a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6406a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6416a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6426a52325fSAlex Elder 	if (!header->snapc)
6436a52325fSAlex Elder 		goto out_err;
644602adf40SYehuda Sadeh 
645602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
646505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
647602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
648621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
649602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
650602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
651602adf40SYehuda Sadeh 
652602adf40SYehuda Sadeh 	return 0;
653602adf40SYehuda Sadeh 
6546a52325fSAlex Elder out_err:
655849b4260SAlex Elder 	kfree(header->snap_sizes);
656ccece235SAlex Elder 	header->snap_sizes = NULL;
657602adf40SYehuda Sadeh 	kfree(header->snap_names);
658ccece235SAlex Elder 	header->snap_names = NULL;
6596a52325fSAlex Elder 	kfree(header->object_prefix);
6606a52325fSAlex Elder 	header->object_prefix = NULL;
661ccece235SAlex Elder 
66200f1f36fSAlex Elder 	return -ENOMEM;
663602adf40SYehuda Sadeh }
664602adf40SYehuda Sadeh 
6658836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
666602adf40SYehuda Sadeh {
667602adf40SYehuda Sadeh 
668e86924a8SAlex Elder 	struct rbd_snap *snap;
66900f1f36fSAlex Elder 
670e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
671e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
672e86924a8SAlex Elder 			rbd_dev->mapping.snap_id = snap->id;
673e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
67434b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
67500f1f36fSAlex Elder 
676e86924a8SAlex Elder 			return 0;
677602adf40SYehuda Sadeh 		}
67800f1f36fSAlex Elder 	}
679e86924a8SAlex Elder 
68000f1f36fSAlex Elder 	return -ENOENT;
68100f1f36fSAlex Elder }
682602adf40SYehuda Sadeh 
6835ed16177SAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
684602adf40SYehuda Sadeh {
68578dc447dSAlex Elder 	int ret;
686602adf40SYehuda Sadeh 
6874e1105a2SAlex Elder 	if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
688cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
689f84344f3SAlex Elder 		rbd_dev->mapping.snap_id = CEPH_NOSNAP;
69099c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
69134b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
692f84344f3SAlex Elder 		rbd_dev->mapping.snap_exists = false;
693e86924a8SAlex Elder 		ret = 0;
694602adf40SYehuda Sadeh 	} else {
6958836b995SAlex Elder 		ret = snap_by_name(rbd_dev, snap_name);
696602adf40SYehuda Sadeh 		if (ret < 0)
697602adf40SYehuda Sadeh 			goto done;
698f84344f3SAlex Elder 		rbd_dev->mapping.snap_exists = true;
699f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
700602adf40SYehuda Sadeh 	}
7014e1105a2SAlex Elder 	rbd_dev->mapping.snap_name = snap_name;
702602adf40SYehuda Sadeh done:
703602adf40SYehuda Sadeh 	return ret;
704602adf40SYehuda Sadeh }
705602adf40SYehuda Sadeh 
706602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
707602adf40SYehuda Sadeh {
708849b4260SAlex Elder 	kfree(header->object_prefix);
709d78fd7aeSAlex Elder 	header->object_prefix = NULL;
710602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
711d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
712849b4260SAlex Elder 	kfree(header->snap_names);
713d78fd7aeSAlex Elder 	header->snap_names = NULL;
714d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
715d78fd7aeSAlex Elder 	header->snapc = NULL;
716602adf40SYehuda Sadeh }
717602adf40SYehuda Sadeh 
71865ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
719602adf40SYehuda Sadeh {
72065ccfe21SAlex Elder 	char *name;
72165ccfe21SAlex Elder 	u64 segment;
72265ccfe21SAlex Elder 	int ret;
723602adf40SYehuda Sadeh 
72465ccfe21SAlex Elder 	name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
72565ccfe21SAlex Elder 	if (!name)
72665ccfe21SAlex Elder 		return NULL;
72765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
72865ccfe21SAlex Elder 	ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
72965ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
73065ccfe21SAlex Elder 	if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
73165ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
73265ccfe21SAlex Elder 			segment, ret);
73365ccfe21SAlex Elder 		kfree(name);
73465ccfe21SAlex Elder 		name = NULL;
73565ccfe21SAlex Elder 	}
736602adf40SYehuda Sadeh 
73765ccfe21SAlex Elder 	return name;
73865ccfe21SAlex Elder }
739602adf40SYehuda Sadeh 
74065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
74165ccfe21SAlex Elder {
74265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
743602adf40SYehuda Sadeh 
74465ccfe21SAlex Elder 	return offset & (segment_size - 1);
74565ccfe21SAlex Elder }
74665ccfe21SAlex Elder 
74765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
74865ccfe21SAlex Elder 				u64 offset, u64 length)
74965ccfe21SAlex Elder {
75065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
75165ccfe21SAlex Elder 
75265ccfe21SAlex Elder 	offset &= segment_size - 1;
75365ccfe21SAlex Elder 
754aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
75565ccfe21SAlex Elder 	if (offset + length > segment_size)
75665ccfe21SAlex Elder 		length = segment_size - offset;
75765ccfe21SAlex Elder 
75865ccfe21SAlex Elder 	return length;
759602adf40SYehuda Sadeh }
760602adf40SYehuda Sadeh 
7611fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
7621fec7093SYehuda Sadeh 				u64 ofs, u64 len)
7631fec7093SYehuda Sadeh {
764df111be6SAlex Elder 	u64 start_seg;
765df111be6SAlex Elder 	u64 end_seg;
766df111be6SAlex Elder 
767df111be6SAlex Elder 	if (!len)
768df111be6SAlex Elder 		return 0;
769df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
770df111be6SAlex Elder 		return -ERANGE;
771df111be6SAlex Elder 
772df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
773df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
774df111be6SAlex Elder 
7751fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
7761fec7093SYehuda Sadeh }
7771fec7093SYehuda Sadeh 
778602adf40SYehuda Sadeh /*
779029bcbd8SJosh Durgin  * returns the size of an object in the image
780029bcbd8SJosh Durgin  */
781029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
782029bcbd8SJosh Durgin {
783029bcbd8SJosh Durgin 	return 1 << header->obj_order;
784029bcbd8SJosh Durgin }
785029bcbd8SJosh Durgin 
786029bcbd8SJosh Durgin /*
787602adf40SYehuda Sadeh  * bio helpers
788602adf40SYehuda Sadeh  */
789602adf40SYehuda Sadeh 
790602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
791602adf40SYehuda Sadeh {
792602adf40SYehuda Sadeh 	struct bio *tmp;
793602adf40SYehuda Sadeh 
794602adf40SYehuda Sadeh 	while (chain) {
795602adf40SYehuda Sadeh 		tmp = chain;
796602adf40SYehuda Sadeh 		chain = chain->bi_next;
797602adf40SYehuda Sadeh 		bio_put(tmp);
798602adf40SYehuda Sadeh 	}
799602adf40SYehuda Sadeh }
800602adf40SYehuda Sadeh 
801602adf40SYehuda Sadeh /*
802602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
803602adf40SYehuda Sadeh  */
804602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
805602adf40SYehuda Sadeh {
806602adf40SYehuda Sadeh 	struct bio_vec *bv;
807602adf40SYehuda Sadeh 	unsigned long flags;
808602adf40SYehuda Sadeh 	void *buf;
809602adf40SYehuda Sadeh 	int i;
810602adf40SYehuda Sadeh 	int pos = 0;
811602adf40SYehuda Sadeh 
812602adf40SYehuda Sadeh 	while (chain) {
813602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
814602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
815602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
816602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
817602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
818602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
81985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
820602adf40SYehuda Sadeh 			}
821602adf40SYehuda Sadeh 			pos += bv->bv_len;
822602adf40SYehuda Sadeh 		}
823602adf40SYehuda Sadeh 
824602adf40SYehuda Sadeh 		chain = chain->bi_next;
825602adf40SYehuda Sadeh 	}
826602adf40SYehuda Sadeh }
827602adf40SYehuda Sadeh 
828602adf40SYehuda Sadeh /*
829f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
830f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
831602adf40SYehuda Sadeh  */
832f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
833f7760dadSAlex Elder 					unsigned int offset,
834f7760dadSAlex Elder 					unsigned int len,
835f7760dadSAlex Elder 					gfp_t gfpmask)
836602adf40SYehuda Sadeh {
837f7760dadSAlex Elder 	struct bio_vec *bv;
838f7760dadSAlex Elder 	unsigned int resid;
839f7760dadSAlex Elder 	unsigned short idx;
840f7760dadSAlex Elder 	unsigned int voff;
841f7760dadSAlex Elder 	unsigned short end_idx;
842f7760dadSAlex Elder 	unsigned short vcnt;
843f7760dadSAlex Elder 	struct bio *bio;
844602adf40SYehuda Sadeh 
845f7760dadSAlex Elder 	/* Handle the easy case for the caller */
846f7760dadSAlex Elder 
847f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
848f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
849f7760dadSAlex Elder 
850f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
851f7760dadSAlex Elder 		return NULL;
852f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
853f7760dadSAlex Elder 		return NULL;
854f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
855f7760dadSAlex Elder 		return NULL;
856f7760dadSAlex Elder 
857f7760dadSAlex Elder 	/* Find first affected segment... */
858f7760dadSAlex Elder 
859f7760dadSAlex Elder 	resid = offset;
860f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
861f7760dadSAlex Elder 		if (resid < bv->bv_len)
862f7760dadSAlex Elder 			break;
863f7760dadSAlex Elder 		resid -= bv->bv_len;
864602adf40SYehuda Sadeh 	}
865f7760dadSAlex Elder 	voff = resid;
866602adf40SYehuda Sadeh 
867f7760dadSAlex Elder 	/* ...and the last affected segment */
868542582fcSAlex Elder 
869f7760dadSAlex Elder 	resid += len;
870f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
871f7760dadSAlex Elder 		if (resid <= bv->bv_len)
872f7760dadSAlex Elder 			break;
873f7760dadSAlex Elder 		resid -= bv->bv_len;
874f7760dadSAlex Elder 	}
875f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
876602adf40SYehuda Sadeh 
877f7760dadSAlex Elder 	/* Build the clone */
878f7760dadSAlex Elder 
879f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
880f7760dadSAlex Elder 	if (!bio)
881f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
882f7760dadSAlex Elder 
883f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
884f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
885f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
886f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
887602adf40SYehuda Sadeh 
888602adf40SYehuda Sadeh 	/*
889f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
890f7760dadSAlex Elder 	 * and last (or only) entries.
891602adf40SYehuda Sadeh 	 */
892f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
893f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
894f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
895f7760dadSAlex Elder 	if (vcnt > 1) {
896f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
897f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
898602adf40SYehuda Sadeh 	} else {
899f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
900602adf40SYehuda Sadeh 	}
901602adf40SYehuda Sadeh 
902f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
903f7760dadSAlex Elder 	bio->bi_size = len;
904f7760dadSAlex Elder 	bio->bi_idx = 0;
905602adf40SYehuda Sadeh 
906f7760dadSAlex Elder 	return bio;
907602adf40SYehuda Sadeh }
908602adf40SYehuda Sadeh 
909f7760dadSAlex Elder /*
910f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
911f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
912f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
913f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
914f7760dadSAlex Elder  *
915f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
916f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
917f7760dadSAlex Elder  * the start of data to be cloned is located.
918f7760dadSAlex Elder  *
919f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
920f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
921f7760dadSAlex Elder  * contain the offset of that byte within that bio.
922f7760dadSAlex Elder  */
923f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
924f7760dadSAlex Elder 					unsigned int *offset,
925f7760dadSAlex Elder 					unsigned int len,
926f7760dadSAlex Elder 					gfp_t gfpmask)
927f7760dadSAlex Elder {
928f7760dadSAlex Elder 	struct bio *bi = *bio_src;
929f7760dadSAlex Elder 	unsigned int off = *offset;
930f7760dadSAlex Elder 	struct bio *chain = NULL;
931f7760dadSAlex Elder 	struct bio **end;
932602adf40SYehuda Sadeh 
933f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
934602adf40SYehuda Sadeh 
935f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
936f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
937602adf40SYehuda Sadeh 
938f7760dadSAlex Elder 	end = &chain;
939f7760dadSAlex Elder 	while (len) {
940f7760dadSAlex Elder 		unsigned int bi_size;
941f7760dadSAlex Elder 		struct bio *bio;
942f7760dadSAlex Elder 
943f7760dadSAlex Elder 		if (!bi)
944f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
945f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
946f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
947f7760dadSAlex Elder 		if (!bio)
948f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
949f7760dadSAlex Elder 
950f7760dadSAlex Elder 		*end = bio;
951f7760dadSAlex Elder 		end = &bio->bi_next;
952f7760dadSAlex Elder 
953f7760dadSAlex Elder 		off += bi_size;
954f7760dadSAlex Elder 		if (off == bi->bi_size) {
955f7760dadSAlex Elder 			bi = bi->bi_next;
956f7760dadSAlex Elder 			off = 0;
957f7760dadSAlex Elder 		}
958f7760dadSAlex Elder 		len -= bi_size;
959f7760dadSAlex Elder 	}
960f7760dadSAlex Elder 	*bio_src = bi;
961f7760dadSAlex Elder 	*offset = off;
962f7760dadSAlex Elder 
963f7760dadSAlex Elder 	return chain;
964f7760dadSAlex Elder out_err:
965f7760dadSAlex Elder 	bio_chain_put(chain);
966f7760dadSAlex Elder 
967602adf40SYehuda Sadeh 	return NULL;
968602adf40SYehuda Sadeh }
969602adf40SYehuda Sadeh 
970602adf40SYehuda Sadeh /*
971602adf40SYehuda Sadeh  * helpers for osd request op vectors.
972602adf40SYehuda Sadeh  */
97357cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
97457cfc106SAlex Elder 					int opcode, u32 payload_len)
975602adf40SYehuda Sadeh {
97657cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
97757cfc106SAlex Elder 
97857cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
97957cfc106SAlex Elder 	if (!ops)
98057cfc106SAlex Elder 		return NULL;
98157cfc106SAlex Elder 
98257cfc106SAlex Elder 	ops[0].op = opcode;
98357cfc106SAlex Elder 
984602adf40SYehuda Sadeh 	/*
985602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
986602adf40SYehuda Sadeh 	 * in calc_raw_layout()
987602adf40SYehuda Sadeh 	 */
98857cfc106SAlex Elder 	ops[0].payload_len = payload_len;
98957cfc106SAlex Elder 
99057cfc106SAlex Elder 	return ops;
991602adf40SYehuda Sadeh }
992602adf40SYehuda Sadeh 
993602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
994602adf40SYehuda Sadeh {
995602adf40SYehuda Sadeh 	kfree(ops);
996602adf40SYehuda Sadeh }
997602adf40SYehuda Sadeh 
9981fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
9991fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
10001fec7093SYehuda Sadeh 				   int index,
10011fec7093SYehuda Sadeh 				   int ret, u64 len)
10021fec7093SYehuda Sadeh {
10031fec7093SYehuda Sadeh 	struct request_queue *q;
10041fec7093SYehuda Sadeh 	int min, max, i;
10051fec7093SYehuda Sadeh 
1006bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
1007bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
10081fec7093SYehuda Sadeh 
10091fec7093SYehuda Sadeh 	if (!rq)
10101fec7093SYehuda Sadeh 		return;
10111fec7093SYehuda Sadeh 
10121fec7093SYehuda Sadeh 	if (!coll) {
10131fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
10141fec7093SYehuda Sadeh 		return;
10151fec7093SYehuda Sadeh 	}
10161fec7093SYehuda Sadeh 
10171fec7093SYehuda Sadeh 	q = rq->q;
10181fec7093SYehuda Sadeh 
10191fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
10201fec7093SYehuda Sadeh 	coll->status[index].done = 1;
10211fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
10221fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
10231fec7093SYehuda Sadeh 	max = min = coll->num_done;
10241fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
10251fec7093SYehuda Sadeh 		max++;
10261fec7093SYehuda Sadeh 
10271fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
10281fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
10291fec7093SYehuda Sadeh 				  coll->status[i].bytes);
10301fec7093SYehuda Sadeh 		coll->num_done++;
10311fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
10321fec7093SYehuda Sadeh 	}
10331fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
10341fec7093SYehuda Sadeh }
10351fec7093SYehuda Sadeh 
10361fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
10371fec7093SYehuda Sadeh 			     int ret, u64 len)
10381fec7093SYehuda Sadeh {
10391fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
10401fec7093SYehuda Sadeh }
10411fec7093SYehuda Sadeh 
1042602adf40SYehuda Sadeh /*
1043602adf40SYehuda Sadeh  * Send ceph osd request
1044602adf40SYehuda Sadeh  */
1045602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
10460ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
1047602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1048602adf40SYehuda Sadeh 			  u64 snapid,
1049aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
1050602adf40SYehuda Sadeh 			  struct bio *bio,
1051602adf40SYehuda Sadeh 			  struct page **pages,
1052602adf40SYehuda Sadeh 			  int num_pages,
1053602adf40SYehuda Sadeh 			  int flags,
1054602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
10551fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
10561fec7093SYehuda Sadeh 			  int coll_index,
1057602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
105859c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
105959c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
106059c2be1eSYehuda Sadeh 			  u64 *ver)
1061602adf40SYehuda Sadeh {
1062602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
1063602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
1064602adf40SYehuda Sadeh 	int ret;
1065602adf40SYehuda Sadeh 	u64 bno;
1066602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
1067602adf40SYehuda Sadeh 	struct rbd_request *req_data;
1068602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
10691dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
1070602adf40SYehuda Sadeh 
1071602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
10721fec7093SYehuda Sadeh 	if (!req_data) {
10731fec7093SYehuda Sadeh 		if (coll)
10741fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
10751fec7093SYehuda Sadeh 					       -ENOMEM, len);
10761fec7093SYehuda Sadeh 		return -ENOMEM;
10771fec7093SYehuda Sadeh 	}
1078602adf40SYehuda Sadeh 
10791fec7093SYehuda Sadeh 	if (coll) {
10801fec7093SYehuda Sadeh 		req_data->coll = coll;
10811fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
10821fec7093SYehuda Sadeh 	}
10831fec7093SYehuda Sadeh 
1084f7760dadSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1085f7760dadSAlex Elder 		object_name, (unsigned long long) ofs,
1086f7760dadSAlex Elder 		(unsigned long long) len, coll, coll_index);
1087602adf40SYehuda Sadeh 
10880ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
10891dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
10901dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
10914ad12621SSage Weil 	if (!req) {
10924ad12621SSage Weil 		ret = -ENOMEM;
1093602adf40SYehuda Sadeh 		goto done_pages;
1094602adf40SYehuda Sadeh 	}
1095602adf40SYehuda Sadeh 
1096602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
1097602adf40SYehuda Sadeh 
1098602adf40SYehuda Sadeh 	req_data->rq = rq;
1099602adf40SYehuda Sadeh 	req_data->bio = bio;
1100602adf40SYehuda Sadeh 	req_data->pages = pages;
1101602adf40SYehuda Sadeh 	req_data->len = len;
1102602adf40SYehuda Sadeh 
1103602adf40SYehuda Sadeh 	req->r_priv = req_data;
1104602adf40SYehuda Sadeh 
1105602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
1106602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1107602adf40SYehuda Sadeh 
1108aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
1109602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
1110602adf40SYehuda Sadeh 
1111602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
1112602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1113602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1114602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1115602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
11160ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
11176cae3717SSage Weil 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
11181dbb4399SAlex Elder 				   req, ops);
11196cae3717SSage Weil 	rbd_assert(ret == 0);
1120602adf40SYehuda Sadeh 
1121602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
1122602adf40SYehuda Sadeh 				ops,
1123602adf40SYehuda Sadeh 				snapc,
1124602adf40SYehuda Sadeh 				&mtime,
1125602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
1126602adf40SYehuda Sadeh 
112759c2be1eSYehuda Sadeh 	if (linger_req) {
11281dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
112959c2be1eSYehuda Sadeh 		*linger_req = req;
113059c2be1eSYehuda Sadeh 	}
113159c2be1eSYehuda Sadeh 
11321dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
1133602adf40SYehuda Sadeh 	if (ret < 0)
1134602adf40SYehuda Sadeh 		goto done_err;
1135602adf40SYehuda Sadeh 
1136602adf40SYehuda Sadeh 	if (!rbd_cb) {
11371dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
113859c2be1eSYehuda Sadeh 		if (ver)
113959c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
1140bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
1141bd919d45SAlex Elder 			(unsigned long long)
11421fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
1143602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
1144602adf40SYehuda Sadeh 	}
1145602adf40SYehuda Sadeh 	return ret;
1146602adf40SYehuda Sadeh 
1147602adf40SYehuda Sadeh done_err:
1148602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
1149602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1150602adf40SYehuda Sadeh done_pages:
11511fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
1152602adf40SYehuda Sadeh 	kfree(req_data);
1153602adf40SYehuda Sadeh 	return ret;
1154602adf40SYehuda Sadeh }
1155602adf40SYehuda Sadeh 
1156602adf40SYehuda Sadeh /*
1157602adf40SYehuda Sadeh  * Ceph osd op callback
1158602adf40SYehuda Sadeh  */
1159602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1160602adf40SYehuda Sadeh {
1161602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
1162602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1163602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
1164602adf40SYehuda Sadeh 	__s32 rc;
1165602adf40SYehuda Sadeh 	u64 bytes;
1166602adf40SYehuda Sadeh 	int read_op;
1167602adf40SYehuda Sadeh 
1168602adf40SYehuda Sadeh 	/* parse reply */
1169602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1170602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1171602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
1172602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
1173602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1174895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1175602adf40SYehuda Sadeh 
1176bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1177bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1178602adf40SYehuda Sadeh 
1179602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1180602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1181602adf40SYehuda Sadeh 		rc = 0;
1182602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1183602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1184602adf40SYehuda Sadeh 		bytes = req_data->len;
1185602adf40SYehuda Sadeh 	}
1186602adf40SYehuda Sadeh 
11871fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1188602adf40SYehuda Sadeh 
1189602adf40SYehuda Sadeh 	if (req_data->bio)
1190602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1191602adf40SYehuda Sadeh 
1192602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1193602adf40SYehuda Sadeh 	kfree(req_data);
1194602adf40SYehuda Sadeh }
1195602adf40SYehuda Sadeh 
119659c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
119759c2be1eSYehuda Sadeh {
119859c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
119959c2be1eSYehuda Sadeh }
120059c2be1eSYehuda Sadeh 
1201602adf40SYehuda Sadeh /*
1202602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1203602adf40SYehuda Sadeh  */
12040ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1205602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1206602adf40SYehuda Sadeh 			   u64 snapid,
1207602adf40SYehuda Sadeh 			   int flags,
1208913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1209aded07eaSAlex Elder 			   const char *object_name,
1210f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1211f8d4de6eSAlex Elder 			   char *inbound,
121259c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
121359c2be1eSYehuda Sadeh 			   u64 *ver)
1214602adf40SYehuda Sadeh {
1215602adf40SYehuda Sadeh 	int ret;
1216602adf40SYehuda Sadeh 	struct page **pages;
1217602adf40SYehuda Sadeh 	int num_pages;
1218913d2fdcSAlex Elder 
1219aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1220602adf40SYehuda Sadeh 
1221f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1222602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1223b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1224b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1225602adf40SYehuda Sadeh 
12260ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1227f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1228602adf40SYehuda Sadeh 			  pages, num_pages,
1229602adf40SYehuda Sadeh 			  flags,
1230602adf40SYehuda Sadeh 			  ops,
12311fec7093SYehuda Sadeh 			  NULL, 0,
123259c2be1eSYehuda Sadeh 			  NULL,
123359c2be1eSYehuda Sadeh 			  linger_req, ver);
1234602adf40SYehuda Sadeh 	if (ret < 0)
1235913d2fdcSAlex Elder 		goto done;
1236602adf40SYehuda Sadeh 
1237f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1238f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1239602adf40SYehuda Sadeh 
1240602adf40SYehuda Sadeh done:
1241602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1242602adf40SYehuda Sadeh 	return ret;
1243602adf40SYehuda Sadeh }
1244602adf40SYehuda Sadeh 
1245602adf40SYehuda Sadeh /*
1246602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1247602adf40SYehuda Sadeh  */
1248602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1249602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1250602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1251602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
12521fec7093SYehuda Sadeh 		     struct bio *bio,
12531fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
12541fec7093SYehuda Sadeh 		     int coll_index)
1255602adf40SYehuda Sadeh {
1256602adf40SYehuda Sadeh 	char *seg_name;
1257602adf40SYehuda Sadeh 	u64 seg_ofs;
1258602adf40SYehuda Sadeh 	u64 seg_len;
1259602adf40SYehuda Sadeh 	int ret;
1260602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1261602adf40SYehuda Sadeh 	u32 payload_len;
1262ff2e4bb5SAlex Elder 	int opcode;
1263ff2e4bb5SAlex Elder 	int flags;
12644634246dSAlex Elder 	u64 snapid;
1265602adf40SYehuda Sadeh 
126665ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1267602adf40SYehuda Sadeh 	if (!seg_name)
1268602adf40SYehuda Sadeh 		return -ENOMEM;
126965ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
127065ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1271602adf40SYehuda Sadeh 
1272ff2e4bb5SAlex Elder 	if (rq_data_dir(rq) == WRITE) {
1273ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_WRITE;
1274ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
12754634246dSAlex Elder 		snapid = CEPH_NOSNAP;
1276ff2e4bb5SAlex Elder 		payload_len = seg_len;
1277ff2e4bb5SAlex Elder 	} else {
1278ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_READ;
1279ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_READ;
12804634246dSAlex Elder 		snapc = NULL;
12814634246dSAlex Elder 		snapid = rbd_dev->mapping.snap_id;
1282ff2e4bb5SAlex Elder 		payload_len = 0;
1283ff2e4bb5SAlex Elder 	}
1284602adf40SYehuda Sadeh 
128557cfc106SAlex Elder 	ret = -ENOMEM;
128657cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
128757cfc106SAlex Elder 	if (!ops)
1288602adf40SYehuda Sadeh 		goto done;
1289602adf40SYehuda Sadeh 
1290602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1291602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1292602adf40SYehuda Sadeh 	   truncated at this point */
1293aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1294602adf40SYehuda Sadeh 
1295602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1296602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1297602adf40SYehuda Sadeh 			     bio,
1298602adf40SYehuda Sadeh 			     NULL, 0,
1299602adf40SYehuda Sadeh 			     flags,
1300602adf40SYehuda Sadeh 			     ops,
13011fec7093SYehuda Sadeh 			     coll, coll_index,
130259c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
130311f77002SSage Weil 
130411f77002SSage Weil 	rbd_destroy_ops(ops);
1305602adf40SYehuda Sadeh done:
1306602adf40SYehuda Sadeh 	kfree(seg_name);
1307602adf40SYehuda Sadeh 	return ret;
1308602adf40SYehuda Sadeh }
1309602adf40SYehuda Sadeh 
1310602adf40SYehuda Sadeh /*
1311602adf40SYehuda Sadeh  * Request sync osd read
1312602adf40SYehuda Sadeh  */
13130ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1314602adf40SYehuda Sadeh 			  u64 snapid,
1315aded07eaSAlex Elder 			  const char *object_name,
1316602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
131759c2be1eSYehuda Sadeh 			  char *buf,
131859c2be1eSYehuda Sadeh 			  u64 *ver)
1319602adf40SYehuda Sadeh {
1320913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1321913d2fdcSAlex Elder 	int ret;
1322913d2fdcSAlex Elder 
1323913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1324913d2fdcSAlex Elder 	if (!ops)
1325913d2fdcSAlex Elder 		return -ENOMEM;
1326913d2fdcSAlex Elder 
1327913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1328b06e6a6bSJosh Durgin 			       snapid,
1329602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1330913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1331913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1332913d2fdcSAlex Elder 
1333913d2fdcSAlex Elder 	return ret;
1334602adf40SYehuda Sadeh }
1335602adf40SYehuda Sadeh 
1336602adf40SYehuda Sadeh /*
133759c2be1eSYehuda Sadeh  * Request sync osd watch
133859c2be1eSYehuda Sadeh  */
13390ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
134059c2be1eSYehuda Sadeh 				   u64 ver,
13417f0a24d8SAlex Elder 				   u64 notify_id)
134259c2be1eSYehuda Sadeh {
134359c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
134411f77002SSage Weil 	int ret;
134511f77002SSage Weil 
134657cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
134757cfc106SAlex Elder 	if (!ops)
134857cfc106SAlex Elder 		return -ENOMEM;
134959c2be1eSYehuda Sadeh 
1350a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
135159c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
135259c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
135359c2be1eSYehuda Sadeh 
13540ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
13557f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1356ad4f232fSAlex Elder 			  NULL, 0,
135759c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
135859c2be1eSYehuda Sadeh 			  ops,
13591fec7093SYehuda Sadeh 			  NULL, 0,
136059c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
136159c2be1eSYehuda Sadeh 
136259c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
136359c2be1eSYehuda Sadeh 	return ret;
136459c2be1eSYehuda Sadeh }
136559c2be1eSYehuda Sadeh 
136659c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
136759c2be1eSYehuda Sadeh {
13680ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1369a71b891bSJosh Durgin 	u64 hver;
137013143d2dSSage Weil 	int rc;
137113143d2dSSage Weil 
13720ce1a794SAlex Elder 	if (!rbd_dev)
137359c2be1eSYehuda Sadeh 		return;
137459c2be1eSYehuda Sadeh 
1375bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1376bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1377bd919d45SAlex Elder 		(unsigned int) opcode);
1378117973fbSAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
137913143d2dSSage Weil 	if (rc)
1380f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
13810ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
138259c2be1eSYehuda Sadeh 
13837f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
138459c2be1eSYehuda Sadeh }
138559c2be1eSYehuda Sadeh 
138659c2be1eSYehuda Sadeh /*
138759c2be1eSYehuda Sadeh  * Request sync osd watch
138859c2be1eSYehuda Sadeh  */
13890e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
139059c2be1eSYehuda Sadeh {
139159c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13920ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
139357cfc106SAlex Elder 	int ret;
139459c2be1eSYehuda Sadeh 
139557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
139657cfc106SAlex Elder 	if (!ops)
139757cfc106SAlex Elder 		return -ENOMEM;
139859c2be1eSYehuda Sadeh 
139959c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
14000ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
140159c2be1eSYehuda Sadeh 	if (ret < 0)
140259c2be1eSYehuda Sadeh 		goto fail;
140359c2be1eSYehuda Sadeh 
14040e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
14050ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
140659c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
140759c2be1eSYehuda Sadeh 
14080ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
140959c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
141059c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
141159c2be1eSYehuda Sadeh 			      ops,
14120e6f322dSAlex Elder 			      rbd_dev->header_name,
14130e6f322dSAlex Elder 			      0, 0, NULL,
14140ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
141559c2be1eSYehuda Sadeh 
141659c2be1eSYehuda Sadeh 	if (ret < 0)
141759c2be1eSYehuda Sadeh 		goto fail_event;
141859c2be1eSYehuda Sadeh 
141959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
142059c2be1eSYehuda Sadeh 	return 0;
142159c2be1eSYehuda Sadeh 
142259c2be1eSYehuda Sadeh fail_event:
14230ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14240ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
142559c2be1eSYehuda Sadeh fail:
142659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
142759c2be1eSYehuda Sadeh 	return ret;
142859c2be1eSYehuda Sadeh }
142959c2be1eSYehuda Sadeh 
143079e3057cSYehuda Sadeh /*
143179e3057cSYehuda Sadeh  * Request sync osd unwatch
143279e3057cSYehuda Sadeh  */
1433070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
143479e3057cSYehuda Sadeh {
143579e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
143657cfc106SAlex Elder 	int ret;
143779e3057cSYehuda Sadeh 
143857cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
143957cfc106SAlex Elder 	if (!ops)
144057cfc106SAlex Elder 		return -ENOMEM;
144179e3057cSYehuda Sadeh 
144279e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
14430ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
144479e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
144579e3057cSYehuda Sadeh 
14460ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
144779e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
144879e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
144979e3057cSYehuda Sadeh 			      ops,
1450070c633fSAlex Elder 			      rbd_dev->header_name,
1451070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1452070c633fSAlex Elder 
145379e3057cSYehuda Sadeh 
145479e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
14550ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14560ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
145779e3057cSYehuda Sadeh 	return ret;
145879e3057cSYehuda Sadeh }
145979e3057cSYehuda Sadeh 
146059c2be1eSYehuda Sadeh /*
14613cb4a687SAlex Elder  * Synchronous osd object method call
1462602adf40SYehuda Sadeh  */
14630ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1464aded07eaSAlex Elder 			     const char *object_name,
1465aded07eaSAlex Elder 			     const char *class_name,
1466aded07eaSAlex Elder 			     const char *method_name,
14673cb4a687SAlex Elder 			     const char *outbound,
14683cb4a687SAlex Elder 			     size_t outbound_size,
1469f8d4de6eSAlex Elder 			     char *inbound,
1470f8d4de6eSAlex Elder 			     size_t inbound_size,
14713cb4a687SAlex Elder 			     int flags,
147259c2be1eSYehuda Sadeh 			     u64 *ver)
1473602adf40SYehuda Sadeh {
1474602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1475aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1476aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
14773cb4a687SAlex Elder 	int payload_size;
147857cfc106SAlex Elder 	int ret;
147957cfc106SAlex Elder 
14803cb4a687SAlex Elder 	/*
14813cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
14823cb4a687SAlex Elder 	 * will be sent along with the class and method names as
14833cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
14843cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
14853cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
14863cb4a687SAlex Elder 	 * operation.
14873cb4a687SAlex Elder 	 */
14883cb4a687SAlex Elder 	payload_size = class_name_len + method_name_len + outbound_size;
14893cb4a687SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
149057cfc106SAlex Elder 	if (!ops)
149157cfc106SAlex Elder 		return -ENOMEM;
1492602adf40SYehuda Sadeh 
1493aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1494aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1495aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1496aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1497602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
14983cb4a687SAlex Elder 	ops[0].cls.indata = outbound;
14993cb4a687SAlex Elder 	ops[0].cls.indata_len = outbound_size;
1500602adf40SYehuda Sadeh 
15010ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1502602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
15033cb4a687SAlex Elder 			       flags, ops,
1504f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1505f8d4de6eSAlex Elder 			       NULL, ver);
1506602adf40SYehuda Sadeh 
1507602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1508602adf40SYehuda Sadeh 
1509602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1510602adf40SYehuda Sadeh 	return ret;
1511602adf40SYehuda Sadeh }
1512602adf40SYehuda Sadeh 
15131fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
15141fec7093SYehuda Sadeh {
15151fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
15161fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
15171fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
15181fec7093SYehuda Sadeh 				GFP_ATOMIC);
15191fec7093SYehuda Sadeh 
15201fec7093SYehuda Sadeh 	if (!coll)
15211fec7093SYehuda Sadeh 		return NULL;
15221fec7093SYehuda Sadeh 	coll->total = num_reqs;
15231fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15241fec7093SYehuda Sadeh 	return coll;
15251fec7093SYehuda Sadeh }
15261fec7093SYehuda Sadeh 
1527602adf40SYehuda Sadeh /*
1528602adf40SYehuda Sadeh  * block device queue callback
1529602adf40SYehuda Sadeh  */
1530602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1531602adf40SYehuda Sadeh {
1532602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1533602adf40SYehuda Sadeh 	struct request *rq;
1534602adf40SYehuda Sadeh 
153500f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1536602adf40SYehuda Sadeh 		struct bio *bio;
1537602adf40SYehuda Sadeh 		bool do_write;
1538bd919d45SAlex Elder 		unsigned int size;
1539602adf40SYehuda Sadeh 		u64 ofs;
15401fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
15411fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1542d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1543f7760dadSAlex Elder 		unsigned int bio_offset;
1544602adf40SYehuda Sadeh 
1545602adf40SYehuda Sadeh 		dout("fetched request\n");
1546602adf40SYehuda Sadeh 
1547602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1548602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1549602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
155000f1f36fSAlex Elder 			continue;
1551602adf40SYehuda Sadeh 		}
1552602adf40SYehuda Sadeh 
1553602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1554602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1555f84344f3SAlex Elder 		if (do_write && rbd_dev->mapping.read_only) {
1556602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
155700f1f36fSAlex Elder 			continue;
1558602adf40SYehuda Sadeh 		}
1559602adf40SYehuda Sadeh 
1560602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1561602adf40SYehuda Sadeh 
1562e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1563e88a36ecSJosh Durgin 
1564f84344f3SAlex Elder 		if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1565f84344f3SAlex Elder 				!rbd_dev->mapping.snap_exists) {
1566d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1567e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1568e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1569e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1570e88a36ecSJosh Durgin 			continue;
1571e88a36ecSJosh Durgin 		}
1572d1d25646SJosh Durgin 
1573d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1574d1d25646SJosh Durgin 
1575d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1576e88a36ecSJosh Durgin 
1577f7760dadSAlex Elder 		size = blk_rq_bytes(rq);
1578f7760dadSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1579f7760dadSAlex Elder 		bio = rq->bio;
1580f7760dadSAlex Elder 
1581602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1582602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1583bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1584602adf40SYehuda Sadeh 
15851fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1586df111be6SAlex Elder 		if (num_segs <= 0) {
1587df111be6SAlex Elder 			spin_lock_irq(q->queue_lock);
1588df111be6SAlex Elder 			__blk_end_request_all(rq, num_segs);
1589df111be6SAlex Elder 			ceph_put_snap_context(snapc);
1590df111be6SAlex Elder 			continue;
1591df111be6SAlex Elder 		}
15921fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
15931fec7093SYehuda Sadeh 		if (!coll) {
15941fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
15951fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1596d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
159700f1f36fSAlex Elder 			continue;
15981fec7093SYehuda Sadeh 		}
15991fec7093SYehuda Sadeh 
1600f7760dadSAlex Elder 		bio_offset = 0;
1601602adf40SYehuda Sadeh 		do {
1602f7760dadSAlex Elder 			u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1603f7760dadSAlex Elder 			unsigned int chain_size;
1604f7760dadSAlex Elder 			struct bio *bio_chain;
1605f7760dadSAlex Elder 
1606f7760dadSAlex Elder 			BUG_ON(limit > (u64) UINT_MAX);
1607f7760dadSAlex Elder 			chain_size = (unsigned int) limit;
1608bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1609f7760dadSAlex Elder 
16101fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1611f7760dadSAlex Elder 
1612f7760dadSAlex Elder 			/* Pass a cloned bio chain via an osd request */
1613f7760dadSAlex Elder 
1614f7760dadSAlex Elder 			bio_chain = bio_chain_clone_range(&bio,
1615f7760dadSAlex Elder 						&bio_offset, chain_size,
1616f7760dadSAlex Elder 						GFP_ATOMIC);
1617f7760dadSAlex Elder 			if (bio_chain)
16184634246dSAlex Elder 				(void) rbd_do_op(rq, rbd_dev, snapc,
1619f7760dadSAlex Elder 						ofs, chain_size,
1620f7760dadSAlex Elder 						bio_chain, coll, cur_seg);
16214634246dSAlex Elder 			else
16221fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
1623f7760dadSAlex Elder 						       -ENOMEM, chain_size);
1624f7760dadSAlex Elder 			size -= chain_size;
1625f7760dadSAlex Elder 			ofs += chain_size;
1626602adf40SYehuda Sadeh 
16271fec7093SYehuda Sadeh 			cur_seg++;
1628602adf40SYehuda Sadeh 		} while (size > 0);
16291fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1630602adf40SYehuda Sadeh 
1631602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1632d1d25646SJosh Durgin 
1633d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1634602adf40SYehuda Sadeh 	}
1635602adf40SYehuda Sadeh }
1636602adf40SYehuda Sadeh 
1637602adf40SYehuda Sadeh /*
1638602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1639602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1640f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1641602adf40SYehuda Sadeh  */
1642602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1643602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1644602adf40SYehuda Sadeh {
1645602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1646e5cfeed2SAlex Elder 	sector_t sector_offset;
1647e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1648e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1649e5cfeed2SAlex Elder 	int ret;
1650602adf40SYehuda Sadeh 
1651e5cfeed2SAlex Elder 	/*
1652e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1653e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1654e5cfeed2SAlex Elder 	 * device.
1655e5cfeed2SAlex Elder 	 */
1656e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1657e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1658e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1659593a9e7bSAlex Elder 
1660e5cfeed2SAlex Elder 	/*
1661e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
1662e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
1663e5cfeed2SAlex Elder 	 */
1664e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1665e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
1666e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
1667e5cfeed2SAlex Elder 	else
1668e5cfeed2SAlex Elder 		ret = 0;
1669e5cfeed2SAlex Elder 
1670e5cfeed2SAlex Elder 	/*
1671e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
1672e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
1673e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
1674e5cfeed2SAlex Elder 	 * added to an empty bio."
1675e5cfeed2SAlex Elder 	 */
1676e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
1677e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
1678e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
1679e5cfeed2SAlex Elder 
1680e5cfeed2SAlex Elder 	return ret;
1681602adf40SYehuda Sadeh }
1682602adf40SYehuda Sadeh 
1683602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1684602adf40SYehuda Sadeh {
1685602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1686602adf40SYehuda Sadeh 
1687602adf40SYehuda Sadeh 	if (!disk)
1688602adf40SYehuda Sadeh 		return;
1689602adf40SYehuda Sadeh 
1690602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1691602adf40SYehuda Sadeh 		del_gendisk(disk);
1692602adf40SYehuda Sadeh 	if (disk->queue)
1693602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1694602adf40SYehuda Sadeh 	put_disk(disk);
1695602adf40SYehuda Sadeh }
1696602adf40SYehuda Sadeh 
1697602adf40SYehuda Sadeh /*
16984156d998SAlex Elder  * Read the complete header for the given rbd device.
16994156d998SAlex Elder  *
17004156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
17014156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
17024156d998SAlex Elder  * of a variable that will be filled in with the version of the
17034156d998SAlex Elder  * header object at the time it was read.
17044156d998SAlex Elder  *
17054156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
17064156d998SAlex Elder  */
17074156d998SAlex Elder static struct rbd_image_header_ondisk *
17084156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
17094156d998SAlex Elder {
17104156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
17114156d998SAlex Elder 	u32 snap_count = 0;
17124156d998SAlex Elder 	u64 names_size = 0;
17134156d998SAlex Elder 	u32 want_count;
17144156d998SAlex Elder 	int ret;
17154156d998SAlex Elder 
17164156d998SAlex Elder 	/*
17174156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
17184156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
17194156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
17204156d998SAlex Elder 	 * the number of snapshots could change by the time we read
17214156d998SAlex Elder 	 * it in, in which case we re-read it.
17224156d998SAlex Elder 	 */
17234156d998SAlex Elder 	do {
17244156d998SAlex Elder 		size_t size;
17254156d998SAlex Elder 
17264156d998SAlex Elder 		kfree(ondisk);
17274156d998SAlex Elder 
17284156d998SAlex Elder 		size = sizeof (*ondisk);
17294156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17304156d998SAlex Elder 		size += names_size;
17314156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17324156d998SAlex Elder 		if (!ondisk)
17334156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
17344156d998SAlex Elder 
17354156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
17364156d998SAlex Elder 				       rbd_dev->header_name,
17374156d998SAlex Elder 				       0, size,
17384156d998SAlex Elder 				       (char *) ondisk, version);
17394156d998SAlex Elder 
17404156d998SAlex Elder 		if (ret < 0)
17414156d998SAlex Elder 			goto out_err;
17424156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
17434156d998SAlex Elder 			ret = -ENXIO;
17444156d998SAlex Elder 			pr_warning("short header read for image %s"
17454156d998SAlex Elder 					" (want %zd got %d)\n",
17464156d998SAlex Elder 				rbd_dev->image_name, size, ret);
17474156d998SAlex Elder 			goto out_err;
17484156d998SAlex Elder 		}
17494156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
17504156d998SAlex Elder 			ret = -ENXIO;
17514156d998SAlex Elder 			pr_warning("invalid header for image %s\n",
17524156d998SAlex Elder 				rbd_dev->image_name);
17534156d998SAlex Elder 			goto out_err;
17544156d998SAlex Elder 		}
17554156d998SAlex Elder 
17564156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
17574156d998SAlex Elder 		want_count = snap_count;
17584156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
17594156d998SAlex Elder 	} while (snap_count != want_count);
17604156d998SAlex Elder 
17614156d998SAlex Elder 	return ondisk;
17624156d998SAlex Elder 
17634156d998SAlex Elder out_err:
17644156d998SAlex Elder 	kfree(ondisk);
17654156d998SAlex Elder 
17664156d998SAlex Elder 	return ERR_PTR(ret);
17674156d998SAlex Elder }
17684156d998SAlex Elder 
17694156d998SAlex Elder /*
1770602adf40SYehuda Sadeh  * reload the ondisk the header
1771602adf40SYehuda Sadeh  */
1772602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1773602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1774602adf40SYehuda Sadeh {
17754156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
17764156d998SAlex Elder 	u64 ver = 0;
17774156d998SAlex Elder 	int ret;
1778602adf40SYehuda Sadeh 
17794156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
17804156d998SAlex Elder 	if (IS_ERR(ondisk))
17814156d998SAlex Elder 		return PTR_ERR(ondisk);
17824156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
17834156d998SAlex Elder 	if (ret >= 0)
178459c2be1eSYehuda Sadeh 		header->obj_version = ver;
17854156d998SAlex Elder 	kfree(ondisk);
1786602adf40SYehuda Sadeh 
17874156d998SAlex Elder 	return ret;
1788602adf40SYehuda Sadeh }
1789602adf40SYehuda Sadeh 
1790dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1791dfc5606dSYehuda Sadeh {
1792dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1793a0593290SAlex Elder 	struct rbd_snap *next;
1794dfc5606dSYehuda Sadeh 
1795a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
179614e7085dSAlex Elder 		__rbd_remove_snap_dev(snap);
1797dfc5606dSYehuda Sadeh }
1798dfc5606dSYehuda Sadeh 
17999478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
18009478554aSAlex Elder {
18019478554aSAlex Elder 	sector_t size;
18029478554aSAlex Elder 
18039478554aSAlex Elder 	if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
18049478554aSAlex Elder 		return;
18059478554aSAlex Elder 
18069478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
18079478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
18089478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
18099478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
18109478554aSAlex Elder }
18119478554aSAlex Elder 
1812602adf40SYehuda Sadeh /*
1813602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1814602adf40SYehuda Sadeh  */
1815117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1816602adf40SYehuda Sadeh {
1817602adf40SYehuda Sadeh 	int ret;
1818602adf40SYehuda Sadeh 	struct rbd_image_header h;
1819602adf40SYehuda Sadeh 
1820602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1821602adf40SYehuda Sadeh 	if (ret < 0)
1822602adf40SYehuda Sadeh 		return ret;
1823602adf40SYehuda Sadeh 
1824a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1825a51aa0c0SJosh Durgin 
18269478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
18279478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
18289478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
18299db4b3e3SSage Weil 
1830849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1831602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1832849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1833d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1834d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1835602adf40SYehuda Sadeh 
1836b813623aSAlex Elder 	if (hver)
1837b813623aSAlex Elder 		*hver = h.obj_version;
1838a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
183993a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1840602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1841602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1842602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1843849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1844849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1845849b4260SAlex Elder 	kfree(h.object_prefix);
1846849b4260SAlex Elder 
1847304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1848304f6808SAlex Elder 	if (!ret)
1849304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1850dfc5606dSYehuda Sadeh 
1851c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1852602adf40SYehuda Sadeh 
1853dfc5606dSYehuda Sadeh 	return ret;
1854602adf40SYehuda Sadeh }
1855602adf40SYehuda Sadeh 
1856117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
18571fe5e993SAlex Elder {
18581fe5e993SAlex Elder 	int ret;
18591fe5e993SAlex Elder 
1860117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
18611fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1862117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
1863117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
1864117973fbSAlex Elder 	else
1865117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
18661fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
18671fe5e993SAlex Elder 
18681fe5e993SAlex Elder 	return ret;
18691fe5e993SAlex Elder }
18701fe5e993SAlex Elder 
1871602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1872602adf40SYehuda Sadeh {
1873602adf40SYehuda Sadeh 	struct gendisk *disk;
1874602adf40SYehuda Sadeh 	struct request_queue *q;
1875593a9e7bSAlex Elder 	u64 segment_size;
1876602adf40SYehuda Sadeh 
1877602adf40SYehuda Sadeh 	/* create gendisk info */
1878602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1879602adf40SYehuda Sadeh 	if (!disk)
18801fcdb8aaSAlex Elder 		return -ENOMEM;
1881602adf40SYehuda Sadeh 
1882f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1883de71a297SAlex Elder 		 rbd_dev->dev_id);
1884602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1885602adf40SYehuda Sadeh 	disk->first_minor = 0;
1886602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1887602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1888602adf40SYehuda Sadeh 
1889602adf40SYehuda Sadeh 	/* init rq */
1890602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1891602adf40SYehuda Sadeh 	if (!q)
1892602adf40SYehuda Sadeh 		goto out_disk;
1893029bcbd8SJosh Durgin 
1894593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1895593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1896593a9e7bSAlex Elder 
1897029bcbd8SJosh Durgin 	/* set io sizes to object size */
1898593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1899593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1900593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1901593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1902593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1903029bcbd8SJosh Durgin 
1904602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1905602adf40SYehuda Sadeh 	disk->queue = q;
1906602adf40SYehuda Sadeh 
1907602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1908602adf40SYehuda Sadeh 
1909602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1910602adf40SYehuda Sadeh 
191112f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
191212f02944SAlex Elder 
1913602adf40SYehuda Sadeh 	return 0;
1914602adf40SYehuda Sadeh out_disk:
1915602adf40SYehuda Sadeh 	put_disk(disk);
19161fcdb8aaSAlex Elder 
19171fcdb8aaSAlex Elder 	return -ENOMEM;
1918602adf40SYehuda Sadeh }
1919602adf40SYehuda Sadeh 
1920dfc5606dSYehuda Sadeh /*
1921dfc5606dSYehuda Sadeh   sysfs
1922dfc5606dSYehuda Sadeh */
1923602adf40SYehuda Sadeh 
1924593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1925593a9e7bSAlex Elder {
1926593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1927593a9e7bSAlex Elder }
1928593a9e7bSAlex Elder 
1929dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1930dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1931602adf40SYehuda Sadeh {
1932593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1933a51aa0c0SJosh Durgin 	sector_t size;
1934dfc5606dSYehuda Sadeh 
1935a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1936a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1937a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1938a51aa0c0SJosh Durgin 
1939a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1940602adf40SYehuda Sadeh }
1941602adf40SYehuda Sadeh 
194234b13184SAlex Elder /*
194334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
194434b13184SAlex Elder  * necessarily the base image.
194534b13184SAlex Elder  */
194634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
194734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
194834b13184SAlex Elder {
194934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
195034b13184SAlex Elder 
195134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
195234b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
195334b13184SAlex Elder }
195434b13184SAlex Elder 
1955dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1956dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1957602adf40SYehuda Sadeh {
1958593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1959dfc5606dSYehuda Sadeh 
1960dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1961dfc5606dSYehuda Sadeh }
1962dfc5606dSYehuda Sadeh 
1963dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1964dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1965dfc5606dSYehuda Sadeh {
1966593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1967dfc5606dSYehuda Sadeh 
19681dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
19691dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1970dfc5606dSYehuda Sadeh }
1971dfc5606dSYehuda Sadeh 
1972dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1973dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1974dfc5606dSYehuda Sadeh {
1975593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1976dfc5606dSYehuda Sadeh 
1977dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1978dfc5606dSYehuda Sadeh }
1979dfc5606dSYehuda Sadeh 
19809bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
19819bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
19829bb2f334SAlex Elder {
19839bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
19849bb2f334SAlex Elder 
19859bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
19869bb2f334SAlex Elder }
19879bb2f334SAlex Elder 
1988dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1989dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1990dfc5606dSYehuda Sadeh {
1991593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1992dfc5606dSYehuda Sadeh 
19930bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
1994dfc5606dSYehuda Sadeh }
1995dfc5606dSYehuda Sadeh 
1996589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
1997589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
1998589d30e0SAlex Elder {
1999589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2000589d30e0SAlex Elder 
2001589d30e0SAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_id);
2002589d30e0SAlex Elder }
2003589d30e0SAlex Elder 
200434b13184SAlex Elder /*
200534b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
200634b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
200734b13184SAlex Elder  */
2008dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2009dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2010dfc5606dSYehuda Sadeh 			     char *buf)
2011dfc5606dSYehuda Sadeh {
2012593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2013dfc5606dSYehuda Sadeh 
2014f84344f3SAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
2015dfc5606dSYehuda Sadeh }
2016dfc5606dSYehuda Sadeh 
2017dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2018dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2019dfc5606dSYehuda Sadeh 				 const char *buf,
2020dfc5606dSYehuda Sadeh 				 size_t size)
2021dfc5606dSYehuda Sadeh {
2022593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2023b813623aSAlex Elder 	int ret;
2024602adf40SYehuda Sadeh 
2025117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2026b813623aSAlex Elder 
2027b813623aSAlex Elder 	return ret < 0 ? ret : size;
2028dfc5606dSYehuda Sadeh }
2029602adf40SYehuda Sadeh 
2030dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
203134b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2032dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2033dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2034dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
20359bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2036dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2037589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2038dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2039dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2040dfc5606dSYehuda Sadeh 
2041dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2042dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
204334b13184SAlex Elder 	&dev_attr_features.attr,
2044dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2045dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2046dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
20479bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2048dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2049589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2050dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
2051dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2052dfc5606dSYehuda Sadeh 	NULL
2053dfc5606dSYehuda Sadeh };
2054dfc5606dSYehuda Sadeh 
2055dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2056dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2057dfc5606dSYehuda Sadeh };
2058dfc5606dSYehuda Sadeh 
2059dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2060dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2061dfc5606dSYehuda Sadeh 	NULL
2062dfc5606dSYehuda Sadeh };
2063dfc5606dSYehuda Sadeh 
2064dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2065dfc5606dSYehuda Sadeh {
2066dfc5606dSYehuda Sadeh }
2067dfc5606dSYehuda Sadeh 
2068dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2069dfc5606dSYehuda Sadeh 	.name		= "rbd",
2070dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2071dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2072dfc5606dSYehuda Sadeh };
2073dfc5606dSYehuda Sadeh 
2074dfc5606dSYehuda Sadeh 
2075dfc5606dSYehuda Sadeh /*
2076dfc5606dSYehuda Sadeh   sysfs - snapshots
2077dfc5606dSYehuda Sadeh */
2078dfc5606dSYehuda Sadeh 
2079dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2080dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2081dfc5606dSYehuda Sadeh 				  char *buf)
2082dfc5606dSYehuda Sadeh {
2083dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2084dfc5606dSYehuda Sadeh 
20853591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2086dfc5606dSYehuda Sadeh }
2087dfc5606dSYehuda Sadeh 
2088dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2089dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2090dfc5606dSYehuda Sadeh 				char *buf)
2091dfc5606dSYehuda Sadeh {
2092dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2093dfc5606dSYehuda Sadeh 
2094593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2095dfc5606dSYehuda Sadeh }
2096dfc5606dSYehuda Sadeh 
209734b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
209834b13184SAlex Elder 				struct device_attribute *attr,
209934b13184SAlex Elder 				char *buf)
210034b13184SAlex Elder {
210134b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
210234b13184SAlex Elder 
210334b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
210434b13184SAlex Elder 			(unsigned long long) snap->features);
210534b13184SAlex Elder }
210634b13184SAlex Elder 
2107dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2108dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
210934b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2110dfc5606dSYehuda Sadeh 
2111dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2112dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2113dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
211434b13184SAlex Elder 	&dev_attr_snap_features.attr,
2115dfc5606dSYehuda Sadeh 	NULL,
2116dfc5606dSYehuda Sadeh };
2117dfc5606dSYehuda Sadeh 
2118dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2119dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2120dfc5606dSYehuda Sadeh };
2121dfc5606dSYehuda Sadeh 
2122dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2123dfc5606dSYehuda Sadeh {
2124dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2125dfc5606dSYehuda Sadeh 	kfree(snap->name);
2126dfc5606dSYehuda Sadeh 	kfree(snap);
2127dfc5606dSYehuda Sadeh }
2128dfc5606dSYehuda Sadeh 
2129dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2130dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2131dfc5606dSYehuda Sadeh 	NULL
2132dfc5606dSYehuda Sadeh };
2133dfc5606dSYehuda Sadeh 
2134dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2135dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2136dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2137dfc5606dSYehuda Sadeh };
2138dfc5606dSYehuda Sadeh 
2139304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2140304f6808SAlex Elder {
2141304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2142304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2143304f6808SAlex Elder 
2144304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2145304f6808SAlex Elder 
2146304f6808SAlex Elder 	return ret;
2147304f6808SAlex Elder }
2148304f6808SAlex Elder 
214914e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2150dfc5606dSYehuda Sadeh {
2151dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2152304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2153dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2154dfc5606dSYehuda Sadeh }
2155dfc5606dSYehuda Sadeh 
215614e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2157dfc5606dSYehuda Sadeh 				  struct device *parent)
2158dfc5606dSYehuda Sadeh {
2159dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2160dfc5606dSYehuda Sadeh 	int ret;
2161dfc5606dSYehuda Sadeh 
2162dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2163dfc5606dSYehuda Sadeh 	dev->parent = parent;
2164dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2165d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2166304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2167304f6808SAlex Elder 
2168dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2169dfc5606dSYehuda Sadeh 
2170dfc5606dSYehuda Sadeh 	return ret;
2171dfc5606dSYehuda Sadeh }
2172dfc5606dSYehuda Sadeh 
21734e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2174c8d18425SAlex Elder 						const char *snap_name,
217534b13184SAlex Elder 						u64 snap_id, u64 snap_size,
217634b13184SAlex Elder 						u64 snap_features)
2177dfc5606dSYehuda Sadeh {
21784e891e0aSAlex Elder 	struct rbd_snap *snap;
2179dfc5606dSYehuda Sadeh 	int ret;
21804e891e0aSAlex Elder 
21814e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2182dfc5606dSYehuda Sadeh 	if (!snap)
21834e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
21844e891e0aSAlex Elder 
21854e891e0aSAlex Elder 	ret = -ENOMEM;
2186c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
21874e891e0aSAlex Elder 	if (!snap->name)
21884e891e0aSAlex Elder 		goto err;
21894e891e0aSAlex Elder 
2190c8d18425SAlex Elder 	snap->id = snap_id;
2191c8d18425SAlex Elder 	snap->size = snap_size;
219234b13184SAlex Elder 	snap->features = snap_features;
21934e891e0aSAlex Elder 
21944e891e0aSAlex Elder 	return snap;
21954e891e0aSAlex Elder 
2196dfc5606dSYehuda Sadeh err:
2197dfc5606dSYehuda Sadeh 	kfree(snap->name);
2198dfc5606dSYehuda Sadeh 	kfree(snap);
21994e891e0aSAlex Elder 
22004e891e0aSAlex Elder 	return ERR_PTR(ret);
2201dfc5606dSYehuda Sadeh }
2202dfc5606dSYehuda Sadeh 
2203cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2204cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2205cd892126SAlex Elder {
2206cd892126SAlex Elder 	char *snap_name;
2207cd892126SAlex Elder 
2208cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2209cd892126SAlex Elder 
2210cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2211cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2212cd892126SAlex Elder 
2213cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2214cd892126SAlex Elder 
2215cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2216cd892126SAlex Elder 	while (which--)
2217cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2218cd892126SAlex Elder 
2219cd892126SAlex Elder 	return snap_name;
2220cd892126SAlex Elder }
2221cd892126SAlex Elder 
2222dfc5606dSYehuda Sadeh /*
22239d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
22249d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
22259d475de5SAlex Elder  * image.
22269d475de5SAlex Elder  */
22279d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
22289d475de5SAlex Elder 				u8 *order, u64 *snap_size)
22299d475de5SAlex Elder {
22309d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
22319d475de5SAlex Elder 	int ret;
22329d475de5SAlex Elder 	struct {
22339d475de5SAlex Elder 		u8 order;
22349d475de5SAlex Elder 		__le64 size;
22359d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
22369d475de5SAlex Elder 
22379d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
22389d475de5SAlex Elder 				"rbd", "get_size",
22399d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
22409d475de5SAlex Elder 				(char *) &size_buf, sizeof (size_buf),
22419d475de5SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
22429d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
22439d475de5SAlex Elder 	if (ret < 0)
22449d475de5SAlex Elder 		return ret;
22459d475de5SAlex Elder 
22469d475de5SAlex Elder 	*order = size_buf.order;
22479d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
22489d475de5SAlex Elder 
22499d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
22509d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
22519d475de5SAlex Elder 		(unsigned long long) *snap_size);
22529d475de5SAlex Elder 
22539d475de5SAlex Elder 	return 0;
22549d475de5SAlex Elder }
22559d475de5SAlex Elder 
22569d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
22579d475de5SAlex Elder {
22589d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
22599d475de5SAlex Elder 					&rbd_dev->header.obj_order,
22609d475de5SAlex Elder 					&rbd_dev->header.image_size);
22619d475de5SAlex Elder }
22629d475de5SAlex Elder 
22631e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
22641e130199SAlex Elder {
22651e130199SAlex Elder 	void *reply_buf;
22661e130199SAlex Elder 	int ret;
22671e130199SAlex Elder 	void *p;
22681e130199SAlex Elder 
22691e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
22701e130199SAlex Elder 	if (!reply_buf)
22711e130199SAlex Elder 		return -ENOMEM;
22721e130199SAlex Elder 
22731e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
22741e130199SAlex Elder 				"rbd", "get_object_prefix",
22751e130199SAlex Elder 				NULL, 0,
22761e130199SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
22771e130199SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
22781e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
22791e130199SAlex Elder 	if (ret < 0)
22801e130199SAlex Elder 		goto out;
2281a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
22821e130199SAlex Elder 
22831e130199SAlex Elder 	p = reply_buf;
22841e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
22851e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
22861e130199SAlex Elder 						NULL, GFP_NOIO);
22871e130199SAlex Elder 
22881e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
22891e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
22901e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
22911e130199SAlex Elder 	} else {
22921e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
22931e130199SAlex Elder 	}
22941e130199SAlex Elder 
22951e130199SAlex Elder out:
22961e130199SAlex Elder 	kfree(reply_buf);
22971e130199SAlex Elder 
22981e130199SAlex Elder 	return ret;
22991e130199SAlex Elder }
23001e130199SAlex Elder 
2301b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2302b1b5402aSAlex Elder 		u64 *snap_features)
2303b1b5402aSAlex Elder {
2304b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2305b1b5402aSAlex Elder 	struct {
2306b1b5402aSAlex Elder 		__le64 features;
2307b1b5402aSAlex Elder 		__le64 incompat;
2308b1b5402aSAlex Elder 	} features_buf = { 0 };
2309d889140cSAlex Elder 	u64 incompat;
2310b1b5402aSAlex Elder 	int ret;
2311b1b5402aSAlex Elder 
2312b1b5402aSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2313b1b5402aSAlex Elder 				"rbd", "get_features",
2314b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2315b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
2316b1b5402aSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2317b1b5402aSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2318b1b5402aSAlex Elder 	if (ret < 0)
2319b1b5402aSAlex Elder 		return ret;
2320d889140cSAlex Elder 
2321d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2322d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2323d889140cSAlex Elder 		return -ENOTSUPP;
2324d889140cSAlex Elder 
2325b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2326b1b5402aSAlex Elder 
2327b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2328b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2329b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2330b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2331b1b5402aSAlex Elder 
2332b1b5402aSAlex Elder 	return 0;
2333b1b5402aSAlex Elder }
2334b1b5402aSAlex Elder 
2335b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2336b1b5402aSAlex Elder {
2337b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2338b1b5402aSAlex Elder 						&rbd_dev->header.features);
2339b1b5402aSAlex Elder }
2340b1b5402aSAlex Elder 
23416e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
234235d489f9SAlex Elder {
234335d489f9SAlex Elder 	size_t size;
234435d489f9SAlex Elder 	int ret;
234535d489f9SAlex Elder 	void *reply_buf;
234635d489f9SAlex Elder 	void *p;
234735d489f9SAlex Elder 	void *end;
234835d489f9SAlex Elder 	u64 seq;
234935d489f9SAlex Elder 	u32 snap_count;
235035d489f9SAlex Elder 	struct ceph_snap_context *snapc;
235135d489f9SAlex Elder 	u32 i;
235235d489f9SAlex Elder 
235335d489f9SAlex Elder 	/*
235435d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
235535d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
235635d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
235735d489f9SAlex Elder 	 * prepared to receive.
235835d489f9SAlex Elder 	 */
235935d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
236035d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
236135d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
236235d489f9SAlex Elder 	if (!reply_buf)
236335d489f9SAlex Elder 		return -ENOMEM;
236435d489f9SAlex Elder 
236535d489f9SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
236635d489f9SAlex Elder 				"rbd", "get_snapcontext",
236735d489f9SAlex Elder 				NULL, 0,
236835d489f9SAlex Elder 				reply_buf, size,
23696e14b1a6SAlex Elder 				CEPH_OSD_FLAG_READ, ver);
237035d489f9SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
237135d489f9SAlex Elder 	if (ret < 0)
237235d489f9SAlex Elder 		goto out;
237335d489f9SAlex Elder 
237435d489f9SAlex Elder 	ret = -ERANGE;
237535d489f9SAlex Elder 	p = reply_buf;
237635d489f9SAlex Elder 	end = (char *) reply_buf + size;
237735d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
237835d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
237935d489f9SAlex Elder 
238035d489f9SAlex Elder 	/*
238135d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
238235d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
238335d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
238435d489f9SAlex Elder 	 * allocate is representable in a size_t.
238535d489f9SAlex Elder 	 */
238635d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
238735d489f9SAlex Elder 				 / sizeof (u64)) {
238835d489f9SAlex Elder 		ret = -EINVAL;
238935d489f9SAlex Elder 		goto out;
239035d489f9SAlex Elder 	}
239135d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
239235d489f9SAlex Elder 		goto out;
239335d489f9SAlex Elder 
239435d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
239535d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
239635d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
239735d489f9SAlex Elder 	if (!snapc) {
239835d489f9SAlex Elder 		ret = -ENOMEM;
239935d489f9SAlex Elder 		goto out;
240035d489f9SAlex Elder 	}
240135d489f9SAlex Elder 
240235d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
240335d489f9SAlex Elder 	snapc->seq = seq;
240435d489f9SAlex Elder 	snapc->num_snaps = snap_count;
240535d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
240635d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
240735d489f9SAlex Elder 
240835d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
240935d489f9SAlex Elder 
241035d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
241135d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
241235d489f9SAlex Elder 
241335d489f9SAlex Elder out:
241435d489f9SAlex Elder 	kfree(reply_buf);
241535d489f9SAlex Elder 
241635d489f9SAlex Elder 	return 0;
241735d489f9SAlex Elder }
241835d489f9SAlex Elder 
2419b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2420b8b1e2dbSAlex Elder {
2421b8b1e2dbSAlex Elder 	size_t size;
2422b8b1e2dbSAlex Elder 	void *reply_buf;
2423b8b1e2dbSAlex Elder 	__le64 snap_id;
2424b8b1e2dbSAlex Elder 	int ret;
2425b8b1e2dbSAlex Elder 	void *p;
2426b8b1e2dbSAlex Elder 	void *end;
2427b8b1e2dbSAlex Elder 	size_t snap_name_len;
2428b8b1e2dbSAlex Elder 	char *snap_name;
2429b8b1e2dbSAlex Elder 
2430b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2431b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
2432b8b1e2dbSAlex Elder 	if (!reply_buf)
2433b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
2434b8b1e2dbSAlex Elder 
2435b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2436b8b1e2dbSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2437b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
2438b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
2439b8b1e2dbSAlex Elder 				reply_buf, size,
2440b8b1e2dbSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2441b8b1e2dbSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2442b8b1e2dbSAlex Elder 	if (ret < 0)
2443b8b1e2dbSAlex Elder 		goto out;
2444b8b1e2dbSAlex Elder 
2445b8b1e2dbSAlex Elder 	p = reply_buf;
2446b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
2447b8b1e2dbSAlex Elder 	snap_name_len = 0;
2448b8b1e2dbSAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2449b8b1e2dbSAlex Elder 				GFP_KERNEL);
2450b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
2451b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
2452b8b1e2dbSAlex Elder 		goto out;
2453b8b1e2dbSAlex Elder 	} else {
2454b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
2455b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
2456b8b1e2dbSAlex Elder 	}
2457b8b1e2dbSAlex Elder 	kfree(reply_buf);
2458b8b1e2dbSAlex Elder 
2459b8b1e2dbSAlex Elder 	return snap_name;
2460b8b1e2dbSAlex Elder out:
2461b8b1e2dbSAlex Elder 	kfree(reply_buf);
2462b8b1e2dbSAlex Elder 
2463b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
2464b8b1e2dbSAlex Elder }
2465b8b1e2dbSAlex Elder 
2466b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2467b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2468b8b1e2dbSAlex Elder {
2469b8b1e2dbSAlex Elder 	__le64 snap_id;
2470b8b1e2dbSAlex Elder 	u8 order;
2471b8b1e2dbSAlex Elder 	int ret;
2472b8b1e2dbSAlex Elder 
2473b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
2474b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2475b8b1e2dbSAlex Elder 	if (ret)
2476b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2477b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2478b8b1e2dbSAlex Elder 	if (ret)
2479b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2480b8b1e2dbSAlex Elder 
2481b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
2482b8b1e2dbSAlex Elder }
2483b8b1e2dbSAlex Elder 
2484b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2485b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2486b8b1e2dbSAlex Elder {
2487b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
2488b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
2489b8b1e2dbSAlex Elder 					snap_size, snap_features);
2490b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
2491b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
2492b8b1e2dbSAlex Elder 					snap_size, snap_features);
2493b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
2494b8b1e2dbSAlex Elder }
2495b8b1e2dbSAlex Elder 
2496117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2497117973fbSAlex Elder {
2498117973fbSAlex Elder 	int ret;
2499117973fbSAlex Elder 	__u8 obj_order;
2500117973fbSAlex Elder 
2501117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
2502117973fbSAlex Elder 
2503117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
2504117973fbSAlex Elder 
2505117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
2506117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
2507117973fbSAlex Elder 	if (ret)
2508117973fbSAlex Elder 		goto out;
2509117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
2510117973fbSAlex Elder 		ret = -EIO;
2511117973fbSAlex Elder 		goto out;
2512117973fbSAlex Elder 	}
2513117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
2514117973fbSAlex Elder 
2515117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2516117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
2517117973fbSAlex Elder 	if (ret)
2518117973fbSAlex Elder 		goto out;
2519117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2520117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
2521117973fbSAlex Elder 	if (ret)
2522117973fbSAlex Elder 		goto out;
2523117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
2524117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
2525117973fbSAlex Elder out:
2526117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
2527117973fbSAlex Elder 
2528117973fbSAlex Elder 	return ret;
2529117973fbSAlex Elder }
2530117973fbSAlex Elder 
25319d475de5SAlex Elder /*
253235938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
253335938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
253435938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
253535938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
253635938150SAlex Elder  * And verify there are no changes to snapshots we already know
253735938150SAlex Elder  * about.
253835938150SAlex Elder  *
253935938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
254035938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
254135938150SAlex Elder  * are also maintained in that order.)
2542dfc5606dSYehuda Sadeh  */
2543304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2544dfc5606dSYehuda Sadeh {
254535938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
254635938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
254735938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
254835938150SAlex Elder 	struct list_head *links = head->next;
254935938150SAlex Elder 	u32 index = 0;
2550dfc5606dSYehuda Sadeh 
25519fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
255235938150SAlex Elder 	while (index < snap_count || links != head) {
255335938150SAlex Elder 		u64 snap_id;
255435938150SAlex Elder 		struct rbd_snap *snap;
2555cd892126SAlex Elder 		char *snap_name;
2556cd892126SAlex Elder 		u64 snap_size = 0;
2557cd892126SAlex Elder 		u64 snap_features = 0;
2558dfc5606dSYehuda Sadeh 
255935938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
256035938150SAlex Elder 					     : CEPH_NOSNAP;
256135938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
256235938150SAlex Elder 				     : NULL;
2563aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2564dfc5606dSYehuda Sadeh 
256535938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
256635938150SAlex Elder 			struct list_head *next = links->next;
2567dfc5606dSYehuda Sadeh 
256835938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2569dfc5606dSYehuda Sadeh 
2570f84344f3SAlex Elder 			if (rbd_dev->mapping.snap_id == snap->id)
2571f84344f3SAlex Elder 				rbd_dev->mapping.snap_exists = false;
257235938150SAlex Elder 			__rbd_remove_snap_dev(snap);
25739fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
2574f84344f3SAlex Elder 				rbd_dev->mapping.snap_id == snap->id ?
2575f84344f3SAlex Elder 								"mapped " : "",
25769fcbb800SAlex Elder 				(unsigned long long) snap->id);
2577dfc5606dSYehuda Sadeh 
257835938150SAlex Elder 			/* Done with this list entry; advance */
257935938150SAlex Elder 
258035938150SAlex Elder 			links = next;
258135938150SAlex Elder 			continue;
2582dfc5606dSYehuda Sadeh 		}
258335938150SAlex Elder 
2584b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
2585cd892126SAlex Elder 					&snap_size, &snap_features);
2586cd892126SAlex Elder 		if (IS_ERR(snap_name))
2587cd892126SAlex Elder 			return PTR_ERR(snap_name);
2588cd892126SAlex Elder 
25899fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
25909fcbb800SAlex Elder 			(unsigned long long) snap_id);
259135938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
259235938150SAlex Elder 			struct rbd_snap *new_snap;
259335938150SAlex Elder 
259435938150SAlex Elder 			/* We haven't seen this snapshot before */
259535938150SAlex Elder 
2596c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2597cd892126SAlex Elder 					snap_id, snap_size, snap_features);
25989fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
25999fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
26009fcbb800SAlex Elder 
26019fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
26029fcbb800SAlex Elder 
26039fcbb800SAlex Elder 				return err;
26049fcbb800SAlex Elder 			}
260535938150SAlex Elder 
260635938150SAlex Elder 			/* New goes before existing, or at end of list */
260735938150SAlex Elder 
26089fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
260935938150SAlex Elder 			if (snap)
261035938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
261135938150SAlex Elder 			else
2612523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
261335938150SAlex Elder 		} else {
261435938150SAlex Elder 			/* Already have this one */
261535938150SAlex Elder 
26169fcbb800SAlex Elder 			dout("  already present\n");
26179fcbb800SAlex Elder 
2618cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2619aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2620cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
262135938150SAlex Elder 
262235938150SAlex Elder 			/* Done with this list entry; advance */
262335938150SAlex Elder 
262435938150SAlex Elder 			links = links->next;
2625dfc5606dSYehuda Sadeh 		}
262635938150SAlex Elder 
262735938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
262835938150SAlex Elder 
262935938150SAlex Elder 		index++;
2630dfc5606dSYehuda Sadeh 	}
26319fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2632dfc5606dSYehuda Sadeh 
2633dfc5606dSYehuda Sadeh 	return 0;
2634dfc5606dSYehuda Sadeh }
2635dfc5606dSYehuda Sadeh 
2636304f6808SAlex Elder /*
2637304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2638304f6808SAlex Elder  * have not already been registered.
2639304f6808SAlex Elder  */
2640304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2641304f6808SAlex Elder {
2642304f6808SAlex Elder 	struct rbd_snap *snap;
2643304f6808SAlex Elder 	int ret = 0;
2644304f6808SAlex Elder 
2645304f6808SAlex Elder 	dout("%s called\n", __func__);
264686ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
264786ff77bbSAlex Elder 		return -EIO;
2648304f6808SAlex Elder 
2649304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2650304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
2651304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2652304f6808SAlex Elder 			if (ret < 0)
2653304f6808SAlex Elder 				break;
2654304f6808SAlex Elder 		}
2655304f6808SAlex Elder 	}
2656304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
2657304f6808SAlex Elder 
2658304f6808SAlex Elder 	return ret;
2659304f6808SAlex Elder }
2660304f6808SAlex Elder 
2661dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2662dfc5606dSYehuda Sadeh {
2663dfc5606dSYehuda Sadeh 	struct device *dev;
2664cd789ab9SAlex Elder 	int ret;
2665dfc5606dSYehuda Sadeh 
2666dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2667dfc5606dSYehuda Sadeh 
2668cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
2669dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2670dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2671dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2672dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2673de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2674dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2675dfc5606dSYehuda Sadeh 
2676dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2677cd789ab9SAlex Elder 
2678dfc5606dSYehuda Sadeh 	return ret;
2679602adf40SYehuda Sadeh }
2680602adf40SYehuda Sadeh 
2681dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2682dfc5606dSYehuda Sadeh {
2683dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2684dfc5606dSYehuda Sadeh }
2685dfc5606dSYehuda Sadeh 
268659c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
268759c2be1eSYehuda Sadeh {
268859c2be1eSYehuda Sadeh 	int ret, rc;
268959c2be1eSYehuda Sadeh 
269059c2be1eSYehuda Sadeh 	do {
26910e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
269259c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
2693117973fbSAlex Elder 			rc = rbd_dev_refresh(rbd_dev, NULL);
269459c2be1eSYehuda Sadeh 			if (rc < 0)
269559c2be1eSYehuda Sadeh 				return rc;
269659c2be1eSYehuda Sadeh 		}
269759c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
269859c2be1eSYehuda Sadeh 
269959c2be1eSYehuda Sadeh 	return ret;
270059c2be1eSYehuda Sadeh }
270159c2be1eSYehuda Sadeh 
2702e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
27031ddbe94eSAlex Elder 
27041ddbe94eSAlex Elder /*
2705499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2706499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
27071ddbe94eSAlex Elder  */
2708e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2709b7f23c36SAlex Elder {
2710e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2711499afd5bSAlex Elder 
2712499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2713499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2714499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2715e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2716e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2717b7f23c36SAlex Elder }
2718b7f23c36SAlex Elder 
27191ddbe94eSAlex Elder /*
2720499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2721499afd5bSAlex Elder  * identifier is no longer in use.
27221ddbe94eSAlex Elder  */
2723e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
27241ddbe94eSAlex Elder {
2725d184f6bfSAlex Elder 	struct list_head *tmp;
2726de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
2727d184f6bfSAlex Elder 	int max_id;
2728d184f6bfSAlex Elder 
2729aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
2730499afd5bSAlex Elder 
2731e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2732e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2733499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2734499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2735d184f6bfSAlex Elder 
2736d184f6bfSAlex Elder 	/*
2737d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2738d184f6bfSAlex Elder 	 * is nothing special we need to do.
2739d184f6bfSAlex Elder 	 */
2740e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2741d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2742d184f6bfSAlex Elder 		return;
2743d184f6bfSAlex Elder 	}
2744d184f6bfSAlex Elder 
2745d184f6bfSAlex Elder 	/*
2746d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2747d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2748d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2749d184f6bfSAlex Elder 	 */
2750d184f6bfSAlex Elder 	max_id = 0;
2751d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2752d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2753d184f6bfSAlex Elder 
2754d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2755b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
2756b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
2757d184f6bfSAlex Elder 	}
2758499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
27591ddbe94eSAlex Elder 
27601ddbe94eSAlex Elder 	/*
2761e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
2762d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2763d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2764d184f6bfSAlex Elder 	 * case.
27651ddbe94eSAlex Elder 	 */
2766e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2767e2839308SAlex Elder 	dout("  max dev id has been reset\n");
2768b7f23c36SAlex Elder }
2769b7f23c36SAlex Elder 
2770a725f65eSAlex Elder /*
2771e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2772e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2773593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2774593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2775e28fff26SAlex Elder  */
2776e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2777e28fff26SAlex Elder {
2778e28fff26SAlex Elder         /*
2779e28fff26SAlex Elder         * These are the characters that produce nonzero for
2780e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2781e28fff26SAlex Elder         */
2782e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2783e28fff26SAlex Elder 
2784e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2785e28fff26SAlex Elder 
2786e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2787e28fff26SAlex Elder }
2788e28fff26SAlex Elder 
2789e28fff26SAlex Elder /*
2790e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2791e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2792593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2793593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2794e28fff26SAlex Elder  *
2795e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2796e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2797e28fff26SAlex Elder  * token_size if the token would not fit.
2798e28fff26SAlex Elder  *
2799593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2800e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2801e28fff26SAlex Elder  * too small to hold it.
2802e28fff26SAlex Elder  */
2803e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2804e28fff26SAlex Elder 				char *token,
2805e28fff26SAlex Elder 				size_t token_size)
2806e28fff26SAlex Elder {
2807e28fff26SAlex Elder         size_t len;
2808e28fff26SAlex Elder 
2809e28fff26SAlex Elder 	len = next_token(buf);
2810e28fff26SAlex Elder 	if (len < token_size) {
2811e28fff26SAlex Elder 		memcpy(token, *buf, len);
2812e28fff26SAlex Elder 		*(token + len) = '\0';
2813e28fff26SAlex Elder 	}
2814e28fff26SAlex Elder 	*buf += len;
2815e28fff26SAlex Elder 
2816e28fff26SAlex Elder         return len;
2817e28fff26SAlex Elder }
2818e28fff26SAlex Elder 
2819e28fff26SAlex Elder /*
2820ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2821ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2822ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2823ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2824ea3352f4SAlex Elder  *
2825ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2826ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2827ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2828ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2829ea3352f4SAlex Elder  *
2830ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2831ea3352f4SAlex Elder  * the end of the found token.
2832ea3352f4SAlex Elder  *
2833ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2834ea3352f4SAlex Elder  */
2835ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2836ea3352f4SAlex Elder {
2837ea3352f4SAlex Elder 	char *dup;
2838ea3352f4SAlex Elder 	size_t len;
2839ea3352f4SAlex Elder 
2840ea3352f4SAlex Elder 	len = next_token(buf);
2841ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2842ea3352f4SAlex Elder 	if (!dup)
2843ea3352f4SAlex Elder 		return NULL;
2844ea3352f4SAlex Elder 
2845ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2846ea3352f4SAlex Elder 	*(dup + len) = '\0';
2847ea3352f4SAlex Elder 	*buf += len;
2848ea3352f4SAlex Elder 
2849ea3352f4SAlex Elder 	if (lenp)
2850ea3352f4SAlex Elder 		*lenp = len;
2851ea3352f4SAlex Elder 
2852ea3352f4SAlex Elder 	return dup;
2853ea3352f4SAlex Elder }
2854ea3352f4SAlex Elder 
2855ea3352f4SAlex Elder /*
28563feeb894SAlex Elder  * This fills in the pool_name, image_name, image_name_len, rbd_dev,
28573feeb894SAlex Elder  * rbd_md_name, and name fields of the given rbd_dev, based on the
28583feeb894SAlex Elder  * list of monitor addresses and other options provided via
28593feeb894SAlex Elder  * /sys/bus/rbd/add.  Returns a pointer to a dynamically-allocated
28603feeb894SAlex Elder  * copy of the snapshot name to map if successful, or a
28613feeb894SAlex Elder  * pointer-coded error otherwise.
2862d22f76e7SAlex Elder  *
2863d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2864a725f65eSAlex Elder  */
28653feeb894SAlex Elder static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2866a725f65eSAlex Elder 				const char *buf,
28677ef3214aSAlex Elder 				const char **mon_addrs,
28685214ecc4SAlex Elder 				size_t *mon_addrs_size,
2869e28fff26SAlex Elder 				char *options,
2870e28fff26SAlex Elder 				size_t options_size)
2871a725f65eSAlex Elder {
2872e28fff26SAlex Elder 	size_t len;
28733feeb894SAlex Elder 	char *err_ptr = ERR_PTR(-EINVAL);
28743feeb894SAlex Elder 	char *snap_name;
2875e28fff26SAlex Elder 
2876e28fff26SAlex Elder 	/* The first four tokens are required */
2877e28fff26SAlex Elder 
28787ef3214aSAlex Elder 	len = next_token(&buf);
28797ef3214aSAlex Elder 	if (!len)
28803feeb894SAlex Elder 		return err_ptr;
28815214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
28827ef3214aSAlex Elder 	*mon_addrs = buf;
28837ef3214aSAlex Elder 
28847ef3214aSAlex Elder 	buf += len;
2885a725f65eSAlex Elder 
2886e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2887e28fff26SAlex Elder 	if (!len || len >= options_size)
28883feeb894SAlex Elder 		return err_ptr;
2889a725f65eSAlex Elder 
28903feeb894SAlex Elder 	err_ptr = ERR_PTR(-ENOMEM);
2891d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2892d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2893d22f76e7SAlex Elder 		goto out_err;
2894e28fff26SAlex Elder 
28950bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
28960bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2897bf3e5ae1SAlex Elder 		goto out_err;
2898e28fff26SAlex Elder 
2899d4b125e9SAlex Elder 	/* Snapshot name is optional; default is to use "head" */
2900d4b125e9SAlex Elder 
29013feeb894SAlex Elder 	len = next_token(&buf);
2902d4b125e9SAlex Elder 	if (len > RBD_MAX_SNAP_NAME_LEN) {
2903d4b125e9SAlex Elder 		err_ptr = ERR_PTR(-ENAMETOOLONG);
2904d4b125e9SAlex Elder 		goto out_err;
2905d4b125e9SAlex Elder 	}
2906820a5f3eSAlex Elder 	if (!len) {
29073feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
29083feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
2909849b4260SAlex Elder 	}
29103feeb894SAlex Elder 	snap_name = kmalloc(len + 1, GFP_KERNEL);
29113feeb894SAlex Elder 	if (!snap_name)
29123feeb894SAlex Elder 		goto out_err;
29133feeb894SAlex Elder 	memcpy(snap_name, buf, len);
29143feeb894SAlex Elder 	*(snap_name + len) = '\0';
2915e28fff26SAlex Elder 
29163feeb894SAlex Elder 	return snap_name;
2917d22f76e7SAlex Elder 
2918d22f76e7SAlex Elder out_err:
29190bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2920d78fd7aeSAlex Elder 	rbd_dev->image_name = NULL;
2921d78fd7aeSAlex Elder 	rbd_dev->image_name_len = 0;
2922d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2923d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2924d22f76e7SAlex Elder 
29253feeb894SAlex Elder 	return err_ptr;
2926a725f65eSAlex Elder }
2927a725f65eSAlex Elder 
2928589d30e0SAlex Elder /*
2929589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
2930589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
2931589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
2932589d30e0SAlex Elder  *
2933589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
2934589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
2935589d30e0SAlex Elder  * with the supplied name.
2936589d30e0SAlex Elder  *
2937589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
2938589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
2939589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
2940589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
2941589d30e0SAlex Elder  */
2942589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2943589d30e0SAlex Elder {
2944589d30e0SAlex Elder 	int ret;
2945589d30e0SAlex Elder 	size_t size;
2946589d30e0SAlex Elder 	char *object_name;
2947589d30e0SAlex Elder 	void *response;
2948589d30e0SAlex Elder 	void *p;
2949589d30e0SAlex Elder 
2950589d30e0SAlex Elder 	/*
2951589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
2952589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
2953589d30e0SAlex Elder 	 */
2954589d30e0SAlex Elder 	size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2955589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
2956589d30e0SAlex Elder 	if (!object_name)
2957589d30e0SAlex Elder 		return -ENOMEM;
2958589d30e0SAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2959589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
2960589d30e0SAlex Elder 
2961589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
2962589d30e0SAlex Elder 
2963589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2964589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
2965589d30e0SAlex Elder 	if (!response) {
2966589d30e0SAlex Elder 		ret = -ENOMEM;
2967589d30e0SAlex Elder 		goto out;
2968589d30e0SAlex Elder 	}
2969589d30e0SAlex Elder 
2970589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
2971589d30e0SAlex Elder 				"rbd", "get_id",
2972589d30e0SAlex Elder 				NULL, 0,
2973589d30e0SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX,
2974589d30e0SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2975589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2976589d30e0SAlex Elder 	if (ret < 0)
2977589d30e0SAlex Elder 		goto out;
2978a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
2979589d30e0SAlex Elder 
2980589d30e0SAlex Elder 	p = response;
2981589d30e0SAlex Elder 	rbd_dev->image_id = ceph_extract_encoded_string(&p,
2982589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
2983589d30e0SAlex Elder 						&rbd_dev->image_id_len,
2984589d30e0SAlex Elder 						GFP_NOIO);
2985589d30e0SAlex Elder 	if (IS_ERR(rbd_dev->image_id)) {
2986589d30e0SAlex Elder 		ret = PTR_ERR(rbd_dev->image_id);
2987589d30e0SAlex Elder 		rbd_dev->image_id = NULL;
2988589d30e0SAlex Elder 	} else {
2989589d30e0SAlex Elder 		dout("image_id is %s\n", rbd_dev->image_id);
2990589d30e0SAlex Elder 	}
2991589d30e0SAlex Elder out:
2992589d30e0SAlex Elder 	kfree(response);
2993589d30e0SAlex Elder 	kfree(object_name);
2994589d30e0SAlex Elder 
2995589d30e0SAlex Elder 	return ret;
2996589d30e0SAlex Elder }
2997589d30e0SAlex Elder 
2998a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2999a30b71b9SAlex Elder {
3000a30b71b9SAlex Elder 	int ret;
3001a30b71b9SAlex Elder 	size_t size;
3002a30b71b9SAlex Elder 
3003a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3004a30b71b9SAlex Elder 
3005a30b71b9SAlex Elder 	rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3006a30b71b9SAlex Elder 	if (!rbd_dev->image_id)
3007a30b71b9SAlex Elder 		return -ENOMEM;
3008a30b71b9SAlex Elder 	rbd_dev->image_id_len = 0;
3009a30b71b9SAlex Elder 
3010a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3011a30b71b9SAlex Elder 
3012a30b71b9SAlex Elder 	size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3013a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3014a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3015a30b71b9SAlex Elder 		ret = -ENOMEM;
3016a30b71b9SAlex Elder 		goto out_err;
3017a30b71b9SAlex Elder 	}
3018a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3019a30b71b9SAlex Elder 
3020a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3021a30b71b9SAlex Elder 
3022a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3023a30b71b9SAlex Elder 	if (ret < 0)
3024a30b71b9SAlex Elder 		goto out_err;
3025a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3026a30b71b9SAlex Elder 
3027a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3028a30b71b9SAlex Elder 		rbd_dev->header_name);
3029a30b71b9SAlex Elder 
3030a30b71b9SAlex Elder 	return 0;
3031a30b71b9SAlex Elder 
3032a30b71b9SAlex Elder out_err:
3033a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3034a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
3035a30b71b9SAlex Elder 	kfree(rbd_dev->image_id);
3036a30b71b9SAlex Elder 	rbd_dev->image_id = NULL;
3037a30b71b9SAlex Elder 
3038a30b71b9SAlex Elder 	return ret;
3039a30b71b9SAlex Elder }
3040a30b71b9SAlex Elder 
3041a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3042a30b71b9SAlex Elder {
3043a30b71b9SAlex Elder 	size_t size;
30449d475de5SAlex Elder 	int ret;
30456e14b1a6SAlex Elder 	u64 ver = 0;
3046a30b71b9SAlex Elder 
3047a30b71b9SAlex Elder 	/*
3048a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3049a30b71b9SAlex Elder 	 * object name for this rbd image.
3050a30b71b9SAlex Elder 	 */
3051a30b71b9SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3052a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3053a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3054a30b71b9SAlex Elder 		return -ENOMEM;
3055a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
3056a30b71b9SAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->image_id);
30579d475de5SAlex Elder 
30589d475de5SAlex Elder 	/* Get the size and object order for the image */
30599d475de5SAlex Elder 
30609d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
30619d475de5SAlex Elder 	if (ret < 0)
30629d475de5SAlex Elder 		goto out_err;
30631e130199SAlex Elder 
30641e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
30651e130199SAlex Elder 
30661e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
30671e130199SAlex Elder 	if (ret < 0)
30681e130199SAlex Elder 		goto out_err;
3069b1b5402aSAlex Elder 
3070d889140cSAlex Elder 	/* Get the and check features for the image */
3071b1b5402aSAlex Elder 
3072b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3073b1b5402aSAlex Elder 	if (ret < 0)
3074b1b5402aSAlex Elder 		goto out_err;
307535d489f9SAlex Elder 
30766e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
307735d489f9SAlex Elder 
30786e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
30796e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
30806e14b1a6SAlex Elder 
30816e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
30826e14b1a6SAlex Elder 
30836e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
308435d489f9SAlex Elder 	if (ret)
308535d489f9SAlex Elder 		goto out_err;
30866e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
30876e14b1a6SAlex Elder 
3088a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3089a30b71b9SAlex Elder 
3090a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3091a30b71b9SAlex Elder 		rbd_dev->header_name);
3092a30b71b9SAlex Elder 
309335152979SAlex Elder 	return 0;
30949d475de5SAlex Elder out_err:
30959d475de5SAlex Elder 	kfree(rbd_dev->header_name);
30969d475de5SAlex Elder 	rbd_dev->header_name = NULL;
30971e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
30981e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
30999d475de5SAlex Elder 
31009d475de5SAlex Elder 	return ret;
3101a30b71b9SAlex Elder }
3102a30b71b9SAlex Elder 
3103a30b71b9SAlex Elder /*
3104a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3105a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3106a30b71b9SAlex Elder  * id.
3107a30b71b9SAlex Elder  */
3108a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3109a30b71b9SAlex Elder {
3110a30b71b9SAlex Elder 	int ret;
3111a30b71b9SAlex Elder 
3112a30b71b9SAlex Elder 	/*
3113a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3114a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3115a30b71b9SAlex Elder 	 * it's a format 1 image.
3116a30b71b9SAlex Elder 	 */
3117a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3118a30b71b9SAlex Elder 	if (ret)
3119a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3120a30b71b9SAlex Elder 	else
3121a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
3122a30b71b9SAlex Elder 	if (ret)
3123a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3124a30b71b9SAlex Elder 
3125a30b71b9SAlex Elder 	return ret;
3126a30b71b9SAlex Elder }
3127a30b71b9SAlex Elder 
312859c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
312959c2be1eSYehuda Sadeh 		       const char *buf,
313059c2be1eSYehuda Sadeh 		       size_t count)
3131602adf40SYehuda Sadeh {
3132cb8627c7SAlex Elder 	char *options;
3133cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
31347ef3214aSAlex Elder 	const char *mon_addrs = NULL;
31357ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
313627cc2594SAlex Elder 	struct ceph_osd_client *osdc;
313727cc2594SAlex Elder 	int rc = -ENOMEM;
31383feeb894SAlex Elder 	char *snap_name;
3139602adf40SYehuda Sadeh 
3140602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
3141602adf40SYehuda Sadeh 		return -ENODEV;
3142602adf40SYehuda Sadeh 
314327cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
314427cc2594SAlex Elder 	if (!options)
314585ae8926SAlex Elder 		goto err_out_mem;
3146cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3147cb8627c7SAlex Elder 	if (!rbd_dev)
314885ae8926SAlex Elder 		goto err_out_mem;
3149602adf40SYehuda Sadeh 
3150602adf40SYehuda Sadeh 	/* static rbd_device initialization */
3151602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
3152602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
3153dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
3154c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
3155602adf40SYehuda Sadeh 
3156a725f65eSAlex Elder 	/* parse add command */
31573feeb894SAlex Elder 	snap_name = rbd_add_parse_args(rbd_dev, buf,
31583feeb894SAlex Elder 				&mon_addrs, &mon_addrs_size, options, count);
31593feeb894SAlex Elder 	if (IS_ERR(snap_name)) {
31603feeb894SAlex Elder 		rc = PTR_ERR(snap_name);
316185ae8926SAlex Elder 		goto err_out_mem;
31623feeb894SAlex Elder 	}
3163a725f65eSAlex Elder 
3164f8c38929SAlex Elder 	rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
3165f8c38929SAlex Elder 	if (rc < 0)
316685ae8926SAlex Elder 		goto err_out_args;
3167602adf40SYehuda Sadeh 
3168602adf40SYehuda Sadeh 	/* pick the pool */
31691dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
3170602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3171602adf40SYehuda Sadeh 	if (rc < 0)
3172602adf40SYehuda Sadeh 		goto err_out_client;
31739bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
3174602adf40SYehuda Sadeh 
3175a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
3176a30b71b9SAlex Elder 	if (rc < 0)
3177589d30e0SAlex Elder 		goto err_out_client;
317805fd6f6fSAlex Elder 
317905fd6f6fSAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
318005fd6f6fSAlex Elder 	rc = rbd_dev_snaps_update(rbd_dev);
318105fd6f6fSAlex Elder 	if (rc)
318205fd6f6fSAlex Elder 		goto err_out_header;
318305fd6f6fSAlex Elder 
318405fd6f6fSAlex Elder 	rc = rbd_dev_set_mapping(rbd_dev, snap_name);
318505fd6f6fSAlex Elder 	if (rc)
318605fd6f6fSAlex Elder 		goto err_out_header;
318705fd6f6fSAlex Elder 
318885ae8926SAlex Elder 	/* generate unique id: find highest unique id, add one */
318985ae8926SAlex Elder 	rbd_dev_id_get(rbd_dev);
319085ae8926SAlex Elder 
319185ae8926SAlex Elder 	/* Fill in the device name, now that we have its id. */
319285ae8926SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
319385ae8926SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
319485ae8926SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
319585ae8926SAlex Elder 
319685ae8926SAlex Elder 	/* Get our block major device number. */
319785ae8926SAlex Elder 
319827cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
319927cc2594SAlex Elder 	if (rc < 0)
320085ae8926SAlex Elder 		goto err_out_id;
320127cc2594SAlex Elder 	rbd_dev->major = rc;
3202602adf40SYehuda Sadeh 
32030f308a31SAlex Elder 	/* Set up the blkdev mapping. */
32040f308a31SAlex Elder 
32050f308a31SAlex Elder 	rc = rbd_init_disk(rbd_dev);
3206dfc5606dSYehuda Sadeh 	if (rc)
3207766fc439SYehuda Sadeh 		goto err_out_blkdev;
3208766fc439SYehuda Sadeh 
32090f308a31SAlex Elder 	rc = rbd_bus_add_dev(rbd_dev);
32100f308a31SAlex Elder 	if (rc)
32110f308a31SAlex Elder 		goto err_out_disk;
32120f308a31SAlex Elder 
321332eec68dSAlex Elder 	/*
321432eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
321532eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
321632eec68dSAlex Elder 	 */
32172ac4e75dSAlex Elder 
32184bb1f1edSAlex Elder 	down_write(&rbd_dev->header_rwsem);
32195ed16177SAlex Elder 	rc = rbd_dev_snaps_register(rbd_dev);
32204bb1f1edSAlex Elder 	up_write(&rbd_dev->header_rwsem);
32212ac4e75dSAlex Elder 	if (rc)
32222ac4e75dSAlex Elder 		goto err_out_bus;
32232ac4e75dSAlex Elder 
322459c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
322559c2be1eSYehuda Sadeh 	if (rc)
322659c2be1eSYehuda Sadeh 		goto err_out_bus;
322759c2be1eSYehuda Sadeh 
32283ee4001eSAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
32293ee4001eSAlex Elder 
32303ee4001eSAlex Elder 	add_disk(rbd_dev->disk);
32313ee4001eSAlex Elder 
32323ee4001eSAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
32333ee4001eSAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
32343ee4001eSAlex Elder 
3235602adf40SYehuda Sadeh 	return count;
3236602adf40SYehuda Sadeh 
3237766fc439SYehuda Sadeh err_out_bus:
3238766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
3239766fc439SYehuda Sadeh 
3240766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3241766fc439SYehuda Sadeh 	kfree(options);
3242766fc439SYehuda Sadeh 	return rc;
3243766fc439SYehuda Sadeh 
32440f308a31SAlex Elder err_out_disk:
32450f308a31SAlex Elder 	rbd_free_disk(rbd_dev);
3246602adf40SYehuda Sadeh err_out_blkdev:
3247602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
324885ae8926SAlex Elder err_out_id:
324985ae8926SAlex Elder 	rbd_dev_id_put(rbd_dev);
325005fd6f6fSAlex Elder err_out_header:
325105fd6f6fSAlex Elder 	rbd_header_free(&rbd_dev->header);
3252602adf40SYehuda Sadeh err_out_client:
32533fcf2581SAlex Elder 	kfree(rbd_dev->header_name);
3254602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
3255589d30e0SAlex Elder 	kfree(rbd_dev->image_id);
325685ae8926SAlex Elder err_out_args:
3257f84344f3SAlex Elder 	kfree(rbd_dev->mapping.snap_name);
32580bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
3259d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
326085ae8926SAlex Elder err_out_mem:
326127cc2594SAlex Elder 	kfree(rbd_dev);
3262cb8627c7SAlex Elder 	kfree(options);
326327cc2594SAlex Elder 
3264602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
3265602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
326627cc2594SAlex Elder 
326727cc2594SAlex Elder 	return (ssize_t) rc;
3268602adf40SYehuda Sadeh }
3269602adf40SYehuda Sadeh 
3270de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3271602adf40SYehuda Sadeh {
3272602adf40SYehuda Sadeh 	struct list_head *tmp;
3273602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
3274602adf40SYehuda Sadeh 
3275e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3276602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
3277602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3278de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
3279e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
3280602adf40SYehuda Sadeh 			return rbd_dev;
3281602adf40SYehuda Sadeh 		}
3282e124a82fSAlex Elder 	}
3283e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3284602adf40SYehuda Sadeh 	return NULL;
3285602adf40SYehuda Sadeh }
3286602adf40SYehuda Sadeh 
3287dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
3288602adf40SYehuda Sadeh {
3289593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3290602adf40SYehuda Sadeh 
32911dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
32921dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
32931dbb4399SAlex Elder 
32941dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
329559c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
32961dbb4399SAlex Elder 	}
329759c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
3298070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
329959c2be1eSYehuda Sadeh 
3300602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
3301602adf40SYehuda Sadeh 
3302602adf40SYehuda Sadeh 	/* clean up and free blkdev */
3303602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
3304602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
330532eec68dSAlex Elder 
33062ac4e75dSAlex Elder 	/* release allocated disk header fields */
33072ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
33082ac4e75dSAlex Elder 
330932eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
3310f84344f3SAlex Elder 	kfree(rbd_dev->mapping.snap_name);
3311589d30e0SAlex Elder 	kfree(rbd_dev->image_id);
33120bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
3313d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
33140bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
3315e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
3316602adf40SYehuda Sadeh 	kfree(rbd_dev);
3317602adf40SYehuda Sadeh 
3318602adf40SYehuda Sadeh 	/* release module ref */
3319602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
3320602adf40SYehuda Sadeh }
3321602adf40SYehuda Sadeh 
3322dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
3323602adf40SYehuda Sadeh 			  const char *buf,
3324602adf40SYehuda Sadeh 			  size_t count)
3325602adf40SYehuda Sadeh {
3326602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
3327602adf40SYehuda Sadeh 	int target_id, rc;
3328602adf40SYehuda Sadeh 	unsigned long ul;
3329602adf40SYehuda Sadeh 	int ret = count;
3330602adf40SYehuda Sadeh 
3331602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
3332602adf40SYehuda Sadeh 	if (rc)
3333602adf40SYehuda Sadeh 		return rc;
3334602adf40SYehuda Sadeh 
3335602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
3336602adf40SYehuda Sadeh 	target_id = (int) ul;
3337602adf40SYehuda Sadeh 	if (target_id != ul)
3338602adf40SYehuda Sadeh 		return -EINVAL;
3339602adf40SYehuda Sadeh 
3340602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3341602adf40SYehuda Sadeh 
3342602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
3343602adf40SYehuda Sadeh 	if (!rbd_dev) {
3344602adf40SYehuda Sadeh 		ret = -ENOENT;
3345602adf40SYehuda Sadeh 		goto done;
3346602adf40SYehuda Sadeh 	}
3347602adf40SYehuda Sadeh 
3348dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
3349dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3350602adf40SYehuda Sadeh 
3351602adf40SYehuda Sadeh done:
3352602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3353aafb230eSAlex Elder 
3354602adf40SYehuda Sadeh 	return ret;
3355602adf40SYehuda Sadeh }
3356602adf40SYehuda Sadeh 
3357602adf40SYehuda Sadeh /*
3358602adf40SYehuda Sadeh  * create control files in sysfs
3359dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3360602adf40SYehuda Sadeh  */
3361602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3362602adf40SYehuda Sadeh {
3363dfc5606dSYehuda Sadeh 	int ret;
3364602adf40SYehuda Sadeh 
3365fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3366dfc5606dSYehuda Sadeh 	if (ret < 0)
3367dfc5606dSYehuda Sadeh 		return ret;
3368602adf40SYehuda Sadeh 
3369fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3370fed4c143SAlex Elder 	if (ret < 0)
3371fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3372602adf40SYehuda Sadeh 
3373602adf40SYehuda Sadeh 	return ret;
3374602adf40SYehuda Sadeh }
3375602adf40SYehuda Sadeh 
3376602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3377602adf40SYehuda Sadeh {
3378dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3379fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3380602adf40SYehuda Sadeh }
3381602adf40SYehuda Sadeh 
3382602adf40SYehuda Sadeh int __init rbd_init(void)
3383602adf40SYehuda Sadeh {
3384602adf40SYehuda Sadeh 	int rc;
3385602adf40SYehuda Sadeh 
3386602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3387602adf40SYehuda Sadeh 	if (rc)
3388602adf40SYehuda Sadeh 		return rc;
3389f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3390602adf40SYehuda Sadeh 	return 0;
3391602adf40SYehuda Sadeh }
3392602adf40SYehuda Sadeh 
3393602adf40SYehuda Sadeh void __exit rbd_exit(void)
3394602adf40SYehuda Sadeh {
3395602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3396602adf40SYehuda Sadeh }
3397602adf40SYehuda Sadeh 
3398602adf40SYehuda Sadeh module_init(rbd_init);
3399602adf40SYehuda Sadeh module_exit(rbd_exit);
3400602adf40SYehuda Sadeh 
3401602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3402602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3403602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3404602adf40SYehuda Sadeh 
3405602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3406602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3407602adf40SYehuda Sadeh 
3408602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3409