xref: /openbmc/linux/drivers/block/rbd.c (revision 9d3997fd)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
66d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67d4b125e9SAlex Elder 
6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
70602adf40SYehuda Sadeh 
71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
72602adf40SYehuda Sadeh 
73589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
741e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
75589d30e0SAlex Elder 
76d889140cSAlex Elder /* Feature bits */
77d889140cSAlex Elder 
78d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
79d889140cSAlex Elder 
80d889140cSAlex Elder /* Features supported by this (client software) implementation. */
81d889140cSAlex Elder 
82d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
83d889140cSAlex Elder 
8481a89793SAlex Elder /*
8581a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8681a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8781a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8881a89793SAlex Elder  * enough to hold all possible device names.
8981a89793SAlex Elder  */
90602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9181a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
92602adf40SYehuda Sadeh 
93cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
9459c2be1eSYehuda Sadeh 
95602adf40SYehuda Sadeh /*
96602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
97602adf40SYehuda Sadeh  */
98602adf40SYehuda Sadeh struct rbd_image_header {
99f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
100849b4260SAlex Elder 	char *object_prefix;
10134b13184SAlex Elder 	u64 features;
102602adf40SYehuda Sadeh 	__u8 obj_order;
103602adf40SYehuda Sadeh 	__u8 crypt_type;
104602adf40SYehuda Sadeh 	__u8 comp_type;
105602adf40SYehuda Sadeh 
106f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
107f84344f3SAlex Elder 	u64 image_size;
108f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
109602adf40SYehuda Sadeh 	char *snap_names;
110602adf40SYehuda Sadeh 	u64 *snap_sizes;
11159c2be1eSYehuda Sadeh 
11259c2be1eSYehuda Sadeh 	u64 obj_version;
11359c2be1eSYehuda Sadeh };
11459c2be1eSYehuda Sadeh 
1150d7dbfceSAlex Elder /*
1160d7dbfceSAlex Elder  * An rbd image specification.
1170d7dbfceSAlex Elder  *
1180d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
1190d7dbfceSAlex Elder  * identify an image.
1200d7dbfceSAlex Elder  */
1210d7dbfceSAlex Elder struct rbd_spec {
1220d7dbfceSAlex Elder 	u64		pool_id;
1230d7dbfceSAlex Elder 	char		*pool_name;
1240d7dbfceSAlex Elder 
1250d7dbfceSAlex Elder 	char		*image_id;
1260d7dbfceSAlex Elder 	size_t		image_id_len;
1270d7dbfceSAlex Elder 	char		*image_name;
1280d7dbfceSAlex Elder 	size_t		image_name_len;
1290d7dbfceSAlex Elder 
1300d7dbfceSAlex Elder 	u64		snap_id;
1310d7dbfceSAlex Elder 	char		*snap_name;
1320d7dbfceSAlex Elder 
1330d7dbfceSAlex Elder 	struct kref	kref;
1340d7dbfceSAlex Elder };
1350d7dbfceSAlex Elder 
13659c2be1eSYehuda Sadeh struct rbd_options {
137cc0538b6SAlex Elder 	bool	read_only;
138602adf40SYehuda Sadeh };
139602adf40SYehuda Sadeh 
140602adf40SYehuda Sadeh /*
141f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
142602adf40SYehuda Sadeh  */
143602adf40SYehuda Sadeh struct rbd_client {
144602adf40SYehuda Sadeh 	struct ceph_client	*client;
145602adf40SYehuda Sadeh 	struct kref		kref;
146602adf40SYehuda Sadeh 	struct list_head	node;
147602adf40SYehuda Sadeh };
148602adf40SYehuda Sadeh 
149602adf40SYehuda Sadeh /*
150f0f8cef5SAlex Elder  * a request completion status
151602adf40SYehuda Sadeh  */
1521fec7093SYehuda Sadeh struct rbd_req_status {
1531fec7093SYehuda Sadeh 	int done;
1541fec7093SYehuda Sadeh 	int rc;
1551fec7093SYehuda Sadeh 	u64 bytes;
1561fec7093SYehuda Sadeh };
1571fec7093SYehuda Sadeh 
1581fec7093SYehuda Sadeh /*
1591fec7093SYehuda Sadeh  * a collection of requests
1601fec7093SYehuda Sadeh  */
1611fec7093SYehuda Sadeh struct rbd_req_coll {
1621fec7093SYehuda Sadeh 	int			total;
1631fec7093SYehuda Sadeh 	int			num_done;
1641fec7093SYehuda Sadeh 	struct kref		kref;
1651fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
166602adf40SYehuda Sadeh };
167602adf40SYehuda Sadeh 
168f0f8cef5SAlex Elder /*
169f0f8cef5SAlex Elder  * a single io request
170f0f8cef5SAlex Elder  */
171f0f8cef5SAlex Elder struct rbd_request {
172f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
173f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
174f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
175f0f8cef5SAlex Elder 	u64			len;
176f0f8cef5SAlex Elder 	int			coll_index;
177f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
178f0f8cef5SAlex Elder };
179f0f8cef5SAlex Elder 
180dfc5606dSYehuda Sadeh struct rbd_snap {
181dfc5606dSYehuda Sadeh 	struct	device		dev;
182dfc5606dSYehuda Sadeh 	const char		*name;
1833591538fSJosh Durgin 	u64			size;
184dfc5606dSYehuda Sadeh 	struct list_head	node;
185dfc5606dSYehuda Sadeh 	u64			id;
18634b13184SAlex Elder 	u64			features;
187dfc5606dSYehuda Sadeh };
188dfc5606dSYehuda Sadeh 
189f84344f3SAlex Elder struct rbd_mapping {
19099c1f08fSAlex Elder 	u64                     size;
19134b13184SAlex Elder 	u64                     features;
192f84344f3SAlex Elder 	bool			read_only;
193f84344f3SAlex Elder };
194f84344f3SAlex Elder 
195602adf40SYehuda Sadeh /*
196602adf40SYehuda Sadeh  * a single device
197602adf40SYehuda Sadeh  */
198602adf40SYehuda Sadeh struct rbd_device {
199de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
200602adf40SYehuda Sadeh 
201602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
202602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
203602adf40SYehuda Sadeh 
204a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
205602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
206602adf40SYehuda Sadeh 
207602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
208602adf40SYehuda Sadeh 
209602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
210602adf40SYehuda Sadeh 
211602adf40SYehuda Sadeh 	struct rbd_image_header	header;
212daba5fdbSAlex Elder 	bool                    exists;
2130d7dbfceSAlex Elder 	struct rbd_spec		*spec;
214602adf40SYehuda Sadeh 
2150d7dbfceSAlex Elder 	char			*header_name;
216971f839aSAlex Elder 
21759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
21859c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
21959c2be1eSYehuda Sadeh 
220c666601aSJosh Durgin 	/* protects updating the header */
221c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
222f84344f3SAlex Elder 
223f84344f3SAlex Elder 	struct rbd_mapping	mapping;
224602adf40SYehuda Sadeh 
225602adf40SYehuda Sadeh 	struct list_head	node;
226dfc5606dSYehuda Sadeh 
227dfc5606dSYehuda Sadeh 	/* list of snapshots */
228dfc5606dSYehuda Sadeh 	struct list_head	snaps;
229dfc5606dSYehuda Sadeh 
230dfc5606dSYehuda Sadeh 	/* sysfs related */
231dfc5606dSYehuda Sadeh 	struct device		dev;
232dfc5606dSYehuda Sadeh };
233dfc5606dSYehuda Sadeh 
234602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
235e124a82fSAlex Elder 
236602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
237e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
238e124a82fSAlex Elder 
239602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
240432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
241602adf40SYehuda Sadeh 
242304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
243304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
244304f6808SAlex Elder 
245dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
24641f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
247dfc5606dSYehuda Sadeh 
248f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
249f0f8cef5SAlex Elder 		       size_t count);
250f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
251f0f8cef5SAlex Elder 			  size_t count);
252f0f8cef5SAlex Elder 
253f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
254f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
255f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
256f0f8cef5SAlex Elder 	__ATTR_NULL
257f0f8cef5SAlex Elder };
258f0f8cef5SAlex Elder 
259f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
260f0f8cef5SAlex Elder 	.name		= "rbd",
261f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
262f0f8cef5SAlex Elder };
263f0f8cef5SAlex Elder 
264f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
265f0f8cef5SAlex Elder {
266f0f8cef5SAlex Elder }
267f0f8cef5SAlex Elder 
268f0f8cef5SAlex Elder static struct device rbd_root_dev = {
269f0f8cef5SAlex Elder 	.init_name =    "rbd",
270f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
271f0f8cef5SAlex Elder };
272f0f8cef5SAlex Elder 
273aafb230eSAlex Elder #ifdef RBD_DEBUG
274aafb230eSAlex Elder #define rbd_assert(expr)						\
275aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
276aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
277aafb230eSAlex Elder 						"at line %d:\n\n"	\
278aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
279aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
280aafb230eSAlex Elder 			BUG();						\
281aafb230eSAlex Elder 		}
282aafb230eSAlex Elder #else /* !RBD_DEBUG */
283aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
284aafb230eSAlex Elder #endif /* !RBD_DEBUG */
285dfc5606dSYehuda Sadeh 
286dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
287dfc5606dSYehuda Sadeh {
288dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
289dfc5606dSYehuda Sadeh }
290dfc5606dSYehuda Sadeh 
291dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
292dfc5606dSYehuda Sadeh {
293dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
294dfc5606dSYehuda Sadeh }
295602adf40SYehuda Sadeh 
296117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
297117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
29859c2be1eSYehuda Sadeh 
299602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
300602adf40SYehuda Sadeh {
301f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
302602adf40SYehuda Sadeh 
303f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
304602adf40SYehuda Sadeh 		return -EROFS;
305602adf40SYehuda Sadeh 
306340c7a2bSAlex Elder 	rbd_get_dev(rbd_dev);
307f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
308340c7a2bSAlex Elder 
309602adf40SYehuda Sadeh 	return 0;
310602adf40SYehuda Sadeh }
311602adf40SYehuda Sadeh 
312dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
313dfc5606dSYehuda Sadeh {
314dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
315dfc5606dSYehuda Sadeh 
316dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
317dfc5606dSYehuda Sadeh 
318dfc5606dSYehuda Sadeh 	return 0;
319dfc5606dSYehuda Sadeh }
320dfc5606dSYehuda Sadeh 
321602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
322602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
323602adf40SYehuda Sadeh 	.open			= rbd_open,
324dfc5606dSYehuda Sadeh 	.release		= rbd_release,
325602adf40SYehuda Sadeh };
326602adf40SYehuda Sadeh 
327602adf40SYehuda Sadeh /*
328602adf40SYehuda Sadeh  * Initialize an rbd client instance.
32943ae4701SAlex Elder  * We own *ceph_opts.
330602adf40SYehuda Sadeh  */
331f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
332602adf40SYehuda Sadeh {
333602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
334602adf40SYehuda Sadeh 	int ret = -ENOMEM;
335602adf40SYehuda Sadeh 
336602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
337602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
338602adf40SYehuda Sadeh 	if (!rbdc)
339602adf40SYehuda Sadeh 		goto out_opt;
340602adf40SYehuda Sadeh 
341602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
342602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
343602adf40SYehuda Sadeh 
344bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
345bc534d86SAlex Elder 
34643ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
347602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
348bc534d86SAlex Elder 		goto out_mutex;
34943ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
350602adf40SYehuda Sadeh 
351602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
352602adf40SYehuda Sadeh 	if (ret < 0)
353602adf40SYehuda Sadeh 		goto out_err;
354602adf40SYehuda Sadeh 
355432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
356602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
357432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
358602adf40SYehuda Sadeh 
359bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
360bc534d86SAlex Elder 
361602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
362602adf40SYehuda Sadeh 	return rbdc;
363602adf40SYehuda Sadeh 
364602adf40SYehuda Sadeh out_err:
365602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
366bc534d86SAlex Elder out_mutex:
367bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
368602adf40SYehuda Sadeh 	kfree(rbdc);
369602adf40SYehuda Sadeh out_opt:
37043ae4701SAlex Elder 	if (ceph_opts)
37143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
37228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
373602adf40SYehuda Sadeh }
374602adf40SYehuda Sadeh 
375602adf40SYehuda Sadeh /*
3761f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
3771f7ba331SAlex Elder  * found, bump its reference count.
378602adf40SYehuda Sadeh  */
3791f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
380602adf40SYehuda Sadeh {
381602adf40SYehuda Sadeh 	struct rbd_client *client_node;
3821f7ba331SAlex Elder 	bool found = false;
383602adf40SYehuda Sadeh 
38443ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
385602adf40SYehuda Sadeh 		return NULL;
386602adf40SYehuda Sadeh 
3871f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
3881f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
3891f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
3901f7ba331SAlex Elder 			kref_get(&client_node->kref);
3911f7ba331SAlex Elder 			found = true;
3921f7ba331SAlex Elder 			break;
3931f7ba331SAlex Elder 		}
3941f7ba331SAlex Elder 	}
3951f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
3961f7ba331SAlex Elder 
3971f7ba331SAlex Elder 	return found ? client_node : NULL;
398602adf40SYehuda Sadeh }
399602adf40SYehuda Sadeh 
400602adf40SYehuda Sadeh /*
40159c2be1eSYehuda Sadeh  * mount options
40259c2be1eSYehuda Sadeh  */
40359c2be1eSYehuda Sadeh enum {
40459c2be1eSYehuda Sadeh 	Opt_last_int,
40559c2be1eSYehuda Sadeh 	/* int args above */
40659c2be1eSYehuda Sadeh 	Opt_last_string,
40759c2be1eSYehuda Sadeh 	/* string args above */
408cc0538b6SAlex Elder 	Opt_read_only,
409cc0538b6SAlex Elder 	Opt_read_write,
410cc0538b6SAlex Elder 	/* Boolean args above */
411cc0538b6SAlex Elder 	Opt_last_bool,
41259c2be1eSYehuda Sadeh };
41359c2be1eSYehuda Sadeh 
41443ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
41559c2be1eSYehuda Sadeh 	/* int args above */
41659c2be1eSYehuda Sadeh 	/* string args above */
417be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
418cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
419cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
420cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
421cc0538b6SAlex Elder 	/* Boolean args above */
42259c2be1eSYehuda Sadeh 	{-1, NULL}
42359c2be1eSYehuda Sadeh };
42459c2be1eSYehuda Sadeh 
42559c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
42659c2be1eSYehuda Sadeh {
42743ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
42859c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
42959c2be1eSYehuda Sadeh 	int token, intval, ret;
43059c2be1eSYehuda Sadeh 
43143ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
43259c2be1eSYehuda Sadeh 	if (token < 0)
43359c2be1eSYehuda Sadeh 		return -EINVAL;
43459c2be1eSYehuda Sadeh 
43559c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
43659c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
43759c2be1eSYehuda Sadeh 		if (ret < 0) {
43859c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
43959c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
44059c2be1eSYehuda Sadeh 			return ret;
44159c2be1eSYehuda Sadeh 		}
44259c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
44359c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
44459c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
44559c2be1eSYehuda Sadeh 		     argstr[0].from);
446cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
447cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
44859c2be1eSYehuda Sadeh 	} else {
44959c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
45059c2be1eSYehuda Sadeh 	}
45159c2be1eSYehuda Sadeh 
45259c2be1eSYehuda Sadeh 	switch (token) {
453cc0538b6SAlex Elder 	case Opt_read_only:
454cc0538b6SAlex Elder 		rbd_opts->read_only = true;
455cc0538b6SAlex Elder 		break;
456cc0538b6SAlex Elder 	case Opt_read_write:
457cc0538b6SAlex Elder 		rbd_opts->read_only = false;
458cc0538b6SAlex Elder 		break;
45959c2be1eSYehuda Sadeh 	default:
460aafb230eSAlex Elder 		rbd_assert(false);
461aafb230eSAlex Elder 		break;
46259c2be1eSYehuda Sadeh 	}
46359c2be1eSYehuda Sadeh 	return 0;
46459c2be1eSYehuda Sadeh }
46559c2be1eSYehuda Sadeh 
46659c2be1eSYehuda Sadeh /*
467602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
468602adf40SYehuda Sadeh  * not exist create it.
469602adf40SYehuda Sadeh  */
4709d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
471602adf40SYehuda Sadeh {
472f8c38929SAlex Elder 	struct rbd_client *rbdc;
47359c2be1eSYehuda Sadeh 
4741f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
4759d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
47643ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
4779d3997fdSAlex Elder 	else
478f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
479d720bcb0SAlex Elder 
4809d3997fdSAlex Elder 	return rbdc;
481602adf40SYehuda Sadeh }
482602adf40SYehuda Sadeh 
483602adf40SYehuda Sadeh /*
484602adf40SYehuda Sadeh  * Destroy ceph client
485d23a4b3fSAlex Elder  *
486432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
487602adf40SYehuda Sadeh  */
488602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
489602adf40SYehuda Sadeh {
490602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
491602adf40SYehuda Sadeh 
492602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
493cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
494602adf40SYehuda Sadeh 	list_del(&rbdc->node);
495cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
496602adf40SYehuda Sadeh 
497602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
498602adf40SYehuda Sadeh 	kfree(rbdc);
499602adf40SYehuda Sadeh }
500602adf40SYehuda Sadeh 
501602adf40SYehuda Sadeh /*
502602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
503602adf40SYehuda Sadeh  * it.
504602adf40SYehuda Sadeh  */
5059d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
506602adf40SYehuda Sadeh {
5079d3997fdSAlex Elder 	kref_put(&rbdc->kref, rbd_client_release);
508602adf40SYehuda Sadeh }
509602adf40SYehuda Sadeh 
5101fec7093SYehuda Sadeh /*
5111fec7093SYehuda Sadeh  * Destroy requests collection
5121fec7093SYehuda Sadeh  */
5131fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5141fec7093SYehuda Sadeh {
5151fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5161fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5171fec7093SYehuda Sadeh 
5181fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5191fec7093SYehuda Sadeh 	kfree(coll);
5201fec7093SYehuda Sadeh }
521602adf40SYehuda Sadeh 
522a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
523a30b71b9SAlex Elder {
524a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
525a30b71b9SAlex Elder }
526a30b71b9SAlex Elder 
5278e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5288e94af8eSAlex Elder {
529103a150fSAlex Elder 	size_t size;
530103a150fSAlex Elder 	u32 snap_count;
531103a150fSAlex Elder 
532103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
533103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
534103a150fSAlex Elder 		return false;
535103a150fSAlex Elder 
536db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
537db2388b6SAlex Elder 
538db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
539db2388b6SAlex Elder 		return false;
540db2388b6SAlex Elder 
541db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
542db2388b6SAlex Elder 
543db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
544db2388b6SAlex Elder 		return false;
545db2388b6SAlex Elder 
546103a150fSAlex Elder 	/*
547103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
548103a150fSAlex Elder 	 * that limits the number of snapshots.
549103a150fSAlex Elder 	 */
550103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
551103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
552103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
553103a150fSAlex Elder 		return false;
554103a150fSAlex Elder 
555103a150fSAlex Elder 	/*
556103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
557103a150fSAlex Elder 	 * header must also be representable in a size_t.
558103a150fSAlex Elder 	 */
559103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
560103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
561103a150fSAlex Elder 		return false;
562103a150fSAlex Elder 
563103a150fSAlex Elder 	return true;
5648e94af8eSAlex Elder }
5658e94af8eSAlex Elder 
566602adf40SYehuda Sadeh /*
567602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
568602adf40SYehuda Sadeh  * header.
569602adf40SYehuda Sadeh  */
570602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
5714156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
572602adf40SYehuda Sadeh {
573ccece235SAlex Elder 	u32 snap_count;
57458c17b0eSAlex Elder 	size_t len;
575d2bb24e5SAlex Elder 	size_t size;
576621901d6SAlex Elder 	u32 i;
577602adf40SYehuda Sadeh 
5786a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
5796a52325fSAlex Elder 
580103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
581103a150fSAlex Elder 
58258c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
58358c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
5846a52325fSAlex Elder 	if (!header->object_prefix)
585602adf40SYehuda Sadeh 		return -ENOMEM;
58658c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
58758c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
58800f1f36fSAlex Elder 
589602adf40SYehuda Sadeh 	if (snap_count) {
590f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
591f785cc1dSAlex Elder 
592621901d6SAlex Elder 		/* Save a copy of the snapshot names */
593621901d6SAlex Elder 
594f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
595f785cc1dSAlex Elder 			return -EIO;
596f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
597602adf40SYehuda Sadeh 		if (!header->snap_names)
5986a52325fSAlex Elder 			goto out_err;
599f785cc1dSAlex Elder 		/*
600f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
601f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
602f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
603f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
604f785cc1dSAlex Elder 		 */
605f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
606f785cc1dSAlex Elder 			snap_names_len);
6076a52325fSAlex Elder 
608621901d6SAlex Elder 		/* Record each snapshot's size */
609621901d6SAlex Elder 
610d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
611d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
612602adf40SYehuda Sadeh 		if (!header->snap_sizes)
6136a52325fSAlex Elder 			goto out_err;
614621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
615621901d6SAlex Elder 			header->snap_sizes[i] =
616621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
617602adf40SYehuda Sadeh 	} else {
618ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
619602adf40SYehuda Sadeh 		header->snap_names = NULL;
620602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
621602adf40SYehuda Sadeh 	}
622849b4260SAlex Elder 
62334b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
624602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
625602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
626602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6276a52325fSAlex Elder 
628621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
629621901d6SAlex Elder 
630f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6316a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6326a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6336a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6346a52325fSAlex Elder 	if (!header->snapc)
6356a52325fSAlex Elder 		goto out_err;
636602adf40SYehuda Sadeh 
637602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
638505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
639602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
640621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
641602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
642602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
643602adf40SYehuda Sadeh 
644602adf40SYehuda Sadeh 	return 0;
645602adf40SYehuda Sadeh 
6466a52325fSAlex Elder out_err:
647849b4260SAlex Elder 	kfree(header->snap_sizes);
648ccece235SAlex Elder 	header->snap_sizes = NULL;
649602adf40SYehuda Sadeh 	kfree(header->snap_names);
650ccece235SAlex Elder 	header->snap_names = NULL;
6516a52325fSAlex Elder 	kfree(header->object_prefix);
6526a52325fSAlex Elder 	header->object_prefix = NULL;
653ccece235SAlex Elder 
65400f1f36fSAlex Elder 	return -ENOMEM;
655602adf40SYehuda Sadeh }
656602adf40SYehuda Sadeh 
6578836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
658602adf40SYehuda Sadeh {
659602adf40SYehuda Sadeh 
660e86924a8SAlex Elder 	struct rbd_snap *snap;
66100f1f36fSAlex Elder 
662e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
663e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
6640d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
665e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
66634b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
66700f1f36fSAlex Elder 
668e86924a8SAlex Elder 			return 0;
669602adf40SYehuda Sadeh 		}
67000f1f36fSAlex Elder 	}
671e86924a8SAlex Elder 
67200f1f36fSAlex Elder 	return -ENOENT;
67300f1f36fSAlex Elder }
674602adf40SYehuda Sadeh 
675819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
676602adf40SYehuda Sadeh {
67778dc447dSAlex Elder 	int ret;
678602adf40SYehuda Sadeh 
6790d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
680cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
6810d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
68299c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
68334b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
684e86924a8SAlex Elder 		ret = 0;
685602adf40SYehuda Sadeh 	} else {
6860d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
687602adf40SYehuda Sadeh 		if (ret < 0)
688602adf40SYehuda Sadeh 			goto done;
689f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
690602adf40SYehuda Sadeh 	}
691daba5fdbSAlex Elder 	rbd_dev->exists = true;
692602adf40SYehuda Sadeh done:
693602adf40SYehuda Sadeh 	return ret;
694602adf40SYehuda Sadeh }
695602adf40SYehuda Sadeh 
696602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
697602adf40SYehuda Sadeh {
698849b4260SAlex Elder 	kfree(header->object_prefix);
699d78fd7aeSAlex Elder 	header->object_prefix = NULL;
700602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
701d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
702849b4260SAlex Elder 	kfree(header->snap_names);
703d78fd7aeSAlex Elder 	header->snap_names = NULL;
704d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
705d78fd7aeSAlex Elder 	header->snapc = NULL;
706602adf40SYehuda Sadeh }
707602adf40SYehuda Sadeh 
70865ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
709602adf40SYehuda Sadeh {
71065ccfe21SAlex Elder 	char *name;
71165ccfe21SAlex Elder 	u64 segment;
71265ccfe21SAlex Elder 	int ret;
713602adf40SYehuda Sadeh 
71465ccfe21SAlex Elder 	name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
71565ccfe21SAlex Elder 	if (!name)
71665ccfe21SAlex Elder 		return NULL;
71765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
71865ccfe21SAlex Elder 	ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
71965ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
72065ccfe21SAlex Elder 	if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
72165ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
72265ccfe21SAlex Elder 			segment, ret);
72365ccfe21SAlex Elder 		kfree(name);
72465ccfe21SAlex Elder 		name = NULL;
72565ccfe21SAlex Elder 	}
726602adf40SYehuda Sadeh 
72765ccfe21SAlex Elder 	return name;
72865ccfe21SAlex Elder }
729602adf40SYehuda Sadeh 
73065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
73165ccfe21SAlex Elder {
73265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
733602adf40SYehuda Sadeh 
73465ccfe21SAlex Elder 	return offset & (segment_size - 1);
73565ccfe21SAlex Elder }
73665ccfe21SAlex Elder 
73765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
73865ccfe21SAlex Elder 				u64 offset, u64 length)
73965ccfe21SAlex Elder {
74065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
74165ccfe21SAlex Elder 
74265ccfe21SAlex Elder 	offset &= segment_size - 1;
74365ccfe21SAlex Elder 
744aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
74565ccfe21SAlex Elder 	if (offset + length > segment_size)
74665ccfe21SAlex Elder 		length = segment_size - offset;
74765ccfe21SAlex Elder 
74865ccfe21SAlex Elder 	return length;
749602adf40SYehuda Sadeh }
750602adf40SYehuda Sadeh 
7511fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
7521fec7093SYehuda Sadeh 				u64 ofs, u64 len)
7531fec7093SYehuda Sadeh {
754df111be6SAlex Elder 	u64 start_seg;
755df111be6SAlex Elder 	u64 end_seg;
756df111be6SAlex Elder 
757df111be6SAlex Elder 	if (!len)
758df111be6SAlex Elder 		return 0;
759df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
760df111be6SAlex Elder 		return -ERANGE;
761df111be6SAlex Elder 
762df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
763df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
764df111be6SAlex Elder 
7651fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
7661fec7093SYehuda Sadeh }
7671fec7093SYehuda Sadeh 
768602adf40SYehuda Sadeh /*
769029bcbd8SJosh Durgin  * returns the size of an object in the image
770029bcbd8SJosh Durgin  */
771029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
772029bcbd8SJosh Durgin {
773029bcbd8SJosh Durgin 	return 1 << header->obj_order;
774029bcbd8SJosh Durgin }
775029bcbd8SJosh Durgin 
776029bcbd8SJosh Durgin /*
777602adf40SYehuda Sadeh  * bio helpers
778602adf40SYehuda Sadeh  */
779602adf40SYehuda Sadeh 
780602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
781602adf40SYehuda Sadeh {
782602adf40SYehuda Sadeh 	struct bio *tmp;
783602adf40SYehuda Sadeh 
784602adf40SYehuda Sadeh 	while (chain) {
785602adf40SYehuda Sadeh 		tmp = chain;
786602adf40SYehuda Sadeh 		chain = chain->bi_next;
787602adf40SYehuda Sadeh 		bio_put(tmp);
788602adf40SYehuda Sadeh 	}
789602adf40SYehuda Sadeh }
790602adf40SYehuda Sadeh 
791602adf40SYehuda Sadeh /*
792602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
793602adf40SYehuda Sadeh  */
794602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
795602adf40SYehuda Sadeh {
796602adf40SYehuda Sadeh 	struct bio_vec *bv;
797602adf40SYehuda Sadeh 	unsigned long flags;
798602adf40SYehuda Sadeh 	void *buf;
799602adf40SYehuda Sadeh 	int i;
800602adf40SYehuda Sadeh 	int pos = 0;
801602adf40SYehuda Sadeh 
802602adf40SYehuda Sadeh 	while (chain) {
803602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
804602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
805602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
806602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
807602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
808602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
80985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
810602adf40SYehuda Sadeh 			}
811602adf40SYehuda Sadeh 			pos += bv->bv_len;
812602adf40SYehuda Sadeh 		}
813602adf40SYehuda Sadeh 
814602adf40SYehuda Sadeh 		chain = chain->bi_next;
815602adf40SYehuda Sadeh 	}
816602adf40SYehuda Sadeh }
817602adf40SYehuda Sadeh 
818602adf40SYehuda Sadeh /*
819f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
820f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
821602adf40SYehuda Sadeh  */
822f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
823f7760dadSAlex Elder 					unsigned int offset,
824f7760dadSAlex Elder 					unsigned int len,
825f7760dadSAlex Elder 					gfp_t gfpmask)
826602adf40SYehuda Sadeh {
827f7760dadSAlex Elder 	struct bio_vec *bv;
828f7760dadSAlex Elder 	unsigned int resid;
829f7760dadSAlex Elder 	unsigned short idx;
830f7760dadSAlex Elder 	unsigned int voff;
831f7760dadSAlex Elder 	unsigned short end_idx;
832f7760dadSAlex Elder 	unsigned short vcnt;
833f7760dadSAlex Elder 	struct bio *bio;
834602adf40SYehuda Sadeh 
835f7760dadSAlex Elder 	/* Handle the easy case for the caller */
836f7760dadSAlex Elder 
837f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
838f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
839f7760dadSAlex Elder 
840f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
841f7760dadSAlex Elder 		return NULL;
842f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
843f7760dadSAlex Elder 		return NULL;
844f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
845f7760dadSAlex Elder 		return NULL;
846f7760dadSAlex Elder 
847f7760dadSAlex Elder 	/* Find first affected segment... */
848f7760dadSAlex Elder 
849f7760dadSAlex Elder 	resid = offset;
850f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
851f7760dadSAlex Elder 		if (resid < bv->bv_len)
852f7760dadSAlex Elder 			break;
853f7760dadSAlex Elder 		resid -= bv->bv_len;
854602adf40SYehuda Sadeh 	}
855f7760dadSAlex Elder 	voff = resid;
856602adf40SYehuda Sadeh 
857f7760dadSAlex Elder 	/* ...and the last affected segment */
858542582fcSAlex Elder 
859f7760dadSAlex Elder 	resid += len;
860f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
861f7760dadSAlex Elder 		if (resid <= bv->bv_len)
862f7760dadSAlex Elder 			break;
863f7760dadSAlex Elder 		resid -= bv->bv_len;
864f7760dadSAlex Elder 	}
865f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
866602adf40SYehuda Sadeh 
867f7760dadSAlex Elder 	/* Build the clone */
868f7760dadSAlex Elder 
869f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
870f7760dadSAlex Elder 	if (!bio)
871f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
872f7760dadSAlex Elder 
873f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
874f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
875f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
876f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
877602adf40SYehuda Sadeh 
878602adf40SYehuda Sadeh 	/*
879f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
880f7760dadSAlex Elder 	 * and last (or only) entries.
881602adf40SYehuda Sadeh 	 */
882f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
883f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
884f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
885f7760dadSAlex Elder 	if (vcnt > 1) {
886f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
887f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
888602adf40SYehuda Sadeh 	} else {
889f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
890602adf40SYehuda Sadeh 	}
891602adf40SYehuda Sadeh 
892f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
893f7760dadSAlex Elder 	bio->bi_size = len;
894f7760dadSAlex Elder 	bio->bi_idx = 0;
895602adf40SYehuda Sadeh 
896f7760dadSAlex Elder 	return bio;
897602adf40SYehuda Sadeh }
898602adf40SYehuda Sadeh 
899f7760dadSAlex Elder /*
900f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
901f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
902f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
903f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
904f7760dadSAlex Elder  *
905f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
906f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
907f7760dadSAlex Elder  * the start of data to be cloned is located.
908f7760dadSAlex Elder  *
909f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
910f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
911f7760dadSAlex Elder  * contain the offset of that byte within that bio.
912f7760dadSAlex Elder  */
913f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
914f7760dadSAlex Elder 					unsigned int *offset,
915f7760dadSAlex Elder 					unsigned int len,
916f7760dadSAlex Elder 					gfp_t gfpmask)
917f7760dadSAlex Elder {
918f7760dadSAlex Elder 	struct bio *bi = *bio_src;
919f7760dadSAlex Elder 	unsigned int off = *offset;
920f7760dadSAlex Elder 	struct bio *chain = NULL;
921f7760dadSAlex Elder 	struct bio **end;
922602adf40SYehuda Sadeh 
923f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
924602adf40SYehuda Sadeh 
925f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
926f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
927602adf40SYehuda Sadeh 
928f7760dadSAlex Elder 	end = &chain;
929f7760dadSAlex Elder 	while (len) {
930f7760dadSAlex Elder 		unsigned int bi_size;
931f7760dadSAlex Elder 		struct bio *bio;
932f7760dadSAlex Elder 
933f7760dadSAlex Elder 		if (!bi)
934f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
935f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
936f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
937f7760dadSAlex Elder 		if (!bio)
938f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
939f7760dadSAlex Elder 
940f7760dadSAlex Elder 		*end = bio;
941f7760dadSAlex Elder 		end = &bio->bi_next;
942f7760dadSAlex Elder 
943f7760dadSAlex Elder 		off += bi_size;
944f7760dadSAlex Elder 		if (off == bi->bi_size) {
945f7760dadSAlex Elder 			bi = bi->bi_next;
946f7760dadSAlex Elder 			off = 0;
947f7760dadSAlex Elder 		}
948f7760dadSAlex Elder 		len -= bi_size;
949f7760dadSAlex Elder 	}
950f7760dadSAlex Elder 	*bio_src = bi;
951f7760dadSAlex Elder 	*offset = off;
952f7760dadSAlex Elder 
953f7760dadSAlex Elder 	return chain;
954f7760dadSAlex Elder out_err:
955f7760dadSAlex Elder 	bio_chain_put(chain);
956f7760dadSAlex Elder 
957602adf40SYehuda Sadeh 	return NULL;
958602adf40SYehuda Sadeh }
959602adf40SYehuda Sadeh 
960602adf40SYehuda Sadeh /*
961602adf40SYehuda Sadeh  * helpers for osd request op vectors.
962602adf40SYehuda Sadeh  */
96357cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
96457cfc106SAlex Elder 					int opcode, u32 payload_len)
965602adf40SYehuda Sadeh {
96657cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
96757cfc106SAlex Elder 
96857cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
96957cfc106SAlex Elder 	if (!ops)
97057cfc106SAlex Elder 		return NULL;
97157cfc106SAlex Elder 
97257cfc106SAlex Elder 	ops[0].op = opcode;
97357cfc106SAlex Elder 
974602adf40SYehuda Sadeh 	/*
975602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
976602adf40SYehuda Sadeh 	 * in calc_raw_layout()
977602adf40SYehuda Sadeh 	 */
97857cfc106SAlex Elder 	ops[0].payload_len = payload_len;
97957cfc106SAlex Elder 
98057cfc106SAlex Elder 	return ops;
981602adf40SYehuda Sadeh }
982602adf40SYehuda Sadeh 
983602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
984602adf40SYehuda Sadeh {
985602adf40SYehuda Sadeh 	kfree(ops);
986602adf40SYehuda Sadeh }
987602adf40SYehuda Sadeh 
9881fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
9891fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
9901fec7093SYehuda Sadeh 				   int index,
9911fec7093SYehuda Sadeh 				   int ret, u64 len)
9921fec7093SYehuda Sadeh {
9931fec7093SYehuda Sadeh 	struct request_queue *q;
9941fec7093SYehuda Sadeh 	int min, max, i;
9951fec7093SYehuda Sadeh 
996bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
997bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
9981fec7093SYehuda Sadeh 
9991fec7093SYehuda Sadeh 	if (!rq)
10001fec7093SYehuda Sadeh 		return;
10011fec7093SYehuda Sadeh 
10021fec7093SYehuda Sadeh 	if (!coll) {
10031fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
10041fec7093SYehuda Sadeh 		return;
10051fec7093SYehuda Sadeh 	}
10061fec7093SYehuda Sadeh 
10071fec7093SYehuda Sadeh 	q = rq->q;
10081fec7093SYehuda Sadeh 
10091fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
10101fec7093SYehuda Sadeh 	coll->status[index].done = 1;
10111fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
10121fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
10131fec7093SYehuda Sadeh 	max = min = coll->num_done;
10141fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
10151fec7093SYehuda Sadeh 		max++;
10161fec7093SYehuda Sadeh 
10171fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
10181fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
10191fec7093SYehuda Sadeh 				  coll->status[i].bytes);
10201fec7093SYehuda Sadeh 		coll->num_done++;
10211fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
10221fec7093SYehuda Sadeh 	}
10231fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
10241fec7093SYehuda Sadeh }
10251fec7093SYehuda Sadeh 
10261fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
10271fec7093SYehuda Sadeh 			     int ret, u64 len)
10281fec7093SYehuda Sadeh {
10291fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
10301fec7093SYehuda Sadeh }
10311fec7093SYehuda Sadeh 
1032602adf40SYehuda Sadeh /*
1033602adf40SYehuda Sadeh  * Send ceph osd request
1034602adf40SYehuda Sadeh  */
1035602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
10360ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
1037602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1038602adf40SYehuda Sadeh 			  u64 snapid,
1039aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
1040602adf40SYehuda Sadeh 			  struct bio *bio,
1041602adf40SYehuda Sadeh 			  struct page **pages,
1042602adf40SYehuda Sadeh 			  int num_pages,
1043602adf40SYehuda Sadeh 			  int flags,
1044602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
10451fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
10461fec7093SYehuda Sadeh 			  int coll_index,
1047602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
104859c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
104959c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
105059c2be1eSYehuda Sadeh 			  u64 *ver)
1051602adf40SYehuda Sadeh {
1052602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
1053602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
1054602adf40SYehuda Sadeh 	int ret;
1055602adf40SYehuda Sadeh 	u64 bno;
1056602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
1057602adf40SYehuda Sadeh 	struct rbd_request *req_data;
1058602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
10591dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
1060602adf40SYehuda Sadeh 
1061602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
10621fec7093SYehuda Sadeh 	if (!req_data) {
10631fec7093SYehuda Sadeh 		if (coll)
10641fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
10651fec7093SYehuda Sadeh 					       -ENOMEM, len);
10661fec7093SYehuda Sadeh 		return -ENOMEM;
10671fec7093SYehuda Sadeh 	}
1068602adf40SYehuda Sadeh 
10691fec7093SYehuda Sadeh 	if (coll) {
10701fec7093SYehuda Sadeh 		req_data->coll = coll;
10711fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
10721fec7093SYehuda Sadeh 	}
10731fec7093SYehuda Sadeh 
1074f7760dadSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1075f7760dadSAlex Elder 		object_name, (unsigned long long) ofs,
1076f7760dadSAlex Elder 		(unsigned long long) len, coll, coll_index);
1077602adf40SYehuda Sadeh 
10780ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
10791dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
10801dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
10814ad12621SSage Weil 	if (!req) {
10824ad12621SSage Weil 		ret = -ENOMEM;
1083602adf40SYehuda Sadeh 		goto done_pages;
1084602adf40SYehuda Sadeh 	}
1085602adf40SYehuda Sadeh 
1086602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
1087602adf40SYehuda Sadeh 
1088602adf40SYehuda Sadeh 	req_data->rq = rq;
1089602adf40SYehuda Sadeh 	req_data->bio = bio;
1090602adf40SYehuda Sadeh 	req_data->pages = pages;
1091602adf40SYehuda Sadeh 	req_data->len = len;
1092602adf40SYehuda Sadeh 
1093602adf40SYehuda Sadeh 	req->r_priv = req_data;
1094602adf40SYehuda Sadeh 
1095602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
1096602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1097602adf40SYehuda Sadeh 
1098aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
1099602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
1100602adf40SYehuda Sadeh 
1101602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
1102602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1103602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1104602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1105602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
11060d7dbfceSAlex Elder 	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
11076cae3717SSage Weil 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
11081dbb4399SAlex Elder 				   req, ops);
11096cae3717SSage Weil 	rbd_assert(ret == 0);
1110602adf40SYehuda Sadeh 
1111602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
1112602adf40SYehuda Sadeh 				ops,
1113602adf40SYehuda Sadeh 				snapc,
1114602adf40SYehuda Sadeh 				&mtime,
1115602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
1116602adf40SYehuda Sadeh 
111759c2be1eSYehuda Sadeh 	if (linger_req) {
11181dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
111959c2be1eSYehuda Sadeh 		*linger_req = req;
112059c2be1eSYehuda Sadeh 	}
112159c2be1eSYehuda Sadeh 
11221dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
1123602adf40SYehuda Sadeh 	if (ret < 0)
1124602adf40SYehuda Sadeh 		goto done_err;
1125602adf40SYehuda Sadeh 
1126602adf40SYehuda Sadeh 	if (!rbd_cb) {
11271dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
112859c2be1eSYehuda Sadeh 		if (ver)
112959c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
1130bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
1131bd919d45SAlex Elder 			(unsigned long long)
11321fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
1133602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
1134602adf40SYehuda Sadeh 	}
1135602adf40SYehuda Sadeh 	return ret;
1136602adf40SYehuda Sadeh 
1137602adf40SYehuda Sadeh done_err:
1138602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
1139602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1140602adf40SYehuda Sadeh done_pages:
11411fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
1142602adf40SYehuda Sadeh 	kfree(req_data);
1143602adf40SYehuda Sadeh 	return ret;
1144602adf40SYehuda Sadeh }
1145602adf40SYehuda Sadeh 
1146602adf40SYehuda Sadeh /*
1147602adf40SYehuda Sadeh  * Ceph osd op callback
1148602adf40SYehuda Sadeh  */
1149602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1150602adf40SYehuda Sadeh {
1151602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
1152602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1153602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
1154602adf40SYehuda Sadeh 	__s32 rc;
1155602adf40SYehuda Sadeh 	u64 bytes;
1156602adf40SYehuda Sadeh 	int read_op;
1157602adf40SYehuda Sadeh 
1158602adf40SYehuda Sadeh 	/* parse reply */
1159602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1160602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1161602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
1162602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
1163602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1164895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1165602adf40SYehuda Sadeh 
1166bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1167bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1168602adf40SYehuda Sadeh 
1169602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1170602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1171602adf40SYehuda Sadeh 		rc = 0;
1172602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1173602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1174602adf40SYehuda Sadeh 		bytes = req_data->len;
1175602adf40SYehuda Sadeh 	}
1176602adf40SYehuda Sadeh 
11771fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1178602adf40SYehuda Sadeh 
1179602adf40SYehuda Sadeh 	if (req_data->bio)
1180602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1181602adf40SYehuda Sadeh 
1182602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1183602adf40SYehuda Sadeh 	kfree(req_data);
1184602adf40SYehuda Sadeh }
1185602adf40SYehuda Sadeh 
118659c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
118759c2be1eSYehuda Sadeh {
118859c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
118959c2be1eSYehuda Sadeh }
119059c2be1eSYehuda Sadeh 
1191602adf40SYehuda Sadeh /*
1192602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1193602adf40SYehuda Sadeh  */
11940ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1195602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1196602adf40SYehuda Sadeh 			   u64 snapid,
1197602adf40SYehuda Sadeh 			   int flags,
1198913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1199aded07eaSAlex Elder 			   const char *object_name,
1200f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1201f8d4de6eSAlex Elder 			   char *inbound,
120259c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
120359c2be1eSYehuda Sadeh 			   u64 *ver)
1204602adf40SYehuda Sadeh {
1205602adf40SYehuda Sadeh 	int ret;
1206602adf40SYehuda Sadeh 	struct page **pages;
1207602adf40SYehuda Sadeh 	int num_pages;
1208913d2fdcSAlex Elder 
1209aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1210602adf40SYehuda Sadeh 
1211f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1212602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1213b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1214b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1215602adf40SYehuda Sadeh 
12160ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1217f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1218602adf40SYehuda Sadeh 			  pages, num_pages,
1219602adf40SYehuda Sadeh 			  flags,
1220602adf40SYehuda Sadeh 			  ops,
12211fec7093SYehuda Sadeh 			  NULL, 0,
122259c2be1eSYehuda Sadeh 			  NULL,
122359c2be1eSYehuda Sadeh 			  linger_req, ver);
1224602adf40SYehuda Sadeh 	if (ret < 0)
1225913d2fdcSAlex Elder 		goto done;
1226602adf40SYehuda Sadeh 
1227f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1228f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1229602adf40SYehuda Sadeh 
1230602adf40SYehuda Sadeh done:
1231602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1232602adf40SYehuda Sadeh 	return ret;
1233602adf40SYehuda Sadeh }
1234602adf40SYehuda Sadeh 
1235602adf40SYehuda Sadeh /*
1236602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1237602adf40SYehuda Sadeh  */
1238602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1239602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1240602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1241602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
12421fec7093SYehuda Sadeh 		     struct bio *bio,
12431fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
12441fec7093SYehuda Sadeh 		     int coll_index)
1245602adf40SYehuda Sadeh {
1246602adf40SYehuda Sadeh 	char *seg_name;
1247602adf40SYehuda Sadeh 	u64 seg_ofs;
1248602adf40SYehuda Sadeh 	u64 seg_len;
1249602adf40SYehuda Sadeh 	int ret;
1250602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1251602adf40SYehuda Sadeh 	u32 payload_len;
1252ff2e4bb5SAlex Elder 	int opcode;
1253ff2e4bb5SAlex Elder 	int flags;
12544634246dSAlex Elder 	u64 snapid;
1255602adf40SYehuda Sadeh 
125665ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1257602adf40SYehuda Sadeh 	if (!seg_name)
1258602adf40SYehuda Sadeh 		return -ENOMEM;
125965ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
126065ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1261602adf40SYehuda Sadeh 
1262ff2e4bb5SAlex Elder 	if (rq_data_dir(rq) == WRITE) {
1263ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_WRITE;
1264ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
12654634246dSAlex Elder 		snapid = CEPH_NOSNAP;
1266ff2e4bb5SAlex Elder 		payload_len = seg_len;
1267ff2e4bb5SAlex Elder 	} else {
1268ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_READ;
1269ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_READ;
12704634246dSAlex Elder 		snapc = NULL;
12710d7dbfceSAlex Elder 		snapid = rbd_dev->spec->snap_id;
1272ff2e4bb5SAlex Elder 		payload_len = 0;
1273ff2e4bb5SAlex Elder 	}
1274602adf40SYehuda Sadeh 
127557cfc106SAlex Elder 	ret = -ENOMEM;
127657cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
127757cfc106SAlex Elder 	if (!ops)
1278602adf40SYehuda Sadeh 		goto done;
1279602adf40SYehuda Sadeh 
1280602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1281602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1282602adf40SYehuda Sadeh 	   truncated at this point */
1283aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1284602adf40SYehuda Sadeh 
1285602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1286602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1287602adf40SYehuda Sadeh 			     bio,
1288602adf40SYehuda Sadeh 			     NULL, 0,
1289602adf40SYehuda Sadeh 			     flags,
1290602adf40SYehuda Sadeh 			     ops,
12911fec7093SYehuda Sadeh 			     coll, coll_index,
129259c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
129311f77002SSage Weil 
129411f77002SSage Weil 	rbd_destroy_ops(ops);
1295602adf40SYehuda Sadeh done:
1296602adf40SYehuda Sadeh 	kfree(seg_name);
1297602adf40SYehuda Sadeh 	return ret;
1298602adf40SYehuda Sadeh }
1299602adf40SYehuda Sadeh 
1300602adf40SYehuda Sadeh /*
1301602adf40SYehuda Sadeh  * Request sync osd read
1302602adf40SYehuda Sadeh  */
13030ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1304602adf40SYehuda Sadeh 			  u64 snapid,
1305aded07eaSAlex Elder 			  const char *object_name,
1306602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
130759c2be1eSYehuda Sadeh 			  char *buf,
130859c2be1eSYehuda Sadeh 			  u64 *ver)
1309602adf40SYehuda Sadeh {
1310913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1311913d2fdcSAlex Elder 	int ret;
1312913d2fdcSAlex Elder 
1313913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1314913d2fdcSAlex Elder 	if (!ops)
1315913d2fdcSAlex Elder 		return -ENOMEM;
1316913d2fdcSAlex Elder 
1317913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1318b06e6a6bSJosh Durgin 			       snapid,
1319602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1320913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1321913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1322913d2fdcSAlex Elder 
1323913d2fdcSAlex Elder 	return ret;
1324602adf40SYehuda Sadeh }
1325602adf40SYehuda Sadeh 
1326602adf40SYehuda Sadeh /*
132759c2be1eSYehuda Sadeh  * Request sync osd watch
132859c2be1eSYehuda Sadeh  */
13290ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
133059c2be1eSYehuda Sadeh 				   u64 ver,
13317f0a24d8SAlex Elder 				   u64 notify_id)
133259c2be1eSYehuda Sadeh {
133359c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
133411f77002SSage Weil 	int ret;
133511f77002SSage Weil 
133657cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
133757cfc106SAlex Elder 	if (!ops)
133857cfc106SAlex Elder 		return -ENOMEM;
133959c2be1eSYehuda Sadeh 
1340a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
134159c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
134259c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
134359c2be1eSYehuda Sadeh 
13440ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
13457f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1346ad4f232fSAlex Elder 			  NULL, 0,
134759c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
134859c2be1eSYehuda Sadeh 			  ops,
13491fec7093SYehuda Sadeh 			  NULL, 0,
135059c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
135159c2be1eSYehuda Sadeh 
135259c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
135359c2be1eSYehuda Sadeh 	return ret;
135459c2be1eSYehuda Sadeh }
135559c2be1eSYehuda Sadeh 
135659c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
135759c2be1eSYehuda Sadeh {
13580ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1359a71b891bSJosh Durgin 	u64 hver;
136013143d2dSSage Weil 	int rc;
136113143d2dSSage Weil 
13620ce1a794SAlex Elder 	if (!rbd_dev)
136359c2be1eSYehuda Sadeh 		return;
136459c2be1eSYehuda Sadeh 
1365bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1366bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1367bd919d45SAlex Elder 		(unsigned int) opcode);
1368117973fbSAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
136913143d2dSSage Weil 	if (rc)
1370f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
13710ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
137259c2be1eSYehuda Sadeh 
13737f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
137459c2be1eSYehuda Sadeh }
137559c2be1eSYehuda Sadeh 
137659c2be1eSYehuda Sadeh /*
137759c2be1eSYehuda Sadeh  * Request sync osd watch
137859c2be1eSYehuda Sadeh  */
13790e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
138059c2be1eSYehuda Sadeh {
138159c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13820ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
138357cfc106SAlex Elder 	int ret;
138459c2be1eSYehuda Sadeh 
138557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
138657cfc106SAlex Elder 	if (!ops)
138757cfc106SAlex Elder 		return -ENOMEM;
138859c2be1eSYehuda Sadeh 
138959c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
13900ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
139159c2be1eSYehuda Sadeh 	if (ret < 0)
139259c2be1eSYehuda Sadeh 		goto fail;
139359c2be1eSYehuda Sadeh 
13940e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
13950ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
139659c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
139759c2be1eSYehuda Sadeh 
13980ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
139959c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
140059c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
140159c2be1eSYehuda Sadeh 			      ops,
14020e6f322dSAlex Elder 			      rbd_dev->header_name,
14030e6f322dSAlex Elder 			      0, 0, NULL,
14040ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
140559c2be1eSYehuda Sadeh 
140659c2be1eSYehuda Sadeh 	if (ret < 0)
140759c2be1eSYehuda Sadeh 		goto fail_event;
140859c2be1eSYehuda Sadeh 
140959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
141059c2be1eSYehuda Sadeh 	return 0;
141159c2be1eSYehuda Sadeh 
141259c2be1eSYehuda Sadeh fail_event:
14130ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14140ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
141559c2be1eSYehuda Sadeh fail:
141659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
141759c2be1eSYehuda Sadeh 	return ret;
141859c2be1eSYehuda Sadeh }
141959c2be1eSYehuda Sadeh 
142079e3057cSYehuda Sadeh /*
142179e3057cSYehuda Sadeh  * Request sync osd unwatch
142279e3057cSYehuda Sadeh  */
1423070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
142479e3057cSYehuda Sadeh {
142579e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
142657cfc106SAlex Elder 	int ret;
142779e3057cSYehuda Sadeh 
142857cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
142957cfc106SAlex Elder 	if (!ops)
143057cfc106SAlex Elder 		return -ENOMEM;
143179e3057cSYehuda Sadeh 
143279e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
14330ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
143479e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
143579e3057cSYehuda Sadeh 
14360ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
143779e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
143879e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
143979e3057cSYehuda Sadeh 			      ops,
1440070c633fSAlex Elder 			      rbd_dev->header_name,
1441070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1442070c633fSAlex Elder 
144379e3057cSYehuda Sadeh 
144479e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
14450ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14460ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
144779e3057cSYehuda Sadeh 	return ret;
144879e3057cSYehuda Sadeh }
144979e3057cSYehuda Sadeh 
145059c2be1eSYehuda Sadeh /*
14513cb4a687SAlex Elder  * Synchronous osd object method call
1452602adf40SYehuda Sadeh  */
14530ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1454aded07eaSAlex Elder 			     const char *object_name,
1455aded07eaSAlex Elder 			     const char *class_name,
1456aded07eaSAlex Elder 			     const char *method_name,
14573cb4a687SAlex Elder 			     const char *outbound,
14583cb4a687SAlex Elder 			     size_t outbound_size,
1459f8d4de6eSAlex Elder 			     char *inbound,
1460f8d4de6eSAlex Elder 			     size_t inbound_size,
14613cb4a687SAlex Elder 			     int flags,
146259c2be1eSYehuda Sadeh 			     u64 *ver)
1463602adf40SYehuda Sadeh {
1464602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1465aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1466aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
14673cb4a687SAlex Elder 	int payload_size;
146857cfc106SAlex Elder 	int ret;
146957cfc106SAlex Elder 
14703cb4a687SAlex Elder 	/*
14713cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
14723cb4a687SAlex Elder 	 * will be sent along with the class and method names as
14733cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
14743cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
14753cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
14763cb4a687SAlex Elder 	 * operation.
14773cb4a687SAlex Elder 	 */
14783cb4a687SAlex Elder 	payload_size = class_name_len + method_name_len + outbound_size;
14793cb4a687SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
148057cfc106SAlex Elder 	if (!ops)
148157cfc106SAlex Elder 		return -ENOMEM;
1482602adf40SYehuda Sadeh 
1483aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1484aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1485aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1486aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1487602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
14883cb4a687SAlex Elder 	ops[0].cls.indata = outbound;
14893cb4a687SAlex Elder 	ops[0].cls.indata_len = outbound_size;
1490602adf40SYehuda Sadeh 
14910ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1492602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
14933cb4a687SAlex Elder 			       flags, ops,
1494f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1495f8d4de6eSAlex Elder 			       NULL, ver);
1496602adf40SYehuda Sadeh 
1497602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1498602adf40SYehuda Sadeh 
1499602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1500602adf40SYehuda Sadeh 	return ret;
1501602adf40SYehuda Sadeh }
1502602adf40SYehuda Sadeh 
15031fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
15041fec7093SYehuda Sadeh {
15051fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
15061fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
15071fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
15081fec7093SYehuda Sadeh 				GFP_ATOMIC);
15091fec7093SYehuda Sadeh 
15101fec7093SYehuda Sadeh 	if (!coll)
15111fec7093SYehuda Sadeh 		return NULL;
15121fec7093SYehuda Sadeh 	coll->total = num_reqs;
15131fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15141fec7093SYehuda Sadeh 	return coll;
15151fec7093SYehuda Sadeh }
15161fec7093SYehuda Sadeh 
1517602adf40SYehuda Sadeh /*
1518602adf40SYehuda Sadeh  * block device queue callback
1519602adf40SYehuda Sadeh  */
1520602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1521602adf40SYehuda Sadeh {
1522602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1523602adf40SYehuda Sadeh 	struct request *rq;
1524602adf40SYehuda Sadeh 
152500f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1526602adf40SYehuda Sadeh 		struct bio *bio;
1527602adf40SYehuda Sadeh 		bool do_write;
1528bd919d45SAlex Elder 		unsigned int size;
1529602adf40SYehuda Sadeh 		u64 ofs;
15301fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
15311fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1532d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1533f7760dadSAlex Elder 		unsigned int bio_offset;
1534602adf40SYehuda Sadeh 
1535602adf40SYehuda Sadeh 		dout("fetched request\n");
1536602adf40SYehuda Sadeh 
1537602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1538602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1539602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
154000f1f36fSAlex Elder 			continue;
1541602adf40SYehuda Sadeh 		}
1542602adf40SYehuda Sadeh 
1543602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1544602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1545f84344f3SAlex Elder 		if (do_write && rbd_dev->mapping.read_only) {
1546602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
154700f1f36fSAlex Elder 			continue;
1548602adf40SYehuda Sadeh 		}
1549602adf40SYehuda Sadeh 
1550602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1551602adf40SYehuda Sadeh 
1552e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1553e88a36ecSJosh Durgin 
1554daba5fdbSAlex Elder 		if (!rbd_dev->exists) {
15550d7dbfceSAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1556d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1557e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1558e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1559e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1560e88a36ecSJosh Durgin 			continue;
1561e88a36ecSJosh Durgin 		}
1562d1d25646SJosh Durgin 
1563d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1564d1d25646SJosh Durgin 
1565d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1566e88a36ecSJosh Durgin 
1567f7760dadSAlex Elder 		size = blk_rq_bytes(rq);
1568f7760dadSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1569f7760dadSAlex Elder 		bio = rq->bio;
1570f7760dadSAlex Elder 
1571602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1572602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1573bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1574602adf40SYehuda Sadeh 
15751fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1576df111be6SAlex Elder 		if (num_segs <= 0) {
1577df111be6SAlex Elder 			spin_lock_irq(q->queue_lock);
1578df111be6SAlex Elder 			__blk_end_request_all(rq, num_segs);
1579df111be6SAlex Elder 			ceph_put_snap_context(snapc);
1580df111be6SAlex Elder 			continue;
1581df111be6SAlex Elder 		}
15821fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
15831fec7093SYehuda Sadeh 		if (!coll) {
15841fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
15851fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1586d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
158700f1f36fSAlex Elder 			continue;
15881fec7093SYehuda Sadeh 		}
15891fec7093SYehuda Sadeh 
1590f7760dadSAlex Elder 		bio_offset = 0;
1591602adf40SYehuda Sadeh 		do {
1592f7760dadSAlex Elder 			u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1593f7760dadSAlex Elder 			unsigned int chain_size;
1594f7760dadSAlex Elder 			struct bio *bio_chain;
1595f7760dadSAlex Elder 
1596f7760dadSAlex Elder 			BUG_ON(limit > (u64) UINT_MAX);
1597f7760dadSAlex Elder 			chain_size = (unsigned int) limit;
1598bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1599f7760dadSAlex Elder 
16001fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1601f7760dadSAlex Elder 
1602f7760dadSAlex Elder 			/* Pass a cloned bio chain via an osd request */
1603f7760dadSAlex Elder 
1604f7760dadSAlex Elder 			bio_chain = bio_chain_clone_range(&bio,
1605f7760dadSAlex Elder 						&bio_offset, chain_size,
1606f7760dadSAlex Elder 						GFP_ATOMIC);
1607f7760dadSAlex Elder 			if (bio_chain)
16084634246dSAlex Elder 				(void) rbd_do_op(rq, rbd_dev, snapc,
1609f7760dadSAlex Elder 						ofs, chain_size,
1610f7760dadSAlex Elder 						bio_chain, coll, cur_seg);
16114634246dSAlex Elder 			else
16121fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
1613f7760dadSAlex Elder 						       -ENOMEM, chain_size);
1614f7760dadSAlex Elder 			size -= chain_size;
1615f7760dadSAlex Elder 			ofs += chain_size;
1616602adf40SYehuda Sadeh 
16171fec7093SYehuda Sadeh 			cur_seg++;
1618602adf40SYehuda Sadeh 		} while (size > 0);
16191fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1620602adf40SYehuda Sadeh 
1621602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1622d1d25646SJosh Durgin 
1623d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1624602adf40SYehuda Sadeh 	}
1625602adf40SYehuda Sadeh }
1626602adf40SYehuda Sadeh 
1627602adf40SYehuda Sadeh /*
1628602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1629602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1630f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1631602adf40SYehuda Sadeh  */
1632602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1633602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1634602adf40SYehuda Sadeh {
1635602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1636e5cfeed2SAlex Elder 	sector_t sector_offset;
1637e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1638e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1639e5cfeed2SAlex Elder 	int ret;
1640602adf40SYehuda Sadeh 
1641e5cfeed2SAlex Elder 	/*
1642e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1643e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1644e5cfeed2SAlex Elder 	 * device.
1645e5cfeed2SAlex Elder 	 */
1646e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1647e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1648e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1649593a9e7bSAlex Elder 
1650e5cfeed2SAlex Elder 	/*
1651e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
1652e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
1653e5cfeed2SAlex Elder 	 */
1654e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1655e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
1656e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
1657e5cfeed2SAlex Elder 	else
1658e5cfeed2SAlex Elder 		ret = 0;
1659e5cfeed2SAlex Elder 
1660e5cfeed2SAlex Elder 	/*
1661e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
1662e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
1663e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
1664e5cfeed2SAlex Elder 	 * added to an empty bio."
1665e5cfeed2SAlex Elder 	 */
1666e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
1667e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
1668e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
1669e5cfeed2SAlex Elder 
1670e5cfeed2SAlex Elder 	return ret;
1671602adf40SYehuda Sadeh }
1672602adf40SYehuda Sadeh 
1673602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1674602adf40SYehuda Sadeh {
1675602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1676602adf40SYehuda Sadeh 
1677602adf40SYehuda Sadeh 	if (!disk)
1678602adf40SYehuda Sadeh 		return;
1679602adf40SYehuda Sadeh 
1680602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1681602adf40SYehuda Sadeh 		del_gendisk(disk);
1682602adf40SYehuda Sadeh 	if (disk->queue)
1683602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1684602adf40SYehuda Sadeh 	put_disk(disk);
1685602adf40SYehuda Sadeh }
1686602adf40SYehuda Sadeh 
1687602adf40SYehuda Sadeh /*
16884156d998SAlex Elder  * Read the complete header for the given rbd device.
16894156d998SAlex Elder  *
16904156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
16914156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
16924156d998SAlex Elder  * of a variable that will be filled in with the version of the
16934156d998SAlex Elder  * header object at the time it was read.
16944156d998SAlex Elder  *
16954156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
16964156d998SAlex Elder  */
16974156d998SAlex Elder static struct rbd_image_header_ondisk *
16984156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
16994156d998SAlex Elder {
17004156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
17014156d998SAlex Elder 	u32 snap_count = 0;
17024156d998SAlex Elder 	u64 names_size = 0;
17034156d998SAlex Elder 	u32 want_count;
17044156d998SAlex Elder 	int ret;
17054156d998SAlex Elder 
17064156d998SAlex Elder 	/*
17074156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
17084156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
17094156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
17104156d998SAlex Elder 	 * the number of snapshots could change by the time we read
17114156d998SAlex Elder 	 * it in, in which case we re-read it.
17124156d998SAlex Elder 	 */
17134156d998SAlex Elder 	do {
17144156d998SAlex Elder 		size_t size;
17154156d998SAlex Elder 
17164156d998SAlex Elder 		kfree(ondisk);
17174156d998SAlex Elder 
17184156d998SAlex Elder 		size = sizeof (*ondisk);
17194156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17204156d998SAlex Elder 		size += names_size;
17214156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17224156d998SAlex Elder 		if (!ondisk)
17234156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
17244156d998SAlex Elder 
17254156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
17264156d998SAlex Elder 				       rbd_dev->header_name,
17274156d998SAlex Elder 				       0, size,
17284156d998SAlex Elder 				       (char *) ondisk, version);
17294156d998SAlex Elder 
17304156d998SAlex Elder 		if (ret < 0)
17314156d998SAlex Elder 			goto out_err;
17324156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
17334156d998SAlex Elder 			ret = -ENXIO;
17344156d998SAlex Elder 			pr_warning("short header read for image %s"
17354156d998SAlex Elder 					" (want %zd got %d)\n",
17360d7dbfceSAlex Elder 				rbd_dev->spec->image_name, size, ret);
17374156d998SAlex Elder 			goto out_err;
17384156d998SAlex Elder 		}
17394156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
17404156d998SAlex Elder 			ret = -ENXIO;
17414156d998SAlex Elder 			pr_warning("invalid header for image %s\n",
17420d7dbfceSAlex Elder 				rbd_dev->spec->image_name);
17434156d998SAlex Elder 			goto out_err;
17444156d998SAlex Elder 		}
17454156d998SAlex Elder 
17464156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
17474156d998SAlex Elder 		want_count = snap_count;
17484156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
17494156d998SAlex Elder 	} while (snap_count != want_count);
17504156d998SAlex Elder 
17514156d998SAlex Elder 	return ondisk;
17524156d998SAlex Elder 
17534156d998SAlex Elder out_err:
17544156d998SAlex Elder 	kfree(ondisk);
17554156d998SAlex Elder 
17564156d998SAlex Elder 	return ERR_PTR(ret);
17574156d998SAlex Elder }
17584156d998SAlex Elder 
17594156d998SAlex Elder /*
1760602adf40SYehuda Sadeh  * reload the ondisk the header
1761602adf40SYehuda Sadeh  */
1762602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1763602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1764602adf40SYehuda Sadeh {
17654156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
17664156d998SAlex Elder 	u64 ver = 0;
17674156d998SAlex Elder 	int ret;
1768602adf40SYehuda Sadeh 
17694156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
17704156d998SAlex Elder 	if (IS_ERR(ondisk))
17714156d998SAlex Elder 		return PTR_ERR(ondisk);
17724156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
17734156d998SAlex Elder 	if (ret >= 0)
177459c2be1eSYehuda Sadeh 		header->obj_version = ver;
17754156d998SAlex Elder 	kfree(ondisk);
1776602adf40SYehuda Sadeh 
17774156d998SAlex Elder 	return ret;
1778602adf40SYehuda Sadeh }
1779602adf40SYehuda Sadeh 
178041f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1781dfc5606dSYehuda Sadeh {
1782dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1783a0593290SAlex Elder 	struct rbd_snap *next;
1784dfc5606dSYehuda Sadeh 
1785a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
178641f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
1787dfc5606dSYehuda Sadeh }
1788dfc5606dSYehuda Sadeh 
17899478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
17909478554aSAlex Elder {
17919478554aSAlex Elder 	sector_t size;
17929478554aSAlex Elder 
17930d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
17949478554aSAlex Elder 		return;
17959478554aSAlex Elder 
17969478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
17979478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
17989478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
17999478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
18009478554aSAlex Elder }
18019478554aSAlex Elder 
1802602adf40SYehuda Sadeh /*
1803602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1804602adf40SYehuda Sadeh  */
1805117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1806602adf40SYehuda Sadeh {
1807602adf40SYehuda Sadeh 	int ret;
1808602adf40SYehuda Sadeh 	struct rbd_image_header h;
1809602adf40SYehuda Sadeh 
1810602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1811602adf40SYehuda Sadeh 	if (ret < 0)
1812602adf40SYehuda Sadeh 		return ret;
1813602adf40SYehuda Sadeh 
1814a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1815a51aa0c0SJosh Durgin 
18169478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
18179478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
18189478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
18199db4b3e3SSage Weil 
1820849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1821602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1822849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1823d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1824d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1825602adf40SYehuda Sadeh 
1826b813623aSAlex Elder 	if (hver)
1827b813623aSAlex Elder 		*hver = h.obj_version;
1828a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
182993a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1830602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1831602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1832602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1833849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1834849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1835849b4260SAlex Elder 	kfree(h.object_prefix);
1836849b4260SAlex Elder 
1837304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1838304f6808SAlex Elder 	if (!ret)
1839304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1840dfc5606dSYehuda Sadeh 
1841c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1842602adf40SYehuda Sadeh 
1843dfc5606dSYehuda Sadeh 	return ret;
1844602adf40SYehuda Sadeh }
1845602adf40SYehuda Sadeh 
1846117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
18471fe5e993SAlex Elder {
18481fe5e993SAlex Elder 	int ret;
18491fe5e993SAlex Elder 
1850117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
18511fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1852117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
1853117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
1854117973fbSAlex Elder 	else
1855117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
18561fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
18571fe5e993SAlex Elder 
18581fe5e993SAlex Elder 	return ret;
18591fe5e993SAlex Elder }
18601fe5e993SAlex Elder 
1861602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1862602adf40SYehuda Sadeh {
1863602adf40SYehuda Sadeh 	struct gendisk *disk;
1864602adf40SYehuda Sadeh 	struct request_queue *q;
1865593a9e7bSAlex Elder 	u64 segment_size;
1866602adf40SYehuda Sadeh 
1867602adf40SYehuda Sadeh 	/* create gendisk info */
1868602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1869602adf40SYehuda Sadeh 	if (!disk)
18701fcdb8aaSAlex Elder 		return -ENOMEM;
1871602adf40SYehuda Sadeh 
1872f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1873de71a297SAlex Elder 		 rbd_dev->dev_id);
1874602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1875602adf40SYehuda Sadeh 	disk->first_minor = 0;
1876602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1877602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1878602adf40SYehuda Sadeh 
1879602adf40SYehuda Sadeh 	/* init rq */
1880602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1881602adf40SYehuda Sadeh 	if (!q)
1882602adf40SYehuda Sadeh 		goto out_disk;
1883029bcbd8SJosh Durgin 
1884593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1885593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1886593a9e7bSAlex Elder 
1887029bcbd8SJosh Durgin 	/* set io sizes to object size */
1888593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1889593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1890593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1891593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1892593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1893029bcbd8SJosh Durgin 
1894602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1895602adf40SYehuda Sadeh 	disk->queue = q;
1896602adf40SYehuda Sadeh 
1897602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1898602adf40SYehuda Sadeh 
1899602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1900602adf40SYehuda Sadeh 
190112f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
190212f02944SAlex Elder 
1903602adf40SYehuda Sadeh 	return 0;
1904602adf40SYehuda Sadeh out_disk:
1905602adf40SYehuda Sadeh 	put_disk(disk);
19061fcdb8aaSAlex Elder 
19071fcdb8aaSAlex Elder 	return -ENOMEM;
1908602adf40SYehuda Sadeh }
1909602adf40SYehuda Sadeh 
1910dfc5606dSYehuda Sadeh /*
1911dfc5606dSYehuda Sadeh   sysfs
1912dfc5606dSYehuda Sadeh */
1913602adf40SYehuda Sadeh 
1914593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1915593a9e7bSAlex Elder {
1916593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1917593a9e7bSAlex Elder }
1918593a9e7bSAlex Elder 
1919dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1920dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1921602adf40SYehuda Sadeh {
1922593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1923a51aa0c0SJosh Durgin 	sector_t size;
1924dfc5606dSYehuda Sadeh 
1925a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1926a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1927a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1928a51aa0c0SJosh Durgin 
1929a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1930602adf40SYehuda Sadeh }
1931602adf40SYehuda Sadeh 
193234b13184SAlex Elder /*
193334b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
193434b13184SAlex Elder  * necessarily the base image.
193534b13184SAlex Elder  */
193634b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
193734b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
193834b13184SAlex Elder {
193934b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
194034b13184SAlex Elder 
194134b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
194234b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
194334b13184SAlex Elder }
194434b13184SAlex Elder 
1945dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1946dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1947602adf40SYehuda Sadeh {
1948593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1949dfc5606dSYehuda Sadeh 
1950dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1951dfc5606dSYehuda Sadeh }
1952dfc5606dSYehuda Sadeh 
1953dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1954dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1955dfc5606dSYehuda Sadeh {
1956593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1957dfc5606dSYehuda Sadeh 
19581dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
19591dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1960dfc5606dSYehuda Sadeh }
1961dfc5606dSYehuda Sadeh 
1962dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1963dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1964dfc5606dSYehuda Sadeh {
1965593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1966dfc5606dSYehuda Sadeh 
19670d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
1968dfc5606dSYehuda Sadeh }
1969dfc5606dSYehuda Sadeh 
19709bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
19719bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
19729bb2f334SAlex Elder {
19739bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
19749bb2f334SAlex Elder 
19750d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
19760d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
19779bb2f334SAlex Elder }
19789bb2f334SAlex Elder 
1979dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1980dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1981dfc5606dSYehuda Sadeh {
1982593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1983dfc5606dSYehuda Sadeh 
19840d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
1985dfc5606dSYehuda Sadeh }
1986dfc5606dSYehuda Sadeh 
1987589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
1988589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
1989589d30e0SAlex Elder {
1990589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1991589d30e0SAlex Elder 
19920d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
1993589d30e0SAlex Elder }
1994589d30e0SAlex Elder 
199534b13184SAlex Elder /*
199634b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
199734b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
199834b13184SAlex Elder  */
1999dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2000dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2001dfc5606dSYehuda Sadeh 			     char *buf)
2002dfc5606dSYehuda Sadeh {
2003593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2004dfc5606dSYehuda Sadeh 
20050d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2006dfc5606dSYehuda Sadeh }
2007dfc5606dSYehuda Sadeh 
2008dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2009dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2010dfc5606dSYehuda Sadeh 				 const char *buf,
2011dfc5606dSYehuda Sadeh 				 size_t size)
2012dfc5606dSYehuda Sadeh {
2013593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2014b813623aSAlex Elder 	int ret;
2015602adf40SYehuda Sadeh 
2016117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2017b813623aSAlex Elder 
2018b813623aSAlex Elder 	return ret < 0 ? ret : size;
2019dfc5606dSYehuda Sadeh }
2020602adf40SYehuda Sadeh 
2021dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
202234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2023dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2024dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2025dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
20269bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2027dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2028589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2029dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2030dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2031dfc5606dSYehuda Sadeh 
2032dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2033dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
203434b13184SAlex Elder 	&dev_attr_features.attr,
2035dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2036dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2037dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
20389bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2039dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2040589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2041dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
2042dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2043dfc5606dSYehuda Sadeh 	NULL
2044dfc5606dSYehuda Sadeh };
2045dfc5606dSYehuda Sadeh 
2046dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2047dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2048dfc5606dSYehuda Sadeh };
2049dfc5606dSYehuda Sadeh 
2050dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2051dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2052dfc5606dSYehuda Sadeh 	NULL
2053dfc5606dSYehuda Sadeh };
2054dfc5606dSYehuda Sadeh 
2055dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2056dfc5606dSYehuda Sadeh {
2057dfc5606dSYehuda Sadeh }
2058dfc5606dSYehuda Sadeh 
2059dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2060dfc5606dSYehuda Sadeh 	.name		= "rbd",
2061dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2062dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2063dfc5606dSYehuda Sadeh };
2064dfc5606dSYehuda Sadeh 
2065dfc5606dSYehuda Sadeh 
2066dfc5606dSYehuda Sadeh /*
2067dfc5606dSYehuda Sadeh   sysfs - snapshots
2068dfc5606dSYehuda Sadeh */
2069dfc5606dSYehuda Sadeh 
2070dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2071dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2072dfc5606dSYehuda Sadeh 				  char *buf)
2073dfc5606dSYehuda Sadeh {
2074dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2075dfc5606dSYehuda Sadeh 
20763591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2077dfc5606dSYehuda Sadeh }
2078dfc5606dSYehuda Sadeh 
2079dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2080dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2081dfc5606dSYehuda Sadeh 				char *buf)
2082dfc5606dSYehuda Sadeh {
2083dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2084dfc5606dSYehuda Sadeh 
2085593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2086dfc5606dSYehuda Sadeh }
2087dfc5606dSYehuda Sadeh 
208834b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
208934b13184SAlex Elder 				struct device_attribute *attr,
209034b13184SAlex Elder 				char *buf)
209134b13184SAlex Elder {
209234b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
209334b13184SAlex Elder 
209434b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
209534b13184SAlex Elder 			(unsigned long long) snap->features);
209634b13184SAlex Elder }
209734b13184SAlex Elder 
2098dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2099dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
210034b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2101dfc5606dSYehuda Sadeh 
2102dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2103dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2104dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
210534b13184SAlex Elder 	&dev_attr_snap_features.attr,
2106dfc5606dSYehuda Sadeh 	NULL,
2107dfc5606dSYehuda Sadeh };
2108dfc5606dSYehuda Sadeh 
2109dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2110dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2111dfc5606dSYehuda Sadeh };
2112dfc5606dSYehuda Sadeh 
2113dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2114dfc5606dSYehuda Sadeh {
2115dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2116dfc5606dSYehuda Sadeh 	kfree(snap->name);
2117dfc5606dSYehuda Sadeh 	kfree(snap);
2118dfc5606dSYehuda Sadeh }
2119dfc5606dSYehuda Sadeh 
2120dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2121dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2122dfc5606dSYehuda Sadeh 	NULL
2123dfc5606dSYehuda Sadeh };
2124dfc5606dSYehuda Sadeh 
2125dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2126dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2127dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2128dfc5606dSYehuda Sadeh };
2129dfc5606dSYehuda Sadeh 
21308b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
21318b8fb99cSAlex Elder {
21328b8fb99cSAlex Elder 	kref_get(&spec->kref);
21338b8fb99cSAlex Elder 
21348b8fb99cSAlex Elder 	return spec;
21358b8fb99cSAlex Elder }
21368b8fb99cSAlex Elder 
21378b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
21388b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
21398b8fb99cSAlex Elder {
21408b8fb99cSAlex Elder 	if (spec)
21418b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
21428b8fb99cSAlex Elder }
21438b8fb99cSAlex Elder 
21448b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
21458b8fb99cSAlex Elder {
21468b8fb99cSAlex Elder 	struct rbd_spec *spec;
21478b8fb99cSAlex Elder 
21488b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
21498b8fb99cSAlex Elder 	if (!spec)
21508b8fb99cSAlex Elder 		return NULL;
21518b8fb99cSAlex Elder 	kref_init(&spec->kref);
21528b8fb99cSAlex Elder 
21538b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
21548b8fb99cSAlex Elder 
21558b8fb99cSAlex Elder 	return spec;
21568b8fb99cSAlex Elder }
21578b8fb99cSAlex Elder 
21588b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
21598b8fb99cSAlex Elder {
21608b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
21618b8fb99cSAlex Elder 
21628b8fb99cSAlex Elder 	kfree(spec->pool_name);
21638b8fb99cSAlex Elder 	kfree(spec->image_id);
21648b8fb99cSAlex Elder 	kfree(spec->image_name);
21658b8fb99cSAlex Elder 	kfree(spec->snap_name);
21668b8fb99cSAlex Elder 	kfree(spec);
21678b8fb99cSAlex Elder }
21688b8fb99cSAlex Elder 
2169304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2170304f6808SAlex Elder {
2171304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2172304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2173304f6808SAlex Elder 
2174304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2175304f6808SAlex Elder 
2176304f6808SAlex Elder 	return ret;
2177304f6808SAlex Elder }
2178304f6808SAlex Elder 
217941f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2180dfc5606dSYehuda Sadeh {
2181dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2182304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2183dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2184dfc5606dSYehuda Sadeh }
2185dfc5606dSYehuda Sadeh 
218614e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2187dfc5606dSYehuda Sadeh 				  struct device *parent)
2188dfc5606dSYehuda Sadeh {
2189dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2190dfc5606dSYehuda Sadeh 	int ret;
2191dfc5606dSYehuda Sadeh 
2192dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2193dfc5606dSYehuda Sadeh 	dev->parent = parent;
2194dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2195d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2196304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2197304f6808SAlex Elder 
2198dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2199dfc5606dSYehuda Sadeh 
2200dfc5606dSYehuda Sadeh 	return ret;
2201dfc5606dSYehuda Sadeh }
2202dfc5606dSYehuda Sadeh 
22034e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2204c8d18425SAlex Elder 						const char *snap_name,
220534b13184SAlex Elder 						u64 snap_id, u64 snap_size,
220634b13184SAlex Elder 						u64 snap_features)
2207dfc5606dSYehuda Sadeh {
22084e891e0aSAlex Elder 	struct rbd_snap *snap;
2209dfc5606dSYehuda Sadeh 	int ret;
22104e891e0aSAlex Elder 
22114e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2212dfc5606dSYehuda Sadeh 	if (!snap)
22134e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
22144e891e0aSAlex Elder 
22154e891e0aSAlex Elder 	ret = -ENOMEM;
2216c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
22174e891e0aSAlex Elder 	if (!snap->name)
22184e891e0aSAlex Elder 		goto err;
22194e891e0aSAlex Elder 
2220c8d18425SAlex Elder 	snap->id = snap_id;
2221c8d18425SAlex Elder 	snap->size = snap_size;
222234b13184SAlex Elder 	snap->features = snap_features;
22234e891e0aSAlex Elder 
22244e891e0aSAlex Elder 	return snap;
22254e891e0aSAlex Elder 
2226dfc5606dSYehuda Sadeh err:
2227dfc5606dSYehuda Sadeh 	kfree(snap->name);
2228dfc5606dSYehuda Sadeh 	kfree(snap);
22294e891e0aSAlex Elder 
22304e891e0aSAlex Elder 	return ERR_PTR(ret);
2231dfc5606dSYehuda Sadeh }
2232dfc5606dSYehuda Sadeh 
2233cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2234cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2235cd892126SAlex Elder {
2236cd892126SAlex Elder 	char *snap_name;
2237cd892126SAlex Elder 
2238cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2239cd892126SAlex Elder 
2240cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2241cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2242cd892126SAlex Elder 
2243cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2244cd892126SAlex Elder 
2245cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2246cd892126SAlex Elder 	while (which--)
2247cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2248cd892126SAlex Elder 
2249cd892126SAlex Elder 	return snap_name;
2250cd892126SAlex Elder }
2251cd892126SAlex Elder 
2252dfc5606dSYehuda Sadeh /*
22539d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
22549d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
22559d475de5SAlex Elder  * image.
22569d475de5SAlex Elder  */
22579d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
22589d475de5SAlex Elder 				u8 *order, u64 *snap_size)
22599d475de5SAlex Elder {
22609d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
22619d475de5SAlex Elder 	int ret;
22629d475de5SAlex Elder 	struct {
22639d475de5SAlex Elder 		u8 order;
22649d475de5SAlex Elder 		__le64 size;
22659d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
22669d475de5SAlex Elder 
22679d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
22689d475de5SAlex Elder 				"rbd", "get_size",
22699d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
22709d475de5SAlex Elder 				(char *) &size_buf, sizeof (size_buf),
22719d475de5SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
22729d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
22739d475de5SAlex Elder 	if (ret < 0)
22749d475de5SAlex Elder 		return ret;
22759d475de5SAlex Elder 
22769d475de5SAlex Elder 	*order = size_buf.order;
22779d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
22789d475de5SAlex Elder 
22799d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
22809d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
22819d475de5SAlex Elder 		(unsigned long long) *snap_size);
22829d475de5SAlex Elder 
22839d475de5SAlex Elder 	return 0;
22849d475de5SAlex Elder }
22859d475de5SAlex Elder 
22869d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
22879d475de5SAlex Elder {
22889d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
22899d475de5SAlex Elder 					&rbd_dev->header.obj_order,
22909d475de5SAlex Elder 					&rbd_dev->header.image_size);
22919d475de5SAlex Elder }
22929d475de5SAlex Elder 
22931e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
22941e130199SAlex Elder {
22951e130199SAlex Elder 	void *reply_buf;
22961e130199SAlex Elder 	int ret;
22971e130199SAlex Elder 	void *p;
22981e130199SAlex Elder 
22991e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
23001e130199SAlex Elder 	if (!reply_buf)
23011e130199SAlex Elder 		return -ENOMEM;
23021e130199SAlex Elder 
23031e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
23041e130199SAlex Elder 				"rbd", "get_object_prefix",
23051e130199SAlex Elder 				NULL, 0,
23061e130199SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
23071e130199SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
23081e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
23091e130199SAlex Elder 	if (ret < 0)
23101e130199SAlex Elder 		goto out;
2311a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
23121e130199SAlex Elder 
23131e130199SAlex Elder 	p = reply_buf;
23141e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
23151e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
23161e130199SAlex Elder 						NULL, GFP_NOIO);
23171e130199SAlex Elder 
23181e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
23191e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
23201e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
23211e130199SAlex Elder 	} else {
23221e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
23231e130199SAlex Elder 	}
23241e130199SAlex Elder 
23251e130199SAlex Elder out:
23261e130199SAlex Elder 	kfree(reply_buf);
23271e130199SAlex Elder 
23281e130199SAlex Elder 	return ret;
23291e130199SAlex Elder }
23301e130199SAlex Elder 
2331b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2332b1b5402aSAlex Elder 		u64 *snap_features)
2333b1b5402aSAlex Elder {
2334b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2335b1b5402aSAlex Elder 	struct {
2336b1b5402aSAlex Elder 		__le64 features;
2337b1b5402aSAlex Elder 		__le64 incompat;
2338b1b5402aSAlex Elder 	} features_buf = { 0 };
2339d889140cSAlex Elder 	u64 incompat;
2340b1b5402aSAlex Elder 	int ret;
2341b1b5402aSAlex Elder 
2342b1b5402aSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2343b1b5402aSAlex Elder 				"rbd", "get_features",
2344b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2345b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
2346b1b5402aSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2347b1b5402aSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2348b1b5402aSAlex Elder 	if (ret < 0)
2349b1b5402aSAlex Elder 		return ret;
2350d889140cSAlex Elder 
2351d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2352d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2353d889140cSAlex Elder 		return -ENOTSUPP;
2354d889140cSAlex Elder 
2355b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2356b1b5402aSAlex Elder 
2357b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2358b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2359b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2360b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2361b1b5402aSAlex Elder 
2362b1b5402aSAlex Elder 	return 0;
2363b1b5402aSAlex Elder }
2364b1b5402aSAlex Elder 
2365b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2366b1b5402aSAlex Elder {
2367b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2368b1b5402aSAlex Elder 						&rbd_dev->header.features);
2369b1b5402aSAlex Elder }
2370b1b5402aSAlex Elder 
23716e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
237235d489f9SAlex Elder {
237335d489f9SAlex Elder 	size_t size;
237435d489f9SAlex Elder 	int ret;
237535d489f9SAlex Elder 	void *reply_buf;
237635d489f9SAlex Elder 	void *p;
237735d489f9SAlex Elder 	void *end;
237835d489f9SAlex Elder 	u64 seq;
237935d489f9SAlex Elder 	u32 snap_count;
238035d489f9SAlex Elder 	struct ceph_snap_context *snapc;
238135d489f9SAlex Elder 	u32 i;
238235d489f9SAlex Elder 
238335d489f9SAlex Elder 	/*
238435d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
238535d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
238635d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
238735d489f9SAlex Elder 	 * prepared to receive.
238835d489f9SAlex Elder 	 */
238935d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
239035d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
239135d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
239235d489f9SAlex Elder 	if (!reply_buf)
239335d489f9SAlex Elder 		return -ENOMEM;
239435d489f9SAlex Elder 
239535d489f9SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
239635d489f9SAlex Elder 				"rbd", "get_snapcontext",
239735d489f9SAlex Elder 				NULL, 0,
239835d489f9SAlex Elder 				reply_buf, size,
23996e14b1a6SAlex Elder 				CEPH_OSD_FLAG_READ, ver);
240035d489f9SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
240135d489f9SAlex Elder 	if (ret < 0)
240235d489f9SAlex Elder 		goto out;
240335d489f9SAlex Elder 
240435d489f9SAlex Elder 	ret = -ERANGE;
240535d489f9SAlex Elder 	p = reply_buf;
240635d489f9SAlex Elder 	end = (char *) reply_buf + size;
240735d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
240835d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
240935d489f9SAlex Elder 
241035d489f9SAlex Elder 	/*
241135d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
241235d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
241335d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
241435d489f9SAlex Elder 	 * allocate is representable in a size_t.
241535d489f9SAlex Elder 	 */
241635d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
241735d489f9SAlex Elder 				 / sizeof (u64)) {
241835d489f9SAlex Elder 		ret = -EINVAL;
241935d489f9SAlex Elder 		goto out;
242035d489f9SAlex Elder 	}
242135d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
242235d489f9SAlex Elder 		goto out;
242335d489f9SAlex Elder 
242435d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
242535d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
242635d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
242735d489f9SAlex Elder 	if (!snapc) {
242835d489f9SAlex Elder 		ret = -ENOMEM;
242935d489f9SAlex Elder 		goto out;
243035d489f9SAlex Elder 	}
243135d489f9SAlex Elder 
243235d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
243335d489f9SAlex Elder 	snapc->seq = seq;
243435d489f9SAlex Elder 	snapc->num_snaps = snap_count;
243535d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
243635d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
243735d489f9SAlex Elder 
243835d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
243935d489f9SAlex Elder 
244035d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
244135d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
244235d489f9SAlex Elder 
244335d489f9SAlex Elder out:
244435d489f9SAlex Elder 	kfree(reply_buf);
244535d489f9SAlex Elder 
244635d489f9SAlex Elder 	return 0;
244735d489f9SAlex Elder }
244835d489f9SAlex Elder 
2449b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2450b8b1e2dbSAlex Elder {
2451b8b1e2dbSAlex Elder 	size_t size;
2452b8b1e2dbSAlex Elder 	void *reply_buf;
2453b8b1e2dbSAlex Elder 	__le64 snap_id;
2454b8b1e2dbSAlex Elder 	int ret;
2455b8b1e2dbSAlex Elder 	void *p;
2456b8b1e2dbSAlex Elder 	void *end;
2457b8b1e2dbSAlex Elder 	char *snap_name;
2458b8b1e2dbSAlex Elder 
2459b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2460b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
2461b8b1e2dbSAlex Elder 	if (!reply_buf)
2462b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
2463b8b1e2dbSAlex Elder 
2464b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2465b8b1e2dbSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2466b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
2467b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
2468b8b1e2dbSAlex Elder 				reply_buf, size,
2469b8b1e2dbSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2470b8b1e2dbSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2471b8b1e2dbSAlex Elder 	if (ret < 0)
2472b8b1e2dbSAlex Elder 		goto out;
2473b8b1e2dbSAlex Elder 
2474b8b1e2dbSAlex Elder 	p = reply_buf;
2475b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
2476e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2477b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
2478b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
2479b8b1e2dbSAlex Elder 		goto out;
2480b8b1e2dbSAlex Elder 	} else {
2481b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
2482b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
2483b8b1e2dbSAlex Elder 	}
2484b8b1e2dbSAlex Elder 	kfree(reply_buf);
2485b8b1e2dbSAlex Elder 
2486b8b1e2dbSAlex Elder 	return snap_name;
2487b8b1e2dbSAlex Elder out:
2488b8b1e2dbSAlex Elder 	kfree(reply_buf);
2489b8b1e2dbSAlex Elder 
2490b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
2491b8b1e2dbSAlex Elder }
2492b8b1e2dbSAlex Elder 
2493b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2494b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2495b8b1e2dbSAlex Elder {
2496b8b1e2dbSAlex Elder 	__le64 snap_id;
2497b8b1e2dbSAlex Elder 	u8 order;
2498b8b1e2dbSAlex Elder 	int ret;
2499b8b1e2dbSAlex Elder 
2500b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
2501b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2502b8b1e2dbSAlex Elder 	if (ret)
2503b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2504b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2505b8b1e2dbSAlex Elder 	if (ret)
2506b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2507b8b1e2dbSAlex Elder 
2508b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
2509b8b1e2dbSAlex Elder }
2510b8b1e2dbSAlex Elder 
2511b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2512b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2513b8b1e2dbSAlex Elder {
2514b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
2515b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
2516b8b1e2dbSAlex Elder 					snap_size, snap_features);
2517b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
2518b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
2519b8b1e2dbSAlex Elder 					snap_size, snap_features);
2520b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
2521b8b1e2dbSAlex Elder }
2522b8b1e2dbSAlex Elder 
2523117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2524117973fbSAlex Elder {
2525117973fbSAlex Elder 	int ret;
2526117973fbSAlex Elder 	__u8 obj_order;
2527117973fbSAlex Elder 
2528117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
2529117973fbSAlex Elder 
2530117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
2531117973fbSAlex Elder 
2532117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
2533117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
2534117973fbSAlex Elder 	if (ret)
2535117973fbSAlex Elder 		goto out;
2536117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
2537117973fbSAlex Elder 		ret = -EIO;
2538117973fbSAlex Elder 		goto out;
2539117973fbSAlex Elder 	}
2540117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
2541117973fbSAlex Elder 
2542117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2543117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
2544117973fbSAlex Elder 	if (ret)
2545117973fbSAlex Elder 		goto out;
2546117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2547117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
2548117973fbSAlex Elder 	if (ret)
2549117973fbSAlex Elder 		goto out;
2550117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
2551117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
2552117973fbSAlex Elder out:
2553117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
2554117973fbSAlex Elder 
2555117973fbSAlex Elder 	return ret;
2556117973fbSAlex Elder }
2557117973fbSAlex Elder 
25589d475de5SAlex Elder /*
255935938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
256035938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
256135938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
256235938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
256335938150SAlex Elder  * And verify there are no changes to snapshots we already know
256435938150SAlex Elder  * about.
256535938150SAlex Elder  *
256635938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
256735938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
256835938150SAlex Elder  * are also maintained in that order.)
2569dfc5606dSYehuda Sadeh  */
2570304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2571dfc5606dSYehuda Sadeh {
257235938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
257335938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
257435938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
257535938150SAlex Elder 	struct list_head *links = head->next;
257635938150SAlex Elder 	u32 index = 0;
2577dfc5606dSYehuda Sadeh 
25789fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
257935938150SAlex Elder 	while (index < snap_count || links != head) {
258035938150SAlex Elder 		u64 snap_id;
258135938150SAlex Elder 		struct rbd_snap *snap;
2582cd892126SAlex Elder 		char *snap_name;
2583cd892126SAlex Elder 		u64 snap_size = 0;
2584cd892126SAlex Elder 		u64 snap_features = 0;
2585dfc5606dSYehuda Sadeh 
258635938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
258735938150SAlex Elder 					     : CEPH_NOSNAP;
258835938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
258935938150SAlex Elder 				     : NULL;
2590aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2591dfc5606dSYehuda Sadeh 
259235938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
259335938150SAlex Elder 			struct list_head *next = links->next;
2594dfc5606dSYehuda Sadeh 
259535938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2596dfc5606dSYehuda Sadeh 
25970d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
2598daba5fdbSAlex Elder 				rbd_dev->exists = false;
259941f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
26009fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
26010d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
26020d7dbfceSAlex Elder 							"mapped " : "",
26039fcbb800SAlex Elder 				(unsigned long long) snap->id);
2604dfc5606dSYehuda Sadeh 
260535938150SAlex Elder 			/* Done with this list entry; advance */
260635938150SAlex Elder 
260735938150SAlex Elder 			links = next;
260835938150SAlex Elder 			continue;
2609dfc5606dSYehuda Sadeh 		}
261035938150SAlex Elder 
2611b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
2612cd892126SAlex Elder 					&snap_size, &snap_features);
2613cd892126SAlex Elder 		if (IS_ERR(snap_name))
2614cd892126SAlex Elder 			return PTR_ERR(snap_name);
2615cd892126SAlex Elder 
26169fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
26179fcbb800SAlex Elder 			(unsigned long long) snap_id);
261835938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
261935938150SAlex Elder 			struct rbd_snap *new_snap;
262035938150SAlex Elder 
262135938150SAlex Elder 			/* We haven't seen this snapshot before */
262235938150SAlex Elder 
2623c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2624cd892126SAlex Elder 					snap_id, snap_size, snap_features);
26259fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
26269fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
26279fcbb800SAlex Elder 
26289fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
26299fcbb800SAlex Elder 
26309fcbb800SAlex Elder 				return err;
26319fcbb800SAlex Elder 			}
263235938150SAlex Elder 
263335938150SAlex Elder 			/* New goes before existing, or at end of list */
263435938150SAlex Elder 
26359fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
263635938150SAlex Elder 			if (snap)
263735938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
263835938150SAlex Elder 			else
2639523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
264035938150SAlex Elder 		} else {
264135938150SAlex Elder 			/* Already have this one */
264235938150SAlex Elder 
26439fcbb800SAlex Elder 			dout("  already present\n");
26449fcbb800SAlex Elder 
2645cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2646aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2647cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
264835938150SAlex Elder 
264935938150SAlex Elder 			/* Done with this list entry; advance */
265035938150SAlex Elder 
265135938150SAlex Elder 			links = links->next;
2652dfc5606dSYehuda Sadeh 		}
265335938150SAlex Elder 
265435938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
265535938150SAlex Elder 
265635938150SAlex Elder 		index++;
2657dfc5606dSYehuda Sadeh 	}
26589fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2659dfc5606dSYehuda Sadeh 
2660dfc5606dSYehuda Sadeh 	return 0;
2661dfc5606dSYehuda Sadeh }
2662dfc5606dSYehuda Sadeh 
2663304f6808SAlex Elder /*
2664304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2665304f6808SAlex Elder  * have not already been registered.
2666304f6808SAlex Elder  */
2667304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2668304f6808SAlex Elder {
2669304f6808SAlex Elder 	struct rbd_snap *snap;
2670304f6808SAlex Elder 	int ret = 0;
2671304f6808SAlex Elder 
2672304f6808SAlex Elder 	dout("%s called\n", __func__);
267386ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
267486ff77bbSAlex Elder 		return -EIO;
2675304f6808SAlex Elder 
2676304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2677304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
2678304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2679304f6808SAlex Elder 			if (ret < 0)
2680304f6808SAlex Elder 				break;
2681304f6808SAlex Elder 		}
2682304f6808SAlex Elder 	}
2683304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
2684304f6808SAlex Elder 
2685304f6808SAlex Elder 	return ret;
2686304f6808SAlex Elder }
2687304f6808SAlex Elder 
2688dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2689dfc5606dSYehuda Sadeh {
2690dfc5606dSYehuda Sadeh 	struct device *dev;
2691cd789ab9SAlex Elder 	int ret;
2692dfc5606dSYehuda Sadeh 
2693dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2694dfc5606dSYehuda Sadeh 
2695cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
2696dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2697dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2698dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2699dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2700de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2701dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2702dfc5606dSYehuda Sadeh 
2703dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2704cd789ab9SAlex Elder 
2705dfc5606dSYehuda Sadeh 	return ret;
2706602adf40SYehuda Sadeh }
2707602adf40SYehuda Sadeh 
2708dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2709dfc5606dSYehuda Sadeh {
2710dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2711dfc5606dSYehuda Sadeh }
2712dfc5606dSYehuda Sadeh 
271359c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
271459c2be1eSYehuda Sadeh {
271559c2be1eSYehuda Sadeh 	int ret, rc;
271659c2be1eSYehuda Sadeh 
271759c2be1eSYehuda Sadeh 	do {
27180e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
271959c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
2720117973fbSAlex Elder 			rc = rbd_dev_refresh(rbd_dev, NULL);
272159c2be1eSYehuda Sadeh 			if (rc < 0)
272259c2be1eSYehuda Sadeh 				return rc;
272359c2be1eSYehuda Sadeh 		}
272459c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
272559c2be1eSYehuda Sadeh 
272659c2be1eSYehuda Sadeh 	return ret;
272759c2be1eSYehuda Sadeh }
272859c2be1eSYehuda Sadeh 
2729e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
27301ddbe94eSAlex Elder 
27311ddbe94eSAlex Elder /*
2732499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2733499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
27341ddbe94eSAlex Elder  */
2735e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2736b7f23c36SAlex Elder {
2737e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2738499afd5bSAlex Elder 
2739499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2740499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2741499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2742e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2743e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2744b7f23c36SAlex Elder }
2745b7f23c36SAlex Elder 
27461ddbe94eSAlex Elder /*
2747499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2748499afd5bSAlex Elder  * identifier is no longer in use.
27491ddbe94eSAlex Elder  */
2750e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
27511ddbe94eSAlex Elder {
2752d184f6bfSAlex Elder 	struct list_head *tmp;
2753de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
2754d184f6bfSAlex Elder 	int max_id;
2755d184f6bfSAlex Elder 
2756aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
2757499afd5bSAlex Elder 
2758e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2759e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2760499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2761499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2762d184f6bfSAlex Elder 
2763d184f6bfSAlex Elder 	/*
2764d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2765d184f6bfSAlex Elder 	 * is nothing special we need to do.
2766d184f6bfSAlex Elder 	 */
2767e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2768d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2769d184f6bfSAlex Elder 		return;
2770d184f6bfSAlex Elder 	}
2771d184f6bfSAlex Elder 
2772d184f6bfSAlex Elder 	/*
2773d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2774d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2775d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2776d184f6bfSAlex Elder 	 */
2777d184f6bfSAlex Elder 	max_id = 0;
2778d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2779d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2780d184f6bfSAlex Elder 
2781d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2782b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
2783b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
2784d184f6bfSAlex Elder 	}
2785499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
27861ddbe94eSAlex Elder 
27871ddbe94eSAlex Elder 	/*
2788e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
2789d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2790d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2791d184f6bfSAlex Elder 	 * case.
27921ddbe94eSAlex Elder 	 */
2793e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2794e2839308SAlex Elder 	dout("  max dev id has been reset\n");
2795b7f23c36SAlex Elder }
2796b7f23c36SAlex Elder 
2797a725f65eSAlex Elder /*
2798e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2799e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2800593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2801593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2802e28fff26SAlex Elder  */
2803e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2804e28fff26SAlex Elder {
2805e28fff26SAlex Elder         /*
2806e28fff26SAlex Elder         * These are the characters that produce nonzero for
2807e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2808e28fff26SAlex Elder         */
2809e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2810e28fff26SAlex Elder 
2811e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2812e28fff26SAlex Elder 
2813e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2814e28fff26SAlex Elder }
2815e28fff26SAlex Elder 
2816e28fff26SAlex Elder /*
2817e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2818e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2819593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2820593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2821e28fff26SAlex Elder  *
2822e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2823e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2824e28fff26SAlex Elder  * token_size if the token would not fit.
2825e28fff26SAlex Elder  *
2826593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2827e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2828e28fff26SAlex Elder  * too small to hold it.
2829e28fff26SAlex Elder  */
2830e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2831e28fff26SAlex Elder 				char *token,
2832e28fff26SAlex Elder 				size_t token_size)
2833e28fff26SAlex Elder {
2834e28fff26SAlex Elder         size_t len;
2835e28fff26SAlex Elder 
2836e28fff26SAlex Elder 	len = next_token(buf);
2837e28fff26SAlex Elder 	if (len < token_size) {
2838e28fff26SAlex Elder 		memcpy(token, *buf, len);
2839e28fff26SAlex Elder 		*(token + len) = '\0';
2840e28fff26SAlex Elder 	}
2841e28fff26SAlex Elder 	*buf += len;
2842e28fff26SAlex Elder 
2843e28fff26SAlex Elder         return len;
2844e28fff26SAlex Elder }
2845e28fff26SAlex Elder 
2846e28fff26SAlex Elder /*
2847ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2848ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2849ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2850ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2851ea3352f4SAlex Elder  *
2852ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2853ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2854ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2855ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2856ea3352f4SAlex Elder  *
2857ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2858ea3352f4SAlex Elder  * the end of the found token.
2859ea3352f4SAlex Elder  *
2860ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2861ea3352f4SAlex Elder  */
2862ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2863ea3352f4SAlex Elder {
2864ea3352f4SAlex Elder 	char *dup;
2865ea3352f4SAlex Elder 	size_t len;
2866ea3352f4SAlex Elder 
2867ea3352f4SAlex Elder 	len = next_token(buf);
2868ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2869ea3352f4SAlex Elder 	if (!dup)
2870ea3352f4SAlex Elder 		return NULL;
2871ea3352f4SAlex Elder 
2872ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2873ea3352f4SAlex Elder 	*(dup + len) = '\0';
2874ea3352f4SAlex Elder 	*buf += len;
2875ea3352f4SAlex Elder 
2876ea3352f4SAlex Elder 	if (lenp)
2877ea3352f4SAlex Elder 		*lenp = len;
2878ea3352f4SAlex Elder 
2879ea3352f4SAlex Elder 	return dup;
2880ea3352f4SAlex Elder }
2881ea3352f4SAlex Elder 
2882ea3352f4SAlex Elder /*
2883859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
2884859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
2885859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
2886859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
2887d22f76e7SAlex Elder  *
2888859c31dfSAlex Elder  * The information extracted from these options is recorded in
2889859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
2890859c31dfSAlex Elder  * structures:
2891859c31dfSAlex Elder  *  ceph_opts
2892859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
2893859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
2894859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
2895859c31dfSAlex Elder  *  rbd_opts
2896859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
2897859c31dfSAlex Elder  *	this function; caller must release with kfree().
2898859c31dfSAlex Elder  *  spec
2899859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
2900859c31dfSAlex Elder  *	initialized by this function based on parsed options.
2901859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
2902859c31dfSAlex Elder  *
2903859c31dfSAlex Elder  * The options passed take this form:
2904859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
2905859c31dfSAlex Elder  * where:
2906859c31dfSAlex Elder  *  <mon_addrs>
2907859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
2908859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
2909859c31dfSAlex Elder  *      by a port number (separated by a colon).
2910859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
2911859c31dfSAlex Elder  *  <options>
2912859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
2913859c31dfSAlex Elder  *  <pool_name>
2914859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
2915859c31dfSAlex Elder  *  <image_name>
2916859c31dfSAlex Elder  *      The name of the image in that pool to map.
2917859c31dfSAlex Elder  *  <snap_id>
2918859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
2919859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
2920859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
2921859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
2922a725f65eSAlex Elder  */
2923859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
2924dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
2925859c31dfSAlex Elder 				struct rbd_options **opts,
2926859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
2927a725f65eSAlex Elder {
2928e28fff26SAlex Elder 	size_t len;
2929859c31dfSAlex Elder 	char *options;
29300ddebc0cSAlex Elder 	const char *mon_addrs;
29310ddebc0cSAlex Elder 	size_t mon_addrs_size;
2932859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
29334e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
2934859c31dfSAlex Elder 	struct ceph_options *copts;
2935dc79b113SAlex Elder 	int ret;
2936e28fff26SAlex Elder 
2937e28fff26SAlex Elder 	/* The first four tokens are required */
2938e28fff26SAlex Elder 
29397ef3214aSAlex Elder 	len = next_token(&buf);
29407ef3214aSAlex Elder 	if (!len)
2941dc79b113SAlex Elder 		return -EINVAL;	/* Missing monitor address(es) */
29420ddebc0cSAlex Elder 	mon_addrs = buf;
2943f28e565aSAlex Elder 	mon_addrs_size = len + 1;
29447ef3214aSAlex Elder 	buf += len;
2945a725f65eSAlex Elder 
2946dc79b113SAlex Elder 	ret = -EINVAL;
2947f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
2948f28e565aSAlex Elder 	if (!options)
2949dc79b113SAlex Elder 		return -ENOMEM;
2950f28e565aSAlex Elder 	if (!*options)
2951f28e565aSAlex Elder 		goto out_err;	/* Missing options */
2952a725f65eSAlex Elder 
2953859c31dfSAlex Elder 	spec = rbd_spec_alloc();
2954859c31dfSAlex Elder 	if (!spec)
2955f28e565aSAlex Elder 		goto out_mem;
2956859c31dfSAlex Elder 
2957859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
2958859c31dfSAlex Elder 	if (!spec->pool_name)
2959859c31dfSAlex Elder 		goto out_mem;
2960859c31dfSAlex Elder 	if (!*spec->pool_name)
2961f28e565aSAlex Elder 		goto out_err;	/* Missing pool name */
2962e28fff26SAlex Elder 
2963859c31dfSAlex Elder 	spec->image_name = dup_token(&buf, &spec->image_name_len);
2964859c31dfSAlex Elder 	if (!spec->image_name)
2965f28e565aSAlex Elder 		goto out_mem;
2966859c31dfSAlex Elder 	if (!*spec->image_name)
2967f28e565aSAlex Elder 		goto out_err;	/* Missing image name */
2968e28fff26SAlex Elder 
2969f28e565aSAlex Elder 	/*
2970f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
2971f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
2972f28e565aSAlex Elder 	 */
29733feeb894SAlex Elder 	len = next_token(&buf);
2974820a5f3eSAlex Elder 	if (!len) {
29753feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
29763feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
2977f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
2978dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
2979f28e565aSAlex Elder 		goto out_err;
2980849b4260SAlex Elder 	}
2981859c31dfSAlex Elder 	spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
2982859c31dfSAlex Elder 	if (!spec->snap_name)
2983f28e565aSAlex Elder 		goto out_mem;
2984859c31dfSAlex Elder 	memcpy(spec->snap_name, buf, len);
2985859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
2986e5c35534SAlex Elder 
29870ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
2988e28fff26SAlex Elder 
29894e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
29904e9afebaSAlex Elder 	if (!rbd_opts)
29914e9afebaSAlex Elder 		goto out_mem;
29924e9afebaSAlex Elder 
29934e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
2994d22f76e7SAlex Elder 
2995859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
29960ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
29974e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
2998859c31dfSAlex Elder 	if (IS_ERR(copts)) {
2999859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3000dc79b113SAlex Elder 		goto out_err;
3001dc79b113SAlex Elder 	}
3002859c31dfSAlex Elder 	kfree(options);
3003859c31dfSAlex Elder 
3004859c31dfSAlex Elder 	*ceph_opts = copts;
30054e9afebaSAlex Elder 	*opts = rbd_opts;
3006859c31dfSAlex Elder 	*rbd_spec = spec;
30070ddebc0cSAlex Elder 
3008dc79b113SAlex Elder 	return 0;
3009f28e565aSAlex Elder out_mem:
3010dc79b113SAlex Elder 	ret = -ENOMEM;
3011d22f76e7SAlex Elder out_err:
3012859c31dfSAlex Elder 	kfree(rbd_opts);
3013859c31dfSAlex Elder 	rbd_spec_put(spec);
3014f28e565aSAlex Elder 	kfree(options);
3015d22f76e7SAlex Elder 
3016dc79b113SAlex Elder 	return ret;
3017a725f65eSAlex Elder }
3018a725f65eSAlex Elder 
3019589d30e0SAlex Elder /*
3020589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3021589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3022589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3023589d30e0SAlex Elder  *
3024589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3025589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3026589d30e0SAlex Elder  * with the supplied name.
3027589d30e0SAlex Elder  *
3028589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3029589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3030589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3031589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3032589d30e0SAlex Elder  */
3033589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3034589d30e0SAlex Elder {
3035589d30e0SAlex Elder 	int ret;
3036589d30e0SAlex Elder 	size_t size;
3037589d30e0SAlex Elder 	char *object_name;
3038589d30e0SAlex Elder 	void *response;
3039589d30e0SAlex Elder 	void *p;
3040589d30e0SAlex Elder 
3041589d30e0SAlex Elder 	/*
3042589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3043589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3044589d30e0SAlex Elder 	 */
30450d7dbfceSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
3046589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3047589d30e0SAlex Elder 	if (!object_name)
3048589d30e0SAlex Elder 		return -ENOMEM;
30490d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3050589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3051589d30e0SAlex Elder 
3052589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3053589d30e0SAlex Elder 
3054589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3055589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3056589d30e0SAlex Elder 	if (!response) {
3057589d30e0SAlex Elder 		ret = -ENOMEM;
3058589d30e0SAlex Elder 		goto out;
3059589d30e0SAlex Elder 	}
3060589d30e0SAlex Elder 
3061589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
3062589d30e0SAlex Elder 				"rbd", "get_id",
3063589d30e0SAlex Elder 				NULL, 0,
3064589d30e0SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX,
3065589d30e0SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
3066589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3067589d30e0SAlex Elder 	if (ret < 0)
3068589d30e0SAlex Elder 		goto out;
3069a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
3070589d30e0SAlex Elder 
3071589d30e0SAlex Elder 	p = response;
30720d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3073589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
30740d7dbfceSAlex Elder 						&rbd_dev->spec->image_id_len,
3075589d30e0SAlex Elder 						GFP_NOIO);
30760d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
30770d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
30780d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3079589d30e0SAlex Elder 	} else {
30800d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3081589d30e0SAlex Elder 	}
3082589d30e0SAlex Elder out:
3083589d30e0SAlex Elder 	kfree(response);
3084589d30e0SAlex Elder 	kfree(object_name);
3085589d30e0SAlex Elder 
3086589d30e0SAlex Elder 	return ret;
3087589d30e0SAlex Elder }
3088589d30e0SAlex Elder 
3089a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3090a30b71b9SAlex Elder {
3091a30b71b9SAlex Elder 	int ret;
3092a30b71b9SAlex Elder 	size_t size;
3093a30b71b9SAlex Elder 
3094a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3095a30b71b9SAlex Elder 
30960d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
30970d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3098a30b71b9SAlex Elder 		return -ENOMEM;
30990d7dbfceSAlex Elder 	rbd_dev->spec->image_id_len = 0;
3100a30b71b9SAlex Elder 
3101a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3102a30b71b9SAlex Elder 
31030d7dbfceSAlex Elder 	size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
3104a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3105a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3106a30b71b9SAlex Elder 		ret = -ENOMEM;
3107a30b71b9SAlex Elder 		goto out_err;
3108a30b71b9SAlex Elder 	}
31090d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
31100d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3111a30b71b9SAlex Elder 
3112a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3113a30b71b9SAlex Elder 
3114a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3115a30b71b9SAlex Elder 	if (ret < 0)
3116a30b71b9SAlex Elder 		goto out_err;
3117a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3118a30b71b9SAlex Elder 
3119a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3120a30b71b9SAlex Elder 		rbd_dev->header_name);
3121a30b71b9SAlex Elder 
3122a30b71b9SAlex Elder 	return 0;
3123a30b71b9SAlex Elder 
3124a30b71b9SAlex Elder out_err:
3125a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3126a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
31270d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
31280d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3129a30b71b9SAlex Elder 
3130a30b71b9SAlex Elder 	return ret;
3131a30b71b9SAlex Elder }
3132a30b71b9SAlex Elder 
3133a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3134a30b71b9SAlex Elder {
3135a30b71b9SAlex Elder 	size_t size;
31369d475de5SAlex Elder 	int ret;
31376e14b1a6SAlex Elder 	u64 ver = 0;
3138a30b71b9SAlex Elder 
3139a30b71b9SAlex Elder 	/*
3140a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3141a30b71b9SAlex Elder 	 * object name for this rbd image.
3142a30b71b9SAlex Elder 	 */
31430d7dbfceSAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
3144a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3145a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3146a30b71b9SAlex Elder 		return -ENOMEM;
3147a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
31480d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
31499d475de5SAlex Elder 
31509d475de5SAlex Elder 	/* Get the size and object order for the image */
31519d475de5SAlex Elder 
31529d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
31539d475de5SAlex Elder 	if (ret < 0)
31549d475de5SAlex Elder 		goto out_err;
31551e130199SAlex Elder 
31561e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
31571e130199SAlex Elder 
31581e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
31591e130199SAlex Elder 	if (ret < 0)
31601e130199SAlex Elder 		goto out_err;
3161b1b5402aSAlex Elder 
3162d889140cSAlex Elder 	/* Get the and check features for the image */
3163b1b5402aSAlex Elder 
3164b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3165b1b5402aSAlex Elder 	if (ret < 0)
3166b1b5402aSAlex Elder 		goto out_err;
316735d489f9SAlex Elder 
31686e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
316935d489f9SAlex Elder 
31706e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
31716e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
31726e14b1a6SAlex Elder 
31736e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
31746e14b1a6SAlex Elder 
31756e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
317635d489f9SAlex Elder 	if (ret)
317735d489f9SAlex Elder 		goto out_err;
31786e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
31796e14b1a6SAlex Elder 
3180a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3181a30b71b9SAlex Elder 
3182a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3183a30b71b9SAlex Elder 		rbd_dev->header_name);
3184a30b71b9SAlex Elder 
318535152979SAlex Elder 	return 0;
31869d475de5SAlex Elder out_err:
31879d475de5SAlex Elder 	kfree(rbd_dev->header_name);
31889d475de5SAlex Elder 	rbd_dev->header_name = NULL;
31891e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
31901e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
31919d475de5SAlex Elder 
31929d475de5SAlex Elder 	return ret;
3193a30b71b9SAlex Elder }
3194a30b71b9SAlex Elder 
3195a30b71b9SAlex Elder /*
3196a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3197a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3198a30b71b9SAlex Elder  * id.
3199a30b71b9SAlex Elder  */
3200a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3201a30b71b9SAlex Elder {
3202a30b71b9SAlex Elder 	int ret;
3203a30b71b9SAlex Elder 
3204a30b71b9SAlex Elder 	/*
3205a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3206a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3207a30b71b9SAlex Elder 	 * it's a format 1 image.
3208a30b71b9SAlex Elder 	 */
3209a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3210a30b71b9SAlex Elder 	if (ret)
3211a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3212a30b71b9SAlex Elder 	else
3213a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
3214a30b71b9SAlex Elder 	if (ret)
3215a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3216a30b71b9SAlex Elder 
3217a30b71b9SAlex Elder 	return ret;
3218a30b71b9SAlex Elder }
3219a30b71b9SAlex Elder 
322059c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
322159c2be1eSYehuda Sadeh 		       const char *buf,
322259c2be1eSYehuda Sadeh 		       size_t count)
3223602adf40SYehuda Sadeh {
3224cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
3225dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
32264e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3227859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
32289d3997fdSAlex Elder 	struct rbd_client *rbdc;
322927cc2594SAlex Elder 	struct ceph_osd_client *osdc;
323027cc2594SAlex Elder 	int rc = -ENOMEM;
3231602adf40SYehuda Sadeh 
3232602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
3233602adf40SYehuda Sadeh 		return -ENODEV;
3234602adf40SYehuda Sadeh 
3235cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3236cb8627c7SAlex Elder 	if (!rbd_dev)
32374e9afebaSAlex Elder 		return -ENOMEM;
3238602adf40SYehuda Sadeh 
3239602adf40SYehuda Sadeh 	/* static rbd_device initialization */
3240602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
3241602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
3242dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
3243c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
3244602adf40SYehuda Sadeh 
3245a725f65eSAlex Elder 	/* parse add command */
3246859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3247dc79b113SAlex Elder 	if (rc < 0)
324885ae8926SAlex Elder 		goto err_out_mem;
3249859c31dfSAlex Elder 
32504e9afebaSAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
3251a725f65eSAlex Elder 
32529d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
32539d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
32549d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
32550ddebc0cSAlex Elder 		goto err_out_args;
32569d3997fdSAlex Elder 	}
32579d3997fdSAlex Elder 	rbd_dev->rbd_client = rbdc;
325878cea76eSAlex Elder 	ceph_opts = NULL;	/* ceph_opts now owned by rbd_dev client */
3259602adf40SYehuda Sadeh 
3260602adf40SYehuda Sadeh 	/* pick the pool */
32619d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
3262859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3263602adf40SYehuda Sadeh 	if (rc < 0)
3264602adf40SYehuda Sadeh 		goto err_out_client;
3265859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
3266859c31dfSAlex Elder 
3267859c31dfSAlex Elder 	rbd_dev->spec = spec;
3268602adf40SYehuda Sadeh 
3269a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
3270a30b71b9SAlex Elder 	if (rc < 0)
3271589d30e0SAlex Elder 		goto err_out_client;
327205fd6f6fSAlex Elder 
327305fd6f6fSAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
327405fd6f6fSAlex Elder 	rc = rbd_dev_snaps_update(rbd_dev);
327505fd6f6fSAlex Elder 	if (rc)
327641f38c2bSAlex Elder 		goto err_out_probe;
327705fd6f6fSAlex Elder 
3278819d52bfSAlex Elder 	rc = rbd_dev_set_mapping(rbd_dev);
327905fd6f6fSAlex Elder 	if (rc)
328041f38c2bSAlex Elder 		goto err_out_snaps;
328105fd6f6fSAlex Elder 
328285ae8926SAlex Elder 	/* generate unique id: find highest unique id, add one */
328385ae8926SAlex Elder 	rbd_dev_id_get(rbd_dev);
328485ae8926SAlex Elder 
328585ae8926SAlex Elder 	/* Fill in the device name, now that we have its id. */
328685ae8926SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
328785ae8926SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
328885ae8926SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
328985ae8926SAlex Elder 
329085ae8926SAlex Elder 	/* Get our block major device number. */
329185ae8926SAlex Elder 
329227cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
329327cc2594SAlex Elder 	if (rc < 0)
329485ae8926SAlex Elder 		goto err_out_id;
329527cc2594SAlex Elder 	rbd_dev->major = rc;
3296602adf40SYehuda Sadeh 
32970f308a31SAlex Elder 	/* Set up the blkdev mapping. */
32980f308a31SAlex Elder 
32990f308a31SAlex Elder 	rc = rbd_init_disk(rbd_dev);
3300dfc5606dSYehuda Sadeh 	if (rc)
3301766fc439SYehuda Sadeh 		goto err_out_blkdev;
3302766fc439SYehuda Sadeh 
33030f308a31SAlex Elder 	rc = rbd_bus_add_dev(rbd_dev);
33040f308a31SAlex Elder 	if (rc)
33050f308a31SAlex Elder 		goto err_out_disk;
33060f308a31SAlex Elder 
330732eec68dSAlex Elder 	/*
330832eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
330932eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
331032eec68dSAlex Elder 	 */
33112ac4e75dSAlex Elder 
33124bb1f1edSAlex Elder 	down_write(&rbd_dev->header_rwsem);
33135ed16177SAlex Elder 	rc = rbd_dev_snaps_register(rbd_dev);
33144bb1f1edSAlex Elder 	up_write(&rbd_dev->header_rwsem);
33152ac4e75dSAlex Elder 	if (rc)
33162ac4e75dSAlex Elder 		goto err_out_bus;
33172ac4e75dSAlex Elder 
331859c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
331959c2be1eSYehuda Sadeh 	if (rc)
332059c2be1eSYehuda Sadeh 		goto err_out_bus;
332159c2be1eSYehuda Sadeh 
33224e9afebaSAlex Elder 	kfree(rbd_opts);
33234e9afebaSAlex Elder 
33243ee4001eSAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
33253ee4001eSAlex Elder 
33263ee4001eSAlex Elder 	add_disk(rbd_dev->disk);
33273ee4001eSAlex Elder 
33283ee4001eSAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
33293ee4001eSAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
33303ee4001eSAlex Elder 
3331602adf40SYehuda Sadeh 	return count;
3332602adf40SYehuda Sadeh 
3333766fc439SYehuda Sadeh err_out_bus:
3334766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
3335766fc439SYehuda Sadeh 
3336766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
33374e9afebaSAlex Elder 	kfree(rbd_opts);
33384e9afebaSAlex Elder 
3339766fc439SYehuda Sadeh 	return rc;
3340766fc439SYehuda Sadeh 
33410f308a31SAlex Elder err_out_disk:
33420f308a31SAlex Elder 	rbd_free_disk(rbd_dev);
3343602adf40SYehuda Sadeh err_out_blkdev:
3344602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
334585ae8926SAlex Elder err_out_id:
334685ae8926SAlex Elder 	rbd_dev_id_put(rbd_dev);
334741f38c2bSAlex Elder err_out_snaps:
334841f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
334941f38c2bSAlex Elder err_out_probe:
335005fd6f6fSAlex Elder 	rbd_header_free(&rbd_dev->header);
3351602adf40SYehuda Sadeh err_out_client:
33523fcf2581SAlex Elder 	kfree(rbd_dev->header_name);
33539d3997fdSAlex Elder 	rbd_put_client(rbdc);
33540ddebc0cSAlex Elder err_out_args:
335578cea76eSAlex Elder 	if (ceph_opts)
335678cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
33574e9afebaSAlex Elder 	kfree(rbd_opts);
3358859c31dfSAlex Elder 	rbd_spec_put(spec);
335985ae8926SAlex Elder err_out_mem:
336027cc2594SAlex Elder 	kfree(rbd_dev);
336127cc2594SAlex Elder 
3362602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
3363602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
336427cc2594SAlex Elder 
336527cc2594SAlex Elder 	return (ssize_t) rc;
3366602adf40SYehuda Sadeh }
3367602adf40SYehuda Sadeh 
3368de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3369602adf40SYehuda Sadeh {
3370602adf40SYehuda Sadeh 	struct list_head *tmp;
3371602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
3372602adf40SYehuda Sadeh 
3373e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3374602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
3375602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3376de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
3377e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
3378602adf40SYehuda Sadeh 			return rbd_dev;
3379602adf40SYehuda Sadeh 		}
3380e124a82fSAlex Elder 	}
3381e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3382602adf40SYehuda Sadeh 	return NULL;
3383602adf40SYehuda Sadeh }
3384602adf40SYehuda Sadeh 
3385dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
3386602adf40SYehuda Sadeh {
3387593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3388602adf40SYehuda Sadeh 
33891dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
33901dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
33911dbb4399SAlex Elder 
33921dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
339359c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
33941dbb4399SAlex Elder 	}
339559c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
3396070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
339759c2be1eSYehuda Sadeh 
33989d3997fdSAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3399602adf40SYehuda Sadeh 
3400602adf40SYehuda Sadeh 	/* clean up and free blkdev */
3401602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
3402602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
340332eec68dSAlex Elder 
34042ac4e75dSAlex Elder 	/* release allocated disk header fields */
34052ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
34062ac4e75dSAlex Elder 
340732eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
34080bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
3409e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
34108b8fb99cSAlex Elder 	rbd_spec_put(rbd_dev->spec);
3411602adf40SYehuda Sadeh 	kfree(rbd_dev);
3412602adf40SYehuda Sadeh 
3413602adf40SYehuda Sadeh 	/* release module ref */
3414602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
3415602adf40SYehuda Sadeh }
3416602adf40SYehuda Sadeh 
3417dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
3418602adf40SYehuda Sadeh 			  const char *buf,
3419602adf40SYehuda Sadeh 			  size_t count)
3420602adf40SYehuda Sadeh {
3421602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
3422602adf40SYehuda Sadeh 	int target_id, rc;
3423602adf40SYehuda Sadeh 	unsigned long ul;
3424602adf40SYehuda Sadeh 	int ret = count;
3425602adf40SYehuda Sadeh 
3426602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
3427602adf40SYehuda Sadeh 	if (rc)
3428602adf40SYehuda Sadeh 		return rc;
3429602adf40SYehuda Sadeh 
3430602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
3431602adf40SYehuda Sadeh 	target_id = (int) ul;
3432602adf40SYehuda Sadeh 	if (target_id != ul)
3433602adf40SYehuda Sadeh 		return -EINVAL;
3434602adf40SYehuda Sadeh 
3435602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3436602adf40SYehuda Sadeh 
3437602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
3438602adf40SYehuda Sadeh 	if (!rbd_dev) {
3439602adf40SYehuda Sadeh 		ret = -ENOENT;
3440602adf40SYehuda Sadeh 		goto done;
3441602adf40SYehuda Sadeh 	}
3442602adf40SYehuda Sadeh 
344341f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
3444dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3445602adf40SYehuda Sadeh 
3446602adf40SYehuda Sadeh done:
3447602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3448aafb230eSAlex Elder 
3449602adf40SYehuda Sadeh 	return ret;
3450602adf40SYehuda Sadeh }
3451602adf40SYehuda Sadeh 
3452602adf40SYehuda Sadeh /*
3453602adf40SYehuda Sadeh  * create control files in sysfs
3454dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3455602adf40SYehuda Sadeh  */
3456602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3457602adf40SYehuda Sadeh {
3458dfc5606dSYehuda Sadeh 	int ret;
3459602adf40SYehuda Sadeh 
3460fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3461dfc5606dSYehuda Sadeh 	if (ret < 0)
3462dfc5606dSYehuda Sadeh 		return ret;
3463602adf40SYehuda Sadeh 
3464fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3465fed4c143SAlex Elder 	if (ret < 0)
3466fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3467602adf40SYehuda Sadeh 
3468602adf40SYehuda Sadeh 	return ret;
3469602adf40SYehuda Sadeh }
3470602adf40SYehuda Sadeh 
3471602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3472602adf40SYehuda Sadeh {
3473dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3474fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3475602adf40SYehuda Sadeh }
3476602adf40SYehuda Sadeh 
3477602adf40SYehuda Sadeh int __init rbd_init(void)
3478602adf40SYehuda Sadeh {
3479602adf40SYehuda Sadeh 	int rc;
3480602adf40SYehuda Sadeh 
3481602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3482602adf40SYehuda Sadeh 	if (rc)
3483602adf40SYehuda Sadeh 		return rc;
3484f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3485602adf40SYehuda Sadeh 	return 0;
3486602adf40SYehuda Sadeh }
3487602adf40SYehuda Sadeh 
3488602adf40SYehuda Sadeh void __exit rbd_exit(void)
3489602adf40SYehuda Sadeh {
3490602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3491602adf40SYehuda Sadeh }
3492602adf40SYehuda Sadeh 
3493602adf40SYehuda Sadeh module_init(rbd_init);
3494602adf40SYehuda Sadeh module_exit(rbd_exit);
3495602adf40SYehuda Sadeh 
3496602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3497602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3498602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3499602adf40SYehuda Sadeh 
3500602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3501602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3502602adf40SYehuda Sadeh 
3503602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3504