xref: /openbmc/linux/drivers/block/rbd.c (revision 99c1f08f)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
65602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
66602adf40SYehuda Sadeh 
67602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
68602adf40SYehuda Sadeh 
6981a89793SAlex Elder /*
7081a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
7181a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
7281a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
7381a89793SAlex Elder  * enough to hold all possible device names.
7481a89793SAlex Elder  */
75602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7681a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
77602adf40SYehuda Sadeh 
78cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
7959c2be1eSYehuda Sadeh 
80602adf40SYehuda Sadeh /*
81602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
82602adf40SYehuda Sadeh  */
83602adf40SYehuda Sadeh struct rbd_image_header {
84f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
85849b4260SAlex Elder 	char *object_prefix;
86602adf40SYehuda Sadeh 	__u8 obj_order;
87602adf40SYehuda Sadeh 	__u8 crypt_type;
88602adf40SYehuda Sadeh 	__u8 comp_type;
89602adf40SYehuda Sadeh 
90f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
91f84344f3SAlex Elder 	u64 image_size;
92f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
93602adf40SYehuda Sadeh 	char *snap_names;
94602adf40SYehuda Sadeh 	u64 *snap_sizes;
9559c2be1eSYehuda Sadeh 
9659c2be1eSYehuda Sadeh 	u64 obj_version;
9759c2be1eSYehuda Sadeh };
9859c2be1eSYehuda Sadeh 
9959c2be1eSYehuda Sadeh struct rbd_options {
100cc0538b6SAlex Elder 	bool	read_only;
101602adf40SYehuda Sadeh };
102602adf40SYehuda Sadeh 
103602adf40SYehuda Sadeh /*
104f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
105602adf40SYehuda Sadeh  */
106602adf40SYehuda Sadeh struct rbd_client {
107602adf40SYehuda Sadeh 	struct ceph_client	*client;
108602adf40SYehuda Sadeh 	struct kref		kref;
109602adf40SYehuda Sadeh 	struct list_head	node;
110602adf40SYehuda Sadeh };
111602adf40SYehuda Sadeh 
112602adf40SYehuda Sadeh /*
113f0f8cef5SAlex Elder  * a request completion status
114602adf40SYehuda Sadeh  */
1151fec7093SYehuda Sadeh struct rbd_req_status {
1161fec7093SYehuda Sadeh 	int done;
1171fec7093SYehuda Sadeh 	int rc;
1181fec7093SYehuda Sadeh 	u64 bytes;
1191fec7093SYehuda Sadeh };
1201fec7093SYehuda Sadeh 
1211fec7093SYehuda Sadeh /*
1221fec7093SYehuda Sadeh  * a collection of requests
1231fec7093SYehuda Sadeh  */
1241fec7093SYehuda Sadeh struct rbd_req_coll {
1251fec7093SYehuda Sadeh 	int			total;
1261fec7093SYehuda Sadeh 	int			num_done;
1271fec7093SYehuda Sadeh 	struct kref		kref;
1281fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
129602adf40SYehuda Sadeh };
130602adf40SYehuda Sadeh 
131f0f8cef5SAlex Elder /*
132f0f8cef5SAlex Elder  * a single io request
133f0f8cef5SAlex Elder  */
134f0f8cef5SAlex Elder struct rbd_request {
135f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
136f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
137f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
138f0f8cef5SAlex Elder 	u64			len;
139f0f8cef5SAlex Elder 	int			coll_index;
140f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
141f0f8cef5SAlex Elder };
142f0f8cef5SAlex Elder 
143dfc5606dSYehuda Sadeh struct rbd_snap {
144dfc5606dSYehuda Sadeh 	struct	device		dev;
145dfc5606dSYehuda Sadeh 	const char		*name;
1463591538fSJosh Durgin 	u64			size;
147dfc5606dSYehuda Sadeh 	struct list_head	node;
148dfc5606dSYehuda Sadeh 	u64			id;
149dfc5606dSYehuda Sadeh };
150dfc5606dSYehuda Sadeh 
151f84344f3SAlex Elder struct rbd_mapping {
152f84344f3SAlex Elder 	char                    *snap_name;
153f84344f3SAlex Elder 	u64                     snap_id;
15499c1f08fSAlex Elder 	u64                     size;
155f84344f3SAlex Elder 	bool                    snap_exists;
156f84344f3SAlex Elder 	bool			read_only;
157f84344f3SAlex Elder };
158f84344f3SAlex Elder 
159602adf40SYehuda Sadeh /*
160602adf40SYehuda Sadeh  * a single device
161602adf40SYehuda Sadeh  */
162602adf40SYehuda Sadeh struct rbd_device {
163de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
164602adf40SYehuda Sadeh 
165602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
166602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
167602adf40SYehuda Sadeh 
168f8c38929SAlex Elder 	struct rbd_options	rbd_opts;
169602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
170602adf40SYehuda Sadeh 
171602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
172602adf40SYehuda Sadeh 
173602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
174602adf40SYehuda Sadeh 
175602adf40SYehuda Sadeh 	struct rbd_image_header	header;
1760bed54dcSAlex Elder 	char			*image_name;
1770bed54dcSAlex Elder 	size_t			image_name_len;
1780bed54dcSAlex Elder 	char			*header_name;
179d22f76e7SAlex Elder 	char			*pool_name;
1809bb2f334SAlex Elder 	int			pool_id;
181602adf40SYehuda Sadeh 
18259c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
18359c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
18459c2be1eSYehuda Sadeh 
185c666601aSJosh Durgin 	/* protects updating the header */
186c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
187f84344f3SAlex Elder 
188f84344f3SAlex Elder 	struct rbd_mapping	mapping;
189602adf40SYehuda Sadeh 
190602adf40SYehuda Sadeh 	struct list_head	node;
191dfc5606dSYehuda Sadeh 
192dfc5606dSYehuda Sadeh 	/* list of snapshots */
193dfc5606dSYehuda Sadeh 	struct list_head	snaps;
194dfc5606dSYehuda Sadeh 
195dfc5606dSYehuda Sadeh 	/* sysfs related */
196dfc5606dSYehuda Sadeh 	struct device		dev;
197dfc5606dSYehuda Sadeh };
198dfc5606dSYehuda Sadeh 
199602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
200e124a82fSAlex Elder 
201602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
202e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
203e124a82fSAlex Elder 
204602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
205432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
206602adf40SYehuda Sadeh 
2079fcbb800SAlex Elder static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
208dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
209dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
210dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
211dfc5606dSYehuda Sadeh 			    const char *buf,
212dfc5606dSYehuda Sadeh 			    size_t count);
21314e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap);
214dfc5606dSYehuda Sadeh 
215f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
216f0f8cef5SAlex Elder 		       size_t count);
217f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
218f0f8cef5SAlex Elder 			  size_t count);
219f0f8cef5SAlex Elder 
220f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
221f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
222f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
223f0f8cef5SAlex Elder 	__ATTR_NULL
224f0f8cef5SAlex Elder };
225f0f8cef5SAlex Elder 
226f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
227f0f8cef5SAlex Elder 	.name		= "rbd",
228f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
229f0f8cef5SAlex Elder };
230f0f8cef5SAlex Elder 
231f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
232f0f8cef5SAlex Elder {
233f0f8cef5SAlex Elder }
234f0f8cef5SAlex Elder 
235f0f8cef5SAlex Elder static struct device rbd_root_dev = {
236f0f8cef5SAlex Elder 	.init_name =    "rbd",
237f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
238f0f8cef5SAlex Elder };
239f0f8cef5SAlex Elder 
240aafb230eSAlex Elder #ifdef RBD_DEBUG
241aafb230eSAlex Elder #define rbd_assert(expr)						\
242aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
243aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
244aafb230eSAlex Elder 						"at line %d:\n\n"	\
245aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
246aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
247aafb230eSAlex Elder 			BUG();						\
248aafb230eSAlex Elder 		}
249aafb230eSAlex Elder #else /* !RBD_DEBUG */
250aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
251aafb230eSAlex Elder #endif /* !RBD_DEBUG */
252dfc5606dSYehuda Sadeh 
253dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
254dfc5606dSYehuda Sadeh {
255dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
256dfc5606dSYehuda Sadeh }
257dfc5606dSYehuda Sadeh 
258dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
259dfc5606dSYehuda Sadeh {
260dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
261dfc5606dSYehuda Sadeh }
262602adf40SYehuda Sadeh 
2631fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
26459c2be1eSYehuda Sadeh 
265602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
266602adf40SYehuda Sadeh {
267f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
268602adf40SYehuda Sadeh 
269f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
270602adf40SYehuda Sadeh 		return -EROFS;
271602adf40SYehuda Sadeh 
272340c7a2bSAlex Elder 	rbd_get_dev(rbd_dev);
273f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
274340c7a2bSAlex Elder 
275602adf40SYehuda Sadeh 	return 0;
276602adf40SYehuda Sadeh }
277602adf40SYehuda Sadeh 
278dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
279dfc5606dSYehuda Sadeh {
280dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
281dfc5606dSYehuda Sadeh 
282dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
283dfc5606dSYehuda Sadeh 
284dfc5606dSYehuda Sadeh 	return 0;
285dfc5606dSYehuda Sadeh }
286dfc5606dSYehuda Sadeh 
287602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
288602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
289602adf40SYehuda Sadeh 	.open			= rbd_open,
290dfc5606dSYehuda Sadeh 	.release		= rbd_release,
291602adf40SYehuda Sadeh };
292602adf40SYehuda Sadeh 
293602adf40SYehuda Sadeh /*
294602adf40SYehuda Sadeh  * Initialize an rbd client instance.
29543ae4701SAlex Elder  * We own *ceph_opts.
296602adf40SYehuda Sadeh  */
297f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
298602adf40SYehuda Sadeh {
299602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
300602adf40SYehuda Sadeh 	int ret = -ENOMEM;
301602adf40SYehuda Sadeh 
302602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
303602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
304602adf40SYehuda Sadeh 	if (!rbdc)
305602adf40SYehuda Sadeh 		goto out_opt;
306602adf40SYehuda Sadeh 
307602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
308602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
309602adf40SYehuda Sadeh 
310bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
311bc534d86SAlex Elder 
31243ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
313602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
314bc534d86SAlex Elder 		goto out_mutex;
31543ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
316602adf40SYehuda Sadeh 
317602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
318602adf40SYehuda Sadeh 	if (ret < 0)
319602adf40SYehuda Sadeh 		goto out_err;
320602adf40SYehuda Sadeh 
321432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
322602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
323432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
324602adf40SYehuda Sadeh 
325bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
326bc534d86SAlex Elder 
327602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
328602adf40SYehuda Sadeh 	return rbdc;
329602adf40SYehuda Sadeh 
330602adf40SYehuda Sadeh out_err:
331602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
332bc534d86SAlex Elder out_mutex:
333bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
334602adf40SYehuda Sadeh 	kfree(rbdc);
335602adf40SYehuda Sadeh out_opt:
33643ae4701SAlex Elder 	if (ceph_opts)
33743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
33828f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
339602adf40SYehuda Sadeh }
340602adf40SYehuda Sadeh 
341602adf40SYehuda Sadeh /*
3421f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
3431f7ba331SAlex Elder  * found, bump its reference count.
344602adf40SYehuda Sadeh  */
3451f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
346602adf40SYehuda Sadeh {
347602adf40SYehuda Sadeh 	struct rbd_client *client_node;
3481f7ba331SAlex Elder 	bool found = false;
349602adf40SYehuda Sadeh 
35043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
351602adf40SYehuda Sadeh 		return NULL;
352602adf40SYehuda Sadeh 
3531f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
3541f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
3551f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
3561f7ba331SAlex Elder 			kref_get(&client_node->kref);
3571f7ba331SAlex Elder 			found = true;
3581f7ba331SAlex Elder 			break;
3591f7ba331SAlex Elder 		}
3601f7ba331SAlex Elder 	}
3611f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
3621f7ba331SAlex Elder 
3631f7ba331SAlex Elder 	return found ? client_node : NULL;
364602adf40SYehuda Sadeh }
365602adf40SYehuda Sadeh 
366602adf40SYehuda Sadeh /*
36759c2be1eSYehuda Sadeh  * mount options
36859c2be1eSYehuda Sadeh  */
36959c2be1eSYehuda Sadeh enum {
37059c2be1eSYehuda Sadeh 	Opt_last_int,
37159c2be1eSYehuda Sadeh 	/* int args above */
37259c2be1eSYehuda Sadeh 	Opt_last_string,
37359c2be1eSYehuda Sadeh 	/* string args above */
374cc0538b6SAlex Elder 	Opt_read_only,
375cc0538b6SAlex Elder 	Opt_read_write,
376cc0538b6SAlex Elder 	/* Boolean args above */
377cc0538b6SAlex Elder 	Opt_last_bool,
37859c2be1eSYehuda Sadeh };
37959c2be1eSYehuda Sadeh 
38043ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
38159c2be1eSYehuda Sadeh 	/* int args above */
38259c2be1eSYehuda Sadeh 	/* string args above */
383f84344f3SAlex Elder 	{Opt_read_only, "mapping.read_only"},
384cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
385cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
386cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
387cc0538b6SAlex Elder 	/* Boolean args above */
38859c2be1eSYehuda Sadeh 	{-1, NULL}
38959c2be1eSYehuda Sadeh };
39059c2be1eSYehuda Sadeh 
39159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
39259c2be1eSYehuda Sadeh {
39343ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
39459c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
39559c2be1eSYehuda Sadeh 	int token, intval, ret;
39659c2be1eSYehuda Sadeh 
39743ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
39859c2be1eSYehuda Sadeh 	if (token < 0)
39959c2be1eSYehuda Sadeh 		return -EINVAL;
40059c2be1eSYehuda Sadeh 
40159c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
40259c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
40359c2be1eSYehuda Sadeh 		if (ret < 0) {
40459c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
40559c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
40659c2be1eSYehuda Sadeh 			return ret;
40759c2be1eSYehuda Sadeh 		}
40859c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
40959c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
41059c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
41159c2be1eSYehuda Sadeh 		     argstr[0].from);
412cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
413cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
41459c2be1eSYehuda Sadeh 	} else {
41559c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
41659c2be1eSYehuda Sadeh 	}
41759c2be1eSYehuda Sadeh 
41859c2be1eSYehuda Sadeh 	switch (token) {
419cc0538b6SAlex Elder 	case Opt_read_only:
420cc0538b6SAlex Elder 		rbd_opts->read_only = true;
421cc0538b6SAlex Elder 		break;
422cc0538b6SAlex Elder 	case Opt_read_write:
423cc0538b6SAlex Elder 		rbd_opts->read_only = false;
424cc0538b6SAlex Elder 		break;
42559c2be1eSYehuda Sadeh 	default:
426aafb230eSAlex Elder 		rbd_assert(false);
427aafb230eSAlex Elder 		break;
42859c2be1eSYehuda Sadeh 	}
42959c2be1eSYehuda Sadeh 	return 0;
43059c2be1eSYehuda Sadeh }
43159c2be1eSYehuda Sadeh 
43259c2be1eSYehuda Sadeh /*
433602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
434602adf40SYehuda Sadeh  * not exist create it.
435602adf40SYehuda Sadeh  */
436f8c38929SAlex Elder static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
437f8c38929SAlex Elder 				size_t mon_addr_len, char *options)
438602adf40SYehuda Sadeh {
439f8c38929SAlex Elder 	struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
44043ae4701SAlex Elder 	struct ceph_options *ceph_opts;
441f8c38929SAlex Elder 	struct rbd_client *rbdc;
44259c2be1eSYehuda Sadeh 
443cc0538b6SAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
444602adf40SYehuda Sadeh 
44543ae4701SAlex Elder 	ceph_opts = ceph_parse_options(options, mon_addr,
4465214ecc4SAlex Elder 					mon_addr + mon_addr_len,
44721079786SAlex Elder 					parse_rbd_opts_token, rbd_opts);
448f8c38929SAlex Elder 	if (IS_ERR(ceph_opts))
449f8c38929SAlex Elder 		return PTR_ERR(ceph_opts);
450602adf40SYehuda Sadeh 
4511f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
452602adf40SYehuda Sadeh 	if (rbdc) {
453e6994d3dSAlex Elder 		/* using an existing client */
45443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
455f8c38929SAlex Elder 	} else {
456f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
457d720bcb0SAlex Elder 		if (IS_ERR(rbdc))
458f8c38929SAlex Elder 			return PTR_ERR(rbdc);
459f8c38929SAlex Elder 	}
460f8c38929SAlex Elder 	rbd_dev->rbd_client = rbdc;
461d720bcb0SAlex Elder 
462f8c38929SAlex Elder 	return 0;
463602adf40SYehuda Sadeh }
464602adf40SYehuda Sadeh 
465602adf40SYehuda Sadeh /*
466602adf40SYehuda Sadeh  * Destroy ceph client
467d23a4b3fSAlex Elder  *
468432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
469602adf40SYehuda Sadeh  */
470602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
471602adf40SYehuda Sadeh {
472602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
473602adf40SYehuda Sadeh 
474602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
475cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
476602adf40SYehuda Sadeh 	list_del(&rbdc->node);
477cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
478602adf40SYehuda Sadeh 
479602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
480602adf40SYehuda Sadeh 	kfree(rbdc);
481602adf40SYehuda Sadeh }
482602adf40SYehuda Sadeh 
483602adf40SYehuda Sadeh /*
484602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
485602adf40SYehuda Sadeh  * it.
486602adf40SYehuda Sadeh  */
487602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
488602adf40SYehuda Sadeh {
489602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
490602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
491602adf40SYehuda Sadeh }
492602adf40SYehuda Sadeh 
4931fec7093SYehuda Sadeh /*
4941fec7093SYehuda Sadeh  * Destroy requests collection
4951fec7093SYehuda Sadeh  */
4961fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4971fec7093SYehuda Sadeh {
4981fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4991fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5001fec7093SYehuda Sadeh 
5011fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5021fec7093SYehuda Sadeh 	kfree(coll);
5031fec7093SYehuda Sadeh }
504602adf40SYehuda Sadeh 
5058e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5068e94af8eSAlex Elder {
507103a150fSAlex Elder 	size_t size;
508103a150fSAlex Elder 	u32 snap_count;
509103a150fSAlex Elder 
510103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
511103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
512103a150fSAlex Elder 		return false;
513103a150fSAlex Elder 
514103a150fSAlex Elder 	/*
515103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
516103a150fSAlex Elder 	 * that limits the number of snapshots.
517103a150fSAlex Elder 	 */
518103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
519103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
520103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
521103a150fSAlex Elder 		return false;
522103a150fSAlex Elder 
523103a150fSAlex Elder 	/*
524103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
525103a150fSAlex Elder 	 * header must also be representable in a size_t.
526103a150fSAlex Elder 	 */
527103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
528103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
529103a150fSAlex Elder 		return false;
530103a150fSAlex Elder 
531103a150fSAlex Elder 	return true;
5328e94af8eSAlex Elder }
5338e94af8eSAlex Elder 
534602adf40SYehuda Sadeh /*
535602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
536602adf40SYehuda Sadeh  * header.
537602adf40SYehuda Sadeh  */
538602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
5394156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
540602adf40SYehuda Sadeh {
541ccece235SAlex Elder 	u32 snap_count;
54258c17b0eSAlex Elder 	size_t len;
543d2bb24e5SAlex Elder 	size_t size;
544621901d6SAlex Elder 	u32 i;
545602adf40SYehuda Sadeh 
5466a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
5476a52325fSAlex Elder 
548103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
549103a150fSAlex Elder 
55058c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
55158c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
5526a52325fSAlex Elder 	if (!header->object_prefix)
553602adf40SYehuda Sadeh 		return -ENOMEM;
55458c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
55558c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
55600f1f36fSAlex Elder 
557602adf40SYehuda Sadeh 	if (snap_count) {
558f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
559f785cc1dSAlex Elder 
560621901d6SAlex Elder 		/* Save a copy of the snapshot names */
561621901d6SAlex Elder 
562f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
563f785cc1dSAlex Elder 			return -EIO;
564f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
565602adf40SYehuda Sadeh 		if (!header->snap_names)
5666a52325fSAlex Elder 			goto out_err;
567f785cc1dSAlex Elder 		/*
568f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
569f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
570f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
571f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
572f785cc1dSAlex Elder 		 */
573f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
574f785cc1dSAlex Elder 			snap_names_len);
5756a52325fSAlex Elder 
576621901d6SAlex Elder 		/* Record each snapshot's size */
577621901d6SAlex Elder 
578d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
579d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
580602adf40SYehuda Sadeh 		if (!header->snap_sizes)
5816a52325fSAlex Elder 			goto out_err;
582621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
583621901d6SAlex Elder 			header->snap_sizes[i] =
584621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
585602adf40SYehuda Sadeh 	} else {
586ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
587602adf40SYehuda Sadeh 		header->snap_names = NULL;
588602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
589602adf40SYehuda Sadeh 	}
590849b4260SAlex Elder 
591602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
592602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
593602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
5946a52325fSAlex Elder 
595621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
596621901d6SAlex Elder 
597f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
5986a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
5996a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6006a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6016a52325fSAlex Elder 	if (!header->snapc)
6026a52325fSAlex Elder 		goto out_err;
603602adf40SYehuda Sadeh 
604602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
605505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
606602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
607621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
608602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
609602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
610602adf40SYehuda Sadeh 
611602adf40SYehuda Sadeh 	return 0;
612602adf40SYehuda Sadeh 
6136a52325fSAlex Elder out_err:
614849b4260SAlex Elder 	kfree(header->snap_sizes);
615ccece235SAlex Elder 	header->snap_sizes = NULL;
616602adf40SYehuda Sadeh 	kfree(header->snap_names);
617ccece235SAlex Elder 	header->snap_names = NULL;
6186a52325fSAlex Elder 	kfree(header->object_prefix);
6196a52325fSAlex Elder 	header->object_prefix = NULL;
620ccece235SAlex Elder 
62100f1f36fSAlex Elder 	return -ENOMEM;
622602adf40SYehuda Sadeh }
623602adf40SYehuda Sadeh 
624602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
625602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
626602adf40SYehuda Sadeh {
627602adf40SYehuda Sadeh 	int i;
628602adf40SYehuda Sadeh 	char *p = header->snap_names;
629602adf40SYehuda Sadeh 
630c9aadfe7SAlex Elder 	rbd_assert(header->snapc != NULL);
631c9aadfe7SAlex Elder 	for (i = 0; i < header->snapc->num_snaps; i++) {
63200f1f36fSAlex Elder 		if (!strcmp(snap_name, p)) {
63300f1f36fSAlex Elder 
63400f1f36fSAlex Elder 			/* Found it.  Pass back its id and/or size */
63500f1f36fSAlex Elder 
636602adf40SYehuda Sadeh 			if (seq)
637602adf40SYehuda Sadeh 				*seq = header->snapc->snaps[i];
638602adf40SYehuda Sadeh 			if (size)
639602adf40SYehuda Sadeh 				*size = header->snap_sizes[i];
640602adf40SYehuda Sadeh 			return i;
641602adf40SYehuda Sadeh 		}
64200f1f36fSAlex Elder 		p += strlen(p) + 1;	/* Skip ahead to the next name */
64300f1f36fSAlex Elder 	}
64400f1f36fSAlex Elder 	return -ENOENT;
64500f1f36fSAlex Elder }
646602adf40SYehuda Sadeh 
64799c1f08fSAlex Elder static int rbd_header_set_snap(struct rbd_device *rbd_dev)
648602adf40SYehuda Sadeh {
64978dc447dSAlex Elder 	int ret;
650602adf40SYehuda Sadeh 
6510ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
652602adf40SYehuda Sadeh 
653f84344f3SAlex Elder 	if (!memcmp(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME,
654cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
655f84344f3SAlex Elder 		rbd_dev->mapping.snap_id = CEPH_NOSNAP;
65699c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
657f84344f3SAlex Elder 		rbd_dev->mapping.snap_exists = false;
658f84344f3SAlex Elder 		rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
659602adf40SYehuda Sadeh 	} else {
660f84344f3SAlex Elder 		ret = snap_by_name(&rbd_dev->header,
661f84344f3SAlex Elder 					rbd_dev->mapping.snap_name,
66299c1f08fSAlex Elder 					&rbd_dev->mapping.snap_id,
66399c1f08fSAlex Elder 					&rbd_dev->mapping.size);
664602adf40SYehuda Sadeh 		if (ret < 0)
665602adf40SYehuda Sadeh 			goto done;
666f84344f3SAlex Elder 		rbd_dev->mapping.snap_exists = true;
667f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
668602adf40SYehuda Sadeh 	}
669602adf40SYehuda Sadeh 
670602adf40SYehuda Sadeh 	ret = 0;
671602adf40SYehuda Sadeh done:
6720ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
673602adf40SYehuda Sadeh 	return ret;
674602adf40SYehuda Sadeh }
675602adf40SYehuda Sadeh 
676602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
677602adf40SYehuda Sadeh {
678849b4260SAlex Elder 	kfree(header->object_prefix);
679d78fd7aeSAlex Elder 	header->object_prefix = NULL;
680602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
681d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
682849b4260SAlex Elder 	kfree(header->snap_names);
683d78fd7aeSAlex Elder 	header->snap_names = NULL;
684d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
685d78fd7aeSAlex Elder 	header->snapc = NULL;
686602adf40SYehuda Sadeh }
687602adf40SYehuda Sadeh 
68865ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
689602adf40SYehuda Sadeh {
69065ccfe21SAlex Elder 	char *name;
69165ccfe21SAlex Elder 	u64 segment;
69265ccfe21SAlex Elder 	int ret;
693602adf40SYehuda Sadeh 
69465ccfe21SAlex Elder 	name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
69565ccfe21SAlex Elder 	if (!name)
69665ccfe21SAlex Elder 		return NULL;
69765ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
69865ccfe21SAlex Elder 	ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
69965ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
70065ccfe21SAlex Elder 	if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
70165ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
70265ccfe21SAlex Elder 			segment, ret);
70365ccfe21SAlex Elder 		kfree(name);
70465ccfe21SAlex Elder 		name = NULL;
70565ccfe21SAlex Elder 	}
706602adf40SYehuda Sadeh 
70765ccfe21SAlex Elder 	return name;
70865ccfe21SAlex Elder }
709602adf40SYehuda Sadeh 
71065ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
71165ccfe21SAlex Elder {
71265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
713602adf40SYehuda Sadeh 
71465ccfe21SAlex Elder 	return offset & (segment_size - 1);
71565ccfe21SAlex Elder }
71665ccfe21SAlex Elder 
71765ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
71865ccfe21SAlex Elder 				u64 offset, u64 length)
71965ccfe21SAlex Elder {
72065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
72165ccfe21SAlex Elder 
72265ccfe21SAlex Elder 	offset &= segment_size - 1;
72365ccfe21SAlex Elder 
724aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
72565ccfe21SAlex Elder 	if (offset + length > segment_size)
72665ccfe21SAlex Elder 		length = segment_size - offset;
72765ccfe21SAlex Elder 
72865ccfe21SAlex Elder 	return length;
729602adf40SYehuda Sadeh }
730602adf40SYehuda Sadeh 
7311fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
7321fec7093SYehuda Sadeh 				u64 ofs, u64 len)
7331fec7093SYehuda Sadeh {
734df111be6SAlex Elder 	u64 start_seg;
735df111be6SAlex Elder 	u64 end_seg;
736df111be6SAlex Elder 
737df111be6SAlex Elder 	if (!len)
738df111be6SAlex Elder 		return 0;
739df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
740df111be6SAlex Elder 		return -ERANGE;
741df111be6SAlex Elder 
742df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
743df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
744df111be6SAlex Elder 
7451fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
7461fec7093SYehuda Sadeh }
7471fec7093SYehuda Sadeh 
748602adf40SYehuda Sadeh /*
749029bcbd8SJosh Durgin  * returns the size of an object in the image
750029bcbd8SJosh Durgin  */
751029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
752029bcbd8SJosh Durgin {
753029bcbd8SJosh Durgin 	return 1 << header->obj_order;
754029bcbd8SJosh Durgin }
755029bcbd8SJosh Durgin 
756029bcbd8SJosh Durgin /*
757602adf40SYehuda Sadeh  * bio helpers
758602adf40SYehuda Sadeh  */
759602adf40SYehuda Sadeh 
760602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
761602adf40SYehuda Sadeh {
762602adf40SYehuda Sadeh 	struct bio *tmp;
763602adf40SYehuda Sadeh 
764602adf40SYehuda Sadeh 	while (chain) {
765602adf40SYehuda Sadeh 		tmp = chain;
766602adf40SYehuda Sadeh 		chain = chain->bi_next;
767602adf40SYehuda Sadeh 		bio_put(tmp);
768602adf40SYehuda Sadeh 	}
769602adf40SYehuda Sadeh }
770602adf40SYehuda Sadeh 
771602adf40SYehuda Sadeh /*
772602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
773602adf40SYehuda Sadeh  */
774602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
775602adf40SYehuda Sadeh {
776602adf40SYehuda Sadeh 	struct bio_vec *bv;
777602adf40SYehuda Sadeh 	unsigned long flags;
778602adf40SYehuda Sadeh 	void *buf;
779602adf40SYehuda Sadeh 	int i;
780602adf40SYehuda Sadeh 	int pos = 0;
781602adf40SYehuda Sadeh 
782602adf40SYehuda Sadeh 	while (chain) {
783602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
784602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
785602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
786602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
787602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
788602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
78985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
790602adf40SYehuda Sadeh 			}
791602adf40SYehuda Sadeh 			pos += bv->bv_len;
792602adf40SYehuda Sadeh 		}
793602adf40SYehuda Sadeh 
794602adf40SYehuda Sadeh 		chain = chain->bi_next;
795602adf40SYehuda Sadeh 	}
796602adf40SYehuda Sadeh }
797602adf40SYehuda Sadeh 
798602adf40SYehuda Sadeh /*
799602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
800602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
801602adf40SYehuda Sadeh  */
802602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
803602adf40SYehuda Sadeh 				   struct bio_pair **bp,
804602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
805602adf40SYehuda Sadeh {
806542582fcSAlex Elder 	struct bio *old_chain = *old;
807542582fcSAlex Elder 	struct bio *new_chain = NULL;
808542582fcSAlex Elder 	struct bio *tail;
809602adf40SYehuda Sadeh 	int total = 0;
810602adf40SYehuda Sadeh 
811602adf40SYehuda Sadeh 	if (*bp) {
812602adf40SYehuda Sadeh 		bio_pair_release(*bp);
813602adf40SYehuda Sadeh 		*bp = NULL;
814602adf40SYehuda Sadeh 	}
815602adf40SYehuda Sadeh 
816602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
817542582fcSAlex Elder 		struct bio *tmp;
818542582fcSAlex Elder 
819602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
820602adf40SYehuda Sadeh 		if (!tmp)
821602adf40SYehuda Sadeh 			goto err_out;
822542582fcSAlex Elder 		gfpmask &= ~__GFP_WAIT;	/* can't wait after the first */
823602adf40SYehuda Sadeh 
824602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
825602adf40SYehuda Sadeh 			struct bio_pair *bp;
826602adf40SYehuda Sadeh 
827602adf40SYehuda Sadeh 			/*
828602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
829602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
830602adf40SYehuda Sadeh 			 */
831602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
832bd919d45SAlex Elder 			     "bi_size=%u\n",
833bd919d45SAlex Elder 			     total, len - total, old_chain->bi_size);
834602adf40SYehuda Sadeh 
835602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
836602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
837593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
838602adf40SYehuda Sadeh 			if (!bp)
839602adf40SYehuda Sadeh 				goto err_out;
840602adf40SYehuda Sadeh 
841602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
842602adf40SYehuda Sadeh 
843602adf40SYehuda Sadeh 			*next = &bp->bio2;
844602adf40SYehuda Sadeh 		} else {
845602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
846602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
847602adf40SYehuda Sadeh 		}
848602adf40SYehuda Sadeh 
849602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
850602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
851542582fcSAlex Elder 		if (new_chain)
852602adf40SYehuda Sadeh 			tail->bi_next = tmp;
853542582fcSAlex Elder 		else
854542582fcSAlex Elder 			new_chain = tmp;
855602adf40SYehuda Sadeh 		tail = tmp;
856602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
857602adf40SYehuda Sadeh 
858602adf40SYehuda Sadeh 		total += tmp->bi_size;
859602adf40SYehuda Sadeh 	}
860602adf40SYehuda Sadeh 
861aafb230eSAlex Elder 	rbd_assert(total == len);
862602adf40SYehuda Sadeh 
863602adf40SYehuda Sadeh 	*old = old_chain;
864602adf40SYehuda Sadeh 
865602adf40SYehuda Sadeh 	return new_chain;
866602adf40SYehuda Sadeh 
867602adf40SYehuda Sadeh err_out:
868602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
869602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
870602adf40SYehuda Sadeh 	return NULL;
871602adf40SYehuda Sadeh }
872602adf40SYehuda Sadeh 
873602adf40SYehuda Sadeh /*
874602adf40SYehuda Sadeh  * helpers for osd request op vectors.
875602adf40SYehuda Sadeh  */
87657cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
87757cfc106SAlex Elder 					int opcode, u32 payload_len)
878602adf40SYehuda Sadeh {
87957cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
88057cfc106SAlex Elder 
88157cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
88257cfc106SAlex Elder 	if (!ops)
88357cfc106SAlex Elder 		return NULL;
88457cfc106SAlex Elder 
88557cfc106SAlex Elder 	ops[0].op = opcode;
88657cfc106SAlex Elder 
887602adf40SYehuda Sadeh 	/*
888602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
889602adf40SYehuda Sadeh 	 * in calc_raw_layout()
890602adf40SYehuda Sadeh 	 */
89157cfc106SAlex Elder 	ops[0].payload_len = payload_len;
89257cfc106SAlex Elder 
89357cfc106SAlex Elder 	return ops;
894602adf40SYehuda Sadeh }
895602adf40SYehuda Sadeh 
896602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
897602adf40SYehuda Sadeh {
898602adf40SYehuda Sadeh 	kfree(ops);
899602adf40SYehuda Sadeh }
900602adf40SYehuda Sadeh 
9011fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
9021fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
9031fec7093SYehuda Sadeh 				   int index,
9041fec7093SYehuda Sadeh 				   int ret, u64 len)
9051fec7093SYehuda Sadeh {
9061fec7093SYehuda Sadeh 	struct request_queue *q;
9071fec7093SYehuda Sadeh 	int min, max, i;
9081fec7093SYehuda Sadeh 
909bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
910bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
9111fec7093SYehuda Sadeh 
9121fec7093SYehuda Sadeh 	if (!rq)
9131fec7093SYehuda Sadeh 		return;
9141fec7093SYehuda Sadeh 
9151fec7093SYehuda Sadeh 	if (!coll) {
9161fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
9171fec7093SYehuda Sadeh 		return;
9181fec7093SYehuda Sadeh 	}
9191fec7093SYehuda Sadeh 
9201fec7093SYehuda Sadeh 	q = rq->q;
9211fec7093SYehuda Sadeh 
9221fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
9231fec7093SYehuda Sadeh 	coll->status[index].done = 1;
9241fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
9251fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
9261fec7093SYehuda Sadeh 	max = min = coll->num_done;
9271fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
9281fec7093SYehuda Sadeh 		max++;
9291fec7093SYehuda Sadeh 
9301fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
9311fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
9321fec7093SYehuda Sadeh 				  coll->status[i].bytes);
9331fec7093SYehuda Sadeh 		coll->num_done++;
9341fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
9351fec7093SYehuda Sadeh 	}
9361fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
9371fec7093SYehuda Sadeh }
9381fec7093SYehuda Sadeh 
9391fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
9401fec7093SYehuda Sadeh 			     int ret, u64 len)
9411fec7093SYehuda Sadeh {
9421fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
9431fec7093SYehuda Sadeh }
9441fec7093SYehuda Sadeh 
945602adf40SYehuda Sadeh /*
946602adf40SYehuda Sadeh  * Send ceph osd request
947602adf40SYehuda Sadeh  */
948602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
9490ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
950602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
951602adf40SYehuda Sadeh 			  u64 snapid,
952aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
953602adf40SYehuda Sadeh 			  struct bio *bio,
954602adf40SYehuda Sadeh 			  struct page **pages,
955602adf40SYehuda Sadeh 			  int num_pages,
956602adf40SYehuda Sadeh 			  int flags,
957602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
9581fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
9591fec7093SYehuda Sadeh 			  int coll_index,
960602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
96159c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
96259c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
96359c2be1eSYehuda Sadeh 			  u64 *ver)
964602adf40SYehuda Sadeh {
965602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
966602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
967602adf40SYehuda Sadeh 	int ret;
968602adf40SYehuda Sadeh 	u64 bno;
969602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
970602adf40SYehuda Sadeh 	struct rbd_request *req_data;
971602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
9721dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
973602adf40SYehuda Sadeh 
974602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
9751fec7093SYehuda Sadeh 	if (!req_data) {
9761fec7093SYehuda Sadeh 		if (coll)
9771fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
9781fec7093SYehuda Sadeh 					       -ENOMEM, len);
9791fec7093SYehuda Sadeh 		return -ENOMEM;
9801fec7093SYehuda Sadeh 	}
981602adf40SYehuda Sadeh 
9821fec7093SYehuda Sadeh 	if (coll) {
9831fec7093SYehuda Sadeh 		req_data->coll = coll;
9841fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
9851fec7093SYehuda Sadeh 	}
9861fec7093SYehuda Sadeh 
987bd919d45SAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
988bd919d45SAlex Elder 		(unsigned long long) ofs, (unsigned long long) len);
989602adf40SYehuda Sadeh 
9900ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
9911dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9921dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9934ad12621SSage Weil 	if (!req) {
9944ad12621SSage Weil 		ret = -ENOMEM;
995602adf40SYehuda Sadeh 		goto done_pages;
996602adf40SYehuda Sadeh 	}
997602adf40SYehuda Sadeh 
998602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
999602adf40SYehuda Sadeh 
1000602adf40SYehuda Sadeh 	req_data->rq = rq;
1001602adf40SYehuda Sadeh 	req_data->bio = bio;
1002602adf40SYehuda Sadeh 	req_data->pages = pages;
1003602adf40SYehuda Sadeh 	req_data->len = len;
1004602adf40SYehuda Sadeh 
1005602adf40SYehuda Sadeh 	req->r_priv = req_data;
1006602adf40SYehuda Sadeh 
1007602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
1008602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1009602adf40SYehuda Sadeh 
1010aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
1011602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
1012602adf40SYehuda Sadeh 
1013602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
1014602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1015602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1016602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1017602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
10180ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
10191dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
10201dbb4399SAlex Elder 				req, ops);
1021602adf40SYehuda Sadeh 
1022602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
1023602adf40SYehuda Sadeh 				ops,
1024602adf40SYehuda Sadeh 				snapc,
1025602adf40SYehuda Sadeh 				&mtime,
1026602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
1027602adf40SYehuda Sadeh 
102859c2be1eSYehuda Sadeh 	if (linger_req) {
10291dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
103059c2be1eSYehuda Sadeh 		*linger_req = req;
103159c2be1eSYehuda Sadeh 	}
103259c2be1eSYehuda Sadeh 
10331dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
1034602adf40SYehuda Sadeh 	if (ret < 0)
1035602adf40SYehuda Sadeh 		goto done_err;
1036602adf40SYehuda Sadeh 
1037602adf40SYehuda Sadeh 	if (!rbd_cb) {
10381dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
103959c2be1eSYehuda Sadeh 		if (ver)
104059c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
1041bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
1042bd919d45SAlex Elder 			(unsigned long long)
10431fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
1044602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
1045602adf40SYehuda Sadeh 	}
1046602adf40SYehuda Sadeh 	return ret;
1047602adf40SYehuda Sadeh 
1048602adf40SYehuda Sadeh done_err:
1049602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
1050602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1051602adf40SYehuda Sadeh done_pages:
10521fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
1053602adf40SYehuda Sadeh 	kfree(req_data);
1054602adf40SYehuda Sadeh 	return ret;
1055602adf40SYehuda Sadeh }
1056602adf40SYehuda Sadeh 
1057602adf40SYehuda Sadeh /*
1058602adf40SYehuda Sadeh  * Ceph osd op callback
1059602adf40SYehuda Sadeh  */
1060602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1061602adf40SYehuda Sadeh {
1062602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
1063602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1064602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
1065602adf40SYehuda Sadeh 	__s32 rc;
1066602adf40SYehuda Sadeh 	u64 bytes;
1067602adf40SYehuda Sadeh 	int read_op;
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh 	/* parse reply */
1070602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1071602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1072602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
1073602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
1074602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1075895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1076602adf40SYehuda Sadeh 
1077bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1078bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1079602adf40SYehuda Sadeh 
1080602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1081602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1082602adf40SYehuda Sadeh 		rc = 0;
1083602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1084602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1085602adf40SYehuda Sadeh 		bytes = req_data->len;
1086602adf40SYehuda Sadeh 	}
1087602adf40SYehuda Sadeh 
10881fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1089602adf40SYehuda Sadeh 
1090602adf40SYehuda Sadeh 	if (req_data->bio)
1091602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1092602adf40SYehuda Sadeh 
1093602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1094602adf40SYehuda Sadeh 	kfree(req_data);
1095602adf40SYehuda Sadeh }
1096602adf40SYehuda Sadeh 
109759c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
109859c2be1eSYehuda Sadeh {
109959c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
110059c2be1eSYehuda Sadeh }
110159c2be1eSYehuda Sadeh 
1102602adf40SYehuda Sadeh /*
1103602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1104602adf40SYehuda Sadeh  */
11050ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1106602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1107602adf40SYehuda Sadeh 			   u64 snapid,
1108602adf40SYehuda Sadeh 			   int flags,
1109913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1110aded07eaSAlex Elder 			   const char *object_name,
1111602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
111259c2be1eSYehuda Sadeh 			   char *buf,
111359c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
111459c2be1eSYehuda Sadeh 			   u64 *ver)
1115602adf40SYehuda Sadeh {
1116602adf40SYehuda Sadeh 	int ret;
1117602adf40SYehuda Sadeh 	struct page **pages;
1118602adf40SYehuda Sadeh 	int num_pages;
1119913d2fdcSAlex Elder 
1120aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1121602adf40SYehuda Sadeh 
1122602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1123602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1124b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1125b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1126602adf40SYehuda Sadeh 
11270ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1128aded07eaSAlex Elder 			  object_name, ofs, len, NULL,
1129602adf40SYehuda Sadeh 			  pages, num_pages,
1130602adf40SYehuda Sadeh 			  flags,
1131602adf40SYehuda Sadeh 			  ops,
11321fec7093SYehuda Sadeh 			  NULL, 0,
113359c2be1eSYehuda Sadeh 			  NULL,
113459c2be1eSYehuda Sadeh 			  linger_req, ver);
1135602adf40SYehuda Sadeh 	if (ret < 0)
1136913d2fdcSAlex Elder 		goto done;
1137602adf40SYehuda Sadeh 
1138602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1139602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1140602adf40SYehuda Sadeh 
1141602adf40SYehuda Sadeh done:
1142602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1143602adf40SYehuda Sadeh 	return ret;
1144602adf40SYehuda Sadeh }
1145602adf40SYehuda Sadeh 
1146602adf40SYehuda Sadeh /*
1147602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1148602adf40SYehuda Sadeh  */
1149602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1150602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1151602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1152602adf40SYehuda Sadeh 		     u64 snapid,
1153d1f57ea6SAlex Elder 		     int opcode, int flags,
1154602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
11551fec7093SYehuda Sadeh 		     struct bio *bio,
11561fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
11571fec7093SYehuda Sadeh 		     int coll_index)
1158602adf40SYehuda Sadeh {
1159602adf40SYehuda Sadeh 	char *seg_name;
1160602adf40SYehuda Sadeh 	u64 seg_ofs;
1161602adf40SYehuda Sadeh 	u64 seg_len;
1162602adf40SYehuda Sadeh 	int ret;
1163602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1164602adf40SYehuda Sadeh 	u32 payload_len;
1165602adf40SYehuda Sadeh 
116665ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1167602adf40SYehuda Sadeh 	if (!seg_name)
1168602adf40SYehuda Sadeh 		return -ENOMEM;
116965ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
117065ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1171602adf40SYehuda Sadeh 
1172602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1173602adf40SYehuda Sadeh 
117457cfc106SAlex Elder 	ret = -ENOMEM;
117557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
117657cfc106SAlex Elder 	if (!ops)
1177602adf40SYehuda Sadeh 		goto done;
1178602adf40SYehuda Sadeh 
1179602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1180602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1181602adf40SYehuda Sadeh 	   truncated at this point */
1182aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1183602adf40SYehuda Sadeh 
1184602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1185602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1186602adf40SYehuda Sadeh 			     bio,
1187602adf40SYehuda Sadeh 			     NULL, 0,
1188602adf40SYehuda Sadeh 			     flags,
1189602adf40SYehuda Sadeh 			     ops,
11901fec7093SYehuda Sadeh 			     coll, coll_index,
119159c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
119211f77002SSage Weil 
119311f77002SSage Weil 	rbd_destroy_ops(ops);
1194602adf40SYehuda Sadeh done:
1195602adf40SYehuda Sadeh 	kfree(seg_name);
1196602adf40SYehuda Sadeh 	return ret;
1197602adf40SYehuda Sadeh }
1198602adf40SYehuda Sadeh 
1199602adf40SYehuda Sadeh /*
1200602adf40SYehuda Sadeh  * Request async osd write
1201602adf40SYehuda Sadeh  */
1202602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1203602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1204602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1205602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
12061fec7093SYehuda Sadeh 			 struct bio *bio,
12071fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
12081fec7093SYehuda Sadeh 			 int coll_index)
1209602adf40SYehuda Sadeh {
1210602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1211602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1212602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
12131fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1214602adf40SYehuda Sadeh }
1215602adf40SYehuda Sadeh 
1216602adf40SYehuda Sadeh /*
1217602adf40SYehuda Sadeh  * Request async osd read
1218602adf40SYehuda Sadeh  */
1219602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1220602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1221602adf40SYehuda Sadeh 			 u64 snapid,
1222602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
12231fec7093SYehuda Sadeh 			 struct bio *bio,
12241fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
12251fec7093SYehuda Sadeh 			 int coll_index)
1226602adf40SYehuda Sadeh {
1227602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1228b06e6a6bSJosh Durgin 			 snapid,
1229602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1230602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
12311fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1232602adf40SYehuda Sadeh }
1233602adf40SYehuda Sadeh 
1234602adf40SYehuda Sadeh /*
1235602adf40SYehuda Sadeh  * Request sync osd read
1236602adf40SYehuda Sadeh  */
12370ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1238602adf40SYehuda Sadeh 			  u64 snapid,
1239aded07eaSAlex Elder 			  const char *object_name,
1240602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
124159c2be1eSYehuda Sadeh 			  char *buf,
124259c2be1eSYehuda Sadeh 			  u64 *ver)
1243602adf40SYehuda Sadeh {
1244913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1245913d2fdcSAlex Elder 	int ret;
1246913d2fdcSAlex Elder 
1247913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1248913d2fdcSAlex Elder 	if (!ops)
1249913d2fdcSAlex Elder 		return -ENOMEM;
1250913d2fdcSAlex Elder 
1251913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1252b06e6a6bSJosh Durgin 			       snapid,
1253602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1254913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1255913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1256913d2fdcSAlex Elder 
1257913d2fdcSAlex Elder 	return ret;
1258602adf40SYehuda Sadeh }
1259602adf40SYehuda Sadeh 
1260602adf40SYehuda Sadeh /*
126159c2be1eSYehuda Sadeh  * Request sync osd watch
126259c2be1eSYehuda Sadeh  */
12630ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
126459c2be1eSYehuda Sadeh 				   u64 ver,
12657f0a24d8SAlex Elder 				   u64 notify_id)
126659c2be1eSYehuda Sadeh {
126759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
126811f77002SSage Weil 	int ret;
126911f77002SSage Weil 
127057cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
127157cfc106SAlex Elder 	if (!ops)
127257cfc106SAlex Elder 		return -ENOMEM;
127359c2be1eSYehuda Sadeh 
1274a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
127559c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
127659c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
127759c2be1eSYehuda Sadeh 
12780ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
12797f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1280ad4f232fSAlex Elder 			  NULL, 0,
128159c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
128259c2be1eSYehuda Sadeh 			  ops,
12831fec7093SYehuda Sadeh 			  NULL, 0,
128459c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
128559c2be1eSYehuda Sadeh 
128659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
128759c2be1eSYehuda Sadeh 	return ret;
128859c2be1eSYehuda Sadeh }
128959c2be1eSYehuda Sadeh 
129059c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
129159c2be1eSYehuda Sadeh {
12920ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1293a71b891bSJosh Durgin 	u64 hver;
129413143d2dSSage Weil 	int rc;
129513143d2dSSage Weil 
12960ce1a794SAlex Elder 	if (!rbd_dev)
129759c2be1eSYehuda Sadeh 		return;
129859c2be1eSYehuda Sadeh 
1299bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1300bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1301bd919d45SAlex Elder 		(unsigned int) opcode);
13021fe5e993SAlex Elder 	rc = rbd_refresh_header(rbd_dev, &hver);
130313143d2dSSage Weil 	if (rc)
1304f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
13050ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
130659c2be1eSYehuda Sadeh 
13077f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
130859c2be1eSYehuda Sadeh }
130959c2be1eSYehuda Sadeh 
131059c2be1eSYehuda Sadeh /*
131159c2be1eSYehuda Sadeh  * Request sync osd watch
131259c2be1eSYehuda Sadeh  */
13130e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
131459c2be1eSYehuda Sadeh {
131559c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13160ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
131757cfc106SAlex Elder 	int ret;
131859c2be1eSYehuda Sadeh 
131957cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
132057cfc106SAlex Elder 	if (!ops)
132157cfc106SAlex Elder 		return -ENOMEM;
132259c2be1eSYehuda Sadeh 
132359c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
13240ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
132559c2be1eSYehuda Sadeh 	if (ret < 0)
132659c2be1eSYehuda Sadeh 		goto fail;
132759c2be1eSYehuda Sadeh 
13280e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
13290ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
133059c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
133159c2be1eSYehuda Sadeh 
13320ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
133359c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
133459c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
133559c2be1eSYehuda Sadeh 			      ops,
13360e6f322dSAlex Elder 			      rbd_dev->header_name,
13370e6f322dSAlex Elder 			      0, 0, NULL,
13380ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
133959c2be1eSYehuda Sadeh 
134059c2be1eSYehuda Sadeh 	if (ret < 0)
134159c2be1eSYehuda Sadeh 		goto fail_event;
134259c2be1eSYehuda Sadeh 
134359c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
134459c2be1eSYehuda Sadeh 	return 0;
134559c2be1eSYehuda Sadeh 
134659c2be1eSYehuda Sadeh fail_event:
13470ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13480ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
134959c2be1eSYehuda Sadeh fail:
135059c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
135159c2be1eSYehuda Sadeh 	return ret;
135259c2be1eSYehuda Sadeh }
135359c2be1eSYehuda Sadeh 
135479e3057cSYehuda Sadeh /*
135579e3057cSYehuda Sadeh  * Request sync osd unwatch
135679e3057cSYehuda Sadeh  */
1357070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
135879e3057cSYehuda Sadeh {
135979e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
136057cfc106SAlex Elder 	int ret;
136179e3057cSYehuda Sadeh 
136257cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
136357cfc106SAlex Elder 	if (!ops)
136457cfc106SAlex Elder 		return -ENOMEM;
136579e3057cSYehuda Sadeh 
136679e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
13670ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
136879e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
136979e3057cSYehuda Sadeh 
13700ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
137179e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
137279e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
137379e3057cSYehuda Sadeh 			      ops,
1374070c633fSAlex Elder 			      rbd_dev->header_name,
1375070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1376070c633fSAlex Elder 
137779e3057cSYehuda Sadeh 
137879e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
13790ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13800ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
138179e3057cSYehuda Sadeh 	return ret;
138279e3057cSYehuda Sadeh }
138379e3057cSYehuda Sadeh 
138459c2be1eSYehuda Sadeh struct rbd_notify_info {
13850ce1a794SAlex Elder 	struct rbd_device *rbd_dev;
138659c2be1eSYehuda Sadeh };
138759c2be1eSYehuda Sadeh 
138859c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
138959c2be1eSYehuda Sadeh {
13900ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
13910ce1a794SAlex Elder 	if (!rbd_dev)
139259c2be1eSYehuda Sadeh 		return;
139359c2be1eSYehuda Sadeh 
1394bd919d45SAlex Elder 	dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1395bd919d45SAlex Elder 			rbd_dev->header_name, (unsigned long long) notify_id,
1396bd919d45SAlex Elder 			(unsigned int) opcode);
139759c2be1eSYehuda Sadeh }
139859c2be1eSYehuda Sadeh 
139959c2be1eSYehuda Sadeh /*
140059c2be1eSYehuda Sadeh  * Request sync osd notify
140159c2be1eSYehuda Sadeh  */
14024cb16250SAlex Elder static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
140359c2be1eSYehuda Sadeh {
140459c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
14050ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
140659c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
140759c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
140859c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
140959c2be1eSYehuda Sadeh 	int ret;
141059c2be1eSYehuda Sadeh 
141157cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
141257cfc106SAlex Elder 	if (!ops)
141357cfc106SAlex Elder 		return -ENOMEM;
141459c2be1eSYehuda Sadeh 
14150ce1a794SAlex Elder 	info.rbd_dev = rbd_dev;
141659c2be1eSYehuda Sadeh 
141759c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
141859c2be1eSYehuda Sadeh 				     (void *)&info, &event);
141959c2be1eSYehuda Sadeh 	if (ret < 0)
142059c2be1eSYehuda Sadeh 		goto fail;
142159c2be1eSYehuda Sadeh 
142259c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
142359c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
142459c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
142559c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
142659c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
142759c2be1eSYehuda Sadeh 
14280ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
142959c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
143059c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
143159c2be1eSYehuda Sadeh 			       ops,
14324cb16250SAlex Elder 			       rbd_dev->header_name,
14334cb16250SAlex Elder 			       0, 0, NULL, NULL, NULL);
143459c2be1eSYehuda Sadeh 	if (ret < 0)
143559c2be1eSYehuda Sadeh 		goto fail_event;
143659c2be1eSYehuda Sadeh 
143759c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
143859c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
143959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
144059c2be1eSYehuda Sadeh 	return 0;
144159c2be1eSYehuda Sadeh 
144259c2be1eSYehuda Sadeh fail_event:
144359c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
144459c2be1eSYehuda Sadeh fail:
144559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
144659c2be1eSYehuda Sadeh 	return ret;
144759c2be1eSYehuda Sadeh }
144859c2be1eSYehuda Sadeh 
144959c2be1eSYehuda Sadeh /*
1450602adf40SYehuda Sadeh  * Request sync osd read
1451602adf40SYehuda Sadeh  */
14520ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1453aded07eaSAlex Elder 			     const char *object_name,
1454aded07eaSAlex Elder 			     const char *class_name,
1455aded07eaSAlex Elder 			     const char *method_name,
1456602adf40SYehuda Sadeh 			     const char *data,
145759c2be1eSYehuda Sadeh 			     int len,
145859c2be1eSYehuda Sadeh 			     u64 *ver)
1459602adf40SYehuda Sadeh {
1460602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1461aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1462aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
146357cfc106SAlex Elder 	int ret;
146457cfc106SAlex Elder 
146557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
1466aded07eaSAlex Elder 				    class_name_len + method_name_len + len);
146757cfc106SAlex Elder 	if (!ops)
146857cfc106SAlex Elder 		return -ENOMEM;
1469602adf40SYehuda Sadeh 
1470aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1471aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1472aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1473aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1474602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1475602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1476602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1477602adf40SYehuda Sadeh 
14780ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1479602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1480602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1481602adf40SYehuda Sadeh 			       ops,
1482d1f57ea6SAlex Elder 			       object_name, 0, 0, NULL, NULL, ver);
1483602adf40SYehuda Sadeh 
1484602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1485602adf40SYehuda Sadeh 
1486602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1487602adf40SYehuda Sadeh 	return ret;
1488602adf40SYehuda Sadeh }
1489602adf40SYehuda Sadeh 
14901fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14911fec7093SYehuda Sadeh {
14921fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14931fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14941fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14951fec7093SYehuda Sadeh 				GFP_ATOMIC);
14961fec7093SYehuda Sadeh 
14971fec7093SYehuda Sadeh 	if (!coll)
14981fec7093SYehuda Sadeh 		return NULL;
14991fec7093SYehuda Sadeh 	coll->total = num_reqs;
15001fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15011fec7093SYehuda Sadeh 	return coll;
15021fec7093SYehuda Sadeh }
15031fec7093SYehuda Sadeh 
1504602adf40SYehuda Sadeh /*
1505602adf40SYehuda Sadeh  * block device queue callback
1506602adf40SYehuda Sadeh  */
1507602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1508602adf40SYehuda Sadeh {
1509602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1510602adf40SYehuda Sadeh 	struct request *rq;
1511602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1512602adf40SYehuda Sadeh 
151300f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1514602adf40SYehuda Sadeh 		struct bio *bio;
1515602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1516602adf40SYehuda Sadeh 		bool do_write;
1517bd919d45SAlex Elder 		unsigned int size;
1518bd919d45SAlex Elder 		u64 op_size = 0;
1519602adf40SYehuda Sadeh 		u64 ofs;
15201fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
15211fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1522d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1523602adf40SYehuda Sadeh 
1524602adf40SYehuda Sadeh 		dout("fetched request\n");
1525602adf40SYehuda Sadeh 
1526602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1527602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1528602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
152900f1f36fSAlex Elder 			continue;
1530602adf40SYehuda Sadeh 		}
1531602adf40SYehuda Sadeh 
1532602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1533602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1534602adf40SYehuda Sadeh 
1535602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1536593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1537602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1538f84344f3SAlex Elder 		if (do_write && rbd_dev->mapping.read_only) {
1539602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
154000f1f36fSAlex Elder 			continue;
1541602adf40SYehuda Sadeh 		}
1542602adf40SYehuda Sadeh 
1543602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1544602adf40SYehuda Sadeh 
1545e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1546e88a36ecSJosh Durgin 
1547f84344f3SAlex Elder 		if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1548f84344f3SAlex Elder 				!rbd_dev->mapping.snap_exists) {
1549d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1550e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1551e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1552e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1553e88a36ecSJosh Durgin 			continue;
1554e88a36ecSJosh Durgin 		}
1555d1d25646SJosh Durgin 
1556d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1557d1d25646SJosh Durgin 
1558d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1559e88a36ecSJosh Durgin 
1560602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1561602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1562bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1563602adf40SYehuda Sadeh 
15641fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1565df111be6SAlex Elder 		if (num_segs <= 0) {
1566df111be6SAlex Elder 			spin_lock_irq(q->queue_lock);
1567df111be6SAlex Elder 			__blk_end_request_all(rq, num_segs);
1568df111be6SAlex Elder 			ceph_put_snap_context(snapc);
1569df111be6SAlex Elder 			continue;
1570df111be6SAlex Elder 		}
15711fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
15721fec7093SYehuda Sadeh 		if (!coll) {
15731fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
15741fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1575d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
157600f1f36fSAlex Elder 			continue;
15771fec7093SYehuda Sadeh 		}
15781fec7093SYehuda Sadeh 
1579602adf40SYehuda Sadeh 		do {
1580602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1581bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
158265ccfe21SAlex Elder 			op_size = rbd_segment_length(rbd_dev, ofs, size);
15831fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1584602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1585602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1586602adf40SYehuda Sadeh 			if (!bio) {
15871fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15881fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15891fec7093SYehuda Sadeh 				goto next_seg;
1590602adf40SYehuda Sadeh 			}
1591602adf40SYehuda Sadeh 
15921fec7093SYehuda Sadeh 
1593602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1594602adf40SYehuda Sadeh 			if (do_write)
1595602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1596d1d25646SJosh Durgin 					      snapc,
1597602adf40SYehuda Sadeh 					      ofs,
15981fec7093SYehuda Sadeh 					      op_size, bio,
15991fec7093SYehuda Sadeh 					      coll, cur_seg);
1600602adf40SYehuda Sadeh 			else
1601602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
1602f84344f3SAlex Elder 					     rbd_dev->mapping.snap_id,
1603602adf40SYehuda Sadeh 					     ofs,
16041fec7093SYehuda Sadeh 					     op_size, bio,
16051fec7093SYehuda Sadeh 					     coll, cur_seg);
1606602adf40SYehuda Sadeh 
16071fec7093SYehuda Sadeh next_seg:
1608602adf40SYehuda Sadeh 			size -= op_size;
1609602adf40SYehuda Sadeh 			ofs += op_size;
1610602adf40SYehuda Sadeh 
16111fec7093SYehuda Sadeh 			cur_seg++;
1612602adf40SYehuda Sadeh 			rq_bio = next_bio;
1613602adf40SYehuda Sadeh 		} while (size > 0);
16141fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1615602adf40SYehuda Sadeh 
1616602adf40SYehuda Sadeh 		if (bp)
1617602adf40SYehuda Sadeh 			bio_pair_release(bp);
1618602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1619d1d25646SJosh Durgin 
1620d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1621602adf40SYehuda Sadeh 	}
1622602adf40SYehuda Sadeh }
1623602adf40SYehuda Sadeh 
1624602adf40SYehuda Sadeh /*
1625602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1626602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1627602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1628602adf40SYehuda Sadeh  */
1629602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1630602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1631602adf40SYehuda Sadeh {
1632602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1633593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1634593a9e7bSAlex Elder 	sector_t sector;
1635593a9e7bSAlex Elder 	unsigned int bio_sectors;
1636602adf40SYehuda Sadeh 	int max;
1637602adf40SYehuda Sadeh 
1638593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1639593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1640593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1641593a9e7bSAlex Elder 
1642602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1643593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1644602adf40SYehuda Sadeh 	if (max < 0)
1645602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1646602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1647602adf40SYehuda Sadeh 		return bvec->bv_len;
1648602adf40SYehuda Sadeh 	return max;
1649602adf40SYehuda Sadeh }
1650602adf40SYehuda Sadeh 
1651602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1652602adf40SYehuda Sadeh {
1653602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1654602adf40SYehuda Sadeh 
1655602adf40SYehuda Sadeh 	if (!disk)
1656602adf40SYehuda Sadeh 		return;
1657602adf40SYehuda Sadeh 
1658602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1659602adf40SYehuda Sadeh 
1660602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1661602adf40SYehuda Sadeh 		del_gendisk(disk);
1662602adf40SYehuda Sadeh 	if (disk->queue)
1663602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1664602adf40SYehuda Sadeh 	put_disk(disk);
1665602adf40SYehuda Sadeh }
1666602adf40SYehuda Sadeh 
1667602adf40SYehuda Sadeh /*
16684156d998SAlex Elder  * Read the complete header for the given rbd device.
16694156d998SAlex Elder  *
16704156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
16714156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
16724156d998SAlex Elder  * of a variable that will be filled in with the version of the
16734156d998SAlex Elder  * header object at the time it was read.
16744156d998SAlex Elder  *
16754156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
16764156d998SAlex Elder  */
16774156d998SAlex Elder static struct rbd_image_header_ondisk *
16784156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
16794156d998SAlex Elder {
16804156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
16814156d998SAlex Elder 	u32 snap_count = 0;
16824156d998SAlex Elder 	u64 names_size = 0;
16834156d998SAlex Elder 	u32 want_count;
16844156d998SAlex Elder 	int ret;
16854156d998SAlex Elder 
16864156d998SAlex Elder 	/*
16874156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
16884156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
16894156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
16904156d998SAlex Elder 	 * the number of snapshots could change by the time we read
16914156d998SAlex Elder 	 * it in, in which case we re-read it.
16924156d998SAlex Elder 	 */
16934156d998SAlex Elder 	do {
16944156d998SAlex Elder 		size_t size;
16954156d998SAlex Elder 
16964156d998SAlex Elder 		kfree(ondisk);
16974156d998SAlex Elder 
16984156d998SAlex Elder 		size = sizeof (*ondisk);
16994156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17004156d998SAlex Elder 		size += names_size;
17014156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17024156d998SAlex Elder 		if (!ondisk)
17034156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
17044156d998SAlex Elder 
17054156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
17064156d998SAlex Elder 				       rbd_dev->header_name,
17074156d998SAlex Elder 				       0, size,
17084156d998SAlex Elder 				       (char *) ondisk, version);
17094156d998SAlex Elder 
17104156d998SAlex Elder 		if (ret < 0)
17114156d998SAlex Elder 			goto out_err;
17124156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
17134156d998SAlex Elder 			ret = -ENXIO;
17144156d998SAlex Elder 			pr_warning("short header read for image %s"
17154156d998SAlex Elder 					" (want %zd got %d)\n",
17164156d998SAlex Elder 				rbd_dev->image_name, size, ret);
17174156d998SAlex Elder 			goto out_err;
17184156d998SAlex Elder 		}
17194156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
17204156d998SAlex Elder 			ret = -ENXIO;
17214156d998SAlex Elder 			pr_warning("invalid header for image %s\n",
17224156d998SAlex Elder 				rbd_dev->image_name);
17234156d998SAlex Elder 			goto out_err;
17244156d998SAlex Elder 		}
17254156d998SAlex Elder 
17264156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
17274156d998SAlex Elder 		want_count = snap_count;
17284156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
17294156d998SAlex Elder 	} while (snap_count != want_count);
17304156d998SAlex Elder 
17314156d998SAlex Elder 	return ondisk;
17324156d998SAlex Elder 
17334156d998SAlex Elder out_err:
17344156d998SAlex Elder 	kfree(ondisk);
17354156d998SAlex Elder 
17364156d998SAlex Elder 	return ERR_PTR(ret);
17374156d998SAlex Elder }
17384156d998SAlex Elder 
17394156d998SAlex Elder /*
1740602adf40SYehuda Sadeh  * reload the ondisk the header
1741602adf40SYehuda Sadeh  */
1742602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1743602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1744602adf40SYehuda Sadeh {
17454156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
17464156d998SAlex Elder 	u64 ver = 0;
17474156d998SAlex Elder 	int ret;
1748602adf40SYehuda Sadeh 
17494156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
17504156d998SAlex Elder 	if (IS_ERR(ondisk))
17514156d998SAlex Elder 		return PTR_ERR(ondisk);
17524156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
17534156d998SAlex Elder 	if (ret >= 0)
175459c2be1eSYehuda Sadeh 		header->obj_version = ver;
17554156d998SAlex Elder 	kfree(ondisk);
1756602adf40SYehuda Sadeh 
17574156d998SAlex Elder 	return ret;
1758602adf40SYehuda Sadeh }
1759602adf40SYehuda Sadeh 
1760602adf40SYehuda Sadeh /*
1761602adf40SYehuda Sadeh  * create a snapshot
1762602adf40SYehuda Sadeh  */
17630ce1a794SAlex Elder static int rbd_header_add_snap(struct rbd_device *rbd_dev,
1764602adf40SYehuda Sadeh 			       const char *snap_name,
1765602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1766602adf40SYehuda Sadeh {
1767602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1768602adf40SYehuda Sadeh 	u64 new_snapid;
1769602adf40SYehuda Sadeh 	int ret;
1770916d4d67SSage Weil 	void *data, *p, *e;
17711dbb4399SAlex Elder 	struct ceph_mon_client *monc;
1772602adf40SYehuda Sadeh 
1773602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
1774f84344f3SAlex Elder 	if (rbd_dev->mapping.snap_id != CEPH_NOSNAP)
1775602adf40SYehuda Sadeh 		return -EINVAL;
1776602adf40SYehuda Sadeh 
17770ce1a794SAlex Elder 	monc = &rbd_dev->rbd_client->client->monc;
17780ce1a794SAlex Elder 	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
1779bd919d45SAlex Elder 	dout("created snapid=%llu\n", (unsigned long long) new_snapid);
1780602adf40SYehuda Sadeh 	if (ret < 0)
1781602adf40SYehuda Sadeh 		return ret;
1782602adf40SYehuda Sadeh 
1783602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1784602adf40SYehuda Sadeh 	if (!data)
1785602adf40SYehuda Sadeh 		return -ENOMEM;
1786602adf40SYehuda Sadeh 
1787916d4d67SSage Weil 	p = data;
1788916d4d67SSage Weil 	e = data + name_len + 16;
1789602adf40SYehuda Sadeh 
1790916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1791916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1792602adf40SYehuda Sadeh 
17930bed54dcSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
17940ce1a794SAlex Elder 				"rbd", "snap_add",
1795d67d4be5SAlex Elder 				data, p - data, NULL);
1796602adf40SYehuda Sadeh 
1797916d4d67SSage Weil 	kfree(data);
1798602adf40SYehuda Sadeh 
1799505cbb9bSAlex Elder 	return ret < 0 ? ret : 0;
1800602adf40SYehuda Sadeh bad:
1801602adf40SYehuda Sadeh 	return -ERANGE;
1802602adf40SYehuda Sadeh }
1803602adf40SYehuda Sadeh 
1804dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1805dfc5606dSYehuda Sadeh {
1806dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1807a0593290SAlex Elder 	struct rbd_snap *next;
1808dfc5606dSYehuda Sadeh 
1809a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
181014e7085dSAlex Elder 		__rbd_remove_snap_dev(snap);
1811dfc5606dSYehuda Sadeh }
1812dfc5606dSYehuda Sadeh 
1813602adf40SYehuda Sadeh /*
1814602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1815602adf40SYehuda Sadeh  */
1816b813623aSAlex Elder static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1817602adf40SYehuda Sadeh {
1818602adf40SYehuda Sadeh 	int ret;
1819602adf40SYehuda Sadeh 	struct rbd_image_header h;
1820602adf40SYehuda Sadeh 
1821602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1822602adf40SYehuda Sadeh 	if (ret < 0)
1823602adf40SYehuda Sadeh 		return ret;
1824602adf40SYehuda Sadeh 
1825a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1826a51aa0c0SJosh Durgin 
18279db4b3e3SSage Weil 	/* resized? */
1828f84344f3SAlex Elder 	if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
1829474ef7ceSJosh Durgin 		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1830474ef7ceSJosh Durgin 
183199c1f08fSAlex Elder 		if (size != (sector_t) rbd_dev->mapping.size) {
183299c1f08fSAlex Elder 			dout("setting size to %llu sectors",
183399c1f08fSAlex Elder 				(unsigned long long) size);
183499c1f08fSAlex Elder 			rbd_dev->mapping.size = (u64) size;
1835474ef7ceSJosh Durgin 			set_capacity(rbd_dev->disk, size);
1836474ef7ceSJosh Durgin 		}
183799c1f08fSAlex Elder 	}
18389db4b3e3SSage Weil 
1839849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1840602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1841849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1842d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1843d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1844602adf40SYehuda Sadeh 
1845b813623aSAlex Elder 	if (hver)
1846b813623aSAlex Elder 		*hver = h.obj_version;
1847a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
184893a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1849602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1850602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1851602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1852849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1853849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1854849b4260SAlex Elder 	kfree(h.object_prefix);
1855849b4260SAlex Elder 
18569fcbb800SAlex Elder 	ret = rbd_dev_snap_devs_update(rbd_dev);
1857dfc5606dSYehuda Sadeh 
1858c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1859602adf40SYehuda Sadeh 
1860dfc5606dSYehuda Sadeh 	return ret;
1861602adf40SYehuda Sadeh }
1862602adf40SYehuda Sadeh 
18631fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
18641fe5e993SAlex Elder {
18651fe5e993SAlex Elder 	int ret;
18661fe5e993SAlex Elder 
18671fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
18681fe5e993SAlex Elder 	ret = __rbd_refresh_header(rbd_dev, hver);
18691fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
18701fe5e993SAlex Elder 
18711fe5e993SAlex Elder 	return ret;
18721fe5e993SAlex Elder }
18731fe5e993SAlex Elder 
1874602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1875602adf40SYehuda Sadeh {
1876602adf40SYehuda Sadeh 	struct gendisk *disk;
1877602adf40SYehuda Sadeh 	struct request_queue *q;
1878602adf40SYehuda Sadeh 	int rc;
1879593a9e7bSAlex Elder 	u64 segment_size;
1880602adf40SYehuda Sadeh 
1881602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1882602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1883602adf40SYehuda Sadeh 	if (rc)
1884602adf40SYehuda Sadeh 		return rc;
1885602adf40SYehuda Sadeh 
1886dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
18879fcbb800SAlex Elder 	rc = rbd_dev_snap_devs_update(rbd_dev);
1888dfc5606dSYehuda Sadeh 	if (rc)
1889dfc5606dSYehuda Sadeh 		return rc;
1890dfc5606dSYehuda Sadeh 
189199c1f08fSAlex Elder 	rc = rbd_header_set_snap(rbd_dev);
1892602adf40SYehuda Sadeh 	if (rc)
1893602adf40SYehuda Sadeh 		return rc;
1894602adf40SYehuda Sadeh 
1895602adf40SYehuda Sadeh 	/* create gendisk info */
1896602adf40SYehuda Sadeh 	rc = -ENOMEM;
1897602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1898602adf40SYehuda Sadeh 	if (!disk)
1899602adf40SYehuda Sadeh 		goto out;
1900602adf40SYehuda Sadeh 
1901f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1902de71a297SAlex Elder 		 rbd_dev->dev_id);
1903602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1904602adf40SYehuda Sadeh 	disk->first_minor = 0;
1905602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1906602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1907602adf40SYehuda Sadeh 
1908602adf40SYehuda Sadeh 	/* init rq */
1909602adf40SYehuda Sadeh 	rc = -ENOMEM;
1910602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1911602adf40SYehuda Sadeh 	if (!q)
1912602adf40SYehuda Sadeh 		goto out_disk;
1913029bcbd8SJosh Durgin 
1914593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1915593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1916593a9e7bSAlex Elder 
1917029bcbd8SJosh Durgin 	/* set io sizes to object size */
1918593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1919593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1920593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1921593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1922593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1923029bcbd8SJosh Durgin 
1924602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1925602adf40SYehuda Sadeh 	disk->queue = q;
1926602adf40SYehuda Sadeh 
1927602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1928602adf40SYehuda Sadeh 
1929602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1930602adf40SYehuda Sadeh 
1931602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
193299c1f08fSAlex Elder 	set_capacity(disk, (sector_t) rbd_dev->mapping.size / SECTOR_SIZE);
1933602adf40SYehuda Sadeh 	add_disk(disk);
1934602adf40SYehuda Sadeh 
1935602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
193699c1f08fSAlex Elder 		disk->disk_name, (unsigned long long) rbd_dev->mapping.size);
1937602adf40SYehuda Sadeh 	return 0;
1938602adf40SYehuda Sadeh 
1939602adf40SYehuda Sadeh out_disk:
1940602adf40SYehuda Sadeh 	put_disk(disk);
1941602adf40SYehuda Sadeh out:
1942602adf40SYehuda Sadeh 	return rc;
1943602adf40SYehuda Sadeh }
1944602adf40SYehuda Sadeh 
1945dfc5606dSYehuda Sadeh /*
1946dfc5606dSYehuda Sadeh   sysfs
1947dfc5606dSYehuda Sadeh */
1948602adf40SYehuda Sadeh 
1949593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1950593a9e7bSAlex Elder {
1951593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1952593a9e7bSAlex Elder }
1953593a9e7bSAlex Elder 
1954dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1955dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1956602adf40SYehuda Sadeh {
1957593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1958a51aa0c0SJosh Durgin 	sector_t size;
1959dfc5606dSYehuda Sadeh 
1960a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1961a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1962a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1963a51aa0c0SJosh Durgin 
1964a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1965602adf40SYehuda Sadeh }
1966602adf40SYehuda Sadeh 
1967dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1968dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1969602adf40SYehuda Sadeh {
1970593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1971dfc5606dSYehuda Sadeh 
1972dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1973dfc5606dSYehuda Sadeh }
1974dfc5606dSYehuda Sadeh 
1975dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1976dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1977dfc5606dSYehuda Sadeh {
1978593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1979dfc5606dSYehuda Sadeh 
19801dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
19811dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1982dfc5606dSYehuda Sadeh }
1983dfc5606dSYehuda Sadeh 
1984dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1985dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1986dfc5606dSYehuda Sadeh {
1987593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1988dfc5606dSYehuda Sadeh 
1989dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1990dfc5606dSYehuda Sadeh }
1991dfc5606dSYehuda Sadeh 
19929bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
19939bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
19949bb2f334SAlex Elder {
19959bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
19969bb2f334SAlex Elder 
19979bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
19989bb2f334SAlex Elder }
19999bb2f334SAlex Elder 
2000dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2001dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2002dfc5606dSYehuda Sadeh {
2003593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2004dfc5606dSYehuda Sadeh 
20050bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
2006dfc5606dSYehuda Sadeh }
2007dfc5606dSYehuda Sadeh 
2008dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2009dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2010dfc5606dSYehuda Sadeh 			     char *buf)
2011dfc5606dSYehuda Sadeh {
2012593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2013dfc5606dSYehuda Sadeh 
2014f84344f3SAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
2015dfc5606dSYehuda Sadeh }
2016dfc5606dSYehuda Sadeh 
2017dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2018dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2019dfc5606dSYehuda Sadeh 				 const char *buf,
2020dfc5606dSYehuda Sadeh 				 size_t size)
2021dfc5606dSYehuda Sadeh {
2022593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2023b813623aSAlex Elder 	int ret;
2024602adf40SYehuda Sadeh 
20251fe5e993SAlex Elder 	ret = rbd_refresh_header(rbd_dev, NULL);
2026b813623aSAlex Elder 
2027b813623aSAlex Elder 	return ret < 0 ? ret : size;
2028dfc5606dSYehuda Sadeh }
2029602adf40SYehuda Sadeh 
2030dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2031dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2032dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2033dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
20349bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2035dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2036dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2037dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2038dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
2039dfc5606dSYehuda Sadeh 
2040dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2041dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
2042dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2043dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2044dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
20459bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2046dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2047dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
2048dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2049dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
2050dfc5606dSYehuda Sadeh 	NULL
2051dfc5606dSYehuda Sadeh };
2052dfc5606dSYehuda Sadeh 
2053dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2054dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2055dfc5606dSYehuda Sadeh };
2056dfc5606dSYehuda Sadeh 
2057dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2058dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2059dfc5606dSYehuda Sadeh 	NULL
2060dfc5606dSYehuda Sadeh };
2061dfc5606dSYehuda Sadeh 
2062dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2063dfc5606dSYehuda Sadeh {
2064dfc5606dSYehuda Sadeh }
2065dfc5606dSYehuda Sadeh 
2066dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2067dfc5606dSYehuda Sadeh 	.name		= "rbd",
2068dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2069dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2070dfc5606dSYehuda Sadeh };
2071dfc5606dSYehuda Sadeh 
2072dfc5606dSYehuda Sadeh 
2073dfc5606dSYehuda Sadeh /*
2074dfc5606dSYehuda Sadeh   sysfs - snapshots
2075dfc5606dSYehuda Sadeh */
2076dfc5606dSYehuda Sadeh 
2077dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2078dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2079dfc5606dSYehuda Sadeh 				  char *buf)
2080dfc5606dSYehuda Sadeh {
2081dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2082dfc5606dSYehuda Sadeh 
20833591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2084dfc5606dSYehuda Sadeh }
2085dfc5606dSYehuda Sadeh 
2086dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2087dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2088dfc5606dSYehuda Sadeh 				char *buf)
2089dfc5606dSYehuda Sadeh {
2090dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2091dfc5606dSYehuda Sadeh 
2092593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2093dfc5606dSYehuda Sadeh }
2094dfc5606dSYehuda Sadeh 
2095dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2096dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2097dfc5606dSYehuda Sadeh 
2098dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2099dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2100dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
2101dfc5606dSYehuda Sadeh 	NULL,
2102dfc5606dSYehuda Sadeh };
2103dfc5606dSYehuda Sadeh 
2104dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2105dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2106dfc5606dSYehuda Sadeh };
2107dfc5606dSYehuda Sadeh 
2108dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2109dfc5606dSYehuda Sadeh {
2110dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2111dfc5606dSYehuda Sadeh 	kfree(snap->name);
2112dfc5606dSYehuda Sadeh 	kfree(snap);
2113dfc5606dSYehuda Sadeh }
2114dfc5606dSYehuda Sadeh 
2115dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2116dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2117dfc5606dSYehuda Sadeh 	NULL
2118dfc5606dSYehuda Sadeh };
2119dfc5606dSYehuda Sadeh 
2120dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2121dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2122dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2123dfc5606dSYehuda Sadeh };
2124dfc5606dSYehuda Sadeh 
212514e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2126dfc5606dSYehuda Sadeh {
2127dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2128dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
2129dfc5606dSYehuda Sadeh }
2130dfc5606dSYehuda Sadeh 
213114e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2132dfc5606dSYehuda Sadeh 				  struct device *parent)
2133dfc5606dSYehuda Sadeh {
2134dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2135dfc5606dSYehuda Sadeh 	int ret;
2136dfc5606dSYehuda Sadeh 
2137dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2138dfc5606dSYehuda Sadeh 	dev->parent = parent;
2139dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2140dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2141dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2142dfc5606dSYehuda Sadeh 
2143dfc5606dSYehuda Sadeh 	return ret;
2144dfc5606dSYehuda Sadeh }
2145dfc5606dSYehuda Sadeh 
21464e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
21474e891e0aSAlex Elder 					      int i, const char *name)
2148dfc5606dSYehuda Sadeh {
21494e891e0aSAlex Elder 	struct rbd_snap *snap;
2150dfc5606dSYehuda Sadeh 	int ret;
21514e891e0aSAlex Elder 
21524e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2153dfc5606dSYehuda Sadeh 	if (!snap)
21544e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
21554e891e0aSAlex Elder 
21564e891e0aSAlex Elder 	ret = -ENOMEM;
2157dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
21584e891e0aSAlex Elder 	if (!snap->name)
21594e891e0aSAlex Elder 		goto err;
21604e891e0aSAlex Elder 
2161dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
2162dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
2163dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
216414e7085dSAlex Elder 		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2165dfc5606dSYehuda Sadeh 		if (ret < 0)
2166dfc5606dSYehuda Sadeh 			goto err;
2167dfc5606dSYehuda Sadeh 	}
21684e891e0aSAlex Elder 
21694e891e0aSAlex Elder 	return snap;
21704e891e0aSAlex Elder 
2171dfc5606dSYehuda Sadeh err:
2172dfc5606dSYehuda Sadeh 	kfree(snap->name);
2173dfc5606dSYehuda Sadeh 	kfree(snap);
21744e891e0aSAlex Elder 
21754e891e0aSAlex Elder 	return ERR_PTR(ret);
2176dfc5606dSYehuda Sadeh }
2177dfc5606dSYehuda Sadeh 
2178dfc5606dSYehuda Sadeh /*
217935938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
218035938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
218135938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
218235938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
218335938150SAlex Elder  * And verify there are no changes to snapshots we already know
218435938150SAlex Elder  * about.
218535938150SAlex Elder  *
218635938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
218735938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
218835938150SAlex Elder  * are also maintained in that order.)
2189dfc5606dSYehuda Sadeh  */
21909fcbb800SAlex Elder static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
2191dfc5606dSYehuda Sadeh {
219235938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
219335938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
219435938150SAlex Elder 	char *snap_name = rbd_dev->header.snap_names;
219535938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
219635938150SAlex Elder 	struct list_head *links = head->next;
219735938150SAlex Elder 	u32 index = 0;
2198dfc5606dSYehuda Sadeh 
21999fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
220035938150SAlex Elder 	while (index < snap_count || links != head) {
220135938150SAlex Elder 		u64 snap_id;
220235938150SAlex Elder 		struct rbd_snap *snap;
2203dfc5606dSYehuda Sadeh 
220435938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
220535938150SAlex Elder 					     : CEPH_NOSNAP;
220635938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
220735938150SAlex Elder 				     : NULL;
2208aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2209dfc5606dSYehuda Sadeh 
221035938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
221135938150SAlex Elder 			struct list_head *next = links->next;
2212dfc5606dSYehuda Sadeh 
221335938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2214dfc5606dSYehuda Sadeh 
2215f84344f3SAlex Elder 			if (rbd_dev->mapping.snap_id == snap->id)
2216f84344f3SAlex Elder 				rbd_dev->mapping.snap_exists = false;
221735938150SAlex Elder 			__rbd_remove_snap_dev(snap);
22189fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
2219f84344f3SAlex Elder 				rbd_dev->mapping.snap_id == snap->id ?
2220f84344f3SAlex Elder 								"mapped " : "",
22219fcbb800SAlex Elder 				(unsigned long long) snap->id);
2222dfc5606dSYehuda Sadeh 
222335938150SAlex Elder 			/* Done with this list entry; advance */
222435938150SAlex Elder 
222535938150SAlex Elder 			links = next;
222635938150SAlex Elder 			continue;
2227dfc5606dSYehuda Sadeh 		}
222835938150SAlex Elder 
22299fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
22309fcbb800SAlex Elder 			(unsigned long long) snap_id);
223135938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
223235938150SAlex Elder 			struct rbd_snap *new_snap;
223335938150SAlex Elder 
223435938150SAlex Elder 			/* We haven't seen this snapshot before */
223535938150SAlex Elder 
223635938150SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, index,
223735938150SAlex Elder 							snap_name);
22389fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
22399fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
22409fcbb800SAlex Elder 
22419fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
22429fcbb800SAlex Elder 
22439fcbb800SAlex Elder 				return err;
22449fcbb800SAlex Elder 			}
224535938150SAlex Elder 
224635938150SAlex Elder 			/* New goes before existing, or at end of list */
224735938150SAlex Elder 
22489fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
224935938150SAlex Elder 			if (snap)
225035938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
225135938150SAlex Elder 			else
2252523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
225335938150SAlex Elder 		} else {
225435938150SAlex Elder 			/* Already have this one */
225535938150SAlex Elder 
22569fcbb800SAlex Elder 			dout("  already present\n");
22579fcbb800SAlex Elder 
2258aafb230eSAlex Elder 			rbd_assert(snap->size ==
2259aafb230eSAlex Elder 					rbd_dev->header.snap_sizes[index]);
2260aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
226135938150SAlex Elder 
226235938150SAlex Elder 			/* Done with this list entry; advance */
226335938150SAlex Elder 
226435938150SAlex Elder 			links = links->next;
2265dfc5606dSYehuda Sadeh 		}
226635938150SAlex Elder 
226735938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
226835938150SAlex Elder 
226935938150SAlex Elder 		index++;
227035938150SAlex Elder 		snap_name += strlen(snap_name) + 1;
2271dfc5606dSYehuda Sadeh 	}
22729fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2273dfc5606dSYehuda Sadeh 
2274dfc5606dSYehuda Sadeh 	return 0;
2275dfc5606dSYehuda Sadeh }
2276dfc5606dSYehuda Sadeh 
2277dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2278dfc5606dSYehuda Sadeh {
2279f0f8cef5SAlex Elder 	int ret;
2280dfc5606dSYehuda Sadeh 	struct device *dev;
2281dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2282dfc5606dSYehuda Sadeh 
2283dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2284dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2285dfc5606dSYehuda Sadeh 
2286dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2287dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2288dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2289dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2290de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2291dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2292dfc5606dSYehuda Sadeh 	if (ret < 0)
2293f0f8cef5SAlex Elder 		goto out;
2294dfc5606dSYehuda Sadeh 
2295dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
229614e7085dSAlex Elder 		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2297dfc5606dSYehuda Sadeh 		if (ret < 0)
2298602adf40SYehuda Sadeh 			break;
2299602adf40SYehuda Sadeh 	}
2300f0f8cef5SAlex Elder out:
2301dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2302dfc5606dSYehuda Sadeh 	return ret;
2303602adf40SYehuda Sadeh }
2304602adf40SYehuda Sadeh 
2305dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2306dfc5606dSYehuda Sadeh {
2307dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2308dfc5606dSYehuda Sadeh }
2309dfc5606dSYehuda Sadeh 
231059c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
231159c2be1eSYehuda Sadeh {
231259c2be1eSYehuda Sadeh 	int ret, rc;
231359c2be1eSYehuda Sadeh 
231459c2be1eSYehuda Sadeh 	do {
23150e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
231659c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
23171fe5e993SAlex Elder 			rc = rbd_refresh_header(rbd_dev, NULL);
231859c2be1eSYehuda Sadeh 			if (rc < 0)
231959c2be1eSYehuda Sadeh 				return rc;
232059c2be1eSYehuda Sadeh 		}
232159c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
232259c2be1eSYehuda Sadeh 
232359c2be1eSYehuda Sadeh 	return ret;
232459c2be1eSYehuda Sadeh }
232559c2be1eSYehuda Sadeh 
2326e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
23271ddbe94eSAlex Elder 
23281ddbe94eSAlex Elder /*
2329499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2330499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
23311ddbe94eSAlex Elder  */
2332e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2333b7f23c36SAlex Elder {
2334e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2335499afd5bSAlex Elder 
2336499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2337499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2338499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2339e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2340e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2341b7f23c36SAlex Elder }
2342b7f23c36SAlex Elder 
23431ddbe94eSAlex Elder /*
2344499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2345499afd5bSAlex Elder  * identifier is no longer in use.
23461ddbe94eSAlex Elder  */
2347e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
23481ddbe94eSAlex Elder {
2349d184f6bfSAlex Elder 	struct list_head *tmp;
2350de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
2351d184f6bfSAlex Elder 	int max_id;
2352d184f6bfSAlex Elder 
2353aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
2354499afd5bSAlex Elder 
2355e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2356e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2357499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2358499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2359d184f6bfSAlex Elder 
2360d184f6bfSAlex Elder 	/*
2361d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2362d184f6bfSAlex Elder 	 * is nothing special we need to do.
2363d184f6bfSAlex Elder 	 */
2364e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2365d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2366d184f6bfSAlex Elder 		return;
2367d184f6bfSAlex Elder 	}
2368d184f6bfSAlex Elder 
2369d184f6bfSAlex Elder 	/*
2370d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2371d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2372d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2373d184f6bfSAlex Elder 	 */
2374d184f6bfSAlex Elder 	max_id = 0;
2375d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2376d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2377d184f6bfSAlex Elder 
2378d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2379d184f6bfSAlex Elder 		if (rbd_id > max_id)
2380d184f6bfSAlex Elder 			max_id = rbd_id;
2381d184f6bfSAlex Elder 	}
2382499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
23831ddbe94eSAlex Elder 
23841ddbe94eSAlex Elder 	/*
2385e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
2386d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2387d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2388d184f6bfSAlex Elder 	 * case.
23891ddbe94eSAlex Elder 	 */
2390e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2391e2839308SAlex Elder 	dout("  max dev id has been reset\n");
2392b7f23c36SAlex Elder }
2393b7f23c36SAlex Elder 
2394a725f65eSAlex Elder /*
2395e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2396e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2397593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2398593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2399e28fff26SAlex Elder  */
2400e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2401e28fff26SAlex Elder {
2402e28fff26SAlex Elder         /*
2403e28fff26SAlex Elder         * These are the characters that produce nonzero for
2404e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2405e28fff26SAlex Elder         */
2406e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2407e28fff26SAlex Elder 
2408e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2409e28fff26SAlex Elder 
2410e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2411e28fff26SAlex Elder }
2412e28fff26SAlex Elder 
2413e28fff26SAlex Elder /*
2414e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2415e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2416593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2417593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2418e28fff26SAlex Elder  *
2419e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2420e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2421e28fff26SAlex Elder  * token_size if the token would not fit.
2422e28fff26SAlex Elder  *
2423593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2424e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2425e28fff26SAlex Elder  * too small to hold it.
2426e28fff26SAlex Elder  */
2427e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2428e28fff26SAlex Elder 				char *token,
2429e28fff26SAlex Elder 				size_t token_size)
2430e28fff26SAlex Elder {
2431e28fff26SAlex Elder         size_t len;
2432e28fff26SAlex Elder 
2433e28fff26SAlex Elder 	len = next_token(buf);
2434e28fff26SAlex Elder 	if (len < token_size) {
2435e28fff26SAlex Elder 		memcpy(token, *buf, len);
2436e28fff26SAlex Elder 		*(token + len) = '\0';
2437e28fff26SAlex Elder 	}
2438e28fff26SAlex Elder 	*buf += len;
2439e28fff26SAlex Elder 
2440e28fff26SAlex Elder         return len;
2441e28fff26SAlex Elder }
2442e28fff26SAlex Elder 
2443e28fff26SAlex Elder /*
2444ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2445ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2446ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2447ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2448ea3352f4SAlex Elder  *
2449ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2450ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2451ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2452ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2453ea3352f4SAlex Elder  *
2454ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2455ea3352f4SAlex Elder  * the end of the found token.
2456ea3352f4SAlex Elder  *
2457ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2458ea3352f4SAlex Elder  */
2459ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2460ea3352f4SAlex Elder {
2461ea3352f4SAlex Elder 	char *dup;
2462ea3352f4SAlex Elder 	size_t len;
2463ea3352f4SAlex Elder 
2464ea3352f4SAlex Elder 	len = next_token(buf);
2465ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2466ea3352f4SAlex Elder 	if (!dup)
2467ea3352f4SAlex Elder 		return NULL;
2468ea3352f4SAlex Elder 
2469ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2470ea3352f4SAlex Elder 	*(dup + len) = '\0';
2471ea3352f4SAlex Elder 	*buf += len;
2472ea3352f4SAlex Elder 
2473ea3352f4SAlex Elder 	if (lenp)
2474ea3352f4SAlex Elder 		*lenp = len;
2475ea3352f4SAlex Elder 
2476ea3352f4SAlex Elder 	return dup;
2477ea3352f4SAlex Elder }
2478ea3352f4SAlex Elder 
2479ea3352f4SAlex Elder /*
24800bed54dcSAlex Elder  * This fills in the pool_name, image_name, image_name_len, snap_name,
2481a725f65eSAlex Elder  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2482a725f65eSAlex Elder  * on the list of monitor addresses and other options provided via
2483a725f65eSAlex Elder  * /sys/bus/rbd/add.
2484d22f76e7SAlex Elder  *
2485d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2486a725f65eSAlex Elder  */
2487a725f65eSAlex Elder static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2488a725f65eSAlex Elder 			      const char *buf,
24897ef3214aSAlex Elder 			      const char **mon_addrs,
24905214ecc4SAlex Elder 			      size_t *mon_addrs_size,
2491e28fff26SAlex Elder 			      char *options,
2492e28fff26SAlex Elder 			     size_t options_size)
2493a725f65eSAlex Elder {
2494e28fff26SAlex Elder 	size_t len;
2495d22f76e7SAlex Elder 	int ret;
2496e28fff26SAlex Elder 
2497e28fff26SAlex Elder 	/* The first four tokens are required */
2498e28fff26SAlex Elder 
24997ef3214aSAlex Elder 	len = next_token(&buf);
25007ef3214aSAlex Elder 	if (!len)
2501a725f65eSAlex Elder 		return -EINVAL;
25025214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
25037ef3214aSAlex Elder 	*mon_addrs = buf;
25047ef3214aSAlex Elder 
25057ef3214aSAlex Elder 	buf += len;
2506a725f65eSAlex Elder 
2507e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2508e28fff26SAlex Elder 	if (!len || len >= options_size)
2509e28fff26SAlex Elder 		return -EINVAL;
2510a725f65eSAlex Elder 
2511bf3e5ae1SAlex Elder 	ret = -ENOMEM;
2512d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2513d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2514d22f76e7SAlex Elder 		goto out_err;
2515e28fff26SAlex Elder 
25160bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
25170bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2518bf3e5ae1SAlex Elder 		goto out_err;
2519e28fff26SAlex Elder 
2520cb8627c7SAlex Elder 	/* Create the name of the header object */
2521cb8627c7SAlex Elder 
25220bed54dcSAlex Elder 	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2523bf3e5ae1SAlex Elder 						+ sizeof (RBD_SUFFIX),
2524bf3e5ae1SAlex Elder 					GFP_KERNEL);
25250bed54dcSAlex Elder 	if (!rbd_dev->header_name)
2526cb8627c7SAlex Elder 		goto out_err;
25270bed54dcSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2528a725f65eSAlex Elder 
2529e28fff26SAlex Elder 	/*
2530820a5f3eSAlex Elder 	 * The snapshot name is optional.  If none is is supplied,
2531820a5f3eSAlex Elder 	 * we use the default value.
2532e28fff26SAlex Elder 	 */
2533f84344f3SAlex Elder 	rbd_dev->mapping.snap_name = dup_token(&buf, &len);
2534f84344f3SAlex Elder 	if (!rbd_dev->mapping.snap_name)
2535820a5f3eSAlex Elder 		goto out_err;
2536820a5f3eSAlex Elder 	if (!len) {
2537820a5f3eSAlex Elder 		/* Replace the empty name with the default */
2538f84344f3SAlex Elder 		kfree(rbd_dev->mapping.snap_name);
2539f84344f3SAlex Elder 		rbd_dev->mapping.snap_name
2540820a5f3eSAlex Elder 			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2541f84344f3SAlex Elder 		if (!rbd_dev->mapping.snap_name)
2542820a5f3eSAlex Elder 			goto out_err;
2543820a5f3eSAlex Elder 
2544f84344f3SAlex Elder 		memcpy(rbd_dev->mapping.snap_name, RBD_SNAP_HEAD_NAME,
2545e28fff26SAlex Elder 			sizeof (RBD_SNAP_HEAD_NAME));
2546849b4260SAlex Elder 	}
2547e28fff26SAlex Elder 
2548a725f65eSAlex Elder 	return 0;
2549d22f76e7SAlex Elder 
2550d22f76e7SAlex Elder out_err:
25510bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2552d78fd7aeSAlex Elder 	rbd_dev->header_name = NULL;
25530bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2554d78fd7aeSAlex Elder 	rbd_dev->image_name = NULL;
2555d78fd7aeSAlex Elder 	rbd_dev->image_name_len = 0;
2556d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2557d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2558d22f76e7SAlex Elder 
2559d22f76e7SAlex Elder 	return ret;
2560a725f65eSAlex Elder }
2561a725f65eSAlex Elder 
256259c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
256359c2be1eSYehuda Sadeh 		       const char *buf,
256459c2be1eSYehuda Sadeh 		       size_t count)
2565602adf40SYehuda Sadeh {
2566cb8627c7SAlex Elder 	char *options;
2567cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
25687ef3214aSAlex Elder 	const char *mon_addrs = NULL;
25697ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
257027cc2594SAlex Elder 	struct ceph_osd_client *osdc;
257127cc2594SAlex Elder 	int rc = -ENOMEM;
2572602adf40SYehuda Sadeh 
2573602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2574602adf40SYehuda Sadeh 		return -ENODEV;
2575602adf40SYehuda Sadeh 
257627cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
257727cc2594SAlex Elder 	if (!options)
257827cc2594SAlex Elder 		goto err_nomem;
2579cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2580cb8627c7SAlex Elder 	if (!rbd_dev)
2581cb8627c7SAlex Elder 		goto err_nomem;
2582602adf40SYehuda Sadeh 
2583602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2584602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2585602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2586dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2587c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2588602adf40SYehuda Sadeh 
2589d184f6bfSAlex Elder 	/* generate unique id: find highest unique id, add one */
2590e2839308SAlex Elder 	rbd_dev_id_get(rbd_dev);
2591602adf40SYehuda Sadeh 
2592a725f65eSAlex Elder 	/* Fill in the device name, now that we have its id. */
259381a89793SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
259481a89793SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2595de71a297SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2596e124a82fSAlex Elder 
2597a725f65eSAlex Elder 	/* parse add command */
25987ef3214aSAlex Elder 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2599e28fff26SAlex Elder 				options, count);
2600a725f65eSAlex Elder 	if (rc)
2601a725f65eSAlex Elder 		goto err_put_id;
2602a725f65eSAlex Elder 
2603f8c38929SAlex Elder 	rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2604f8c38929SAlex Elder 	if (rc < 0)
2605f0f8cef5SAlex Elder 		goto err_put_id;
2606602adf40SYehuda Sadeh 
2607602adf40SYehuda Sadeh 	/* pick the pool */
26081dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2609602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2610602adf40SYehuda Sadeh 	if (rc < 0)
2611602adf40SYehuda Sadeh 		goto err_out_client;
26129bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
2613602adf40SYehuda Sadeh 
2614602adf40SYehuda Sadeh 	/* register our block device */
261527cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
261627cc2594SAlex Elder 	if (rc < 0)
2617602adf40SYehuda Sadeh 		goto err_out_client;
261827cc2594SAlex Elder 	rbd_dev->major = rc;
2619602adf40SYehuda Sadeh 
2620dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2621dfc5606dSYehuda Sadeh 	if (rc)
2622766fc439SYehuda Sadeh 		goto err_out_blkdev;
2623766fc439SYehuda Sadeh 
262432eec68dSAlex Elder 	/*
262532eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
262632eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
262732eec68dSAlex Elder 	 *
262832eec68dSAlex Elder 	 * Set up and announce blkdev mapping.
262932eec68dSAlex Elder 	 */
2630602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2631602adf40SYehuda Sadeh 	if (rc)
2632766fc439SYehuda Sadeh 		goto err_out_bus;
2633602adf40SYehuda Sadeh 
263459c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
263559c2be1eSYehuda Sadeh 	if (rc)
263659c2be1eSYehuda Sadeh 		goto err_out_bus;
263759c2be1eSYehuda Sadeh 
2638602adf40SYehuda Sadeh 	return count;
2639602adf40SYehuda Sadeh 
2640766fc439SYehuda Sadeh err_out_bus:
2641766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2642766fc439SYehuda Sadeh 
2643766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2644766fc439SYehuda Sadeh 	kfree(options);
2645766fc439SYehuda Sadeh 	return rc;
2646766fc439SYehuda Sadeh 
2647602adf40SYehuda Sadeh err_out_blkdev:
2648602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2649602adf40SYehuda Sadeh err_out_client:
2650602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2651f0f8cef5SAlex Elder err_put_id:
2652cb8627c7SAlex Elder 	if (rbd_dev->pool_name) {
2653f84344f3SAlex Elder 		kfree(rbd_dev->mapping.snap_name);
26540bed54dcSAlex Elder 		kfree(rbd_dev->header_name);
26550bed54dcSAlex Elder 		kfree(rbd_dev->image_name);
2656d22f76e7SAlex Elder 		kfree(rbd_dev->pool_name);
2657cb8627c7SAlex Elder 	}
2658e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
265927cc2594SAlex Elder err_nomem:
266027cc2594SAlex Elder 	kfree(rbd_dev);
2661cb8627c7SAlex Elder 	kfree(options);
266227cc2594SAlex Elder 
2663602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2664602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
266527cc2594SAlex Elder 
266627cc2594SAlex Elder 	return (ssize_t) rc;
2667602adf40SYehuda Sadeh }
2668602adf40SYehuda Sadeh 
2669de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
2670602adf40SYehuda Sadeh {
2671602adf40SYehuda Sadeh 	struct list_head *tmp;
2672602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2673602adf40SYehuda Sadeh 
2674e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2675602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2676602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2677de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
2678e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2679602adf40SYehuda Sadeh 			return rbd_dev;
2680602adf40SYehuda Sadeh 		}
2681e124a82fSAlex Elder 	}
2682e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2683602adf40SYehuda Sadeh 	return NULL;
2684602adf40SYehuda Sadeh }
2685602adf40SYehuda Sadeh 
2686dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2687602adf40SYehuda Sadeh {
2688593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2689602adf40SYehuda Sadeh 
26901dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
26911dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
26921dbb4399SAlex Elder 
26931dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
269459c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
26951dbb4399SAlex Elder 	}
269659c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
2697070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
269859c2be1eSYehuda Sadeh 
2699602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2700602adf40SYehuda Sadeh 
2701602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2702602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2703602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
270432eec68dSAlex Elder 
270532eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
2706f84344f3SAlex Elder 	kfree(rbd_dev->mapping.snap_name);
27070bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2708d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
27090bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2710e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
2711602adf40SYehuda Sadeh 	kfree(rbd_dev);
2712602adf40SYehuda Sadeh 
2713602adf40SYehuda Sadeh 	/* release module ref */
2714602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2715602adf40SYehuda Sadeh }
2716602adf40SYehuda Sadeh 
2717dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2718602adf40SYehuda Sadeh 			  const char *buf,
2719602adf40SYehuda Sadeh 			  size_t count)
2720602adf40SYehuda Sadeh {
2721602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2722602adf40SYehuda Sadeh 	int target_id, rc;
2723602adf40SYehuda Sadeh 	unsigned long ul;
2724602adf40SYehuda Sadeh 	int ret = count;
2725602adf40SYehuda Sadeh 
2726602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2727602adf40SYehuda Sadeh 	if (rc)
2728602adf40SYehuda Sadeh 		return rc;
2729602adf40SYehuda Sadeh 
2730602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2731602adf40SYehuda Sadeh 	target_id = (int) ul;
2732602adf40SYehuda Sadeh 	if (target_id != ul)
2733602adf40SYehuda Sadeh 		return -EINVAL;
2734602adf40SYehuda Sadeh 
2735602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2736602adf40SYehuda Sadeh 
2737602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2738602adf40SYehuda Sadeh 	if (!rbd_dev) {
2739602adf40SYehuda Sadeh 		ret = -ENOENT;
2740602adf40SYehuda Sadeh 		goto done;
2741602adf40SYehuda Sadeh 	}
2742602adf40SYehuda Sadeh 
2743dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2744dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2745602adf40SYehuda Sadeh 
2746602adf40SYehuda Sadeh done:
2747602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2748aafb230eSAlex Elder 
2749602adf40SYehuda Sadeh 	return ret;
2750602adf40SYehuda Sadeh }
2751602adf40SYehuda Sadeh 
2752dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2753dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2754602adf40SYehuda Sadeh 			    const char *buf,
2755602adf40SYehuda Sadeh 			    size_t count)
2756602adf40SYehuda Sadeh {
2757593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2758dfc5606dSYehuda Sadeh 	int ret;
2759dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2760602adf40SYehuda Sadeh 	if (!name)
2761602adf40SYehuda Sadeh 		return -ENOMEM;
2762602adf40SYehuda Sadeh 
2763dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2764602adf40SYehuda Sadeh 
2765602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2766602adf40SYehuda Sadeh 
2767602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2768602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2769602adf40SYehuda Sadeh 	if (ret < 0)
277059c2be1eSYehuda Sadeh 		goto err_unlock;
2771602adf40SYehuda Sadeh 
2772b813623aSAlex Elder 	ret = __rbd_refresh_header(rbd_dev, NULL);
2773602adf40SYehuda Sadeh 	if (ret < 0)
277459c2be1eSYehuda Sadeh 		goto err_unlock;
277559c2be1eSYehuda Sadeh 
277659c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
277759c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
277859c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
277959c2be1eSYehuda Sadeh 
278059c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
27814cb16250SAlex Elder 	rbd_req_sync_notify(rbd_dev);
2782602adf40SYehuda Sadeh 
2783602adf40SYehuda Sadeh 	ret = count;
278459c2be1eSYehuda Sadeh 	kfree(name);
278559c2be1eSYehuda Sadeh 	return ret;
278659c2be1eSYehuda Sadeh 
278759c2be1eSYehuda Sadeh err_unlock:
2788602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2789602adf40SYehuda Sadeh 	kfree(name);
2790602adf40SYehuda Sadeh 	return ret;
2791602adf40SYehuda Sadeh }
2792602adf40SYehuda Sadeh 
2793602adf40SYehuda Sadeh /*
2794602adf40SYehuda Sadeh  * create control files in sysfs
2795dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2796602adf40SYehuda Sadeh  */
2797602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2798602adf40SYehuda Sadeh {
2799dfc5606dSYehuda Sadeh 	int ret;
2800602adf40SYehuda Sadeh 
2801fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
2802dfc5606dSYehuda Sadeh 	if (ret < 0)
2803dfc5606dSYehuda Sadeh 		return ret;
2804602adf40SYehuda Sadeh 
2805fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
2806fed4c143SAlex Elder 	if (ret < 0)
2807fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
2808602adf40SYehuda Sadeh 
2809602adf40SYehuda Sadeh 	return ret;
2810602adf40SYehuda Sadeh }
2811602adf40SYehuda Sadeh 
2812602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2813602adf40SYehuda Sadeh {
2814dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2815fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
2816602adf40SYehuda Sadeh }
2817602adf40SYehuda Sadeh 
2818602adf40SYehuda Sadeh int __init rbd_init(void)
2819602adf40SYehuda Sadeh {
2820602adf40SYehuda Sadeh 	int rc;
2821602adf40SYehuda Sadeh 
2822602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2823602adf40SYehuda Sadeh 	if (rc)
2824602adf40SYehuda Sadeh 		return rc;
2825f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2826602adf40SYehuda Sadeh 	return 0;
2827602adf40SYehuda Sadeh }
2828602adf40SYehuda Sadeh 
2829602adf40SYehuda Sadeh void __exit rbd_exit(void)
2830602adf40SYehuda Sadeh {
2831602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2832602adf40SYehuda Sadeh }
2833602adf40SYehuda Sadeh 
2834602adf40SYehuda Sadeh module_init(rbd_init);
2835602adf40SYehuda Sadeh module_exit(rbd_exit);
2836602adf40SYehuda Sadeh 
2837602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2838602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2839602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2840602adf40SYehuda Sadeh 
2841602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2842602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2843602adf40SYehuda Sadeh 
2844602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2845