xref: /openbmc/linux/drivers/block/rbd.c (revision 9fcbb800)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
65602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
66602adf40SYehuda Sadeh 
67602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
68602adf40SYehuda Sadeh 
6981a89793SAlex Elder /*
7081a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
7181a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
7281a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
7381a89793SAlex Elder  * enough to hold all possible device names.
7481a89793SAlex Elder  */
75602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7681a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
77602adf40SYehuda Sadeh 
78cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
7959c2be1eSYehuda Sadeh 
80602adf40SYehuda Sadeh /*
81602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
82602adf40SYehuda Sadeh  */
83602adf40SYehuda Sadeh struct rbd_image_header {
84602adf40SYehuda Sadeh 	u64 image_size;
85849b4260SAlex Elder 	char *object_prefix;
86602adf40SYehuda Sadeh 	__u8 obj_order;
87602adf40SYehuda Sadeh 	__u8 crypt_type;
88602adf40SYehuda Sadeh 	__u8 comp_type;
89602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc;
90602adf40SYehuda Sadeh 	u32 total_snaps;
91602adf40SYehuda Sadeh 
92602adf40SYehuda Sadeh 	char *snap_names;
93602adf40SYehuda Sadeh 	u64 *snap_sizes;
9459c2be1eSYehuda Sadeh 
9559c2be1eSYehuda Sadeh 	u64 obj_version;
9659c2be1eSYehuda Sadeh };
9759c2be1eSYehuda Sadeh 
9859c2be1eSYehuda Sadeh struct rbd_options {
99cc0538b6SAlex Elder 	bool	read_only;
100602adf40SYehuda Sadeh };
101602adf40SYehuda Sadeh 
102602adf40SYehuda Sadeh /*
103f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
104602adf40SYehuda Sadeh  */
105602adf40SYehuda Sadeh struct rbd_client {
106602adf40SYehuda Sadeh 	struct ceph_client	*client;
107602adf40SYehuda Sadeh 	struct kref		kref;
108602adf40SYehuda Sadeh 	struct list_head	node;
109602adf40SYehuda Sadeh };
110602adf40SYehuda Sadeh 
111602adf40SYehuda Sadeh /*
112f0f8cef5SAlex Elder  * a request completion status
113602adf40SYehuda Sadeh  */
1141fec7093SYehuda Sadeh struct rbd_req_status {
1151fec7093SYehuda Sadeh 	int done;
1161fec7093SYehuda Sadeh 	int rc;
1171fec7093SYehuda Sadeh 	u64 bytes;
1181fec7093SYehuda Sadeh };
1191fec7093SYehuda Sadeh 
1201fec7093SYehuda Sadeh /*
1211fec7093SYehuda Sadeh  * a collection of requests
1221fec7093SYehuda Sadeh  */
1231fec7093SYehuda Sadeh struct rbd_req_coll {
1241fec7093SYehuda Sadeh 	int			total;
1251fec7093SYehuda Sadeh 	int			num_done;
1261fec7093SYehuda Sadeh 	struct kref		kref;
1271fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
128602adf40SYehuda Sadeh };
129602adf40SYehuda Sadeh 
130f0f8cef5SAlex Elder /*
131f0f8cef5SAlex Elder  * a single io request
132f0f8cef5SAlex Elder  */
133f0f8cef5SAlex Elder struct rbd_request {
134f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
135f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
136f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
137f0f8cef5SAlex Elder 	u64			len;
138f0f8cef5SAlex Elder 	int			coll_index;
139f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
140f0f8cef5SAlex Elder };
141f0f8cef5SAlex Elder 
142dfc5606dSYehuda Sadeh struct rbd_snap {
143dfc5606dSYehuda Sadeh 	struct	device		dev;
144dfc5606dSYehuda Sadeh 	const char		*name;
1453591538fSJosh Durgin 	u64			size;
146dfc5606dSYehuda Sadeh 	struct list_head	node;
147dfc5606dSYehuda Sadeh 	u64			id;
148dfc5606dSYehuda Sadeh };
149dfc5606dSYehuda Sadeh 
150602adf40SYehuda Sadeh /*
151602adf40SYehuda Sadeh  * a single device
152602adf40SYehuda Sadeh  */
153602adf40SYehuda Sadeh struct rbd_device {
154de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
155602adf40SYehuda Sadeh 
156602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
157602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
158602adf40SYehuda Sadeh 	struct request_queue	*q;
159602adf40SYehuda Sadeh 
160f8c38929SAlex Elder 	struct rbd_options	rbd_opts;
161602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
162602adf40SYehuda Sadeh 
163602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
164602adf40SYehuda Sadeh 
165602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
166602adf40SYehuda Sadeh 
167602adf40SYehuda Sadeh 	struct rbd_image_header	header;
1680bed54dcSAlex Elder 	char			*image_name;
1690bed54dcSAlex Elder 	size_t			image_name_len;
1700bed54dcSAlex Elder 	char			*header_name;
171d22f76e7SAlex Elder 	char			*pool_name;
1729bb2f334SAlex Elder 	int			pool_id;
173602adf40SYehuda Sadeh 
17459c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
17559c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
17659c2be1eSYehuda Sadeh 
177c666601aSJosh Durgin 	/* protects updating the header */
178c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
179e88a36ecSJosh Durgin 	/* name of the snapshot this device reads from */
180820a5f3eSAlex Elder 	char                    *snap_name;
181e88a36ecSJosh Durgin 	/* id of the snapshot this device reads from */
18277dfe99fSJosh Durgin 	u64                     snap_id;	/* current snapshot id */
183e88a36ecSJosh Durgin 	/* whether the snap_id this device reads from still exists */
184e88a36ecSJosh Durgin 	bool                    snap_exists;
185cc0538b6SAlex Elder 	bool			read_only;
186602adf40SYehuda Sadeh 
187602adf40SYehuda Sadeh 	struct list_head	node;
188dfc5606dSYehuda Sadeh 
189dfc5606dSYehuda Sadeh 	/* list of snapshots */
190dfc5606dSYehuda Sadeh 	struct list_head	snaps;
191dfc5606dSYehuda Sadeh 
192dfc5606dSYehuda Sadeh 	/* sysfs related */
193dfc5606dSYehuda Sadeh 	struct device		dev;
194dfc5606dSYehuda Sadeh };
195dfc5606dSYehuda Sadeh 
196602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
197e124a82fSAlex Elder 
198602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
199e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
200e124a82fSAlex Elder 
201602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
202432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
203602adf40SYehuda Sadeh 
2049fcbb800SAlex Elder static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev);
205dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
206dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
207dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
208dfc5606dSYehuda Sadeh 			    const char *buf,
209dfc5606dSYehuda Sadeh 			    size_t count);
21014e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap);
211dfc5606dSYehuda Sadeh 
212f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
213f0f8cef5SAlex Elder 		       size_t count);
214f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
215f0f8cef5SAlex Elder 			  size_t count);
216f0f8cef5SAlex Elder 
217f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
218f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
219f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
220f0f8cef5SAlex Elder 	__ATTR_NULL
221f0f8cef5SAlex Elder };
222f0f8cef5SAlex Elder 
223f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
224f0f8cef5SAlex Elder 	.name		= "rbd",
225f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
226f0f8cef5SAlex Elder };
227f0f8cef5SAlex Elder 
228f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
229f0f8cef5SAlex Elder {
230f0f8cef5SAlex Elder }
231f0f8cef5SAlex Elder 
232f0f8cef5SAlex Elder static struct device rbd_root_dev = {
233f0f8cef5SAlex Elder 	.init_name =    "rbd",
234f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
235f0f8cef5SAlex Elder };
236f0f8cef5SAlex Elder 
237aafb230eSAlex Elder #ifdef RBD_DEBUG
238aafb230eSAlex Elder #define rbd_assert(expr)						\
239aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
240aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
241aafb230eSAlex Elder 						"at line %d:\n\n"	\
242aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
243aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
244aafb230eSAlex Elder 			BUG();						\
245aafb230eSAlex Elder 		}
246aafb230eSAlex Elder #else /* !RBD_DEBUG */
247aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
248aafb230eSAlex Elder #endif /* !RBD_DEBUG */
249dfc5606dSYehuda Sadeh 
250dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
251dfc5606dSYehuda Sadeh {
252dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
253dfc5606dSYehuda Sadeh }
254dfc5606dSYehuda Sadeh 
255dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
256dfc5606dSYehuda Sadeh {
257dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
258dfc5606dSYehuda Sadeh }
259602adf40SYehuda Sadeh 
2601fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
26159c2be1eSYehuda Sadeh 
262602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
263602adf40SYehuda Sadeh {
264f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
265602adf40SYehuda Sadeh 
266602adf40SYehuda Sadeh 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
267602adf40SYehuda Sadeh 		return -EROFS;
268602adf40SYehuda Sadeh 
269340c7a2bSAlex Elder 	rbd_get_dev(rbd_dev);
270340c7a2bSAlex Elder 	set_device_ro(bdev, rbd_dev->read_only);
271340c7a2bSAlex Elder 
272602adf40SYehuda Sadeh 	return 0;
273602adf40SYehuda Sadeh }
274602adf40SYehuda Sadeh 
275dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
276dfc5606dSYehuda Sadeh {
277dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
278dfc5606dSYehuda Sadeh 
279dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
280dfc5606dSYehuda Sadeh 
281dfc5606dSYehuda Sadeh 	return 0;
282dfc5606dSYehuda Sadeh }
283dfc5606dSYehuda Sadeh 
284602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
285602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
286602adf40SYehuda Sadeh 	.open			= rbd_open,
287dfc5606dSYehuda Sadeh 	.release		= rbd_release,
288602adf40SYehuda Sadeh };
289602adf40SYehuda Sadeh 
290602adf40SYehuda Sadeh /*
291602adf40SYehuda Sadeh  * Initialize an rbd client instance.
29243ae4701SAlex Elder  * We own *ceph_opts.
293602adf40SYehuda Sadeh  */
294f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
295602adf40SYehuda Sadeh {
296602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
297602adf40SYehuda Sadeh 	int ret = -ENOMEM;
298602adf40SYehuda Sadeh 
299602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
300602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
301602adf40SYehuda Sadeh 	if (!rbdc)
302602adf40SYehuda Sadeh 		goto out_opt;
303602adf40SYehuda Sadeh 
304602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
305602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
306602adf40SYehuda Sadeh 
307bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
308bc534d86SAlex Elder 
30943ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
310602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
311bc534d86SAlex Elder 		goto out_mutex;
31243ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
313602adf40SYehuda Sadeh 
314602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
315602adf40SYehuda Sadeh 	if (ret < 0)
316602adf40SYehuda Sadeh 		goto out_err;
317602adf40SYehuda Sadeh 
318432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
319602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
320432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
321602adf40SYehuda Sadeh 
322bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
323bc534d86SAlex Elder 
324602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
325602adf40SYehuda Sadeh 	return rbdc;
326602adf40SYehuda Sadeh 
327602adf40SYehuda Sadeh out_err:
328602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
329bc534d86SAlex Elder out_mutex:
330bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
331602adf40SYehuda Sadeh 	kfree(rbdc);
332602adf40SYehuda Sadeh out_opt:
33343ae4701SAlex Elder 	if (ceph_opts)
33443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
33528f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
336602adf40SYehuda Sadeh }
337602adf40SYehuda Sadeh 
338602adf40SYehuda Sadeh /*
3391f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
3401f7ba331SAlex Elder  * found, bump its reference count.
341602adf40SYehuda Sadeh  */
3421f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
343602adf40SYehuda Sadeh {
344602adf40SYehuda Sadeh 	struct rbd_client *client_node;
3451f7ba331SAlex Elder 	bool found = false;
346602adf40SYehuda Sadeh 
34743ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
348602adf40SYehuda Sadeh 		return NULL;
349602adf40SYehuda Sadeh 
3501f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
3511f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
3521f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
3531f7ba331SAlex Elder 			kref_get(&client_node->kref);
3541f7ba331SAlex Elder 			found = true;
3551f7ba331SAlex Elder 			break;
3561f7ba331SAlex Elder 		}
3571f7ba331SAlex Elder 	}
3581f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
3591f7ba331SAlex Elder 
3601f7ba331SAlex Elder 	return found ? client_node : NULL;
361602adf40SYehuda Sadeh }
362602adf40SYehuda Sadeh 
363602adf40SYehuda Sadeh /*
36459c2be1eSYehuda Sadeh  * mount options
36559c2be1eSYehuda Sadeh  */
36659c2be1eSYehuda Sadeh enum {
36759c2be1eSYehuda Sadeh 	Opt_last_int,
36859c2be1eSYehuda Sadeh 	/* int args above */
36959c2be1eSYehuda Sadeh 	Opt_last_string,
37059c2be1eSYehuda Sadeh 	/* string args above */
371cc0538b6SAlex Elder 	Opt_read_only,
372cc0538b6SAlex Elder 	Opt_read_write,
373cc0538b6SAlex Elder 	/* Boolean args above */
374cc0538b6SAlex Elder 	Opt_last_bool,
37559c2be1eSYehuda Sadeh };
37659c2be1eSYehuda Sadeh 
37743ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
37859c2be1eSYehuda Sadeh 	/* int args above */
37959c2be1eSYehuda Sadeh 	/* string args above */
380cc0538b6SAlex Elder 	{Opt_read_only, "read_only"},
381cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
382cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
383cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
384cc0538b6SAlex Elder 	/* Boolean args above */
38559c2be1eSYehuda Sadeh 	{-1, NULL}
38659c2be1eSYehuda Sadeh };
38759c2be1eSYehuda Sadeh 
38859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
38959c2be1eSYehuda Sadeh {
39043ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
39159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
39259c2be1eSYehuda Sadeh 	int token, intval, ret;
39359c2be1eSYehuda Sadeh 
39443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
39559c2be1eSYehuda Sadeh 	if (token < 0)
39659c2be1eSYehuda Sadeh 		return -EINVAL;
39759c2be1eSYehuda Sadeh 
39859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
39959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
40059c2be1eSYehuda Sadeh 		if (ret < 0) {
40159c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
40259c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
40359c2be1eSYehuda Sadeh 			return ret;
40459c2be1eSYehuda Sadeh 		}
40559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
40659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
40759c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
40859c2be1eSYehuda Sadeh 		     argstr[0].from);
409cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
410cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
41159c2be1eSYehuda Sadeh 	} else {
41259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
41359c2be1eSYehuda Sadeh 	}
41459c2be1eSYehuda Sadeh 
41559c2be1eSYehuda Sadeh 	switch (token) {
416cc0538b6SAlex Elder 	case Opt_read_only:
417cc0538b6SAlex Elder 		rbd_opts->read_only = true;
418cc0538b6SAlex Elder 		break;
419cc0538b6SAlex Elder 	case Opt_read_write:
420cc0538b6SAlex Elder 		rbd_opts->read_only = false;
421cc0538b6SAlex Elder 		break;
42259c2be1eSYehuda Sadeh 	default:
423aafb230eSAlex Elder 		rbd_assert(false);
424aafb230eSAlex Elder 		break;
42559c2be1eSYehuda Sadeh 	}
42659c2be1eSYehuda Sadeh 	return 0;
42759c2be1eSYehuda Sadeh }
42859c2be1eSYehuda Sadeh 
42959c2be1eSYehuda Sadeh /*
430602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
431602adf40SYehuda Sadeh  * not exist create it.
432602adf40SYehuda Sadeh  */
433f8c38929SAlex Elder static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
434f8c38929SAlex Elder 				size_t mon_addr_len, char *options)
435602adf40SYehuda Sadeh {
436f8c38929SAlex Elder 	struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43743ae4701SAlex Elder 	struct ceph_options *ceph_opts;
438f8c38929SAlex Elder 	struct rbd_client *rbdc;
43959c2be1eSYehuda Sadeh 
440cc0538b6SAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
441602adf40SYehuda Sadeh 
44243ae4701SAlex Elder 	ceph_opts = ceph_parse_options(options, mon_addr,
4435214ecc4SAlex Elder 					mon_addr + mon_addr_len,
44421079786SAlex Elder 					parse_rbd_opts_token, rbd_opts);
445f8c38929SAlex Elder 	if (IS_ERR(ceph_opts))
446f8c38929SAlex Elder 		return PTR_ERR(ceph_opts);
447602adf40SYehuda Sadeh 
4481f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
449602adf40SYehuda Sadeh 	if (rbdc) {
450e6994d3dSAlex Elder 		/* using an existing client */
45143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
452f8c38929SAlex Elder 	} else {
453f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
454d720bcb0SAlex Elder 		if (IS_ERR(rbdc))
455f8c38929SAlex Elder 			return PTR_ERR(rbdc);
456f8c38929SAlex Elder 	}
457f8c38929SAlex Elder 	rbd_dev->rbd_client = rbdc;
458d720bcb0SAlex Elder 
459f8c38929SAlex Elder 	return 0;
460602adf40SYehuda Sadeh }
461602adf40SYehuda Sadeh 
462602adf40SYehuda Sadeh /*
463602adf40SYehuda Sadeh  * Destroy ceph client
464d23a4b3fSAlex Elder  *
465432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
466602adf40SYehuda Sadeh  */
467602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
468602adf40SYehuda Sadeh {
469602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
470602adf40SYehuda Sadeh 
471602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
472cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
473602adf40SYehuda Sadeh 	list_del(&rbdc->node);
474cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
475602adf40SYehuda Sadeh 
476602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
477602adf40SYehuda Sadeh 	kfree(rbdc);
478602adf40SYehuda Sadeh }
479602adf40SYehuda Sadeh 
480602adf40SYehuda Sadeh /*
481602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
482602adf40SYehuda Sadeh  * it.
483602adf40SYehuda Sadeh  */
484602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
485602adf40SYehuda Sadeh {
486602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
487602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
488602adf40SYehuda Sadeh }
489602adf40SYehuda Sadeh 
4901fec7093SYehuda Sadeh /*
4911fec7093SYehuda Sadeh  * Destroy requests collection
4921fec7093SYehuda Sadeh  */
4931fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4941fec7093SYehuda Sadeh {
4951fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4961fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
4971fec7093SYehuda Sadeh 
4981fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
4991fec7093SYehuda Sadeh 	kfree(coll);
5001fec7093SYehuda Sadeh }
501602adf40SYehuda Sadeh 
5028e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5038e94af8eSAlex Elder {
504103a150fSAlex Elder 	size_t size;
505103a150fSAlex Elder 	u32 snap_count;
506103a150fSAlex Elder 
507103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
508103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
509103a150fSAlex Elder 		return false;
510103a150fSAlex Elder 
511103a150fSAlex Elder 	/*
512103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
513103a150fSAlex Elder 	 * that limits the number of snapshots.
514103a150fSAlex Elder 	 */
515103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
516103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
517103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
518103a150fSAlex Elder 		return false;
519103a150fSAlex Elder 
520103a150fSAlex Elder 	/*
521103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
522103a150fSAlex Elder 	 * header must also be representable in a size_t.
523103a150fSAlex Elder 	 */
524103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
525103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
526103a150fSAlex Elder 		return false;
527103a150fSAlex Elder 
528103a150fSAlex Elder 	return true;
5298e94af8eSAlex Elder }
5308e94af8eSAlex Elder 
531602adf40SYehuda Sadeh /*
532602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
533602adf40SYehuda Sadeh  * header.
534602adf40SYehuda Sadeh  */
535602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
5364156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
537602adf40SYehuda Sadeh {
538ccece235SAlex Elder 	u32 snap_count;
53958c17b0eSAlex Elder 	size_t len;
540d2bb24e5SAlex Elder 	size_t size;
541621901d6SAlex Elder 	u32 i;
542602adf40SYehuda Sadeh 
5436a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
5446a52325fSAlex Elder 
545103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
546103a150fSAlex Elder 
54758c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
54858c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
5496a52325fSAlex Elder 	if (!header->object_prefix)
550602adf40SYehuda Sadeh 		return -ENOMEM;
55158c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
55258c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
55300f1f36fSAlex Elder 
554602adf40SYehuda Sadeh 	if (snap_count) {
555f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
556f785cc1dSAlex Elder 
557621901d6SAlex Elder 		/* Save a copy of the snapshot names */
558621901d6SAlex Elder 
559f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
560f785cc1dSAlex Elder 			return -EIO;
561f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
562602adf40SYehuda Sadeh 		if (!header->snap_names)
5636a52325fSAlex Elder 			goto out_err;
564f785cc1dSAlex Elder 		/*
565f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
566f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
567f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
568f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
569f785cc1dSAlex Elder 		 */
570f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
571f785cc1dSAlex Elder 			snap_names_len);
5726a52325fSAlex Elder 
573621901d6SAlex Elder 		/* Record each snapshot's size */
574621901d6SAlex Elder 
575d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
576d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
577602adf40SYehuda Sadeh 		if (!header->snap_sizes)
5786a52325fSAlex Elder 			goto out_err;
579621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
580621901d6SAlex Elder 			header->snap_sizes[i] =
581621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
582602adf40SYehuda Sadeh 	} else {
583ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
584602adf40SYehuda Sadeh 		header->snap_names = NULL;
585602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
586602adf40SYehuda Sadeh 	}
587849b4260SAlex Elder 
588602adf40SYehuda Sadeh 	header->image_size = le64_to_cpu(ondisk->image_size);
589602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
590602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
591602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
5926a52325fSAlex Elder 	header->total_snaps = snap_count;
5936a52325fSAlex Elder 
594621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
595621901d6SAlex Elder 
5966a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
5976a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
5986a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
5996a52325fSAlex Elder 	if (!header->snapc)
6006a52325fSAlex Elder 		goto out_err;
601602adf40SYehuda Sadeh 
602602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
603505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
604602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
605621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
606602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
607602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
608602adf40SYehuda Sadeh 
609602adf40SYehuda Sadeh 	return 0;
610602adf40SYehuda Sadeh 
6116a52325fSAlex Elder out_err:
612849b4260SAlex Elder 	kfree(header->snap_sizes);
613ccece235SAlex Elder 	header->snap_sizes = NULL;
614602adf40SYehuda Sadeh 	kfree(header->snap_names);
615ccece235SAlex Elder 	header->snap_names = NULL;
6166a52325fSAlex Elder 	kfree(header->object_prefix);
6176a52325fSAlex Elder 	header->object_prefix = NULL;
618ccece235SAlex Elder 
61900f1f36fSAlex Elder 	return -ENOMEM;
620602adf40SYehuda Sadeh }
621602adf40SYehuda Sadeh 
622602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
623602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
624602adf40SYehuda Sadeh {
625602adf40SYehuda Sadeh 	int i;
626602adf40SYehuda Sadeh 	char *p = header->snap_names;
627602adf40SYehuda Sadeh 
62800f1f36fSAlex Elder 	for (i = 0; i < header->total_snaps; i++) {
62900f1f36fSAlex Elder 		if (!strcmp(snap_name, p)) {
63000f1f36fSAlex Elder 
63100f1f36fSAlex Elder 			/* Found it.  Pass back its id and/or size */
63200f1f36fSAlex Elder 
633602adf40SYehuda Sadeh 			if (seq)
634602adf40SYehuda Sadeh 				*seq = header->snapc->snaps[i];
635602adf40SYehuda Sadeh 			if (size)
636602adf40SYehuda Sadeh 				*size = header->snap_sizes[i];
637602adf40SYehuda Sadeh 			return i;
638602adf40SYehuda Sadeh 		}
63900f1f36fSAlex Elder 		p += strlen(p) + 1;	/* Skip ahead to the next name */
64000f1f36fSAlex Elder 	}
64100f1f36fSAlex Elder 	return -ENOENT;
64200f1f36fSAlex Elder }
643602adf40SYehuda Sadeh 
6440ce1a794SAlex Elder static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
645602adf40SYehuda Sadeh {
64678dc447dSAlex Elder 	int ret;
647602adf40SYehuda Sadeh 
6480ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
649602adf40SYehuda Sadeh 
6500ce1a794SAlex Elder 	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
651cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
6520ce1a794SAlex Elder 		rbd_dev->snap_id = CEPH_NOSNAP;
653e88a36ecSJosh Durgin 		rbd_dev->snap_exists = false;
654cc0538b6SAlex Elder 		rbd_dev->read_only = rbd_dev->rbd_opts.read_only;
655602adf40SYehuda Sadeh 		if (size)
65678dc447dSAlex Elder 			*size = rbd_dev->header.image_size;
657602adf40SYehuda Sadeh 	} else {
65878dc447dSAlex Elder 		u64 snap_id = 0;
65978dc447dSAlex Elder 
66078dc447dSAlex Elder 		ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
66178dc447dSAlex Elder 					&snap_id, size);
662602adf40SYehuda Sadeh 		if (ret < 0)
663602adf40SYehuda Sadeh 			goto done;
66478dc447dSAlex Elder 		rbd_dev->snap_id = snap_id;
665e88a36ecSJosh Durgin 		rbd_dev->snap_exists = true;
666cc0538b6SAlex Elder 		rbd_dev->read_only = true;	/* No choice for snapshots */
667602adf40SYehuda Sadeh 	}
668602adf40SYehuda Sadeh 
669602adf40SYehuda Sadeh 	ret = 0;
670602adf40SYehuda Sadeh done:
6710ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
672602adf40SYehuda Sadeh 	return ret;
673602adf40SYehuda Sadeh }
674602adf40SYehuda Sadeh 
675602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
676602adf40SYehuda Sadeh {
677849b4260SAlex Elder 	kfree(header->object_prefix);
678d78fd7aeSAlex Elder 	header->object_prefix = NULL;
679602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
680d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
681849b4260SAlex Elder 	kfree(header->snap_names);
682d78fd7aeSAlex Elder 	header->snap_names = NULL;
683d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
684d78fd7aeSAlex Elder 	header->snapc = NULL;
685602adf40SYehuda Sadeh }
686602adf40SYehuda Sadeh 
68765ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
688602adf40SYehuda Sadeh {
68965ccfe21SAlex Elder 	char *name;
69065ccfe21SAlex Elder 	u64 segment;
69165ccfe21SAlex Elder 	int ret;
692602adf40SYehuda Sadeh 
69365ccfe21SAlex Elder 	name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
69465ccfe21SAlex Elder 	if (!name)
69565ccfe21SAlex Elder 		return NULL;
69665ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
69765ccfe21SAlex Elder 	ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
69865ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
69965ccfe21SAlex Elder 	if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
70065ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
70165ccfe21SAlex Elder 			segment, ret);
70265ccfe21SAlex Elder 		kfree(name);
70365ccfe21SAlex Elder 		name = NULL;
70465ccfe21SAlex Elder 	}
705602adf40SYehuda Sadeh 
70665ccfe21SAlex Elder 	return name;
70765ccfe21SAlex Elder }
708602adf40SYehuda Sadeh 
70965ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
71065ccfe21SAlex Elder {
71165ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
712602adf40SYehuda Sadeh 
71365ccfe21SAlex Elder 	return offset & (segment_size - 1);
71465ccfe21SAlex Elder }
71565ccfe21SAlex Elder 
71665ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
71765ccfe21SAlex Elder 				u64 offset, u64 length)
71865ccfe21SAlex Elder {
71965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
72065ccfe21SAlex Elder 
72165ccfe21SAlex Elder 	offset &= segment_size - 1;
72265ccfe21SAlex Elder 
723aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
72465ccfe21SAlex Elder 	if (offset + length > segment_size)
72565ccfe21SAlex Elder 		length = segment_size - offset;
72665ccfe21SAlex Elder 
72765ccfe21SAlex Elder 	return length;
728602adf40SYehuda Sadeh }
729602adf40SYehuda Sadeh 
7301fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
7311fec7093SYehuda Sadeh 				u64 ofs, u64 len)
7321fec7093SYehuda Sadeh {
733df111be6SAlex Elder 	u64 start_seg;
734df111be6SAlex Elder 	u64 end_seg;
735df111be6SAlex Elder 
736df111be6SAlex Elder 	if (!len)
737df111be6SAlex Elder 		return 0;
738df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
739df111be6SAlex Elder 		return -ERANGE;
740df111be6SAlex Elder 
741df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
742df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
743df111be6SAlex Elder 
7441fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
7451fec7093SYehuda Sadeh }
7461fec7093SYehuda Sadeh 
747602adf40SYehuda Sadeh /*
748029bcbd8SJosh Durgin  * returns the size of an object in the image
749029bcbd8SJosh Durgin  */
750029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
751029bcbd8SJosh Durgin {
752029bcbd8SJosh Durgin 	return 1 << header->obj_order;
753029bcbd8SJosh Durgin }
754029bcbd8SJosh Durgin 
755029bcbd8SJosh Durgin /*
756602adf40SYehuda Sadeh  * bio helpers
757602adf40SYehuda Sadeh  */
758602adf40SYehuda Sadeh 
759602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
760602adf40SYehuda Sadeh {
761602adf40SYehuda Sadeh 	struct bio *tmp;
762602adf40SYehuda Sadeh 
763602adf40SYehuda Sadeh 	while (chain) {
764602adf40SYehuda Sadeh 		tmp = chain;
765602adf40SYehuda Sadeh 		chain = chain->bi_next;
766602adf40SYehuda Sadeh 		bio_put(tmp);
767602adf40SYehuda Sadeh 	}
768602adf40SYehuda Sadeh }
769602adf40SYehuda Sadeh 
770602adf40SYehuda Sadeh /*
771602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
772602adf40SYehuda Sadeh  */
773602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
774602adf40SYehuda Sadeh {
775602adf40SYehuda Sadeh 	struct bio_vec *bv;
776602adf40SYehuda Sadeh 	unsigned long flags;
777602adf40SYehuda Sadeh 	void *buf;
778602adf40SYehuda Sadeh 	int i;
779602adf40SYehuda Sadeh 	int pos = 0;
780602adf40SYehuda Sadeh 
781602adf40SYehuda Sadeh 	while (chain) {
782602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
783602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
784602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
785602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
786602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
787602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
78885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
789602adf40SYehuda Sadeh 			}
790602adf40SYehuda Sadeh 			pos += bv->bv_len;
791602adf40SYehuda Sadeh 		}
792602adf40SYehuda Sadeh 
793602adf40SYehuda Sadeh 		chain = chain->bi_next;
794602adf40SYehuda Sadeh 	}
795602adf40SYehuda Sadeh }
796602adf40SYehuda Sadeh 
797602adf40SYehuda Sadeh /*
798602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
799602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
800602adf40SYehuda Sadeh  */
801602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
802602adf40SYehuda Sadeh 				   struct bio_pair **bp,
803602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
804602adf40SYehuda Sadeh {
805542582fcSAlex Elder 	struct bio *old_chain = *old;
806542582fcSAlex Elder 	struct bio *new_chain = NULL;
807542582fcSAlex Elder 	struct bio *tail;
808602adf40SYehuda Sadeh 	int total = 0;
809602adf40SYehuda Sadeh 
810602adf40SYehuda Sadeh 	if (*bp) {
811602adf40SYehuda Sadeh 		bio_pair_release(*bp);
812602adf40SYehuda Sadeh 		*bp = NULL;
813602adf40SYehuda Sadeh 	}
814602adf40SYehuda Sadeh 
815602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
816542582fcSAlex Elder 		struct bio *tmp;
817542582fcSAlex Elder 
818602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
819602adf40SYehuda Sadeh 		if (!tmp)
820602adf40SYehuda Sadeh 			goto err_out;
821542582fcSAlex Elder 		gfpmask &= ~__GFP_WAIT;	/* can't wait after the first */
822602adf40SYehuda Sadeh 
823602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
824602adf40SYehuda Sadeh 			struct bio_pair *bp;
825602adf40SYehuda Sadeh 
826602adf40SYehuda Sadeh 			/*
827602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
828602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
829602adf40SYehuda Sadeh 			 */
830602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
831bd919d45SAlex Elder 			     "bi_size=%u\n",
832bd919d45SAlex Elder 			     total, len - total, old_chain->bi_size);
833602adf40SYehuda Sadeh 
834602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
835602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
836593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
837602adf40SYehuda Sadeh 			if (!bp)
838602adf40SYehuda Sadeh 				goto err_out;
839602adf40SYehuda Sadeh 
840602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
841602adf40SYehuda Sadeh 
842602adf40SYehuda Sadeh 			*next = &bp->bio2;
843602adf40SYehuda Sadeh 		} else {
844602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
845602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
846602adf40SYehuda Sadeh 		}
847602adf40SYehuda Sadeh 
848602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
849602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
850542582fcSAlex Elder 		if (new_chain)
851602adf40SYehuda Sadeh 			tail->bi_next = tmp;
852542582fcSAlex Elder 		else
853542582fcSAlex Elder 			new_chain = tmp;
854602adf40SYehuda Sadeh 		tail = tmp;
855602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
856602adf40SYehuda Sadeh 
857602adf40SYehuda Sadeh 		total += tmp->bi_size;
858602adf40SYehuda Sadeh 	}
859602adf40SYehuda Sadeh 
860aafb230eSAlex Elder 	rbd_assert(total == len);
861602adf40SYehuda Sadeh 
862602adf40SYehuda Sadeh 	*old = old_chain;
863602adf40SYehuda Sadeh 
864602adf40SYehuda Sadeh 	return new_chain;
865602adf40SYehuda Sadeh 
866602adf40SYehuda Sadeh err_out:
867602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
868602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
869602adf40SYehuda Sadeh 	return NULL;
870602adf40SYehuda Sadeh }
871602adf40SYehuda Sadeh 
872602adf40SYehuda Sadeh /*
873602adf40SYehuda Sadeh  * helpers for osd request op vectors.
874602adf40SYehuda Sadeh  */
87557cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
87657cfc106SAlex Elder 					int opcode, u32 payload_len)
877602adf40SYehuda Sadeh {
87857cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
87957cfc106SAlex Elder 
88057cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
88157cfc106SAlex Elder 	if (!ops)
88257cfc106SAlex Elder 		return NULL;
88357cfc106SAlex Elder 
88457cfc106SAlex Elder 	ops[0].op = opcode;
88557cfc106SAlex Elder 
886602adf40SYehuda Sadeh 	/*
887602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
888602adf40SYehuda Sadeh 	 * in calc_raw_layout()
889602adf40SYehuda Sadeh 	 */
89057cfc106SAlex Elder 	ops[0].payload_len = payload_len;
89157cfc106SAlex Elder 
89257cfc106SAlex Elder 	return ops;
893602adf40SYehuda Sadeh }
894602adf40SYehuda Sadeh 
895602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
896602adf40SYehuda Sadeh {
897602adf40SYehuda Sadeh 	kfree(ops);
898602adf40SYehuda Sadeh }
899602adf40SYehuda Sadeh 
9001fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
9011fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
9021fec7093SYehuda Sadeh 				   int index,
9031fec7093SYehuda Sadeh 				   int ret, u64 len)
9041fec7093SYehuda Sadeh {
9051fec7093SYehuda Sadeh 	struct request_queue *q;
9061fec7093SYehuda Sadeh 	int min, max, i;
9071fec7093SYehuda Sadeh 
908bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
909bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
9101fec7093SYehuda Sadeh 
9111fec7093SYehuda Sadeh 	if (!rq)
9121fec7093SYehuda Sadeh 		return;
9131fec7093SYehuda Sadeh 
9141fec7093SYehuda Sadeh 	if (!coll) {
9151fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
9161fec7093SYehuda Sadeh 		return;
9171fec7093SYehuda Sadeh 	}
9181fec7093SYehuda Sadeh 
9191fec7093SYehuda Sadeh 	q = rq->q;
9201fec7093SYehuda Sadeh 
9211fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
9221fec7093SYehuda Sadeh 	coll->status[index].done = 1;
9231fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
9241fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
9251fec7093SYehuda Sadeh 	max = min = coll->num_done;
9261fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
9271fec7093SYehuda Sadeh 		max++;
9281fec7093SYehuda Sadeh 
9291fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
9301fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
9311fec7093SYehuda Sadeh 				  coll->status[i].bytes);
9321fec7093SYehuda Sadeh 		coll->num_done++;
9331fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
9341fec7093SYehuda Sadeh 	}
9351fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
9361fec7093SYehuda Sadeh }
9371fec7093SYehuda Sadeh 
9381fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
9391fec7093SYehuda Sadeh 			     int ret, u64 len)
9401fec7093SYehuda Sadeh {
9411fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
9421fec7093SYehuda Sadeh }
9431fec7093SYehuda Sadeh 
944602adf40SYehuda Sadeh /*
945602adf40SYehuda Sadeh  * Send ceph osd request
946602adf40SYehuda Sadeh  */
947602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
9480ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
949602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
950602adf40SYehuda Sadeh 			  u64 snapid,
951aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
952602adf40SYehuda Sadeh 			  struct bio *bio,
953602adf40SYehuda Sadeh 			  struct page **pages,
954602adf40SYehuda Sadeh 			  int num_pages,
955602adf40SYehuda Sadeh 			  int flags,
956602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
9571fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
9581fec7093SYehuda Sadeh 			  int coll_index,
959602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
96059c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
96159c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
96259c2be1eSYehuda Sadeh 			  u64 *ver)
963602adf40SYehuda Sadeh {
964602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
965602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
966602adf40SYehuda Sadeh 	int ret;
967602adf40SYehuda Sadeh 	u64 bno;
968602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
969602adf40SYehuda Sadeh 	struct rbd_request *req_data;
970602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
9711dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
972602adf40SYehuda Sadeh 
973602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
9741fec7093SYehuda Sadeh 	if (!req_data) {
9751fec7093SYehuda Sadeh 		if (coll)
9761fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
9771fec7093SYehuda Sadeh 					       -ENOMEM, len);
9781fec7093SYehuda Sadeh 		return -ENOMEM;
9791fec7093SYehuda Sadeh 	}
980602adf40SYehuda Sadeh 
9811fec7093SYehuda Sadeh 	if (coll) {
9821fec7093SYehuda Sadeh 		req_data->coll = coll;
9831fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
9841fec7093SYehuda Sadeh 	}
9851fec7093SYehuda Sadeh 
986bd919d45SAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
987bd919d45SAlex Elder 		(unsigned long long) ofs, (unsigned long long) len);
988602adf40SYehuda Sadeh 
9890ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
9901dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9911dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9924ad12621SSage Weil 	if (!req) {
9934ad12621SSage Weil 		ret = -ENOMEM;
994602adf40SYehuda Sadeh 		goto done_pages;
995602adf40SYehuda Sadeh 	}
996602adf40SYehuda Sadeh 
997602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
998602adf40SYehuda Sadeh 
999602adf40SYehuda Sadeh 	req_data->rq = rq;
1000602adf40SYehuda Sadeh 	req_data->bio = bio;
1001602adf40SYehuda Sadeh 	req_data->pages = pages;
1002602adf40SYehuda Sadeh 	req_data->len = len;
1003602adf40SYehuda Sadeh 
1004602adf40SYehuda Sadeh 	req->r_priv = req_data;
1005602adf40SYehuda Sadeh 
1006602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
1007602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1008602adf40SYehuda Sadeh 
1009aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
1010602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
1011602adf40SYehuda Sadeh 
1012602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
1013602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1014602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1015602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1016602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
10170ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
10181dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
10191dbb4399SAlex Elder 				req, ops);
1020602adf40SYehuda Sadeh 
1021602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
1022602adf40SYehuda Sadeh 				ops,
1023602adf40SYehuda Sadeh 				snapc,
1024602adf40SYehuda Sadeh 				&mtime,
1025602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
1026602adf40SYehuda Sadeh 
102759c2be1eSYehuda Sadeh 	if (linger_req) {
10281dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
102959c2be1eSYehuda Sadeh 		*linger_req = req;
103059c2be1eSYehuda Sadeh 	}
103159c2be1eSYehuda Sadeh 
10321dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
1033602adf40SYehuda Sadeh 	if (ret < 0)
1034602adf40SYehuda Sadeh 		goto done_err;
1035602adf40SYehuda Sadeh 
1036602adf40SYehuda Sadeh 	if (!rbd_cb) {
10371dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
103859c2be1eSYehuda Sadeh 		if (ver)
103959c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
1040bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
1041bd919d45SAlex Elder 			(unsigned long long)
10421fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
1043602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
1044602adf40SYehuda Sadeh 	}
1045602adf40SYehuda Sadeh 	return ret;
1046602adf40SYehuda Sadeh 
1047602adf40SYehuda Sadeh done_err:
1048602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
1049602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1050602adf40SYehuda Sadeh done_pages:
10511fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
1052602adf40SYehuda Sadeh 	kfree(req_data);
1053602adf40SYehuda Sadeh 	return ret;
1054602adf40SYehuda Sadeh }
1055602adf40SYehuda Sadeh 
1056602adf40SYehuda Sadeh /*
1057602adf40SYehuda Sadeh  * Ceph osd op callback
1058602adf40SYehuda Sadeh  */
1059602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1060602adf40SYehuda Sadeh {
1061602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
1062602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1063602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
1064602adf40SYehuda Sadeh 	__s32 rc;
1065602adf40SYehuda Sadeh 	u64 bytes;
1066602adf40SYehuda Sadeh 	int read_op;
1067602adf40SYehuda Sadeh 
1068602adf40SYehuda Sadeh 	/* parse reply */
1069602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1070602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1071602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
1072602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
1073602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1074895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1075602adf40SYehuda Sadeh 
1076bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1077bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1078602adf40SYehuda Sadeh 
1079602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1080602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1081602adf40SYehuda Sadeh 		rc = 0;
1082602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1083602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1084602adf40SYehuda Sadeh 		bytes = req_data->len;
1085602adf40SYehuda Sadeh 	}
1086602adf40SYehuda Sadeh 
10871fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1088602adf40SYehuda Sadeh 
1089602adf40SYehuda Sadeh 	if (req_data->bio)
1090602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1091602adf40SYehuda Sadeh 
1092602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1093602adf40SYehuda Sadeh 	kfree(req_data);
1094602adf40SYehuda Sadeh }
1095602adf40SYehuda Sadeh 
109659c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
109759c2be1eSYehuda Sadeh {
109859c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
109959c2be1eSYehuda Sadeh }
110059c2be1eSYehuda Sadeh 
1101602adf40SYehuda Sadeh /*
1102602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1103602adf40SYehuda Sadeh  */
11040ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1105602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1106602adf40SYehuda Sadeh 			   u64 snapid,
1107602adf40SYehuda Sadeh 			   int flags,
1108913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1109aded07eaSAlex Elder 			   const char *object_name,
1110602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
111159c2be1eSYehuda Sadeh 			   char *buf,
111259c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
111359c2be1eSYehuda Sadeh 			   u64 *ver)
1114602adf40SYehuda Sadeh {
1115602adf40SYehuda Sadeh 	int ret;
1116602adf40SYehuda Sadeh 	struct page **pages;
1117602adf40SYehuda Sadeh 	int num_pages;
1118913d2fdcSAlex Elder 
1119aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1120602adf40SYehuda Sadeh 
1121602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1122602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1123b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1124b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1125602adf40SYehuda Sadeh 
11260ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1127aded07eaSAlex Elder 			  object_name, ofs, len, NULL,
1128602adf40SYehuda Sadeh 			  pages, num_pages,
1129602adf40SYehuda Sadeh 			  flags,
1130602adf40SYehuda Sadeh 			  ops,
11311fec7093SYehuda Sadeh 			  NULL, 0,
113259c2be1eSYehuda Sadeh 			  NULL,
113359c2be1eSYehuda Sadeh 			  linger_req, ver);
1134602adf40SYehuda Sadeh 	if (ret < 0)
1135913d2fdcSAlex Elder 		goto done;
1136602adf40SYehuda Sadeh 
1137602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1138602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1139602adf40SYehuda Sadeh 
1140602adf40SYehuda Sadeh done:
1141602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1142602adf40SYehuda Sadeh 	return ret;
1143602adf40SYehuda Sadeh }
1144602adf40SYehuda Sadeh 
1145602adf40SYehuda Sadeh /*
1146602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1147602adf40SYehuda Sadeh  */
1148602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1149602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1150602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1151602adf40SYehuda Sadeh 		     u64 snapid,
1152d1f57ea6SAlex Elder 		     int opcode, int flags,
1153602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
11541fec7093SYehuda Sadeh 		     struct bio *bio,
11551fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
11561fec7093SYehuda Sadeh 		     int coll_index)
1157602adf40SYehuda Sadeh {
1158602adf40SYehuda Sadeh 	char *seg_name;
1159602adf40SYehuda Sadeh 	u64 seg_ofs;
1160602adf40SYehuda Sadeh 	u64 seg_len;
1161602adf40SYehuda Sadeh 	int ret;
1162602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1163602adf40SYehuda Sadeh 	u32 payload_len;
1164602adf40SYehuda Sadeh 
116565ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1166602adf40SYehuda Sadeh 	if (!seg_name)
1167602adf40SYehuda Sadeh 		return -ENOMEM;
116865ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
116965ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1170602adf40SYehuda Sadeh 
1171602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1172602adf40SYehuda Sadeh 
117357cfc106SAlex Elder 	ret = -ENOMEM;
117457cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
117557cfc106SAlex Elder 	if (!ops)
1176602adf40SYehuda Sadeh 		goto done;
1177602adf40SYehuda Sadeh 
1178602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1179602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1180602adf40SYehuda Sadeh 	   truncated at this point */
1181aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1182602adf40SYehuda Sadeh 
1183602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1184602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1185602adf40SYehuda Sadeh 			     bio,
1186602adf40SYehuda Sadeh 			     NULL, 0,
1187602adf40SYehuda Sadeh 			     flags,
1188602adf40SYehuda Sadeh 			     ops,
11891fec7093SYehuda Sadeh 			     coll, coll_index,
119059c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
119111f77002SSage Weil 
119211f77002SSage Weil 	rbd_destroy_ops(ops);
1193602adf40SYehuda Sadeh done:
1194602adf40SYehuda Sadeh 	kfree(seg_name);
1195602adf40SYehuda Sadeh 	return ret;
1196602adf40SYehuda Sadeh }
1197602adf40SYehuda Sadeh 
1198602adf40SYehuda Sadeh /*
1199602adf40SYehuda Sadeh  * Request async osd write
1200602adf40SYehuda Sadeh  */
1201602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1202602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1203602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1204602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
12051fec7093SYehuda Sadeh 			 struct bio *bio,
12061fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
12071fec7093SYehuda Sadeh 			 int coll_index)
1208602adf40SYehuda Sadeh {
1209602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1210602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1211602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
12121fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1213602adf40SYehuda Sadeh }
1214602adf40SYehuda Sadeh 
1215602adf40SYehuda Sadeh /*
1216602adf40SYehuda Sadeh  * Request async osd read
1217602adf40SYehuda Sadeh  */
1218602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1219602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1220602adf40SYehuda Sadeh 			 u64 snapid,
1221602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
12221fec7093SYehuda Sadeh 			 struct bio *bio,
12231fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
12241fec7093SYehuda Sadeh 			 int coll_index)
1225602adf40SYehuda Sadeh {
1226602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1227b06e6a6bSJosh Durgin 			 snapid,
1228602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1229602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
12301fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1231602adf40SYehuda Sadeh }
1232602adf40SYehuda Sadeh 
1233602adf40SYehuda Sadeh /*
1234602adf40SYehuda Sadeh  * Request sync osd read
1235602adf40SYehuda Sadeh  */
12360ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1237602adf40SYehuda Sadeh 			  u64 snapid,
1238aded07eaSAlex Elder 			  const char *object_name,
1239602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
124059c2be1eSYehuda Sadeh 			  char *buf,
124159c2be1eSYehuda Sadeh 			  u64 *ver)
1242602adf40SYehuda Sadeh {
1243913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1244913d2fdcSAlex Elder 	int ret;
1245913d2fdcSAlex Elder 
1246913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1247913d2fdcSAlex Elder 	if (!ops)
1248913d2fdcSAlex Elder 		return -ENOMEM;
1249913d2fdcSAlex Elder 
1250913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1251b06e6a6bSJosh Durgin 			       snapid,
1252602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1253913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1254913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1255913d2fdcSAlex Elder 
1256913d2fdcSAlex Elder 	return ret;
1257602adf40SYehuda Sadeh }
1258602adf40SYehuda Sadeh 
1259602adf40SYehuda Sadeh /*
126059c2be1eSYehuda Sadeh  * Request sync osd watch
126159c2be1eSYehuda Sadeh  */
12620ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
126359c2be1eSYehuda Sadeh 				   u64 ver,
12647f0a24d8SAlex Elder 				   u64 notify_id)
126559c2be1eSYehuda Sadeh {
126659c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
126711f77002SSage Weil 	int ret;
126811f77002SSage Weil 
126957cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
127057cfc106SAlex Elder 	if (!ops)
127157cfc106SAlex Elder 		return -ENOMEM;
127259c2be1eSYehuda Sadeh 
1273a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
127459c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
127559c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
127659c2be1eSYehuda Sadeh 
12770ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
12787f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1279ad4f232fSAlex Elder 			  NULL, 0,
128059c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
128159c2be1eSYehuda Sadeh 			  ops,
12821fec7093SYehuda Sadeh 			  NULL, 0,
128359c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
128459c2be1eSYehuda Sadeh 
128559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
128659c2be1eSYehuda Sadeh 	return ret;
128759c2be1eSYehuda Sadeh }
128859c2be1eSYehuda Sadeh 
128959c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
129059c2be1eSYehuda Sadeh {
12910ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1292a71b891bSJosh Durgin 	u64 hver;
129313143d2dSSage Weil 	int rc;
129413143d2dSSage Weil 
12950ce1a794SAlex Elder 	if (!rbd_dev)
129659c2be1eSYehuda Sadeh 		return;
129759c2be1eSYehuda Sadeh 
1298bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1299bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1300bd919d45SAlex Elder 		(unsigned int) opcode);
13011fe5e993SAlex Elder 	rc = rbd_refresh_header(rbd_dev, &hver);
130213143d2dSSage Weil 	if (rc)
1303f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
13040ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
130559c2be1eSYehuda Sadeh 
13067f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
130759c2be1eSYehuda Sadeh }
130859c2be1eSYehuda Sadeh 
130959c2be1eSYehuda Sadeh /*
131059c2be1eSYehuda Sadeh  * Request sync osd watch
131159c2be1eSYehuda Sadeh  */
13120e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
131359c2be1eSYehuda Sadeh {
131459c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13150ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
131657cfc106SAlex Elder 	int ret;
131759c2be1eSYehuda Sadeh 
131857cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
131957cfc106SAlex Elder 	if (!ops)
132057cfc106SAlex Elder 		return -ENOMEM;
132159c2be1eSYehuda Sadeh 
132259c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
13230ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
132459c2be1eSYehuda Sadeh 	if (ret < 0)
132559c2be1eSYehuda Sadeh 		goto fail;
132659c2be1eSYehuda Sadeh 
13270e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
13280ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
132959c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
133059c2be1eSYehuda Sadeh 
13310ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
133259c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
133359c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
133459c2be1eSYehuda Sadeh 			      ops,
13350e6f322dSAlex Elder 			      rbd_dev->header_name,
13360e6f322dSAlex Elder 			      0, 0, NULL,
13370ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
133859c2be1eSYehuda Sadeh 
133959c2be1eSYehuda Sadeh 	if (ret < 0)
134059c2be1eSYehuda Sadeh 		goto fail_event;
134159c2be1eSYehuda Sadeh 
134259c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
134359c2be1eSYehuda Sadeh 	return 0;
134459c2be1eSYehuda Sadeh 
134559c2be1eSYehuda Sadeh fail_event:
13460ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13470ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
134859c2be1eSYehuda Sadeh fail:
134959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
135059c2be1eSYehuda Sadeh 	return ret;
135159c2be1eSYehuda Sadeh }
135259c2be1eSYehuda Sadeh 
135379e3057cSYehuda Sadeh /*
135479e3057cSYehuda Sadeh  * Request sync osd unwatch
135579e3057cSYehuda Sadeh  */
1356070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
135779e3057cSYehuda Sadeh {
135879e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
135957cfc106SAlex Elder 	int ret;
136079e3057cSYehuda Sadeh 
136157cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
136257cfc106SAlex Elder 	if (!ops)
136357cfc106SAlex Elder 		return -ENOMEM;
136479e3057cSYehuda Sadeh 
136579e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
13660ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
136779e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
136879e3057cSYehuda Sadeh 
13690ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
137079e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
137179e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
137279e3057cSYehuda Sadeh 			      ops,
1373070c633fSAlex Elder 			      rbd_dev->header_name,
1374070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1375070c633fSAlex Elder 
137679e3057cSYehuda Sadeh 
137779e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
13780ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13790ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
138079e3057cSYehuda Sadeh 	return ret;
138179e3057cSYehuda Sadeh }
138279e3057cSYehuda Sadeh 
138359c2be1eSYehuda Sadeh struct rbd_notify_info {
13840ce1a794SAlex Elder 	struct rbd_device *rbd_dev;
138559c2be1eSYehuda Sadeh };
138659c2be1eSYehuda Sadeh 
138759c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
138859c2be1eSYehuda Sadeh {
13890ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
13900ce1a794SAlex Elder 	if (!rbd_dev)
139159c2be1eSYehuda Sadeh 		return;
139259c2be1eSYehuda Sadeh 
1393bd919d45SAlex Elder 	dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1394bd919d45SAlex Elder 			rbd_dev->header_name, (unsigned long long) notify_id,
1395bd919d45SAlex Elder 			(unsigned int) opcode);
139659c2be1eSYehuda Sadeh }
139759c2be1eSYehuda Sadeh 
139859c2be1eSYehuda Sadeh /*
139959c2be1eSYehuda Sadeh  * Request sync osd notify
140059c2be1eSYehuda Sadeh  */
14014cb16250SAlex Elder static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
140259c2be1eSYehuda Sadeh {
140359c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
14040ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
140559c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
140659c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
140759c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
140859c2be1eSYehuda Sadeh 	int ret;
140959c2be1eSYehuda Sadeh 
141057cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
141157cfc106SAlex Elder 	if (!ops)
141257cfc106SAlex Elder 		return -ENOMEM;
141359c2be1eSYehuda Sadeh 
14140ce1a794SAlex Elder 	info.rbd_dev = rbd_dev;
141559c2be1eSYehuda Sadeh 
141659c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
141759c2be1eSYehuda Sadeh 				     (void *)&info, &event);
141859c2be1eSYehuda Sadeh 	if (ret < 0)
141959c2be1eSYehuda Sadeh 		goto fail;
142059c2be1eSYehuda Sadeh 
142159c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
142259c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
142359c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
142459c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
142559c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
142659c2be1eSYehuda Sadeh 
14270ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
142859c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
142959c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
143059c2be1eSYehuda Sadeh 			       ops,
14314cb16250SAlex Elder 			       rbd_dev->header_name,
14324cb16250SAlex Elder 			       0, 0, NULL, NULL, NULL);
143359c2be1eSYehuda Sadeh 	if (ret < 0)
143459c2be1eSYehuda Sadeh 		goto fail_event;
143559c2be1eSYehuda Sadeh 
143659c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
143759c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
143859c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
143959c2be1eSYehuda Sadeh 	return 0;
144059c2be1eSYehuda Sadeh 
144159c2be1eSYehuda Sadeh fail_event:
144259c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
144359c2be1eSYehuda Sadeh fail:
144459c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
144559c2be1eSYehuda Sadeh 	return ret;
144659c2be1eSYehuda Sadeh }
144759c2be1eSYehuda Sadeh 
144859c2be1eSYehuda Sadeh /*
1449602adf40SYehuda Sadeh  * Request sync osd read
1450602adf40SYehuda Sadeh  */
14510ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1452aded07eaSAlex Elder 			     const char *object_name,
1453aded07eaSAlex Elder 			     const char *class_name,
1454aded07eaSAlex Elder 			     const char *method_name,
1455602adf40SYehuda Sadeh 			     const char *data,
145659c2be1eSYehuda Sadeh 			     int len,
145759c2be1eSYehuda Sadeh 			     u64 *ver)
1458602adf40SYehuda Sadeh {
1459602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1460aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1461aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
146257cfc106SAlex Elder 	int ret;
146357cfc106SAlex Elder 
146457cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
1465aded07eaSAlex Elder 				    class_name_len + method_name_len + len);
146657cfc106SAlex Elder 	if (!ops)
146757cfc106SAlex Elder 		return -ENOMEM;
1468602adf40SYehuda Sadeh 
1469aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1470aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1471aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1472aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1473602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1474602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1475602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1476602adf40SYehuda Sadeh 
14770ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1478602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1479602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1480602adf40SYehuda Sadeh 			       ops,
1481d1f57ea6SAlex Elder 			       object_name, 0, 0, NULL, NULL, ver);
1482602adf40SYehuda Sadeh 
1483602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1484602adf40SYehuda Sadeh 
1485602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1486602adf40SYehuda Sadeh 	return ret;
1487602adf40SYehuda Sadeh }
1488602adf40SYehuda Sadeh 
14891fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14901fec7093SYehuda Sadeh {
14911fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14921fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14931fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14941fec7093SYehuda Sadeh 				GFP_ATOMIC);
14951fec7093SYehuda Sadeh 
14961fec7093SYehuda Sadeh 	if (!coll)
14971fec7093SYehuda Sadeh 		return NULL;
14981fec7093SYehuda Sadeh 	coll->total = num_reqs;
14991fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15001fec7093SYehuda Sadeh 	return coll;
15011fec7093SYehuda Sadeh }
15021fec7093SYehuda Sadeh 
1503602adf40SYehuda Sadeh /*
1504602adf40SYehuda Sadeh  * block device queue callback
1505602adf40SYehuda Sadeh  */
1506602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1507602adf40SYehuda Sadeh {
1508602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1509602adf40SYehuda Sadeh 	struct request *rq;
1510602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1511602adf40SYehuda Sadeh 
151200f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1513602adf40SYehuda Sadeh 		struct bio *bio;
1514602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1515602adf40SYehuda Sadeh 		bool do_write;
1516bd919d45SAlex Elder 		unsigned int size;
1517bd919d45SAlex Elder 		u64 op_size = 0;
1518602adf40SYehuda Sadeh 		u64 ofs;
15191fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
15201fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1521d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1522602adf40SYehuda Sadeh 
1523602adf40SYehuda Sadeh 		dout("fetched request\n");
1524602adf40SYehuda Sadeh 
1525602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1526602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1527602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
152800f1f36fSAlex Elder 			continue;
1529602adf40SYehuda Sadeh 		}
1530602adf40SYehuda Sadeh 
1531602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1532602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1533602adf40SYehuda Sadeh 
1534602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1535593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1536602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1537602adf40SYehuda Sadeh 		if (do_write && rbd_dev->read_only) {
1538602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
153900f1f36fSAlex Elder 			continue;
1540602adf40SYehuda Sadeh 		}
1541602adf40SYehuda Sadeh 
1542602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1543602adf40SYehuda Sadeh 
1544e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1545e88a36ecSJosh Durgin 
1546d1d25646SJosh Durgin 		if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
1547d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1548e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1549e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1550e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1551e88a36ecSJosh Durgin 			continue;
1552e88a36ecSJosh Durgin 		}
1553d1d25646SJosh Durgin 
1554d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1555d1d25646SJosh Durgin 
1556d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1557e88a36ecSJosh Durgin 
1558602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1559602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1560bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1561602adf40SYehuda Sadeh 
15621fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1563df111be6SAlex Elder 		if (num_segs <= 0) {
1564df111be6SAlex Elder 			spin_lock_irq(q->queue_lock);
1565df111be6SAlex Elder 			__blk_end_request_all(rq, num_segs);
1566df111be6SAlex Elder 			ceph_put_snap_context(snapc);
1567df111be6SAlex Elder 			continue;
1568df111be6SAlex Elder 		}
15691fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
15701fec7093SYehuda Sadeh 		if (!coll) {
15711fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
15721fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1573d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
157400f1f36fSAlex Elder 			continue;
15751fec7093SYehuda Sadeh 		}
15761fec7093SYehuda Sadeh 
1577602adf40SYehuda Sadeh 		do {
1578602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1579bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
158065ccfe21SAlex Elder 			op_size = rbd_segment_length(rbd_dev, ofs, size);
15811fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1582602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1583602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1584602adf40SYehuda Sadeh 			if (!bio) {
15851fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15861fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15871fec7093SYehuda Sadeh 				goto next_seg;
1588602adf40SYehuda Sadeh 			}
1589602adf40SYehuda Sadeh 
15901fec7093SYehuda Sadeh 
1591602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1592602adf40SYehuda Sadeh 			if (do_write)
1593602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1594d1d25646SJosh Durgin 					      snapc,
1595602adf40SYehuda Sadeh 					      ofs,
15961fec7093SYehuda Sadeh 					      op_size, bio,
15971fec7093SYehuda Sadeh 					      coll, cur_seg);
1598602adf40SYehuda Sadeh 			else
1599602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
160077dfe99fSJosh Durgin 					     rbd_dev->snap_id,
1601602adf40SYehuda Sadeh 					     ofs,
16021fec7093SYehuda Sadeh 					     op_size, bio,
16031fec7093SYehuda Sadeh 					     coll, cur_seg);
1604602adf40SYehuda Sadeh 
16051fec7093SYehuda Sadeh next_seg:
1606602adf40SYehuda Sadeh 			size -= op_size;
1607602adf40SYehuda Sadeh 			ofs += op_size;
1608602adf40SYehuda Sadeh 
16091fec7093SYehuda Sadeh 			cur_seg++;
1610602adf40SYehuda Sadeh 			rq_bio = next_bio;
1611602adf40SYehuda Sadeh 		} while (size > 0);
16121fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1613602adf40SYehuda Sadeh 
1614602adf40SYehuda Sadeh 		if (bp)
1615602adf40SYehuda Sadeh 			bio_pair_release(bp);
1616602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1617d1d25646SJosh Durgin 
1618d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1619602adf40SYehuda Sadeh 	}
1620602adf40SYehuda Sadeh }
1621602adf40SYehuda Sadeh 
1622602adf40SYehuda Sadeh /*
1623602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1624602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1625602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1626602adf40SYehuda Sadeh  */
1627602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1628602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1629602adf40SYehuda Sadeh {
1630602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1631593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1632593a9e7bSAlex Elder 	sector_t sector;
1633593a9e7bSAlex Elder 	unsigned int bio_sectors;
1634602adf40SYehuda Sadeh 	int max;
1635602adf40SYehuda Sadeh 
1636593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1637593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1638593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1639593a9e7bSAlex Elder 
1640602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1641593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1642602adf40SYehuda Sadeh 	if (max < 0)
1643602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1644602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1645602adf40SYehuda Sadeh 		return bvec->bv_len;
1646602adf40SYehuda Sadeh 	return max;
1647602adf40SYehuda Sadeh }
1648602adf40SYehuda Sadeh 
1649602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1650602adf40SYehuda Sadeh {
1651602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1652602adf40SYehuda Sadeh 
1653602adf40SYehuda Sadeh 	if (!disk)
1654602adf40SYehuda Sadeh 		return;
1655602adf40SYehuda Sadeh 
1656602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1657602adf40SYehuda Sadeh 
1658602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1659602adf40SYehuda Sadeh 		del_gendisk(disk);
1660602adf40SYehuda Sadeh 	if (disk->queue)
1661602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1662602adf40SYehuda Sadeh 	put_disk(disk);
1663602adf40SYehuda Sadeh }
1664602adf40SYehuda Sadeh 
1665602adf40SYehuda Sadeh /*
16664156d998SAlex Elder  * Read the complete header for the given rbd device.
16674156d998SAlex Elder  *
16684156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
16694156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
16704156d998SAlex Elder  * of a variable that will be filled in with the version of the
16714156d998SAlex Elder  * header object at the time it was read.
16724156d998SAlex Elder  *
16734156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
16744156d998SAlex Elder  */
16754156d998SAlex Elder static struct rbd_image_header_ondisk *
16764156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
16774156d998SAlex Elder {
16784156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
16794156d998SAlex Elder 	u32 snap_count = 0;
16804156d998SAlex Elder 	u64 names_size = 0;
16814156d998SAlex Elder 	u32 want_count;
16824156d998SAlex Elder 	int ret;
16834156d998SAlex Elder 
16844156d998SAlex Elder 	/*
16854156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
16864156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
16874156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
16884156d998SAlex Elder 	 * the number of snapshots could change by the time we read
16894156d998SAlex Elder 	 * it in, in which case we re-read it.
16904156d998SAlex Elder 	 */
16914156d998SAlex Elder 	do {
16924156d998SAlex Elder 		size_t size;
16934156d998SAlex Elder 
16944156d998SAlex Elder 		kfree(ondisk);
16954156d998SAlex Elder 
16964156d998SAlex Elder 		size = sizeof (*ondisk);
16974156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
16984156d998SAlex Elder 		size += names_size;
16994156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17004156d998SAlex Elder 		if (!ondisk)
17014156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
17024156d998SAlex Elder 
17034156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
17044156d998SAlex Elder 				       rbd_dev->header_name,
17054156d998SAlex Elder 				       0, size,
17064156d998SAlex Elder 				       (char *) ondisk, version);
17074156d998SAlex Elder 
17084156d998SAlex Elder 		if (ret < 0)
17094156d998SAlex Elder 			goto out_err;
17104156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
17114156d998SAlex Elder 			ret = -ENXIO;
17124156d998SAlex Elder 			pr_warning("short header read for image %s"
17134156d998SAlex Elder 					" (want %zd got %d)\n",
17144156d998SAlex Elder 				rbd_dev->image_name, size, ret);
17154156d998SAlex Elder 			goto out_err;
17164156d998SAlex Elder 		}
17174156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
17184156d998SAlex Elder 			ret = -ENXIO;
17194156d998SAlex Elder 			pr_warning("invalid header for image %s\n",
17204156d998SAlex Elder 				rbd_dev->image_name);
17214156d998SAlex Elder 			goto out_err;
17224156d998SAlex Elder 		}
17234156d998SAlex Elder 
17244156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
17254156d998SAlex Elder 		want_count = snap_count;
17264156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
17274156d998SAlex Elder 	} while (snap_count != want_count);
17284156d998SAlex Elder 
17294156d998SAlex Elder 	return ondisk;
17304156d998SAlex Elder 
17314156d998SAlex Elder out_err:
17324156d998SAlex Elder 	kfree(ondisk);
17334156d998SAlex Elder 
17344156d998SAlex Elder 	return ERR_PTR(ret);
17354156d998SAlex Elder }
17364156d998SAlex Elder 
17374156d998SAlex Elder /*
1738602adf40SYehuda Sadeh  * reload the ondisk the header
1739602adf40SYehuda Sadeh  */
1740602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1741602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1742602adf40SYehuda Sadeh {
17434156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
17444156d998SAlex Elder 	u64 ver = 0;
17454156d998SAlex Elder 	int ret;
1746602adf40SYehuda Sadeh 
17474156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
17484156d998SAlex Elder 	if (IS_ERR(ondisk))
17494156d998SAlex Elder 		return PTR_ERR(ondisk);
17504156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
17514156d998SAlex Elder 	if (ret >= 0)
175259c2be1eSYehuda Sadeh 		header->obj_version = ver;
17534156d998SAlex Elder 	kfree(ondisk);
1754602adf40SYehuda Sadeh 
17554156d998SAlex Elder 	return ret;
1756602adf40SYehuda Sadeh }
1757602adf40SYehuda Sadeh 
1758602adf40SYehuda Sadeh /*
1759602adf40SYehuda Sadeh  * create a snapshot
1760602adf40SYehuda Sadeh  */
17610ce1a794SAlex Elder static int rbd_header_add_snap(struct rbd_device *rbd_dev,
1762602adf40SYehuda Sadeh 			       const char *snap_name,
1763602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1764602adf40SYehuda Sadeh {
1765602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1766602adf40SYehuda Sadeh 	u64 new_snapid;
1767602adf40SYehuda Sadeh 	int ret;
1768916d4d67SSage Weil 	void *data, *p, *e;
17691dbb4399SAlex Elder 	struct ceph_mon_client *monc;
1770602adf40SYehuda Sadeh 
1771602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
17720ce1a794SAlex Elder 	if (rbd_dev->snap_id != CEPH_NOSNAP)
1773602adf40SYehuda Sadeh 		return -EINVAL;
1774602adf40SYehuda Sadeh 
17750ce1a794SAlex Elder 	monc = &rbd_dev->rbd_client->client->monc;
17760ce1a794SAlex Elder 	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
1777bd919d45SAlex Elder 	dout("created snapid=%llu\n", (unsigned long long) new_snapid);
1778602adf40SYehuda Sadeh 	if (ret < 0)
1779602adf40SYehuda Sadeh 		return ret;
1780602adf40SYehuda Sadeh 
1781602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1782602adf40SYehuda Sadeh 	if (!data)
1783602adf40SYehuda Sadeh 		return -ENOMEM;
1784602adf40SYehuda Sadeh 
1785916d4d67SSage Weil 	p = data;
1786916d4d67SSage Weil 	e = data + name_len + 16;
1787602adf40SYehuda Sadeh 
1788916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1789916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1790602adf40SYehuda Sadeh 
17910bed54dcSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
17920ce1a794SAlex Elder 				"rbd", "snap_add",
1793d67d4be5SAlex Elder 				data, p - data, NULL);
1794602adf40SYehuda Sadeh 
1795916d4d67SSage Weil 	kfree(data);
1796602adf40SYehuda Sadeh 
1797505cbb9bSAlex Elder 	return ret < 0 ? ret : 0;
1798602adf40SYehuda Sadeh bad:
1799602adf40SYehuda Sadeh 	return -ERANGE;
1800602adf40SYehuda Sadeh }
1801602adf40SYehuda Sadeh 
1802dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1803dfc5606dSYehuda Sadeh {
1804dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1805a0593290SAlex Elder 	struct rbd_snap *next;
1806dfc5606dSYehuda Sadeh 
1807a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
180814e7085dSAlex Elder 		__rbd_remove_snap_dev(snap);
1809dfc5606dSYehuda Sadeh }
1810dfc5606dSYehuda Sadeh 
1811602adf40SYehuda Sadeh /*
1812602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1813602adf40SYehuda Sadeh  */
1814b813623aSAlex Elder static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1815602adf40SYehuda Sadeh {
1816602adf40SYehuda Sadeh 	int ret;
1817602adf40SYehuda Sadeh 	struct rbd_image_header h;
1818602adf40SYehuda Sadeh 
1819602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1820602adf40SYehuda Sadeh 	if (ret < 0)
1821602adf40SYehuda Sadeh 		return ret;
1822602adf40SYehuda Sadeh 
1823a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1824a51aa0c0SJosh Durgin 
18259db4b3e3SSage Weil 	/* resized? */
1826474ef7ceSJosh Durgin 	if (rbd_dev->snap_id == CEPH_NOSNAP) {
1827474ef7ceSJosh Durgin 		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1828474ef7ceSJosh Durgin 
1829474ef7ceSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long) size);
1830474ef7ceSJosh Durgin 		set_capacity(rbd_dev->disk, size);
1831474ef7ceSJosh Durgin 	}
18329db4b3e3SSage Weil 
1833849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1834602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1835849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1836d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1837d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1838602adf40SYehuda Sadeh 
1839b813623aSAlex Elder 	if (hver)
1840b813623aSAlex Elder 		*hver = h.obj_version;
1841a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
184293a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1843602adf40SYehuda Sadeh 	rbd_dev->header.total_snaps = h.total_snaps;
1844602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1845602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1846602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1847849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1848849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1849849b4260SAlex Elder 	kfree(h.object_prefix);
1850849b4260SAlex Elder 
18519fcbb800SAlex Elder 	ret = rbd_dev_snap_devs_update(rbd_dev);
1852dfc5606dSYehuda Sadeh 
1853c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1854602adf40SYehuda Sadeh 
1855dfc5606dSYehuda Sadeh 	return ret;
1856602adf40SYehuda Sadeh }
1857602adf40SYehuda Sadeh 
18581fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
18591fe5e993SAlex Elder {
18601fe5e993SAlex Elder 	int ret;
18611fe5e993SAlex Elder 
18621fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
18631fe5e993SAlex Elder 	ret = __rbd_refresh_header(rbd_dev, hver);
18641fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
18651fe5e993SAlex Elder 
18661fe5e993SAlex Elder 	return ret;
18671fe5e993SAlex Elder }
18681fe5e993SAlex Elder 
1869602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1870602adf40SYehuda Sadeh {
1871602adf40SYehuda Sadeh 	struct gendisk *disk;
1872602adf40SYehuda Sadeh 	struct request_queue *q;
1873602adf40SYehuda Sadeh 	int rc;
1874593a9e7bSAlex Elder 	u64 segment_size;
1875602adf40SYehuda Sadeh 	u64 total_size = 0;
1876602adf40SYehuda Sadeh 
1877602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1878602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1879602adf40SYehuda Sadeh 	if (rc)
1880602adf40SYehuda Sadeh 		return rc;
1881602adf40SYehuda Sadeh 
1882dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
18839fcbb800SAlex Elder 	rc = rbd_dev_snap_devs_update(rbd_dev);
1884dfc5606dSYehuda Sadeh 	if (rc)
1885dfc5606dSYehuda Sadeh 		return rc;
1886dfc5606dSYehuda Sadeh 
1887cc9d734cSJosh Durgin 	rc = rbd_header_set_snap(rbd_dev, &total_size);
1888602adf40SYehuda Sadeh 	if (rc)
1889602adf40SYehuda Sadeh 		return rc;
1890602adf40SYehuda Sadeh 
1891602adf40SYehuda Sadeh 	/* create gendisk info */
1892602adf40SYehuda Sadeh 	rc = -ENOMEM;
1893602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1894602adf40SYehuda Sadeh 	if (!disk)
1895602adf40SYehuda Sadeh 		goto out;
1896602adf40SYehuda Sadeh 
1897f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1898de71a297SAlex Elder 		 rbd_dev->dev_id);
1899602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1900602adf40SYehuda Sadeh 	disk->first_minor = 0;
1901602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1902602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1903602adf40SYehuda Sadeh 
1904602adf40SYehuda Sadeh 	/* init rq */
1905602adf40SYehuda Sadeh 	rc = -ENOMEM;
1906602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1907602adf40SYehuda Sadeh 	if (!q)
1908602adf40SYehuda Sadeh 		goto out_disk;
1909029bcbd8SJosh Durgin 
1910593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1911593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1912593a9e7bSAlex Elder 
1913029bcbd8SJosh Durgin 	/* set io sizes to object size */
1914593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1915593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1916593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1917593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1918593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1919029bcbd8SJosh Durgin 
1920602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1921602adf40SYehuda Sadeh 	disk->queue = q;
1922602adf40SYehuda Sadeh 
1923602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1924602adf40SYehuda Sadeh 
1925602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1926602adf40SYehuda Sadeh 	rbd_dev->q = q;
1927602adf40SYehuda Sadeh 
1928602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
1929593a9e7bSAlex Elder 	set_capacity(disk, total_size / SECTOR_SIZE);
1930602adf40SYehuda Sadeh 	add_disk(disk);
1931602adf40SYehuda Sadeh 
1932602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
1933602adf40SYehuda Sadeh 		disk->disk_name, (unsigned long long)total_size);
1934602adf40SYehuda Sadeh 	return 0;
1935602adf40SYehuda Sadeh 
1936602adf40SYehuda Sadeh out_disk:
1937602adf40SYehuda Sadeh 	put_disk(disk);
1938602adf40SYehuda Sadeh out:
1939602adf40SYehuda Sadeh 	return rc;
1940602adf40SYehuda Sadeh }
1941602adf40SYehuda Sadeh 
1942dfc5606dSYehuda Sadeh /*
1943dfc5606dSYehuda Sadeh   sysfs
1944dfc5606dSYehuda Sadeh */
1945602adf40SYehuda Sadeh 
1946593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1947593a9e7bSAlex Elder {
1948593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1949593a9e7bSAlex Elder }
1950593a9e7bSAlex Elder 
1951dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1952dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1953602adf40SYehuda Sadeh {
1954593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1955a51aa0c0SJosh Durgin 	sector_t size;
1956dfc5606dSYehuda Sadeh 
1957a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1958a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1959a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1960a51aa0c0SJosh Durgin 
1961a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1962602adf40SYehuda Sadeh }
1963602adf40SYehuda Sadeh 
1964dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1965dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1966602adf40SYehuda Sadeh {
1967593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1968dfc5606dSYehuda Sadeh 
1969dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1970dfc5606dSYehuda Sadeh }
1971dfc5606dSYehuda Sadeh 
1972dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1973dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1974dfc5606dSYehuda Sadeh {
1975593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1976dfc5606dSYehuda Sadeh 
19771dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
19781dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1979dfc5606dSYehuda Sadeh }
1980dfc5606dSYehuda Sadeh 
1981dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1982dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1983dfc5606dSYehuda Sadeh {
1984593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1985dfc5606dSYehuda Sadeh 
1986dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1987dfc5606dSYehuda Sadeh }
1988dfc5606dSYehuda Sadeh 
19899bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
19909bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
19919bb2f334SAlex Elder {
19929bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
19939bb2f334SAlex Elder 
19949bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
19959bb2f334SAlex Elder }
19969bb2f334SAlex Elder 
1997dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1998dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1999dfc5606dSYehuda Sadeh {
2000593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2001dfc5606dSYehuda Sadeh 
20020bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
2003dfc5606dSYehuda Sadeh }
2004dfc5606dSYehuda Sadeh 
2005dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2006dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2007dfc5606dSYehuda Sadeh 			     char *buf)
2008dfc5606dSYehuda Sadeh {
2009593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2010dfc5606dSYehuda Sadeh 
2011dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
2012dfc5606dSYehuda Sadeh }
2013dfc5606dSYehuda Sadeh 
2014dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2015dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2016dfc5606dSYehuda Sadeh 				 const char *buf,
2017dfc5606dSYehuda Sadeh 				 size_t size)
2018dfc5606dSYehuda Sadeh {
2019593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2020b813623aSAlex Elder 	int ret;
2021602adf40SYehuda Sadeh 
20221fe5e993SAlex Elder 	ret = rbd_refresh_header(rbd_dev, NULL);
2023b813623aSAlex Elder 
2024b813623aSAlex Elder 	return ret < 0 ? ret : size;
2025dfc5606dSYehuda Sadeh }
2026602adf40SYehuda Sadeh 
2027dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
2028dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2029dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2030dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
20319bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2032dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2033dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2034dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2035dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
2036dfc5606dSYehuda Sadeh 
2037dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2038dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
2039dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2040dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2041dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
20429bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2043dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2044dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
2045dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2046dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
2047dfc5606dSYehuda Sadeh 	NULL
2048dfc5606dSYehuda Sadeh };
2049dfc5606dSYehuda Sadeh 
2050dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2051dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2052dfc5606dSYehuda Sadeh };
2053dfc5606dSYehuda Sadeh 
2054dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2055dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2056dfc5606dSYehuda Sadeh 	NULL
2057dfc5606dSYehuda Sadeh };
2058dfc5606dSYehuda Sadeh 
2059dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2060dfc5606dSYehuda Sadeh {
2061dfc5606dSYehuda Sadeh }
2062dfc5606dSYehuda Sadeh 
2063dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2064dfc5606dSYehuda Sadeh 	.name		= "rbd",
2065dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2066dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2067dfc5606dSYehuda Sadeh };
2068dfc5606dSYehuda Sadeh 
2069dfc5606dSYehuda Sadeh 
2070dfc5606dSYehuda Sadeh /*
2071dfc5606dSYehuda Sadeh   sysfs - snapshots
2072dfc5606dSYehuda Sadeh */
2073dfc5606dSYehuda Sadeh 
2074dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2075dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2076dfc5606dSYehuda Sadeh 				  char *buf)
2077dfc5606dSYehuda Sadeh {
2078dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2079dfc5606dSYehuda Sadeh 
20803591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2081dfc5606dSYehuda Sadeh }
2082dfc5606dSYehuda Sadeh 
2083dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2084dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2085dfc5606dSYehuda Sadeh 				char *buf)
2086dfc5606dSYehuda Sadeh {
2087dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2088dfc5606dSYehuda Sadeh 
2089593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2090dfc5606dSYehuda Sadeh }
2091dfc5606dSYehuda Sadeh 
2092dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2093dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2094dfc5606dSYehuda Sadeh 
2095dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2096dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2097dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
2098dfc5606dSYehuda Sadeh 	NULL,
2099dfc5606dSYehuda Sadeh };
2100dfc5606dSYehuda Sadeh 
2101dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2102dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2103dfc5606dSYehuda Sadeh };
2104dfc5606dSYehuda Sadeh 
2105dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2106dfc5606dSYehuda Sadeh {
2107dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2108dfc5606dSYehuda Sadeh 	kfree(snap->name);
2109dfc5606dSYehuda Sadeh 	kfree(snap);
2110dfc5606dSYehuda Sadeh }
2111dfc5606dSYehuda Sadeh 
2112dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2113dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2114dfc5606dSYehuda Sadeh 	NULL
2115dfc5606dSYehuda Sadeh };
2116dfc5606dSYehuda Sadeh 
2117dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2118dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2119dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2120dfc5606dSYehuda Sadeh };
2121dfc5606dSYehuda Sadeh 
212214e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2123dfc5606dSYehuda Sadeh {
2124dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2125dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
2126dfc5606dSYehuda Sadeh }
2127dfc5606dSYehuda Sadeh 
212814e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2129dfc5606dSYehuda Sadeh 				  struct device *parent)
2130dfc5606dSYehuda Sadeh {
2131dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2132dfc5606dSYehuda Sadeh 	int ret;
2133dfc5606dSYehuda Sadeh 
2134dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2135dfc5606dSYehuda Sadeh 	dev->parent = parent;
2136dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2137dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2138dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2139dfc5606dSYehuda Sadeh 
2140dfc5606dSYehuda Sadeh 	return ret;
2141dfc5606dSYehuda Sadeh }
2142dfc5606dSYehuda Sadeh 
21434e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
21444e891e0aSAlex Elder 					      int i, const char *name)
2145dfc5606dSYehuda Sadeh {
21464e891e0aSAlex Elder 	struct rbd_snap *snap;
2147dfc5606dSYehuda Sadeh 	int ret;
21484e891e0aSAlex Elder 
21494e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2150dfc5606dSYehuda Sadeh 	if (!snap)
21514e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
21524e891e0aSAlex Elder 
21534e891e0aSAlex Elder 	ret = -ENOMEM;
2154dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
21554e891e0aSAlex Elder 	if (!snap->name)
21564e891e0aSAlex Elder 		goto err;
21574e891e0aSAlex Elder 
2158dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
2159dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
2160dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
216114e7085dSAlex Elder 		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2162dfc5606dSYehuda Sadeh 		if (ret < 0)
2163dfc5606dSYehuda Sadeh 			goto err;
2164dfc5606dSYehuda Sadeh 	}
21654e891e0aSAlex Elder 
21664e891e0aSAlex Elder 	return snap;
21674e891e0aSAlex Elder 
2168dfc5606dSYehuda Sadeh err:
2169dfc5606dSYehuda Sadeh 	kfree(snap->name);
2170dfc5606dSYehuda Sadeh 	kfree(snap);
21714e891e0aSAlex Elder 
21724e891e0aSAlex Elder 	return ERR_PTR(ret);
2173dfc5606dSYehuda Sadeh }
2174dfc5606dSYehuda Sadeh 
2175dfc5606dSYehuda Sadeh /*
217635938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
217735938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
217835938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
217935938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
218035938150SAlex Elder  * And verify there are no changes to snapshots we already know
218135938150SAlex Elder  * about.
218235938150SAlex Elder  *
218335938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
218435938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
218535938150SAlex Elder  * are also maintained in that order.)
2186dfc5606dSYehuda Sadeh  */
21879fcbb800SAlex Elder static int rbd_dev_snap_devs_update(struct rbd_device *rbd_dev)
2188dfc5606dSYehuda Sadeh {
218935938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
219035938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
219135938150SAlex Elder 	char *snap_name = rbd_dev->header.snap_names;
219235938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
219335938150SAlex Elder 	struct list_head *links = head->next;
219435938150SAlex Elder 	u32 index = 0;
2195dfc5606dSYehuda Sadeh 
21969fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
219735938150SAlex Elder 	while (index < snap_count || links != head) {
219835938150SAlex Elder 		u64 snap_id;
219935938150SAlex Elder 		struct rbd_snap *snap;
2200dfc5606dSYehuda Sadeh 
220135938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
220235938150SAlex Elder 					     : CEPH_NOSNAP;
220335938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
220435938150SAlex Elder 				     : NULL;
2205aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2206dfc5606dSYehuda Sadeh 
220735938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
220835938150SAlex Elder 			struct list_head *next = links->next;
2209dfc5606dSYehuda Sadeh 
221035938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2211dfc5606dSYehuda Sadeh 
221235938150SAlex Elder 			if (rbd_dev->snap_id == snap->id)
2213e88a36ecSJosh Durgin 				rbd_dev->snap_exists = false;
221435938150SAlex Elder 			__rbd_remove_snap_dev(snap);
22159fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
22169fcbb800SAlex Elder 				rbd_dev->snap_id == snap->id ? "mapped " : "",
22179fcbb800SAlex Elder 				(unsigned long long) snap->id);
2218dfc5606dSYehuda Sadeh 
221935938150SAlex Elder 			/* Done with this list entry; advance */
222035938150SAlex Elder 
222135938150SAlex Elder 			links = next;
222235938150SAlex Elder 			continue;
2223dfc5606dSYehuda Sadeh 		}
222435938150SAlex Elder 
22259fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
22269fcbb800SAlex Elder 			(unsigned long long) snap_id);
222735938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
222835938150SAlex Elder 			struct rbd_snap *new_snap;
222935938150SAlex Elder 
223035938150SAlex Elder 			/* We haven't seen this snapshot before */
223135938150SAlex Elder 
223235938150SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, index,
223335938150SAlex Elder 							snap_name);
22349fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
22359fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
22369fcbb800SAlex Elder 
22379fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
22389fcbb800SAlex Elder 
22399fcbb800SAlex Elder 				return err;
22409fcbb800SAlex Elder 			}
224135938150SAlex Elder 
224235938150SAlex Elder 			/* New goes before existing, or at end of list */
224335938150SAlex Elder 
22449fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
224535938150SAlex Elder 			if (snap)
224635938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
224735938150SAlex Elder 			else
2248523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
224935938150SAlex Elder 		} else {
225035938150SAlex Elder 			/* Already have this one */
225135938150SAlex Elder 
22529fcbb800SAlex Elder 			dout("  already present\n");
22539fcbb800SAlex Elder 
2254aafb230eSAlex Elder 			rbd_assert(snap->size ==
2255aafb230eSAlex Elder 					rbd_dev->header.snap_sizes[index]);
2256aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
225735938150SAlex Elder 
225835938150SAlex Elder 			/* Done with this list entry; advance */
225935938150SAlex Elder 
226035938150SAlex Elder 			links = links->next;
2261dfc5606dSYehuda Sadeh 		}
226235938150SAlex Elder 
226335938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
226435938150SAlex Elder 
226535938150SAlex Elder 		index++;
226635938150SAlex Elder 		snap_name += strlen(snap_name) + 1;
2267dfc5606dSYehuda Sadeh 	}
22689fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2269dfc5606dSYehuda Sadeh 
2270dfc5606dSYehuda Sadeh 	return 0;
2271dfc5606dSYehuda Sadeh }
2272dfc5606dSYehuda Sadeh 
2273dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2274dfc5606dSYehuda Sadeh {
2275f0f8cef5SAlex Elder 	int ret;
2276dfc5606dSYehuda Sadeh 	struct device *dev;
2277dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2278dfc5606dSYehuda Sadeh 
2279dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2280dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2281dfc5606dSYehuda Sadeh 
2282dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2283dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2284dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2285dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2286de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2287dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2288dfc5606dSYehuda Sadeh 	if (ret < 0)
2289f0f8cef5SAlex Elder 		goto out;
2290dfc5606dSYehuda Sadeh 
2291dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
229214e7085dSAlex Elder 		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2293dfc5606dSYehuda Sadeh 		if (ret < 0)
2294602adf40SYehuda Sadeh 			break;
2295602adf40SYehuda Sadeh 	}
2296f0f8cef5SAlex Elder out:
2297dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2298dfc5606dSYehuda Sadeh 	return ret;
2299602adf40SYehuda Sadeh }
2300602adf40SYehuda Sadeh 
2301dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2302dfc5606dSYehuda Sadeh {
2303dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2304dfc5606dSYehuda Sadeh }
2305dfc5606dSYehuda Sadeh 
230659c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
230759c2be1eSYehuda Sadeh {
230859c2be1eSYehuda Sadeh 	int ret, rc;
230959c2be1eSYehuda Sadeh 
231059c2be1eSYehuda Sadeh 	do {
23110e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
231259c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
23131fe5e993SAlex Elder 			rc = rbd_refresh_header(rbd_dev, NULL);
231459c2be1eSYehuda Sadeh 			if (rc < 0)
231559c2be1eSYehuda Sadeh 				return rc;
231659c2be1eSYehuda Sadeh 		}
231759c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
231859c2be1eSYehuda Sadeh 
231959c2be1eSYehuda Sadeh 	return ret;
232059c2be1eSYehuda Sadeh }
232159c2be1eSYehuda Sadeh 
2322e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
23231ddbe94eSAlex Elder 
23241ddbe94eSAlex Elder /*
2325499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2326499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
23271ddbe94eSAlex Elder  */
2328e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
2329b7f23c36SAlex Elder {
2330e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
2331499afd5bSAlex Elder 
2332499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2333499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2334499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2335e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2336e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2337b7f23c36SAlex Elder }
2338b7f23c36SAlex Elder 
23391ddbe94eSAlex Elder /*
2340499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2341499afd5bSAlex Elder  * identifier is no longer in use.
23421ddbe94eSAlex Elder  */
2343e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
23441ddbe94eSAlex Elder {
2345d184f6bfSAlex Elder 	struct list_head *tmp;
2346de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
2347d184f6bfSAlex Elder 	int max_id;
2348d184f6bfSAlex Elder 
2349aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
2350499afd5bSAlex Elder 
2351e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2352e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
2353499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2354499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2355d184f6bfSAlex Elder 
2356d184f6bfSAlex Elder 	/*
2357d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2358d184f6bfSAlex Elder 	 * is nothing special we need to do.
2359d184f6bfSAlex Elder 	 */
2360e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
2361d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2362d184f6bfSAlex Elder 		return;
2363d184f6bfSAlex Elder 	}
2364d184f6bfSAlex Elder 
2365d184f6bfSAlex Elder 	/*
2366d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2367d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2368d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2369d184f6bfSAlex Elder 	 */
2370d184f6bfSAlex Elder 	max_id = 0;
2371d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2372d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2373d184f6bfSAlex Elder 
2374d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2375d184f6bfSAlex Elder 		if (rbd_id > max_id)
2376d184f6bfSAlex Elder 			max_id = rbd_id;
2377d184f6bfSAlex Elder 	}
2378499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
23791ddbe94eSAlex Elder 
23801ddbe94eSAlex Elder 	/*
2381e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
2382d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2383d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2384d184f6bfSAlex Elder 	 * case.
23851ddbe94eSAlex Elder 	 */
2386e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2387e2839308SAlex Elder 	dout("  max dev id has been reset\n");
2388b7f23c36SAlex Elder }
2389b7f23c36SAlex Elder 
2390a725f65eSAlex Elder /*
2391e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2392e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2393593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2394593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2395e28fff26SAlex Elder  */
2396e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2397e28fff26SAlex Elder {
2398e28fff26SAlex Elder         /*
2399e28fff26SAlex Elder         * These are the characters that produce nonzero for
2400e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2401e28fff26SAlex Elder         */
2402e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2403e28fff26SAlex Elder 
2404e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2405e28fff26SAlex Elder 
2406e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2407e28fff26SAlex Elder }
2408e28fff26SAlex Elder 
2409e28fff26SAlex Elder /*
2410e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2411e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2412593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2413593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2414e28fff26SAlex Elder  *
2415e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2416e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2417e28fff26SAlex Elder  * token_size if the token would not fit.
2418e28fff26SAlex Elder  *
2419593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2420e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2421e28fff26SAlex Elder  * too small to hold it.
2422e28fff26SAlex Elder  */
2423e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2424e28fff26SAlex Elder 				char *token,
2425e28fff26SAlex Elder 				size_t token_size)
2426e28fff26SAlex Elder {
2427e28fff26SAlex Elder         size_t len;
2428e28fff26SAlex Elder 
2429e28fff26SAlex Elder 	len = next_token(buf);
2430e28fff26SAlex Elder 	if (len < token_size) {
2431e28fff26SAlex Elder 		memcpy(token, *buf, len);
2432e28fff26SAlex Elder 		*(token + len) = '\0';
2433e28fff26SAlex Elder 	}
2434e28fff26SAlex Elder 	*buf += len;
2435e28fff26SAlex Elder 
2436e28fff26SAlex Elder         return len;
2437e28fff26SAlex Elder }
2438e28fff26SAlex Elder 
2439e28fff26SAlex Elder /*
2440ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2441ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2442ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2443ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2444ea3352f4SAlex Elder  *
2445ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2446ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2447ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2448ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2449ea3352f4SAlex Elder  *
2450ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2451ea3352f4SAlex Elder  * the end of the found token.
2452ea3352f4SAlex Elder  *
2453ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2454ea3352f4SAlex Elder  */
2455ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2456ea3352f4SAlex Elder {
2457ea3352f4SAlex Elder 	char *dup;
2458ea3352f4SAlex Elder 	size_t len;
2459ea3352f4SAlex Elder 
2460ea3352f4SAlex Elder 	len = next_token(buf);
2461ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2462ea3352f4SAlex Elder 	if (!dup)
2463ea3352f4SAlex Elder 		return NULL;
2464ea3352f4SAlex Elder 
2465ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2466ea3352f4SAlex Elder 	*(dup + len) = '\0';
2467ea3352f4SAlex Elder 	*buf += len;
2468ea3352f4SAlex Elder 
2469ea3352f4SAlex Elder 	if (lenp)
2470ea3352f4SAlex Elder 		*lenp = len;
2471ea3352f4SAlex Elder 
2472ea3352f4SAlex Elder 	return dup;
2473ea3352f4SAlex Elder }
2474ea3352f4SAlex Elder 
2475ea3352f4SAlex Elder /*
24760bed54dcSAlex Elder  * This fills in the pool_name, image_name, image_name_len, snap_name,
2477a725f65eSAlex Elder  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2478a725f65eSAlex Elder  * on the list of monitor addresses and other options provided via
2479a725f65eSAlex Elder  * /sys/bus/rbd/add.
2480d22f76e7SAlex Elder  *
2481d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2482a725f65eSAlex Elder  */
2483a725f65eSAlex Elder static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2484a725f65eSAlex Elder 			      const char *buf,
24857ef3214aSAlex Elder 			      const char **mon_addrs,
24865214ecc4SAlex Elder 			      size_t *mon_addrs_size,
2487e28fff26SAlex Elder 			      char *options,
2488e28fff26SAlex Elder 			     size_t options_size)
2489a725f65eSAlex Elder {
2490e28fff26SAlex Elder 	size_t len;
2491d22f76e7SAlex Elder 	int ret;
2492e28fff26SAlex Elder 
2493e28fff26SAlex Elder 	/* The first four tokens are required */
2494e28fff26SAlex Elder 
24957ef3214aSAlex Elder 	len = next_token(&buf);
24967ef3214aSAlex Elder 	if (!len)
2497a725f65eSAlex Elder 		return -EINVAL;
24985214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
24997ef3214aSAlex Elder 	*mon_addrs = buf;
25007ef3214aSAlex Elder 
25017ef3214aSAlex Elder 	buf += len;
2502a725f65eSAlex Elder 
2503e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2504e28fff26SAlex Elder 	if (!len || len >= options_size)
2505e28fff26SAlex Elder 		return -EINVAL;
2506a725f65eSAlex Elder 
2507bf3e5ae1SAlex Elder 	ret = -ENOMEM;
2508d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2509d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2510d22f76e7SAlex Elder 		goto out_err;
2511e28fff26SAlex Elder 
25120bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
25130bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2514bf3e5ae1SAlex Elder 		goto out_err;
2515e28fff26SAlex Elder 
2516cb8627c7SAlex Elder 	/* Create the name of the header object */
2517cb8627c7SAlex Elder 
25180bed54dcSAlex Elder 	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2519bf3e5ae1SAlex Elder 						+ sizeof (RBD_SUFFIX),
2520bf3e5ae1SAlex Elder 					GFP_KERNEL);
25210bed54dcSAlex Elder 	if (!rbd_dev->header_name)
2522cb8627c7SAlex Elder 		goto out_err;
25230bed54dcSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2524a725f65eSAlex Elder 
2525e28fff26SAlex Elder 	/*
2526820a5f3eSAlex Elder 	 * The snapshot name is optional.  If none is is supplied,
2527820a5f3eSAlex Elder 	 * we use the default value.
2528e28fff26SAlex Elder 	 */
2529820a5f3eSAlex Elder 	rbd_dev->snap_name = dup_token(&buf, &len);
2530820a5f3eSAlex Elder 	if (!rbd_dev->snap_name)
2531820a5f3eSAlex Elder 		goto out_err;
2532820a5f3eSAlex Elder 	if (!len) {
2533820a5f3eSAlex Elder 		/* Replace the empty name with the default */
2534820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
2535820a5f3eSAlex Elder 		rbd_dev->snap_name
2536820a5f3eSAlex Elder 			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2537820a5f3eSAlex Elder 		if (!rbd_dev->snap_name)
2538820a5f3eSAlex Elder 			goto out_err;
2539820a5f3eSAlex Elder 
2540e28fff26SAlex Elder 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2541e28fff26SAlex Elder 			sizeof (RBD_SNAP_HEAD_NAME));
2542849b4260SAlex Elder 	}
2543e28fff26SAlex Elder 
2544a725f65eSAlex Elder 	return 0;
2545d22f76e7SAlex Elder 
2546d22f76e7SAlex Elder out_err:
25470bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2548d78fd7aeSAlex Elder 	rbd_dev->header_name = NULL;
25490bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2550d78fd7aeSAlex Elder 	rbd_dev->image_name = NULL;
2551d78fd7aeSAlex Elder 	rbd_dev->image_name_len = 0;
2552d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2553d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2554d22f76e7SAlex Elder 
2555d22f76e7SAlex Elder 	return ret;
2556a725f65eSAlex Elder }
2557a725f65eSAlex Elder 
255859c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
255959c2be1eSYehuda Sadeh 		       const char *buf,
256059c2be1eSYehuda Sadeh 		       size_t count)
2561602adf40SYehuda Sadeh {
2562cb8627c7SAlex Elder 	char *options;
2563cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
25647ef3214aSAlex Elder 	const char *mon_addrs = NULL;
25657ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
256627cc2594SAlex Elder 	struct ceph_osd_client *osdc;
256727cc2594SAlex Elder 	int rc = -ENOMEM;
2568602adf40SYehuda Sadeh 
2569602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2570602adf40SYehuda Sadeh 		return -ENODEV;
2571602adf40SYehuda Sadeh 
257227cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
257327cc2594SAlex Elder 	if (!options)
257427cc2594SAlex Elder 		goto err_nomem;
2575cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2576cb8627c7SAlex Elder 	if (!rbd_dev)
2577cb8627c7SAlex Elder 		goto err_nomem;
2578602adf40SYehuda Sadeh 
2579602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2580602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2581602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2582dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2583c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2584602adf40SYehuda Sadeh 
2585d184f6bfSAlex Elder 	/* generate unique id: find highest unique id, add one */
2586e2839308SAlex Elder 	rbd_dev_id_get(rbd_dev);
2587602adf40SYehuda Sadeh 
2588a725f65eSAlex Elder 	/* Fill in the device name, now that we have its id. */
258981a89793SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
259081a89793SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2591de71a297SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2592e124a82fSAlex Elder 
2593a725f65eSAlex Elder 	/* parse add command */
25947ef3214aSAlex Elder 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2595e28fff26SAlex Elder 				options, count);
2596a725f65eSAlex Elder 	if (rc)
2597a725f65eSAlex Elder 		goto err_put_id;
2598a725f65eSAlex Elder 
2599f8c38929SAlex Elder 	rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2600f8c38929SAlex Elder 	if (rc < 0)
2601f0f8cef5SAlex Elder 		goto err_put_id;
2602602adf40SYehuda Sadeh 
2603602adf40SYehuda Sadeh 	/* pick the pool */
26041dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2605602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2606602adf40SYehuda Sadeh 	if (rc < 0)
2607602adf40SYehuda Sadeh 		goto err_out_client;
26089bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
2609602adf40SYehuda Sadeh 
2610602adf40SYehuda Sadeh 	/* register our block device */
261127cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
261227cc2594SAlex Elder 	if (rc < 0)
2613602adf40SYehuda Sadeh 		goto err_out_client;
261427cc2594SAlex Elder 	rbd_dev->major = rc;
2615602adf40SYehuda Sadeh 
2616dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2617dfc5606dSYehuda Sadeh 	if (rc)
2618766fc439SYehuda Sadeh 		goto err_out_blkdev;
2619766fc439SYehuda Sadeh 
262032eec68dSAlex Elder 	/*
262132eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
262232eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
262332eec68dSAlex Elder 	 *
262432eec68dSAlex Elder 	 * Set up and announce blkdev mapping.
262532eec68dSAlex Elder 	 */
2626602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2627602adf40SYehuda Sadeh 	if (rc)
2628766fc439SYehuda Sadeh 		goto err_out_bus;
2629602adf40SYehuda Sadeh 
263059c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
263159c2be1eSYehuda Sadeh 	if (rc)
263259c2be1eSYehuda Sadeh 		goto err_out_bus;
263359c2be1eSYehuda Sadeh 
2634602adf40SYehuda Sadeh 	return count;
2635602adf40SYehuda Sadeh 
2636766fc439SYehuda Sadeh err_out_bus:
2637766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2638766fc439SYehuda Sadeh 
2639766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2640766fc439SYehuda Sadeh 	kfree(options);
2641766fc439SYehuda Sadeh 	return rc;
2642766fc439SYehuda Sadeh 
2643602adf40SYehuda Sadeh err_out_blkdev:
2644602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2645602adf40SYehuda Sadeh err_out_client:
2646602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2647f0f8cef5SAlex Elder err_put_id:
2648cb8627c7SAlex Elder 	if (rbd_dev->pool_name) {
2649820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
26500bed54dcSAlex Elder 		kfree(rbd_dev->header_name);
26510bed54dcSAlex Elder 		kfree(rbd_dev->image_name);
2652d22f76e7SAlex Elder 		kfree(rbd_dev->pool_name);
2653cb8627c7SAlex Elder 	}
2654e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
265527cc2594SAlex Elder err_nomem:
265627cc2594SAlex Elder 	kfree(rbd_dev);
2657cb8627c7SAlex Elder 	kfree(options);
265827cc2594SAlex Elder 
2659602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2660602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
266127cc2594SAlex Elder 
266227cc2594SAlex Elder 	return (ssize_t) rc;
2663602adf40SYehuda Sadeh }
2664602adf40SYehuda Sadeh 
2665de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
2666602adf40SYehuda Sadeh {
2667602adf40SYehuda Sadeh 	struct list_head *tmp;
2668602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2669602adf40SYehuda Sadeh 
2670e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2671602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2672602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2673de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
2674e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2675602adf40SYehuda Sadeh 			return rbd_dev;
2676602adf40SYehuda Sadeh 		}
2677e124a82fSAlex Elder 	}
2678e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2679602adf40SYehuda Sadeh 	return NULL;
2680602adf40SYehuda Sadeh }
2681602adf40SYehuda Sadeh 
2682dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2683602adf40SYehuda Sadeh {
2684593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2685602adf40SYehuda Sadeh 
26861dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
26871dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
26881dbb4399SAlex Elder 
26891dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
269059c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
26911dbb4399SAlex Elder 	}
269259c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
2693070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
269459c2be1eSYehuda Sadeh 
2695602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2696602adf40SYehuda Sadeh 
2697602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2698602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2699602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
270032eec68dSAlex Elder 
270132eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
2702820a5f3eSAlex Elder 	kfree(rbd_dev->snap_name);
27030bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2704d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
27050bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2706e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
2707602adf40SYehuda Sadeh 	kfree(rbd_dev);
2708602adf40SYehuda Sadeh 
2709602adf40SYehuda Sadeh 	/* release module ref */
2710602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2711602adf40SYehuda Sadeh }
2712602adf40SYehuda Sadeh 
2713dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2714602adf40SYehuda Sadeh 			  const char *buf,
2715602adf40SYehuda Sadeh 			  size_t count)
2716602adf40SYehuda Sadeh {
2717602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2718602adf40SYehuda Sadeh 	int target_id, rc;
2719602adf40SYehuda Sadeh 	unsigned long ul;
2720602adf40SYehuda Sadeh 	int ret = count;
2721602adf40SYehuda Sadeh 
2722602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2723602adf40SYehuda Sadeh 	if (rc)
2724602adf40SYehuda Sadeh 		return rc;
2725602adf40SYehuda Sadeh 
2726602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2727602adf40SYehuda Sadeh 	target_id = (int) ul;
2728602adf40SYehuda Sadeh 	if (target_id != ul)
2729602adf40SYehuda Sadeh 		return -EINVAL;
2730602adf40SYehuda Sadeh 
2731602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2732602adf40SYehuda Sadeh 
2733602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2734602adf40SYehuda Sadeh 	if (!rbd_dev) {
2735602adf40SYehuda Sadeh 		ret = -ENOENT;
2736602adf40SYehuda Sadeh 		goto done;
2737602adf40SYehuda Sadeh 	}
2738602adf40SYehuda Sadeh 
2739dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2740dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2741602adf40SYehuda Sadeh 
2742602adf40SYehuda Sadeh done:
2743602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2744aafb230eSAlex Elder 
2745602adf40SYehuda Sadeh 	return ret;
2746602adf40SYehuda Sadeh }
2747602adf40SYehuda Sadeh 
2748dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2749dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2750602adf40SYehuda Sadeh 			    const char *buf,
2751602adf40SYehuda Sadeh 			    size_t count)
2752602adf40SYehuda Sadeh {
2753593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2754dfc5606dSYehuda Sadeh 	int ret;
2755dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2756602adf40SYehuda Sadeh 	if (!name)
2757602adf40SYehuda Sadeh 		return -ENOMEM;
2758602adf40SYehuda Sadeh 
2759dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2760602adf40SYehuda Sadeh 
2761602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2762602adf40SYehuda Sadeh 
2763602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2764602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2765602adf40SYehuda Sadeh 	if (ret < 0)
276659c2be1eSYehuda Sadeh 		goto err_unlock;
2767602adf40SYehuda Sadeh 
2768b813623aSAlex Elder 	ret = __rbd_refresh_header(rbd_dev, NULL);
2769602adf40SYehuda Sadeh 	if (ret < 0)
277059c2be1eSYehuda Sadeh 		goto err_unlock;
277159c2be1eSYehuda Sadeh 
277259c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
277359c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
277459c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
277559c2be1eSYehuda Sadeh 
277659c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
27774cb16250SAlex Elder 	rbd_req_sync_notify(rbd_dev);
2778602adf40SYehuda Sadeh 
2779602adf40SYehuda Sadeh 	ret = count;
278059c2be1eSYehuda Sadeh 	kfree(name);
278159c2be1eSYehuda Sadeh 	return ret;
278259c2be1eSYehuda Sadeh 
278359c2be1eSYehuda Sadeh err_unlock:
2784602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2785602adf40SYehuda Sadeh 	kfree(name);
2786602adf40SYehuda Sadeh 	return ret;
2787602adf40SYehuda Sadeh }
2788602adf40SYehuda Sadeh 
2789602adf40SYehuda Sadeh /*
2790602adf40SYehuda Sadeh  * create control files in sysfs
2791dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2792602adf40SYehuda Sadeh  */
2793602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2794602adf40SYehuda Sadeh {
2795dfc5606dSYehuda Sadeh 	int ret;
2796602adf40SYehuda Sadeh 
2797fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
2798dfc5606dSYehuda Sadeh 	if (ret < 0)
2799dfc5606dSYehuda Sadeh 		return ret;
2800602adf40SYehuda Sadeh 
2801fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
2802fed4c143SAlex Elder 	if (ret < 0)
2803fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
2804602adf40SYehuda Sadeh 
2805602adf40SYehuda Sadeh 	return ret;
2806602adf40SYehuda Sadeh }
2807602adf40SYehuda Sadeh 
2808602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2809602adf40SYehuda Sadeh {
2810dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2811fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
2812602adf40SYehuda Sadeh }
2813602adf40SYehuda Sadeh 
2814602adf40SYehuda Sadeh int __init rbd_init(void)
2815602adf40SYehuda Sadeh {
2816602adf40SYehuda Sadeh 	int rc;
2817602adf40SYehuda Sadeh 
2818602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2819602adf40SYehuda Sadeh 	if (rc)
2820602adf40SYehuda Sadeh 		return rc;
2821f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2822602adf40SYehuda Sadeh 	return 0;
2823602adf40SYehuda Sadeh }
2824602adf40SYehuda Sadeh 
2825602adf40SYehuda Sadeh void __exit rbd_exit(void)
2826602adf40SYehuda Sadeh {
2827602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2828602adf40SYehuda Sadeh }
2829602adf40SYehuda Sadeh 
2830602adf40SYehuda Sadeh module_init(rbd_init);
2831602adf40SYehuda Sadeh module_exit(rbd_exit);
2832602adf40SYehuda Sadeh 
2833602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2834602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2835602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2836602adf40SYehuda Sadeh 
2837602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2838602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2839602adf40SYehuda Sadeh 
2840602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2841