xref: /openbmc/linux/drivers/block/rbd.c (revision aded07ea)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44593a9e7bSAlex Elder /*
45593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
46593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
47593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
48593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
49593a9e7bSAlex Elder  */
50593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
51593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
52593a9e7bSAlex Elder 
53f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
54f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
55602adf40SYehuda Sadeh 
56602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
59602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
60602adf40SYehuda Sadeh 
61602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
62602adf40SYehuda Sadeh 
6381a89793SAlex Elder /*
6481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
6581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
6681a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
6781a89793SAlex Elder  * enough to hold all possible device names.
6881a89793SAlex Elder  */
69602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7081a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
71602adf40SYehuda Sadeh 
7259c2be1eSYehuda Sadeh #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
7359c2be1eSYehuda Sadeh 
74602adf40SYehuda Sadeh /*
75602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
76602adf40SYehuda Sadeh  */
77602adf40SYehuda Sadeh struct rbd_image_header {
78602adf40SYehuda Sadeh 	u64 image_size;
79849b4260SAlex Elder 	char *object_prefix;
80602adf40SYehuda Sadeh 	__u8 obj_order;
81602adf40SYehuda Sadeh 	__u8 crypt_type;
82602adf40SYehuda Sadeh 	__u8 comp_type;
83602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc;
84602adf40SYehuda Sadeh 	size_t snap_names_len;
85602adf40SYehuda Sadeh 	u64 snap_seq;
86602adf40SYehuda Sadeh 	u32 total_snaps;
87602adf40SYehuda Sadeh 
88602adf40SYehuda Sadeh 	char *snap_names;
89602adf40SYehuda Sadeh 	u64 *snap_sizes;
9059c2be1eSYehuda Sadeh 
9159c2be1eSYehuda Sadeh 	u64 obj_version;
9259c2be1eSYehuda Sadeh };
9359c2be1eSYehuda Sadeh 
9459c2be1eSYehuda Sadeh struct rbd_options {
9559c2be1eSYehuda Sadeh 	int	notify_timeout;
96602adf40SYehuda Sadeh };
97602adf40SYehuda Sadeh 
98602adf40SYehuda Sadeh /*
99f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
100602adf40SYehuda Sadeh  */
101602adf40SYehuda Sadeh struct rbd_client {
102602adf40SYehuda Sadeh 	struct ceph_client	*client;
10359c2be1eSYehuda Sadeh 	struct rbd_options	*rbd_opts;
104602adf40SYehuda Sadeh 	struct kref		kref;
105602adf40SYehuda Sadeh 	struct list_head	node;
106602adf40SYehuda Sadeh };
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh /*
109f0f8cef5SAlex Elder  * a request completion status
110602adf40SYehuda Sadeh  */
1111fec7093SYehuda Sadeh struct rbd_req_status {
1121fec7093SYehuda Sadeh 	int done;
1131fec7093SYehuda Sadeh 	int rc;
1141fec7093SYehuda Sadeh 	u64 bytes;
1151fec7093SYehuda Sadeh };
1161fec7093SYehuda Sadeh 
1171fec7093SYehuda Sadeh /*
1181fec7093SYehuda Sadeh  * a collection of requests
1191fec7093SYehuda Sadeh  */
1201fec7093SYehuda Sadeh struct rbd_req_coll {
1211fec7093SYehuda Sadeh 	int			total;
1221fec7093SYehuda Sadeh 	int			num_done;
1231fec7093SYehuda Sadeh 	struct kref		kref;
1241fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
125602adf40SYehuda Sadeh };
126602adf40SYehuda Sadeh 
127f0f8cef5SAlex Elder /*
128f0f8cef5SAlex Elder  * a single io request
129f0f8cef5SAlex Elder  */
130f0f8cef5SAlex Elder struct rbd_request {
131f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
132f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
133f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
134f0f8cef5SAlex Elder 	u64			len;
135f0f8cef5SAlex Elder 	int			coll_index;
136f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
137f0f8cef5SAlex Elder };
138f0f8cef5SAlex Elder 
139dfc5606dSYehuda Sadeh struct rbd_snap {
140dfc5606dSYehuda Sadeh 	struct	device		dev;
141dfc5606dSYehuda Sadeh 	const char		*name;
1423591538fSJosh Durgin 	u64			size;
143dfc5606dSYehuda Sadeh 	struct list_head	node;
144dfc5606dSYehuda Sadeh 	u64			id;
145dfc5606dSYehuda Sadeh };
146dfc5606dSYehuda Sadeh 
147602adf40SYehuda Sadeh /*
148602adf40SYehuda Sadeh  * a single device
149602adf40SYehuda Sadeh  */
150602adf40SYehuda Sadeh struct rbd_device {
151602adf40SYehuda Sadeh 	int			id;		/* blkdev unique id */
152602adf40SYehuda Sadeh 
153602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
154602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
155602adf40SYehuda Sadeh 	struct request_queue	*q;
156602adf40SYehuda Sadeh 
157602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
158602adf40SYehuda Sadeh 
159602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160602adf40SYehuda Sadeh 
161602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
162602adf40SYehuda Sadeh 
163602adf40SYehuda Sadeh 	struct rbd_image_header	header;
1640bed54dcSAlex Elder 	char			*image_name;
1650bed54dcSAlex Elder 	size_t			image_name_len;
1660bed54dcSAlex Elder 	char			*header_name;
167d22f76e7SAlex Elder 	char			*pool_name;
1689bb2f334SAlex Elder 	int			pool_id;
169602adf40SYehuda Sadeh 
17059c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
17159c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
17259c2be1eSYehuda Sadeh 
173c666601aSJosh Durgin 	/* protects updating the header */
174c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
175820a5f3eSAlex Elder 	char                    *snap_name;
17677dfe99fSJosh Durgin 	u64                     snap_id;	/* current snapshot id */
177602adf40SYehuda Sadeh 	int read_only;
178602adf40SYehuda Sadeh 
179602adf40SYehuda Sadeh 	struct list_head	node;
180dfc5606dSYehuda Sadeh 
181dfc5606dSYehuda Sadeh 	/* list of snapshots */
182dfc5606dSYehuda Sadeh 	struct list_head	snaps;
183dfc5606dSYehuda Sadeh 
184dfc5606dSYehuda Sadeh 	/* sysfs related */
185dfc5606dSYehuda Sadeh 	struct device		dev;
186dfc5606dSYehuda Sadeh };
187dfc5606dSYehuda Sadeh 
188602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
189e124a82fSAlex Elder 
190602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
191e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
192e124a82fSAlex Elder 
193602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
194432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
195602adf40SYehuda Sadeh 
196dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
197dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
198dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
199dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
200dfc5606dSYehuda Sadeh 			    const char *buf,
201dfc5606dSYehuda Sadeh 			    size_t count);
202dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
20369932487SJustin P. Mattock 				  struct rbd_snap *snap);
204dfc5606dSYehuda Sadeh 
205f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
206f0f8cef5SAlex Elder 		       size_t count);
207f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
208f0f8cef5SAlex Elder 			  size_t count);
209f0f8cef5SAlex Elder 
210f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
211f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
212f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
213f0f8cef5SAlex Elder 	__ATTR_NULL
214f0f8cef5SAlex Elder };
215f0f8cef5SAlex Elder 
216f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
217f0f8cef5SAlex Elder 	.name		= "rbd",
218f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
219f0f8cef5SAlex Elder };
220f0f8cef5SAlex Elder 
221f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
222f0f8cef5SAlex Elder {
223f0f8cef5SAlex Elder }
224f0f8cef5SAlex Elder 
225f0f8cef5SAlex Elder static struct device rbd_root_dev = {
226f0f8cef5SAlex Elder 	.init_name =    "rbd",
227f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
228f0f8cef5SAlex Elder };
229f0f8cef5SAlex Elder 
230dfc5606dSYehuda Sadeh 
231dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
232dfc5606dSYehuda Sadeh {
233dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
234dfc5606dSYehuda Sadeh }
235dfc5606dSYehuda Sadeh 
236dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
237dfc5606dSYehuda Sadeh {
238dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
239dfc5606dSYehuda Sadeh }
240602adf40SYehuda Sadeh 
241263c6ca0SJosh Durgin static int __rbd_refresh_header(struct rbd_device *rbd_dev);
24259c2be1eSYehuda Sadeh 
243602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
244602adf40SYehuda Sadeh {
245f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
246602adf40SYehuda Sadeh 
247dfc5606dSYehuda Sadeh 	rbd_get_dev(rbd_dev);
248dfc5606dSYehuda Sadeh 
249602adf40SYehuda Sadeh 	set_device_ro(bdev, rbd_dev->read_only);
250602adf40SYehuda Sadeh 
251602adf40SYehuda Sadeh 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
252602adf40SYehuda Sadeh 		return -EROFS;
253602adf40SYehuda Sadeh 
254602adf40SYehuda Sadeh 	return 0;
255602adf40SYehuda Sadeh }
256602adf40SYehuda Sadeh 
257dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
258dfc5606dSYehuda Sadeh {
259dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
260dfc5606dSYehuda Sadeh 
261dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
262dfc5606dSYehuda Sadeh 
263dfc5606dSYehuda Sadeh 	return 0;
264dfc5606dSYehuda Sadeh }
265dfc5606dSYehuda Sadeh 
266602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
267602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
268602adf40SYehuda Sadeh 	.open			= rbd_open,
269dfc5606dSYehuda Sadeh 	.release		= rbd_release,
270602adf40SYehuda Sadeh };
271602adf40SYehuda Sadeh 
272602adf40SYehuda Sadeh /*
273602adf40SYehuda Sadeh  * Initialize an rbd client instance.
274602adf40SYehuda Sadeh  * We own *opt.
275602adf40SYehuda Sadeh  */
27659c2be1eSYehuda Sadeh static struct rbd_client *rbd_client_create(struct ceph_options *opt,
27759c2be1eSYehuda Sadeh 					    struct rbd_options *rbd_opts)
278602adf40SYehuda Sadeh {
279602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
280602adf40SYehuda Sadeh 	int ret = -ENOMEM;
281602adf40SYehuda Sadeh 
282602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
283602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
284602adf40SYehuda Sadeh 	if (!rbdc)
285602adf40SYehuda Sadeh 		goto out_opt;
286602adf40SYehuda Sadeh 
287602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
288602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
289602adf40SYehuda Sadeh 
290bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
291bc534d86SAlex Elder 
2926ab00d46SSage Weil 	rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
293602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
294bc534d86SAlex Elder 		goto out_mutex;
29528f259b7SVasiliy Kulikov 	opt = NULL; /* Now rbdc->client is responsible for opt */
296602adf40SYehuda Sadeh 
297602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
298602adf40SYehuda Sadeh 	if (ret < 0)
299602adf40SYehuda Sadeh 		goto out_err;
300602adf40SYehuda Sadeh 
30159c2be1eSYehuda Sadeh 	rbdc->rbd_opts = rbd_opts;
30259c2be1eSYehuda Sadeh 
303432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
304602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
305432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
306602adf40SYehuda Sadeh 
307bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
308bc534d86SAlex Elder 
309602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
310602adf40SYehuda Sadeh 	return rbdc;
311602adf40SYehuda Sadeh 
312602adf40SYehuda Sadeh out_err:
313602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
314bc534d86SAlex Elder out_mutex:
315bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
316602adf40SYehuda Sadeh 	kfree(rbdc);
317602adf40SYehuda Sadeh out_opt:
31828f259b7SVasiliy Kulikov 	if (opt)
319602adf40SYehuda Sadeh 		ceph_destroy_options(opt);
32028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
321602adf40SYehuda Sadeh }
322602adf40SYehuda Sadeh 
323602adf40SYehuda Sadeh /*
324602adf40SYehuda Sadeh  * Find a ceph client with specific addr and configuration.
325602adf40SYehuda Sadeh  */
326602adf40SYehuda Sadeh static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
327602adf40SYehuda Sadeh {
328602adf40SYehuda Sadeh 	struct rbd_client *client_node;
329602adf40SYehuda Sadeh 
330602adf40SYehuda Sadeh 	if (opt->flags & CEPH_OPT_NOSHARE)
331602adf40SYehuda Sadeh 		return NULL;
332602adf40SYehuda Sadeh 
333602adf40SYehuda Sadeh 	list_for_each_entry(client_node, &rbd_client_list, node)
334602adf40SYehuda Sadeh 		if (ceph_compare_options(opt, client_node->client) == 0)
335602adf40SYehuda Sadeh 			return client_node;
336602adf40SYehuda Sadeh 	return NULL;
337602adf40SYehuda Sadeh }
338602adf40SYehuda Sadeh 
339602adf40SYehuda Sadeh /*
34059c2be1eSYehuda Sadeh  * mount options
34159c2be1eSYehuda Sadeh  */
34259c2be1eSYehuda Sadeh enum {
34359c2be1eSYehuda Sadeh 	Opt_notify_timeout,
34459c2be1eSYehuda Sadeh 	Opt_last_int,
34559c2be1eSYehuda Sadeh 	/* int args above */
34659c2be1eSYehuda Sadeh 	Opt_last_string,
34759c2be1eSYehuda Sadeh 	/* string args above */
34859c2be1eSYehuda Sadeh };
34959c2be1eSYehuda Sadeh 
35059c2be1eSYehuda Sadeh static match_table_t rbdopt_tokens = {
35159c2be1eSYehuda Sadeh 	{Opt_notify_timeout, "notify_timeout=%d"},
35259c2be1eSYehuda Sadeh 	/* int args above */
35359c2be1eSYehuda Sadeh 	/* string args above */
35459c2be1eSYehuda Sadeh 	{-1, NULL}
35559c2be1eSYehuda Sadeh };
35659c2be1eSYehuda Sadeh 
35759c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
35859c2be1eSYehuda Sadeh {
35959c2be1eSYehuda Sadeh 	struct rbd_options *rbdopt = private;
36059c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
36159c2be1eSYehuda Sadeh 	int token, intval, ret;
36259c2be1eSYehuda Sadeh 
36321079786SAlex Elder 	token = match_token(c, rbdopt_tokens, argstr);
36459c2be1eSYehuda Sadeh 	if (token < 0)
36559c2be1eSYehuda Sadeh 		return -EINVAL;
36659c2be1eSYehuda Sadeh 
36759c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
36859c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
36959c2be1eSYehuda Sadeh 		if (ret < 0) {
37059c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
37159c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
37259c2be1eSYehuda Sadeh 			return ret;
37359c2be1eSYehuda Sadeh 		}
37459c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
37559c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
37659c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
37759c2be1eSYehuda Sadeh 		     argstr[0].from);
37859c2be1eSYehuda Sadeh 	} else {
37959c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
38059c2be1eSYehuda Sadeh 	}
38159c2be1eSYehuda Sadeh 
38259c2be1eSYehuda Sadeh 	switch (token) {
38359c2be1eSYehuda Sadeh 	case Opt_notify_timeout:
38459c2be1eSYehuda Sadeh 		rbdopt->notify_timeout = intval;
38559c2be1eSYehuda Sadeh 		break;
38659c2be1eSYehuda Sadeh 	default:
38759c2be1eSYehuda Sadeh 		BUG_ON(token);
38859c2be1eSYehuda Sadeh 	}
38959c2be1eSYehuda Sadeh 	return 0;
39059c2be1eSYehuda Sadeh }
39159c2be1eSYehuda Sadeh 
39259c2be1eSYehuda Sadeh /*
393602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
394602adf40SYehuda Sadeh  * not exist create it.
395602adf40SYehuda Sadeh  */
3965214ecc4SAlex Elder static struct rbd_client *rbd_get_client(const char *mon_addr,
3975214ecc4SAlex Elder 					 size_t mon_addr_len,
3985214ecc4SAlex Elder 					 char *options)
399602adf40SYehuda Sadeh {
400602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
401602adf40SYehuda Sadeh 	struct ceph_options *opt;
40259c2be1eSYehuda Sadeh 	struct rbd_options *rbd_opts;
40359c2be1eSYehuda Sadeh 
40459c2be1eSYehuda Sadeh 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
40559c2be1eSYehuda Sadeh 	if (!rbd_opts)
406d720bcb0SAlex Elder 		return ERR_PTR(-ENOMEM);
40759c2be1eSYehuda Sadeh 
40859c2be1eSYehuda Sadeh 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
409602adf40SYehuda Sadeh 
410ee57741cSAlex Elder 	opt = ceph_parse_options(options, mon_addr,
4115214ecc4SAlex Elder 				mon_addr + mon_addr_len,
41221079786SAlex Elder 				parse_rbd_opts_token, rbd_opts);
413ee57741cSAlex Elder 	if (IS_ERR(opt)) {
414d720bcb0SAlex Elder 		kfree(rbd_opts);
415d720bcb0SAlex Elder 		return ERR_CAST(opt);
416ee57741cSAlex Elder 	}
417602adf40SYehuda Sadeh 
418432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
419602adf40SYehuda Sadeh 	rbdc = __rbd_client_find(opt);
420602adf40SYehuda Sadeh 	if (rbdc) {
421e6994d3dSAlex Elder 		/* using an existing client */
422e6994d3dSAlex Elder 		kref_get(&rbdc->kref);
423432b8587SAlex Elder 		spin_unlock(&rbd_client_list_lock);
424e6994d3dSAlex Elder 
425602adf40SYehuda Sadeh 		ceph_destroy_options(opt);
42697bb59a0SAlex Elder 		kfree(rbd_opts);
427602adf40SYehuda Sadeh 
428d720bcb0SAlex Elder 		return rbdc;
429602adf40SYehuda Sadeh 	}
430432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
431602adf40SYehuda Sadeh 
43259c2be1eSYehuda Sadeh 	rbdc = rbd_client_create(opt, rbd_opts);
433d97081b0SAlex Elder 
434d720bcb0SAlex Elder 	if (IS_ERR(rbdc))
43559c2be1eSYehuda Sadeh 		kfree(rbd_opts);
436d720bcb0SAlex Elder 
437d720bcb0SAlex Elder 	return rbdc;
438602adf40SYehuda Sadeh }
439602adf40SYehuda Sadeh 
440602adf40SYehuda Sadeh /*
441602adf40SYehuda Sadeh  * Destroy ceph client
442d23a4b3fSAlex Elder  *
443432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
444602adf40SYehuda Sadeh  */
445602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
446602adf40SYehuda Sadeh {
447602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
448602adf40SYehuda Sadeh 
449602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
450cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
451602adf40SYehuda Sadeh 	list_del(&rbdc->node);
452cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
453602adf40SYehuda Sadeh 
454602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
45559c2be1eSYehuda Sadeh 	kfree(rbdc->rbd_opts);
456602adf40SYehuda Sadeh 	kfree(rbdc);
457602adf40SYehuda Sadeh }
458602adf40SYehuda Sadeh 
459602adf40SYehuda Sadeh /*
460602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
461602adf40SYehuda Sadeh  * it.
462602adf40SYehuda Sadeh  */
463602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
464602adf40SYehuda Sadeh {
465602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
466602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
467602adf40SYehuda Sadeh }
468602adf40SYehuda Sadeh 
4691fec7093SYehuda Sadeh /*
4701fec7093SYehuda Sadeh  * Destroy requests collection
4711fec7093SYehuda Sadeh  */
4721fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4731fec7093SYehuda Sadeh {
4741fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4751fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
4761fec7093SYehuda Sadeh 
4771fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
4781fec7093SYehuda Sadeh 	kfree(coll);
4791fec7093SYehuda Sadeh }
480602adf40SYehuda Sadeh 
481602adf40SYehuda Sadeh /*
482602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
483602adf40SYehuda Sadeh  * header.
484602adf40SYehuda Sadeh  */
485602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
486602adf40SYehuda Sadeh 				 struct rbd_image_header_ondisk *ondisk,
48750f7c4c9SXi Wang 				 u32 allocated_snaps,
488602adf40SYehuda Sadeh 				 gfp_t gfp_flags)
489602adf40SYehuda Sadeh {
49050f7c4c9SXi Wang 	u32 i, snap_count;
491602adf40SYehuda Sadeh 
49221079786SAlex Elder 	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
49381e759fbSJosh Durgin 		return -ENXIO;
49481e759fbSJosh Durgin 
49500f1f36fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
49650f7c4c9SXi Wang 	if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
49750f7c4c9SXi Wang 			 / sizeof (*ondisk))
49850f7c4c9SXi Wang 		return -EINVAL;
499602adf40SYehuda Sadeh 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
500f9f9a190SYan, Zheng 				snap_count * sizeof(u64),
501602adf40SYehuda Sadeh 				gfp_flags);
502602adf40SYehuda Sadeh 	if (!header->snapc)
503602adf40SYehuda Sadeh 		return -ENOMEM;
50400f1f36fSAlex Elder 
50500f1f36fSAlex Elder 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
506602adf40SYehuda Sadeh 	if (snap_count) {
507602adf40SYehuda Sadeh 		header->snap_names = kmalloc(header->snap_names_len,
508f8ad495aSDan Carpenter 					     gfp_flags);
509602adf40SYehuda Sadeh 		if (!header->snap_names)
510602adf40SYehuda Sadeh 			goto err_snapc;
511602adf40SYehuda Sadeh 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
512f8ad495aSDan Carpenter 					     gfp_flags);
513602adf40SYehuda Sadeh 		if (!header->snap_sizes)
514602adf40SYehuda Sadeh 			goto err_names;
515602adf40SYehuda Sadeh 	} else {
516602adf40SYehuda Sadeh 		header->snap_names = NULL;
517602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
518602adf40SYehuda Sadeh 	}
519849b4260SAlex Elder 
520849b4260SAlex Elder 	header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
521849b4260SAlex Elder 					gfp_flags);
522849b4260SAlex Elder 	if (!header->object_prefix)
523849b4260SAlex Elder 		goto err_sizes;
524849b4260SAlex Elder 
525ca1e49a6SAlex Elder 	memcpy(header->object_prefix, ondisk->block_name,
526602adf40SYehuda Sadeh 	       sizeof(ondisk->block_name));
527849b4260SAlex Elder 	header->object_prefix[sizeof (ondisk->block_name)] = '\0';
528602adf40SYehuda Sadeh 
529602adf40SYehuda Sadeh 	header->image_size = le64_to_cpu(ondisk->image_size);
530602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
531602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
532602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
533602adf40SYehuda Sadeh 
534602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
535602adf40SYehuda Sadeh 	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
536602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
537602adf40SYehuda Sadeh 	header->total_snaps = snap_count;
538602adf40SYehuda Sadeh 
53921079786SAlex Elder 	if (snap_count && allocated_snaps == snap_count) {
540602adf40SYehuda Sadeh 		for (i = 0; i < snap_count; i++) {
541602adf40SYehuda Sadeh 			header->snapc->snaps[i] =
542602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].id);
543602adf40SYehuda Sadeh 			header->snap_sizes[i] =
544602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].image_size);
545602adf40SYehuda Sadeh 		}
546602adf40SYehuda Sadeh 
547602adf40SYehuda Sadeh 		/* copy snapshot names */
548602adf40SYehuda Sadeh 		memcpy(header->snap_names, &ondisk->snaps[i],
549602adf40SYehuda Sadeh 			header->snap_names_len);
550602adf40SYehuda Sadeh 	}
551602adf40SYehuda Sadeh 
552602adf40SYehuda Sadeh 	return 0;
553602adf40SYehuda Sadeh 
554849b4260SAlex Elder err_sizes:
555849b4260SAlex Elder 	kfree(header->snap_sizes);
556602adf40SYehuda Sadeh err_names:
557602adf40SYehuda Sadeh 	kfree(header->snap_names);
558602adf40SYehuda Sadeh err_snapc:
559602adf40SYehuda Sadeh 	kfree(header->snapc);
56000f1f36fSAlex Elder 	return -ENOMEM;
561602adf40SYehuda Sadeh }
562602adf40SYehuda Sadeh 
563602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
564602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
565602adf40SYehuda Sadeh {
566602adf40SYehuda Sadeh 	int i;
567602adf40SYehuda Sadeh 	char *p = header->snap_names;
568602adf40SYehuda Sadeh 
56900f1f36fSAlex Elder 	for (i = 0; i < header->total_snaps; i++) {
57000f1f36fSAlex Elder 		if (!strcmp(snap_name, p)) {
57100f1f36fSAlex Elder 
57200f1f36fSAlex Elder 			/* Found it.  Pass back its id and/or size */
57300f1f36fSAlex Elder 
574602adf40SYehuda Sadeh 			if (seq)
575602adf40SYehuda Sadeh 				*seq = header->snapc->snaps[i];
576602adf40SYehuda Sadeh 			if (size)
577602adf40SYehuda Sadeh 				*size = header->snap_sizes[i];
578602adf40SYehuda Sadeh 			return i;
579602adf40SYehuda Sadeh 		}
58000f1f36fSAlex Elder 		p += strlen(p) + 1;	/* Skip ahead to the next name */
58100f1f36fSAlex Elder 	}
58200f1f36fSAlex Elder 	return -ENOENT;
58300f1f36fSAlex Elder }
584602adf40SYehuda Sadeh 
5850ce1a794SAlex Elder static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
586602adf40SYehuda Sadeh {
5870ce1a794SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
588602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc = header->snapc;
589602adf40SYehuda Sadeh 	int ret = -ENOENT;
590602adf40SYehuda Sadeh 
5910ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
592602adf40SYehuda Sadeh 
5930ce1a794SAlex Elder 	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
594cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
595602adf40SYehuda Sadeh 		if (header->total_snaps)
596602adf40SYehuda Sadeh 			snapc->seq = header->snap_seq;
597602adf40SYehuda Sadeh 		else
598602adf40SYehuda Sadeh 			snapc->seq = 0;
5990ce1a794SAlex Elder 		rbd_dev->snap_id = CEPH_NOSNAP;
6000ce1a794SAlex Elder 		rbd_dev->read_only = 0;
601602adf40SYehuda Sadeh 		if (size)
602602adf40SYehuda Sadeh 			*size = header->image_size;
603602adf40SYehuda Sadeh 	} else {
6040ce1a794SAlex Elder 		ret = snap_by_name(header, rbd_dev->snap_name,
6050ce1a794SAlex Elder 					&snapc->seq, size);
606602adf40SYehuda Sadeh 		if (ret < 0)
607602adf40SYehuda Sadeh 			goto done;
6080ce1a794SAlex Elder 		rbd_dev->snap_id = snapc->seq;
6090ce1a794SAlex Elder 		rbd_dev->read_only = 1;
610602adf40SYehuda Sadeh 	}
611602adf40SYehuda Sadeh 
612602adf40SYehuda Sadeh 	ret = 0;
613602adf40SYehuda Sadeh done:
6140ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
615602adf40SYehuda Sadeh 	return ret;
616602adf40SYehuda Sadeh }
617602adf40SYehuda Sadeh 
618602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
619602adf40SYehuda Sadeh {
620849b4260SAlex Elder 	kfree(header->object_prefix);
621602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
622849b4260SAlex Elder 	kfree(header->snap_names);
623849b4260SAlex Elder 	kfree(header->snapc);
624602adf40SYehuda Sadeh }
625602adf40SYehuda Sadeh 
626602adf40SYehuda Sadeh /*
627602adf40SYehuda Sadeh  * get the actual striped segment name, offset and length
628602adf40SYehuda Sadeh  */
629602adf40SYehuda Sadeh static u64 rbd_get_segment(struct rbd_image_header *header,
630ca1e49a6SAlex Elder 			   const char *object_prefix,
631602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
632602adf40SYehuda Sadeh 			   char *seg_name, u64 *segofs)
633602adf40SYehuda Sadeh {
634602adf40SYehuda Sadeh 	u64 seg = ofs >> header->obj_order;
635602adf40SYehuda Sadeh 
636602adf40SYehuda Sadeh 	if (seg_name)
637602adf40SYehuda Sadeh 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
638ca1e49a6SAlex Elder 			 "%s.%012llx", object_prefix, seg);
639602adf40SYehuda Sadeh 
640602adf40SYehuda Sadeh 	ofs = ofs & ((1 << header->obj_order) - 1);
641602adf40SYehuda Sadeh 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
642602adf40SYehuda Sadeh 
643602adf40SYehuda Sadeh 	if (segofs)
644602adf40SYehuda Sadeh 		*segofs = ofs;
645602adf40SYehuda Sadeh 
646602adf40SYehuda Sadeh 	return len;
647602adf40SYehuda Sadeh }
648602adf40SYehuda Sadeh 
6491fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
6501fec7093SYehuda Sadeh 				u64 ofs, u64 len)
6511fec7093SYehuda Sadeh {
6521fec7093SYehuda Sadeh 	u64 start_seg = ofs >> header->obj_order;
6531fec7093SYehuda Sadeh 	u64 end_seg = (ofs + len - 1) >> header->obj_order;
6541fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
6551fec7093SYehuda Sadeh }
6561fec7093SYehuda Sadeh 
657602adf40SYehuda Sadeh /*
658029bcbd8SJosh Durgin  * returns the size of an object in the image
659029bcbd8SJosh Durgin  */
660029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
661029bcbd8SJosh Durgin {
662029bcbd8SJosh Durgin 	return 1 << header->obj_order;
663029bcbd8SJosh Durgin }
664029bcbd8SJosh Durgin 
665029bcbd8SJosh Durgin /*
666602adf40SYehuda Sadeh  * bio helpers
667602adf40SYehuda Sadeh  */
668602adf40SYehuda Sadeh 
669602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
670602adf40SYehuda Sadeh {
671602adf40SYehuda Sadeh 	struct bio *tmp;
672602adf40SYehuda Sadeh 
673602adf40SYehuda Sadeh 	while (chain) {
674602adf40SYehuda Sadeh 		tmp = chain;
675602adf40SYehuda Sadeh 		chain = chain->bi_next;
676602adf40SYehuda Sadeh 		bio_put(tmp);
677602adf40SYehuda Sadeh 	}
678602adf40SYehuda Sadeh }
679602adf40SYehuda Sadeh 
680602adf40SYehuda Sadeh /*
681602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
682602adf40SYehuda Sadeh  */
683602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
684602adf40SYehuda Sadeh {
685602adf40SYehuda Sadeh 	struct bio_vec *bv;
686602adf40SYehuda Sadeh 	unsigned long flags;
687602adf40SYehuda Sadeh 	void *buf;
688602adf40SYehuda Sadeh 	int i;
689602adf40SYehuda Sadeh 	int pos = 0;
690602adf40SYehuda Sadeh 
691602adf40SYehuda Sadeh 	while (chain) {
692602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
693602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
694602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
695602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
696602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
697602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
69885b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
699602adf40SYehuda Sadeh 			}
700602adf40SYehuda Sadeh 			pos += bv->bv_len;
701602adf40SYehuda Sadeh 		}
702602adf40SYehuda Sadeh 
703602adf40SYehuda Sadeh 		chain = chain->bi_next;
704602adf40SYehuda Sadeh 	}
705602adf40SYehuda Sadeh }
706602adf40SYehuda Sadeh 
707602adf40SYehuda Sadeh /*
708602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
709602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
710602adf40SYehuda Sadeh  */
711602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
712602adf40SYehuda Sadeh 				   struct bio_pair **bp,
713602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
714602adf40SYehuda Sadeh {
715602adf40SYehuda Sadeh 	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
716602adf40SYehuda Sadeh 	int total = 0;
717602adf40SYehuda Sadeh 
718602adf40SYehuda Sadeh 	if (*bp) {
719602adf40SYehuda Sadeh 		bio_pair_release(*bp);
720602adf40SYehuda Sadeh 		*bp = NULL;
721602adf40SYehuda Sadeh 	}
722602adf40SYehuda Sadeh 
723602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
724602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
725602adf40SYehuda Sadeh 		if (!tmp)
726602adf40SYehuda Sadeh 			goto err_out;
727602adf40SYehuda Sadeh 
728602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
729602adf40SYehuda Sadeh 			struct bio_pair *bp;
730602adf40SYehuda Sadeh 
731602adf40SYehuda Sadeh 			/*
732602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
733602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
734602adf40SYehuda Sadeh 			 */
735602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
736602adf40SYehuda Sadeh 			     "bi_size=%d\n",
737602adf40SYehuda Sadeh 			     (int)total, (int)len-total,
738602adf40SYehuda Sadeh 			     (int)old_chain->bi_size);
739602adf40SYehuda Sadeh 
740602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
741602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
742593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
743602adf40SYehuda Sadeh 			if (!bp)
744602adf40SYehuda Sadeh 				goto err_out;
745602adf40SYehuda Sadeh 
746602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
747602adf40SYehuda Sadeh 
748602adf40SYehuda Sadeh 			*next = &bp->bio2;
749602adf40SYehuda Sadeh 		} else {
750602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
751602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
752602adf40SYehuda Sadeh 		}
753602adf40SYehuda Sadeh 
754602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
755602adf40SYehuda Sadeh 		gfpmask &= ~__GFP_WAIT;
756602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
757602adf40SYehuda Sadeh 
758602adf40SYehuda Sadeh 		if (!new_chain) {
759602adf40SYehuda Sadeh 			new_chain = tail = tmp;
760602adf40SYehuda Sadeh 		} else {
761602adf40SYehuda Sadeh 			tail->bi_next = tmp;
762602adf40SYehuda Sadeh 			tail = tmp;
763602adf40SYehuda Sadeh 		}
764602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
765602adf40SYehuda Sadeh 
766602adf40SYehuda Sadeh 		total += tmp->bi_size;
767602adf40SYehuda Sadeh 	}
768602adf40SYehuda Sadeh 
769602adf40SYehuda Sadeh 	BUG_ON(total < len);
770602adf40SYehuda Sadeh 
771602adf40SYehuda Sadeh 	if (tail)
772602adf40SYehuda Sadeh 		tail->bi_next = NULL;
773602adf40SYehuda Sadeh 
774602adf40SYehuda Sadeh 	*old = old_chain;
775602adf40SYehuda Sadeh 
776602adf40SYehuda Sadeh 	return new_chain;
777602adf40SYehuda Sadeh 
778602adf40SYehuda Sadeh err_out:
779602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
780602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
781602adf40SYehuda Sadeh 	return NULL;
782602adf40SYehuda Sadeh }
783602adf40SYehuda Sadeh 
784602adf40SYehuda Sadeh /*
785602adf40SYehuda Sadeh  * helpers for osd request op vectors.
786602adf40SYehuda Sadeh  */
787602adf40SYehuda Sadeh static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
788602adf40SYehuda Sadeh 			    int num_ops,
789602adf40SYehuda Sadeh 			    int opcode,
790602adf40SYehuda Sadeh 			    u32 payload_len)
791602adf40SYehuda Sadeh {
792602adf40SYehuda Sadeh 	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
793602adf40SYehuda Sadeh 		       GFP_NOIO);
794602adf40SYehuda Sadeh 	if (!*ops)
795602adf40SYehuda Sadeh 		return -ENOMEM;
796602adf40SYehuda Sadeh 	(*ops)[0].op = opcode;
797602adf40SYehuda Sadeh 	/*
798602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
799602adf40SYehuda Sadeh 	 * in calc_raw_layout()
800602adf40SYehuda Sadeh 	 */
801602adf40SYehuda Sadeh 	(*ops)[0].payload_len = payload_len;
802602adf40SYehuda Sadeh 	return 0;
803602adf40SYehuda Sadeh }
804602adf40SYehuda Sadeh 
805602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
806602adf40SYehuda Sadeh {
807602adf40SYehuda Sadeh 	kfree(ops);
808602adf40SYehuda Sadeh }
809602adf40SYehuda Sadeh 
8101fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
8111fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
8121fec7093SYehuda Sadeh 				   int index,
8131fec7093SYehuda Sadeh 				   int ret, u64 len)
8141fec7093SYehuda Sadeh {
8151fec7093SYehuda Sadeh 	struct request_queue *q;
8161fec7093SYehuda Sadeh 	int min, max, i;
8171fec7093SYehuda Sadeh 
8181fec7093SYehuda Sadeh 	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
8191fec7093SYehuda Sadeh 	     coll, index, ret, len);
8201fec7093SYehuda Sadeh 
8211fec7093SYehuda Sadeh 	if (!rq)
8221fec7093SYehuda Sadeh 		return;
8231fec7093SYehuda Sadeh 
8241fec7093SYehuda Sadeh 	if (!coll) {
8251fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
8261fec7093SYehuda Sadeh 		return;
8271fec7093SYehuda Sadeh 	}
8281fec7093SYehuda Sadeh 
8291fec7093SYehuda Sadeh 	q = rq->q;
8301fec7093SYehuda Sadeh 
8311fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
8321fec7093SYehuda Sadeh 	coll->status[index].done = 1;
8331fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
8341fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
8351fec7093SYehuda Sadeh 	max = min = coll->num_done;
8361fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
8371fec7093SYehuda Sadeh 		max++;
8381fec7093SYehuda Sadeh 
8391fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
8401fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
8411fec7093SYehuda Sadeh 				  coll->status[i].bytes);
8421fec7093SYehuda Sadeh 		coll->num_done++;
8431fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
8441fec7093SYehuda Sadeh 	}
8451fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
8461fec7093SYehuda Sadeh }
8471fec7093SYehuda Sadeh 
8481fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
8491fec7093SYehuda Sadeh 			     int ret, u64 len)
8501fec7093SYehuda Sadeh {
8511fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
8521fec7093SYehuda Sadeh }
8531fec7093SYehuda Sadeh 
854602adf40SYehuda Sadeh /*
855602adf40SYehuda Sadeh  * Send ceph osd request
856602adf40SYehuda Sadeh  */
857602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
8580ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
859602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
860602adf40SYehuda Sadeh 			  u64 snapid,
861aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
862602adf40SYehuda Sadeh 			  struct bio *bio,
863602adf40SYehuda Sadeh 			  struct page **pages,
864602adf40SYehuda Sadeh 			  int num_pages,
865602adf40SYehuda Sadeh 			  int flags,
866602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
867602adf40SYehuda Sadeh 			  int num_reply,
8681fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
8691fec7093SYehuda Sadeh 			  int coll_index,
870602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
87159c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
87259c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
87359c2be1eSYehuda Sadeh 			  u64 *ver)
874602adf40SYehuda Sadeh {
875602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
876602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
877602adf40SYehuda Sadeh 	int ret;
878602adf40SYehuda Sadeh 	u64 bno;
879602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
880602adf40SYehuda Sadeh 	struct rbd_request *req_data;
881602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
8821dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
883602adf40SYehuda Sadeh 
884602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
8851fec7093SYehuda Sadeh 	if (!req_data) {
8861fec7093SYehuda Sadeh 		if (coll)
8871fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
8881fec7093SYehuda Sadeh 					       -ENOMEM, len);
8891fec7093SYehuda Sadeh 		return -ENOMEM;
8901fec7093SYehuda Sadeh 	}
891602adf40SYehuda Sadeh 
8921fec7093SYehuda Sadeh 	if (coll) {
8931fec7093SYehuda Sadeh 		req_data->coll = coll;
8941fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
8951fec7093SYehuda Sadeh 	}
8961fec7093SYehuda Sadeh 
897aded07eaSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
898aded07eaSAlex Elder 		object_name, len, ofs);
899602adf40SYehuda Sadeh 
9000ce1a794SAlex Elder 	down_read(&rbd_dev->header_rwsem);
901602adf40SYehuda Sadeh 
9020ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
9031dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9041dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9054ad12621SSage Weil 	if (!req) {
9060ce1a794SAlex Elder 		up_read(&rbd_dev->header_rwsem);
9074ad12621SSage Weil 		ret = -ENOMEM;
908602adf40SYehuda Sadeh 		goto done_pages;
909602adf40SYehuda Sadeh 	}
910602adf40SYehuda Sadeh 
911602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
912602adf40SYehuda Sadeh 
913602adf40SYehuda Sadeh 	req_data->rq = rq;
914602adf40SYehuda Sadeh 	req_data->bio = bio;
915602adf40SYehuda Sadeh 	req_data->pages = pages;
916602adf40SYehuda Sadeh 	req_data->len = len;
917602adf40SYehuda Sadeh 
918602adf40SYehuda Sadeh 	req->r_priv = req_data;
919602adf40SYehuda Sadeh 
920602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
921602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
922602adf40SYehuda Sadeh 
923aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
924602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
925602adf40SYehuda Sadeh 
926602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
927602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
928602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
929602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
930602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
9310ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
9321dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
9331dbb4399SAlex Elder 				req, ops);
934602adf40SYehuda Sadeh 
935602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
936602adf40SYehuda Sadeh 				ops,
937602adf40SYehuda Sadeh 				snapc,
938602adf40SYehuda Sadeh 				&mtime,
939602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
9400ce1a794SAlex Elder 	up_read(&rbd_dev->header_rwsem);
941602adf40SYehuda Sadeh 
94259c2be1eSYehuda Sadeh 	if (linger_req) {
9431dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
94459c2be1eSYehuda Sadeh 		*linger_req = req;
94559c2be1eSYehuda Sadeh 	}
94659c2be1eSYehuda Sadeh 
9471dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
948602adf40SYehuda Sadeh 	if (ret < 0)
949602adf40SYehuda Sadeh 		goto done_err;
950602adf40SYehuda Sadeh 
951602adf40SYehuda Sadeh 	if (!rbd_cb) {
9521dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
95359c2be1eSYehuda Sadeh 		if (ver)
95459c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
9551fec7093SYehuda Sadeh 		dout("reassert_ver=%lld\n",
9561fec7093SYehuda Sadeh 		     le64_to_cpu(req->r_reassert_version.version));
957602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
958602adf40SYehuda Sadeh 	}
959602adf40SYehuda Sadeh 	return ret;
960602adf40SYehuda Sadeh 
961602adf40SYehuda Sadeh done_err:
962602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
963602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
964602adf40SYehuda Sadeh done_pages:
9651fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
966602adf40SYehuda Sadeh 	kfree(req_data);
967602adf40SYehuda Sadeh 	return ret;
968602adf40SYehuda Sadeh }
969602adf40SYehuda Sadeh 
970602adf40SYehuda Sadeh /*
971602adf40SYehuda Sadeh  * Ceph osd op callback
972602adf40SYehuda Sadeh  */
973602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
974602adf40SYehuda Sadeh {
975602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
976602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
977602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
978602adf40SYehuda Sadeh 	__s32 rc;
979602adf40SYehuda Sadeh 	u64 bytes;
980602adf40SYehuda Sadeh 	int read_op;
981602adf40SYehuda Sadeh 
982602adf40SYehuda Sadeh 	/* parse reply */
983602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
984602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
985602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
986602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
987602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
988895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
989602adf40SYehuda Sadeh 
990602adf40SYehuda Sadeh 	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
991602adf40SYehuda Sadeh 
992602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
993602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
994602adf40SYehuda Sadeh 		rc = 0;
995602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
996602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
997602adf40SYehuda Sadeh 		bytes = req_data->len;
998602adf40SYehuda Sadeh 	}
999602adf40SYehuda Sadeh 
10001fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1001602adf40SYehuda Sadeh 
1002602adf40SYehuda Sadeh 	if (req_data->bio)
1003602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1004602adf40SYehuda Sadeh 
1005602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1006602adf40SYehuda Sadeh 	kfree(req_data);
1007602adf40SYehuda Sadeh }
1008602adf40SYehuda Sadeh 
100959c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
101059c2be1eSYehuda Sadeh {
101159c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
101259c2be1eSYehuda Sadeh }
101359c2be1eSYehuda Sadeh 
1014602adf40SYehuda Sadeh /*
1015602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1016602adf40SYehuda Sadeh  */
10170ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1018602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1019602adf40SYehuda Sadeh 			   u64 snapid,
1020602adf40SYehuda Sadeh 			   int opcode,
1021602adf40SYehuda Sadeh 			   int flags,
1022602adf40SYehuda Sadeh 			   struct ceph_osd_req_op *orig_ops,
1023602adf40SYehuda Sadeh 			   int num_reply,
1024aded07eaSAlex Elder 			   const char *object_name,
1025602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
102659c2be1eSYehuda Sadeh 			   char *buf,
102759c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
102859c2be1eSYehuda Sadeh 			   u64 *ver)
1029602adf40SYehuda Sadeh {
1030602adf40SYehuda Sadeh 	int ret;
1031602adf40SYehuda Sadeh 	struct page **pages;
1032602adf40SYehuda Sadeh 	int num_pages;
1033602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops = orig_ops;
1034602adf40SYehuda Sadeh 	u32 payload_len;
1035602adf40SYehuda Sadeh 
1036602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1037602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1038b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1039b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1040602adf40SYehuda Sadeh 
1041602adf40SYehuda Sadeh 	if (!orig_ops) {
1042602adf40SYehuda Sadeh 		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1043602adf40SYehuda Sadeh 		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1044602adf40SYehuda Sadeh 		if (ret < 0)
1045602adf40SYehuda Sadeh 			goto done;
1046602adf40SYehuda Sadeh 
1047602adf40SYehuda Sadeh 		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1048602adf40SYehuda Sadeh 			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1049602adf40SYehuda Sadeh 			if (ret < 0)
1050602adf40SYehuda Sadeh 				goto done_ops;
1051602adf40SYehuda Sadeh 		}
1052602adf40SYehuda Sadeh 	}
1053602adf40SYehuda Sadeh 
10540ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1055aded07eaSAlex Elder 			  object_name, ofs, len, NULL,
1056602adf40SYehuda Sadeh 			  pages, num_pages,
1057602adf40SYehuda Sadeh 			  flags,
1058602adf40SYehuda Sadeh 			  ops,
1059602adf40SYehuda Sadeh 			  2,
10601fec7093SYehuda Sadeh 			  NULL, 0,
106159c2be1eSYehuda Sadeh 			  NULL,
106259c2be1eSYehuda Sadeh 			  linger_req, ver);
1063602adf40SYehuda Sadeh 	if (ret < 0)
1064602adf40SYehuda Sadeh 		goto done_ops;
1065602adf40SYehuda Sadeh 
1066602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1067602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh done_ops:
1070602adf40SYehuda Sadeh 	if (!orig_ops)
1071602adf40SYehuda Sadeh 		rbd_destroy_ops(ops);
1072602adf40SYehuda Sadeh done:
1073602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1074602adf40SYehuda Sadeh 	return ret;
1075602adf40SYehuda Sadeh }
1076602adf40SYehuda Sadeh 
1077602adf40SYehuda Sadeh /*
1078602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1079602adf40SYehuda Sadeh  */
1080602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1081602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1082602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1083602adf40SYehuda Sadeh 		     u64 snapid,
1084602adf40SYehuda Sadeh 		     int opcode, int flags, int num_reply,
1085602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
10861fec7093SYehuda Sadeh 		     struct bio *bio,
10871fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
10881fec7093SYehuda Sadeh 		     int coll_index)
1089602adf40SYehuda Sadeh {
1090602adf40SYehuda Sadeh 	char *seg_name;
1091602adf40SYehuda Sadeh 	u64 seg_ofs;
1092602adf40SYehuda Sadeh 	u64 seg_len;
1093602adf40SYehuda Sadeh 	int ret;
1094602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1095602adf40SYehuda Sadeh 	u32 payload_len;
1096602adf40SYehuda Sadeh 
1097602adf40SYehuda Sadeh 	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1098602adf40SYehuda Sadeh 	if (!seg_name)
1099602adf40SYehuda Sadeh 		return -ENOMEM;
1100602adf40SYehuda Sadeh 
1101602adf40SYehuda Sadeh 	seg_len = rbd_get_segment(&rbd_dev->header,
1102ca1e49a6SAlex Elder 				  rbd_dev->header.object_prefix,
1103602adf40SYehuda Sadeh 				  ofs, len,
1104602adf40SYehuda Sadeh 				  seg_name, &seg_ofs);
1105602adf40SYehuda Sadeh 
1106602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1107602adf40SYehuda Sadeh 
1108602adf40SYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1109602adf40SYehuda Sadeh 	if (ret < 0)
1110602adf40SYehuda Sadeh 		goto done;
1111602adf40SYehuda Sadeh 
1112602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1113602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1114602adf40SYehuda Sadeh 	   truncated at this point */
1115602adf40SYehuda Sadeh 	BUG_ON(seg_len < len);
1116602adf40SYehuda Sadeh 
1117602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1118602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1119602adf40SYehuda Sadeh 			     bio,
1120602adf40SYehuda Sadeh 			     NULL, 0,
1121602adf40SYehuda Sadeh 			     flags,
1122602adf40SYehuda Sadeh 			     ops,
1123602adf40SYehuda Sadeh 			     num_reply,
11241fec7093SYehuda Sadeh 			     coll, coll_index,
112559c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
112611f77002SSage Weil 
112711f77002SSage Weil 	rbd_destroy_ops(ops);
1128602adf40SYehuda Sadeh done:
1129602adf40SYehuda Sadeh 	kfree(seg_name);
1130602adf40SYehuda Sadeh 	return ret;
1131602adf40SYehuda Sadeh }
1132602adf40SYehuda Sadeh 
1133602adf40SYehuda Sadeh /*
1134602adf40SYehuda Sadeh  * Request async osd write
1135602adf40SYehuda Sadeh  */
1136602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1137602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1138602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1139602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11401fec7093SYehuda Sadeh 			 struct bio *bio,
11411fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11421fec7093SYehuda Sadeh 			 int coll_index)
1143602adf40SYehuda Sadeh {
1144602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1145602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1146602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1147602adf40SYehuda Sadeh 			 2,
11481fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1149602adf40SYehuda Sadeh }
1150602adf40SYehuda Sadeh 
1151602adf40SYehuda Sadeh /*
1152602adf40SYehuda Sadeh  * Request async osd read
1153602adf40SYehuda Sadeh  */
1154602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1155602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1156602adf40SYehuda Sadeh 			 u64 snapid,
1157602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11581fec7093SYehuda Sadeh 			 struct bio *bio,
11591fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11601fec7093SYehuda Sadeh 			 int coll_index)
1161602adf40SYehuda Sadeh {
1162602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1163b06e6a6bSJosh Durgin 			 snapid,
1164602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1165602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
1166602adf40SYehuda Sadeh 			 2,
11671fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1168602adf40SYehuda Sadeh }
1169602adf40SYehuda Sadeh 
1170602adf40SYehuda Sadeh /*
1171602adf40SYehuda Sadeh  * Request sync osd read
1172602adf40SYehuda Sadeh  */
11730ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1174602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1175602adf40SYehuda Sadeh 			  u64 snapid,
1176aded07eaSAlex Elder 			  const char *object_name,
1177602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
117859c2be1eSYehuda Sadeh 			  char *buf,
117959c2be1eSYehuda Sadeh 			  u64 *ver)
1180602adf40SYehuda Sadeh {
11810ce1a794SAlex Elder 	return rbd_req_sync_op(rbd_dev, NULL,
1182b06e6a6bSJosh Durgin 			       snapid,
1183602adf40SYehuda Sadeh 			       CEPH_OSD_OP_READ,
1184602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1185602adf40SYehuda Sadeh 			       NULL,
1186aded07eaSAlex Elder 			       1, object_name, ofs, len, buf, NULL, ver);
1187602adf40SYehuda Sadeh }
1188602adf40SYehuda Sadeh 
1189602adf40SYehuda Sadeh /*
119059c2be1eSYehuda Sadeh  * Request sync osd watch
119159c2be1eSYehuda Sadeh  */
11920ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
119359c2be1eSYehuda Sadeh 				   u64 ver,
119459c2be1eSYehuda Sadeh 				   u64 notify_id,
1195aded07eaSAlex Elder 				   const char *object_name)
119659c2be1eSYehuda Sadeh {
119759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
119811f77002SSage Weil 	int ret;
119911f77002SSage Weil 
120011f77002SSage Weil 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
120159c2be1eSYehuda Sadeh 	if (ret < 0)
120259c2be1eSYehuda Sadeh 		return ret;
120359c2be1eSYehuda Sadeh 
12040ce1a794SAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
120559c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
120659c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
120759c2be1eSYehuda Sadeh 
12080ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
1209aded07eaSAlex Elder 			  object_name, 0, 0, NULL,
1210ad4f232fSAlex Elder 			  NULL, 0,
121159c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
121259c2be1eSYehuda Sadeh 			  ops,
121359c2be1eSYehuda Sadeh 			  1,
12141fec7093SYehuda Sadeh 			  NULL, 0,
121559c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
121659c2be1eSYehuda Sadeh 
121759c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
121859c2be1eSYehuda Sadeh 	return ret;
121959c2be1eSYehuda Sadeh }
122059c2be1eSYehuda Sadeh 
122159c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
122259c2be1eSYehuda Sadeh {
12230ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
122413143d2dSSage Weil 	int rc;
122513143d2dSSage Weil 
12260ce1a794SAlex Elder 	if (!rbd_dev)
122759c2be1eSYehuda Sadeh 		return;
122859c2be1eSYehuda Sadeh 
12290bed54dcSAlex Elder 	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
12300bed54dcSAlex Elder 		rbd_dev->header_name, notify_id, (int) opcode);
123159c2be1eSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
12320ce1a794SAlex Elder 	rc = __rbd_refresh_header(rbd_dev);
123359c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
123413143d2dSSage Weil 	if (rc)
1235f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
12360ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
123759c2be1eSYehuda Sadeh 
12380bed54dcSAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
123959c2be1eSYehuda Sadeh }
124059c2be1eSYehuda Sadeh 
124159c2be1eSYehuda Sadeh /*
124259c2be1eSYehuda Sadeh  * Request sync osd watch
124359c2be1eSYehuda Sadeh  */
12440ce1a794SAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
1245aded07eaSAlex Elder 			      const char *object_name,
124659c2be1eSYehuda Sadeh 			      u64 ver)
124759c2be1eSYehuda Sadeh {
124859c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
12490ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
125059c2be1eSYehuda Sadeh 
125159c2be1eSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
125259c2be1eSYehuda Sadeh 	if (ret < 0)
125359c2be1eSYehuda Sadeh 		return ret;
125459c2be1eSYehuda Sadeh 
125559c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
12560ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
125759c2be1eSYehuda Sadeh 	if (ret < 0)
125859c2be1eSYehuda Sadeh 		goto fail;
125959c2be1eSYehuda Sadeh 
126059c2be1eSYehuda Sadeh 	ops[0].watch.ver = cpu_to_le64(ver);
12610ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
126259c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
126359c2be1eSYehuda Sadeh 
12640ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
126559c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
126659c2be1eSYehuda Sadeh 			      0,
126759c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
126859c2be1eSYehuda Sadeh 			      ops,
1269aded07eaSAlex Elder 			      1, object_name, 0, 0, NULL,
12700ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
127159c2be1eSYehuda Sadeh 
127259c2be1eSYehuda Sadeh 	if (ret < 0)
127359c2be1eSYehuda Sadeh 		goto fail_event;
127459c2be1eSYehuda Sadeh 
127559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
127659c2be1eSYehuda Sadeh 	return 0;
127759c2be1eSYehuda Sadeh 
127859c2be1eSYehuda Sadeh fail_event:
12790ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
12800ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
128159c2be1eSYehuda Sadeh fail:
128259c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
128359c2be1eSYehuda Sadeh 	return ret;
128459c2be1eSYehuda Sadeh }
128559c2be1eSYehuda Sadeh 
128679e3057cSYehuda Sadeh /*
128779e3057cSYehuda Sadeh  * Request sync osd unwatch
128879e3057cSYehuda Sadeh  */
12890ce1a794SAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
1290aded07eaSAlex Elder 				const char *object_name)
129179e3057cSYehuda Sadeh {
129279e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
129379e3057cSYehuda Sadeh 
129479e3057cSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
129579e3057cSYehuda Sadeh 	if (ret < 0)
129679e3057cSYehuda Sadeh 		return ret;
129779e3057cSYehuda Sadeh 
129879e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
12990ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
130079e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
130179e3057cSYehuda Sadeh 
13020ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
130379e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
130479e3057cSYehuda Sadeh 			      0,
130579e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
130679e3057cSYehuda Sadeh 			      ops,
1307aded07eaSAlex Elder 			      1, object_name, 0, 0, NULL, NULL, NULL);
130879e3057cSYehuda Sadeh 
130979e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
13100ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13110ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
131279e3057cSYehuda Sadeh 	return ret;
131379e3057cSYehuda Sadeh }
131479e3057cSYehuda Sadeh 
131559c2be1eSYehuda Sadeh struct rbd_notify_info {
13160ce1a794SAlex Elder 	struct rbd_device *rbd_dev;
131759c2be1eSYehuda Sadeh };
131859c2be1eSYehuda Sadeh 
131959c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
132059c2be1eSYehuda Sadeh {
13210ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
13220ce1a794SAlex Elder 	if (!rbd_dev)
132359c2be1eSYehuda Sadeh 		return;
132459c2be1eSYehuda Sadeh 
13250ce1a794SAlex Elder 	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
13260bed54dcSAlex Elder 				rbd_dev->header_name,
132759c2be1eSYehuda Sadeh 		notify_id, (int)opcode);
132859c2be1eSYehuda Sadeh }
132959c2be1eSYehuda Sadeh 
133059c2be1eSYehuda Sadeh /*
133159c2be1eSYehuda Sadeh  * Request sync osd notify
133259c2be1eSYehuda Sadeh  */
13330ce1a794SAlex Elder static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
1334aded07eaSAlex Elder 		          const char *object_name)
133559c2be1eSYehuda Sadeh {
133659c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13370ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
133859c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
133959c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
134059c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
134159c2be1eSYehuda Sadeh 	int ret;
134259c2be1eSYehuda Sadeh 
134359c2be1eSYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
134459c2be1eSYehuda Sadeh 	if (ret < 0)
134559c2be1eSYehuda Sadeh 		return ret;
134659c2be1eSYehuda Sadeh 
13470ce1a794SAlex Elder 	info.rbd_dev = rbd_dev;
134859c2be1eSYehuda Sadeh 
134959c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
135059c2be1eSYehuda Sadeh 				     (void *)&info, &event);
135159c2be1eSYehuda Sadeh 	if (ret < 0)
135259c2be1eSYehuda Sadeh 		goto fail;
135359c2be1eSYehuda Sadeh 
135459c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
135559c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
135659c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
135759c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
135859c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
135959c2be1eSYehuda Sadeh 
13600ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
136159c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
136259c2be1eSYehuda Sadeh 			       0,
136359c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
136459c2be1eSYehuda Sadeh 			       ops,
1365aded07eaSAlex Elder 			       1, object_name, 0, 0, NULL, NULL, NULL);
136659c2be1eSYehuda Sadeh 	if (ret < 0)
136759c2be1eSYehuda Sadeh 		goto fail_event;
136859c2be1eSYehuda Sadeh 
136959c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
137059c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
137159c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
137259c2be1eSYehuda Sadeh 	return 0;
137359c2be1eSYehuda Sadeh 
137459c2be1eSYehuda Sadeh fail_event:
137559c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
137659c2be1eSYehuda Sadeh fail:
137759c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
137859c2be1eSYehuda Sadeh 	return ret;
137959c2be1eSYehuda Sadeh }
138059c2be1eSYehuda Sadeh 
138159c2be1eSYehuda Sadeh /*
1382602adf40SYehuda Sadeh  * Request sync osd read
1383602adf40SYehuda Sadeh  */
13840ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1385aded07eaSAlex Elder 			     const char *object_name,
1386aded07eaSAlex Elder 			     const char *class_name,
1387aded07eaSAlex Elder 			     const char *method_name,
1388602adf40SYehuda Sadeh 			     const char *data,
138959c2be1eSYehuda Sadeh 			     int len,
139059c2be1eSYehuda Sadeh 			     u64 *ver)
1391602adf40SYehuda Sadeh {
1392602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1393aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1394aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
1395602adf40SYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1396aded07eaSAlex Elder 				    class_name_len + method_name_len + len);
1397602adf40SYehuda Sadeh 	if (ret < 0)
1398602adf40SYehuda Sadeh 		return ret;
1399602adf40SYehuda Sadeh 
1400aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1401aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1402aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1403aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1404602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1405602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1406602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1407602adf40SYehuda Sadeh 
14080ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1409602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1410602adf40SYehuda Sadeh 			       0,
1411602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1412602adf40SYehuda Sadeh 			       ops,
1413aded07eaSAlex Elder 			       1, object_name, 0, 0, NULL, NULL, ver);
1414602adf40SYehuda Sadeh 
1415602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1416602adf40SYehuda Sadeh 
1417602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1418602adf40SYehuda Sadeh 	return ret;
1419602adf40SYehuda Sadeh }
1420602adf40SYehuda Sadeh 
14211fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14221fec7093SYehuda Sadeh {
14231fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14241fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14251fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14261fec7093SYehuda Sadeh 				GFP_ATOMIC);
14271fec7093SYehuda Sadeh 
14281fec7093SYehuda Sadeh 	if (!coll)
14291fec7093SYehuda Sadeh 		return NULL;
14301fec7093SYehuda Sadeh 	coll->total = num_reqs;
14311fec7093SYehuda Sadeh 	kref_init(&coll->kref);
14321fec7093SYehuda Sadeh 	return coll;
14331fec7093SYehuda Sadeh }
14341fec7093SYehuda Sadeh 
1435602adf40SYehuda Sadeh /*
1436602adf40SYehuda Sadeh  * block device queue callback
1437602adf40SYehuda Sadeh  */
1438602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1439602adf40SYehuda Sadeh {
1440602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1441602adf40SYehuda Sadeh 	struct request *rq;
1442602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1443602adf40SYehuda Sadeh 
144400f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1445602adf40SYehuda Sadeh 		struct bio *bio;
1446602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1447602adf40SYehuda Sadeh 		bool do_write;
1448602adf40SYehuda Sadeh 		int size, op_size = 0;
1449602adf40SYehuda Sadeh 		u64 ofs;
14501fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
14511fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1452602adf40SYehuda Sadeh 
1453602adf40SYehuda Sadeh 		/* peek at request from block layer */
1454602adf40SYehuda Sadeh 		if (!rq)
1455602adf40SYehuda Sadeh 			break;
1456602adf40SYehuda Sadeh 
1457602adf40SYehuda Sadeh 		dout("fetched request\n");
1458602adf40SYehuda Sadeh 
1459602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1460602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1461602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
146200f1f36fSAlex Elder 			continue;
1463602adf40SYehuda Sadeh 		}
1464602adf40SYehuda Sadeh 
1465602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1466602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1467602adf40SYehuda Sadeh 
1468602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1469593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1470602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1471602adf40SYehuda Sadeh 		if (do_write && rbd_dev->read_only) {
1472602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
147300f1f36fSAlex Elder 			continue;
1474602adf40SYehuda Sadeh 		}
1475602adf40SYehuda Sadeh 
1476602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1477602adf40SYehuda Sadeh 
1478602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1479602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1480593a9e7bSAlex Elder 		     size, blk_rq_pos(rq) * SECTOR_SIZE);
1481602adf40SYehuda Sadeh 
14821fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
14831fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
14841fec7093SYehuda Sadeh 		if (!coll) {
14851fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
14861fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
148700f1f36fSAlex Elder 			continue;
14881fec7093SYehuda Sadeh 		}
14891fec7093SYehuda Sadeh 
1490602adf40SYehuda Sadeh 		do {
1491602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1492602adf40SYehuda Sadeh 			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1493602adf40SYehuda Sadeh 			op_size = rbd_get_segment(&rbd_dev->header,
1494ca1e49a6SAlex Elder 						  rbd_dev->header.object_prefix,
1495602adf40SYehuda Sadeh 						  ofs, size,
1496602adf40SYehuda Sadeh 						  NULL, NULL);
14971fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1498602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1499602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1500602adf40SYehuda Sadeh 			if (!bio) {
15011fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15021fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15031fec7093SYehuda Sadeh 				goto next_seg;
1504602adf40SYehuda Sadeh 			}
1505602adf40SYehuda Sadeh 
15061fec7093SYehuda Sadeh 
1507602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1508602adf40SYehuda Sadeh 			if (do_write)
1509602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1510602adf40SYehuda Sadeh 					      rbd_dev->header.snapc,
1511602adf40SYehuda Sadeh 					      ofs,
15121fec7093SYehuda Sadeh 					      op_size, bio,
15131fec7093SYehuda Sadeh 					      coll, cur_seg);
1514602adf40SYehuda Sadeh 			else
1515602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
151677dfe99fSJosh Durgin 					     rbd_dev->snap_id,
1517602adf40SYehuda Sadeh 					     ofs,
15181fec7093SYehuda Sadeh 					     op_size, bio,
15191fec7093SYehuda Sadeh 					     coll, cur_seg);
1520602adf40SYehuda Sadeh 
15211fec7093SYehuda Sadeh next_seg:
1522602adf40SYehuda Sadeh 			size -= op_size;
1523602adf40SYehuda Sadeh 			ofs += op_size;
1524602adf40SYehuda Sadeh 
15251fec7093SYehuda Sadeh 			cur_seg++;
1526602adf40SYehuda Sadeh 			rq_bio = next_bio;
1527602adf40SYehuda Sadeh 		} while (size > 0);
15281fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1529602adf40SYehuda Sadeh 
1530602adf40SYehuda Sadeh 		if (bp)
1531602adf40SYehuda Sadeh 			bio_pair_release(bp);
1532602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1533602adf40SYehuda Sadeh 	}
1534602adf40SYehuda Sadeh }
1535602adf40SYehuda Sadeh 
1536602adf40SYehuda Sadeh /*
1537602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1538602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1539602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1540602adf40SYehuda Sadeh  */
1541602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1542602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1543602adf40SYehuda Sadeh {
1544602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1545593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1546593a9e7bSAlex Elder 	sector_t sector;
1547593a9e7bSAlex Elder 	unsigned int bio_sectors;
1548602adf40SYehuda Sadeh 	int max;
1549602adf40SYehuda Sadeh 
1550593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1551593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1552593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1553593a9e7bSAlex Elder 
1554602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1555593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1556602adf40SYehuda Sadeh 	if (max < 0)
1557602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1558602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1559602adf40SYehuda Sadeh 		return bvec->bv_len;
1560602adf40SYehuda Sadeh 	return max;
1561602adf40SYehuda Sadeh }
1562602adf40SYehuda Sadeh 
1563602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1564602adf40SYehuda Sadeh {
1565602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1566602adf40SYehuda Sadeh 
1567602adf40SYehuda Sadeh 	if (!disk)
1568602adf40SYehuda Sadeh 		return;
1569602adf40SYehuda Sadeh 
1570602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1571602adf40SYehuda Sadeh 
1572602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1573602adf40SYehuda Sadeh 		del_gendisk(disk);
1574602adf40SYehuda Sadeh 	if (disk->queue)
1575602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1576602adf40SYehuda Sadeh 	put_disk(disk);
1577602adf40SYehuda Sadeh }
1578602adf40SYehuda Sadeh 
1579602adf40SYehuda Sadeh /*
1580602adf40SYehuda Sadeh  * reload the ondisk the header
1581602adf40SYehuda Sadeh  */
1582602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1583602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1584602adf40SYehuda Sadeh {
1585602adf40SYehuda Sadeh 	ssize_t rc;
1586602adf40SYehuda Sadeh 	struct rbd_image_header_ondisk *dh;
158750f7c4c9SXi Wang 	u32 snap_count = 0;
158859c2be1eSYehuda Sadeh 	u64 ver;
158900f1f36fSAlex Elder 	size_t len;
1590602adf40SYehuda Sadeh 
159100f1f36fSAlex Elder 	/*
159200f1f36fSAlex Elder 	 * First reads the fixed-size header to determine the number
159300f1f36fSAlex Elder 	 * of snapshots, then re-reads it, along with all snapshot
159400f1f36fSAlex Elder 	 * records as well as their stored names.
159500f1f36fSAlex Elder 	 */
159600f1f36fSAlex Elder 	len = sizeof (*dh);
1597602adf40SYehuda Sadeh 	while (1) {
1598602adf40SYehuda Sadeh 		dh = kmalloc(len, GFP_KERNEL);
1599602adf40SYehuda Sadeh 		if (!dh)
1600602adf40SYehuda Sadeh 			return -ENOMEM;
1601602adf40SYehuda Sadeh 
1602602adf40SYehuda Sadeh 		rc = rbd_req_sync_read(rbd_dev,
1603602adf40SYehuda Sadeh 				       NULL, CEPH_NOSNAP,
16040bed54dcSAlex Elder 				       rbd_dev->header_name,
1605602adf40SYehuda Sadeh 				       0, len,
160659c2be1eSYehuda Sadeh 				       (char *)dh, &ver);
1607602adf40SYehuda Sadeh 		if (rc < 0)
1608602adf40SYehuda Sadeh 			goto out_dh;
1609602adf40SYehuda Sadeh 
1610602adf40SYehuda Sadeh 		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
161181e759fbSJosh Durgin 		if (rc < 0) {
161200f1f36fSAlex Elder 			if (rc == -ENXIO)
161381e759fbSJosh Durgin 				pr_warning("unrecognized header format"
16140bed54dcSAlex Elder 					   " for image %s\n",
16150bed54dcSAlex Elder 					   rbd_dev->image_name);
1616602adf40SYehuda Sadeh 			goto out_dh;
161781e759fbSJosh Durgin 		}
1618602adf40SYehuda Sadeh 
161900f1f36fSAlex Elder 		if (snap_count == header->total_snaps)
162000f1f36fSAlex Elder 			break;
162100f1f36fSAlex Elder 
1622602adf40SYehuda Sadeh 		snap_count = header->total_snaps;
162300f1f36fSAlex Elder 		len = sizeof (*dh) +
162400f1f36fSAlex Elder 			snap_count * sizeof(struct rbd_image_snap_ondisk) +
162500f1f36fSAlex Elder 			header->snap_names_len;
162600f1f36fSAlex Elder 
1627602adf40SYehuda Sadeh 		rbd_header_free(header);
1628602adf40SYehuda Sadeh 		kfree(dh);
1629602adf40SYehuda Sadeh 	}
163059c2be1eSYehuda Sadeh 	header->obj_version = ver;
1631602adf40SYehuda Sadeh 
1632602adf40SYehuda Sadeh out_dh:
1633602adf40SYehuda Sadeh 	kfree(dh);
1634602adf40SYehuda Sadeh 	return rc;
1635602adf40SYehuda Sadeh }
1636602adf40SYehuda Sadeh 
1637602adf40SYehuda Sadeh /*
1638602adf40SYehuda Sadeh  * create a snapshot
1639602adf40SYehuda Sadeh  */
16400ce1a794SAlex Elder static int rbd_header_add_snap(struct rbd_device *rbd_dev,
1641602adf40SYehuda Sadeh 			       const char *snap_name,
1642602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1643602adf40SYehuda Sadeh {
1644602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1645602adf40SYehuda Sadeh 	u64 new_snapid;
1646602adf40SYehuda Sadeh 	int ret;
1647916d4d67SSage Weil 	void *data, *p, *e;
164859c2be1eSYehuda Sadeh 	u64 ver;
16491dbb4399SAlex Elder 	struct ceph_mon_client *monc;
1650602adf40SYehuda Sadeh 
1651602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
16520ce1a794SAlex Elder 	if (rbd_dev->snap_id != CEPH_NOSNAP)
1653602adf40SYehuda Sadeh 		return -EINVAL;
1654602adf40SYehuda Sadeh 
16550ce1a794SAlex Elder 	monc = &rbd_dev->rbd_client->client->monc;
16560ce1a794SAlex Elder 	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
1657602adf40SYehuda Sadeh 	dout("created snapid=%lld\n", new_snapid);
1658602adf40SYehuda Sadeh 	if (ret < 0)
1659602adf40SYehuda Sadeh 		return ret;
1660602adf40SYehuda Sadeh 
1661602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1662602adf40SYehuda Sadeh 	if (!data)
1663602adf40SYehuda Sadeh 		return -ENOMEM;
1664602adf40SYehuda Sadeh 
1665916d4d67SSage Weil 	p = data;
1666916d4d67SSage Weil 	e = data + name_len + 16;
1667602adf40SYehuda Sadeh 
1668916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1669916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1670602adf40SYehuda Sadeh 
16710bed54dcSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
16720ce1a794SAlex Elder 				"rbd", "snap_add",
1673916d4d67SSage Weil 				data, p - data, &ver);
1674602adf40SYehuda Sadeh 
1675916d4d67SSage Weil 	kfree(data);
1676602adf40SYehuda Sadeh 
1677602adf40SYehuda Sadeh 	if (ret < 0)
1678602adf40SYehuda Sadeh 		return ret;
1679602adf40SYehuda Sadeh 
16800ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
16810ce1a794SAlex Elder 	rbd_dev->header.snapc->seq = new_snapid;
16820ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
1683602adf40SYehuda Sadeh 
1684602adf40SYehuda Sadeh 	return 0;
1685602adf40SYehuda Sadeh bad:
1686602adf40SYehuda Sadeh 	return -ERANGE;
1687602adf40SYehuda Sadeh }
1688602adf40SYehuda Sadeh 
1689dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1690dfc5606dSYehuda Sadeh {
1691dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1692dfc5606dSYehuda Sadeh 
1693dfc5606dSYehuda Sadeh 	while (!list_empty(&rbd_dev->snaps)) {
1694dfc5606dSYehuda Sadeh 		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1695dfc5606dSYehuda Sadeh 		__rbd_remove_snap_dev(rbd_dev, snap);
1696dfc5606dSYehuda Sadeh 	}
1697dfc5606dSYehuda Sadeh }
1698dfc5606dSYehuda Sadeh 
1699602adf40SYehuda Sadeh /*
1700602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1701602adf40SYehuda Sadeh  */
1702263c6ca0SJosh Durgin static int __rbd_refresh_header(struct rbd_device *rbd_dev)
1703602adf40SYehuda Sadeh {
1704602adf40SYehuda Sadeh 	int ret;
1705602adf40SYehuda Sadeh 	struct rbd_image_header h;
1706602adf40SYehuda Sadeh 	u64 snap_seq;
170759c2be1eSYehuda Sadeh 	int follow_seq = 0;
1708602adf40SYehuda Sadeh 
1709602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1710602adf40SYehuda Sadeh 	if (ret < 0)
1711602adf40SYehuda Sadeh 		return ret;
1712602adf40SYehuda Sadeh 
17139db4b3e3SSage Weil 	/* resized? */
1714593a9e7bSAlex Elder 	set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
17159db4b3e3SSage Weil 
1716c666601aSJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1717602adf40SYehuda Sadeh 
1718602adf40SYehuda Sadeh 	snap_seq = rbd_dev->header.snapc->seq;
171959c2be1eSYehuda Sadeh 	if (rbd_dev->header.total_snaps &&
172059c2be1eSYehuda Sadeh 	    rbd_dev->header.snapc->snaps[0] == snap_seq)
172159c2be1eSYehuda Sadeh 		/* pointing at the head, will need to follow that
172259c2be1eSYehuda Sadeh 		   if head moves */
172359c2be1eSYehuda Sadeh 		follow_seq = 1;
1724602adf40SYehuda Sadeh 
1725849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1726602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1727849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1728849b4260SAlex Elder 	kfree(rbd_dev->header.snapc);
1729602adf40SYehuda Sadeh 
1730602adf40SYehuda Sadeh 	rbd_dev->header.total_snaps = h.total_snaps;
1731602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1732602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1733dfc5606dSYehuda Sadeh 	rbd_dev->header.snap_names_len = h.snap_names_len;
1734602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1735849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1736849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1737849b4260SAlex Elder 	kfree(h.object_prefix);
1738849b4260SAlex Elder 
173959c2be1eSYehuda Sadeh 	if (follow_seq)
174059c2be1eSYehuda Sadeh 		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
174159c2be1eSYehuda Sadeh 	else
1742602adf40SYehuda Sadeh 		rbd_dev->header.snapc->seq = snap_seq;
1743602adf40SYehuda Sadeh 
1744dfc5606dSYehuda Sadeh 	ret = __rbd_init_snaps_header(rbd_dev);
1745dfc5606dSYehuda Sadeh 
1746c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1747602adf40SYehuda Sadeh 
1748dfc5606dSYehuda Sadeh 	return ret;
1749602adf40SYehuda Sadeh }
1750602adf40SYehuda Sadeh 
1751602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1752602adf40SYehuda Sadeh {
1753602adf40SYehuda Sadeh 	struct gendisk *disk;
1754602adf40SYehuda Sadeh 	struct request_queue *q;
1755602adf40SYehuda Sadeh 	int rc;
1756593a9e7bSAlex Elder 	u64 segment_size;
1757602adf40SYehuda Sadeh 	u64 total_size = 0;
1758602adf40SYehuda Sadeh 
1759602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1760602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1761602adf40SYehuda Sadeh 	if (rc)
1762602adf40SYehuda Sadeh 		return rc;
1763602adf40SYehuda Sadeh 
1764dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
1765dfc5606dSYehuda Sadeh 	rc = __rbd_init_snaps_header(rbd_dev);
1766dfc5606dSYehuda Sadeh 	if (rc)
1767dfc5606dSYehuda Sadeh 		return rc;
1768dfc5606dSYehuda Sadeh 
1769cc9d734cSJosh Durgin 	rc = rbd_header_set_snap(rbd_dev, &total_size);
1770602adf40SYehuda Sadeh 	if (rc)
1771602adf40SYehuda Sadeh 		return rc;
1772602adf40SYehuda Sadeh 
1773602adf40SYehuda Sadeh 	/* create gendisk info */
1774602adf40SYehuda Sadeh 	rc = -ENOMEM;
1775602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1776602adf40SYehuda Sadeh 	if (!disk)
1777602adf40SYehuda Sadeh 		goto out;
1778602adf40SYehuda Sadeh 
1779f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1780aedfec59SSage Weil 		 rbd_dev->id);
1781602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1782602adf40SYehuda Sadeh 	disk->first_minor = 0;
1783602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1784602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1785602adf40SYehuda Sadeh 
1786602adf40SYehuda Sadeh 	/* init rq */
1787602adf40SYehuda Sadeh 	rc = -ENOMEM;
1788602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1789602adf40SYehuda Sadeh 	if (!q)
1790602adf40SYehuda Sadeh 		goto out_disk;
1791029bcbd8SJosh Durgin 
1792593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1793593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1794593a9e7bSAlex Elder 
1795029bcbd8SJosh Durgin 	/* set io sizes to object size */
1796593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1797593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1798593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1799593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1800593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1801029bcbd8SJosh Durgin 
1802602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1803602adf40SYehuda Sadeh 	disk->queue = q;
1804602adf40SYehuda Sadeh 
1805602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1806602adf40SYehuda Sadeh 
1807602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1808602adf40SYehuda Sadeh 	rbd_dev->q = q;
1809602adf40SYehuda Sadeh 
1810602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
1811593a9e7bSAlex Elder 	set_capacity(disk, total_size / SECTOR_SIZE);
1812602adf40SYehuda Sadeh 	add_disk(disk);
1813602adf40SYehuda Sadeh 
1814602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
1815602adf40SYehuda Sadeh 		disk->disk_name, (unsigned long long)total_size);
1816602adf40SYehuda Sadeh 	return 0;
1817602adf40SYehuda Sadeh 
1818602adf40SYehuda Sadeh out_disk:
1819602adf40SYehuda Sadeh 	put_disk(disk);
1820602adf40SYehuda Sadeh out:
1821602adf40SYehuda Sadeh 	return rc;
1822602adf40SYehuda Sadeh }
1823602adf40SYehuda Sadeh 
1824dfc5606dSYehuda Sadeh /*
1825dfc5606dSYehuda Sadeh   sysfs
1826dfc5606dSYehuda Sadeh */
1827602adf40SYehuda Sadeh 
1828593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1829593a9e7bSAlex Elder {
1830593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1831593a9e7bSAlex Elder }
1832593a9e7bSAlex Elder 
1833dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1834dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1835602adf40SYehuda Sadeh {
1836593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1837dfc5606dSYehuda Sadeh 
1838dfc5606dSYehuda Sadeh 	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1839602adf40SYehuda Sadeh }
1840602adf40SYehuda Sadeh 
1841dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1842dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1843602adf40SYehuda Sadeh {
1844593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1845dfc5606dSYehuda Sadeh 
1846dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1847dfc5606dSYehuda Sadeh }
1848dfc5606dSYehuda Sadeh 
1849dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1850dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1851dfc5606dSYehuda Sadeh {
1852593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1853dfc5606dSYehuda Sadeh 
18541dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
18551dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1856dfc5606dSYehuda Sadeh }
1857dfc5606dSYehuda Sadeh 
1858dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1859dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1860dfc5606dSYehuda Sadeh {
1861593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1862dfc5606dSYehuda Sadeh 
1863dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1864dfc5606dSYehuda Sadeh }
1865dfc5606dSYehuda Sadeh 
18669bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
18679bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
18689bb2f334SAlex Elder {
18699bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
18709bb2f334SAlex Elder 
18719bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
18729bb2f334SAlex Elder }
18739bb2f334SAlex Elder 
1874dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1875dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1876dfc5606dSYehuda Sadeh {
1877593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1878dfc5606dSYehuda Sadeh 
18790bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
1880dfc5606dSYehuda Sadeh }
1881dfc5606dSYehuda Sadeh 
1882dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
1883dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
1884dfc5606dSYehuda Sadeh 			     char *buf)
1885dfc5606dSYehuda Sadeh {
1886593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1887dfc5606dSYehuda Sadeh 
1888dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
1889dfc5606dSYehuda Sadeh }
1890dfc5606dSYehuda Sadeh 
1891dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
1892dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
1893dfc5606dSYehuda Sadeh 				 const char *buf,
1894dfc5606dSYehuda Sadeh 				 size_t size)
1895dfc5606dSYehuda Sadeh {
1896593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1897dfc5606dSYehuda Sadeh 	int rc;
1898dfc5606dSYehuda Sadeh 	int ret = size;
1899602adf40SYehuda Sadeh 
1900602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1901602adf40SYehuda Sadeh 
1902263c6ca0SJosh Durgin 	rc = __rbd_refresh_header(rbd_dev);
1903dfc5606dSYehuda Sadeh 	if (rc < 0)
1904dfc5606dSYehuda Sadeh 		ret = rc;
1905602adf40SYehuda Sadeh 
1906dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
1907dfc5606dSYehuda Sadeh 	return ret;
1908dfc5606dSYehuda Sadeh }
1909602adf40SYehuda Sadeh 
1910dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1911dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1912dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1913dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
19149bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
1915dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1916dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1917dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1918dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1919dfc5606dSYehuda Sadeh 
1920dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
1921dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
1922dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
1923dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
1924dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
19259bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
1926dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
1927dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
1928dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
1929dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
1930dfc5606dSYehuda Sadeh 	NULL
1931dfc5606dSYehuda Sadeh };
1932dfc5606dSYehuda Sadeh 
1933dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
1934dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
1935dfc5606dSYehuda Sadeh };
1936dfc5606dSYehuda Sadeh 
1937dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
1938dfc5606dSYehuda Sadeh 	&rbd_attr_group,
1939dfc5606dSYehuda Sadeh 	NULL
1940dfc5606dSYehuda Sadeh };
1941dfc5606dSYehuda Sadeh 
1942dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
1943dfc5606dSYehuda Sadeh {
1944dfc5606dSYehuda Sadeh }
1945dfc5606dSYehuda Sadeh 
1946dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
1947dfc5606dSYehuda Sadeh 	.name		= "rbd",
1948dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
1949dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
1950dfc5606dSYehuda Sadeh };
1951dfc5606dSYehuda Sadeh 
1952dfc5606dSYehuda Sadeh 
1953dfc5606dSYehuda Sadeh /*
1954dfc5606dSYehuda Sadeh   sysfs - snapshots
1955dfc5606dSYehuda Sadeh */
1956dfc5606dSYehuda Sadeh 
1957dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
1958dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
1959dfc5606dSYehuda Sadeh 				  char *buf)
1960dfc5606dSYehuda Sadeh {
1961dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1962dfc5606dSYehuda Sadeh 
19633591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
1964dfc5606dSYehuda Sadeh }
1965dfc5606dSYehuda Sadeh 
1966dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
1967dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
1968dfc5606dSYehuda Sadeh 				char *buf)
1969dfc5606dSYehuda Sadeh {
1970dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1971dfc5606dSYehuda Sadeh 
1972593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
1973dfc5606dSYehuda Sadeh }
1974dfc5606dSYehuda Sadeh 
1975dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1976dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1977dfc5606dSYehuda Sadeh 
1978dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
1979dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
1980dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
1981dfc5606dSYehuda Sadeh 	NULL,
1982dfc5606dSYehuda Sadeh };
1983dfc5606dSYehuda Sadeh 
1984dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
1985dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
1986dfc5606dSYehuda Sadeh };
1987dfc5606dSYehuda Sadeh 
1988dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
1989dfc5606dSYehuda Sadeh {
1990dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1991dfc5606dSYehuda Sadeh 	kfree(snap->name);
1992dfc5606dSYehuda Sadeh 	kfree(snap);
1993dfc5606dSYehuda Sadeh }
1994dfc5606dSYehuda Sadeh 
1995dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
1996dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
1997dfc5606dSYehuda Sadeh 	NULL
1998dfc5606dSYehuda Sadeh };
1999dfc5606dSYehuda Sadeh 
2000dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2001dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2002dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2003dfc5606dSYehuda Sadeh };
2004dfc5606dSYehuda Sadeh 
2005dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2006dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap)
2007dfc5606dSYehuda Sadeh {
2008dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2009dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
2010dfc5606dSYehuda Sadeh }
2011dfc5606dSYehuda Sadeh 
2012dfc5606dSYehuda Sadeh static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2013dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap,
2014dfc5606dSYehuda Sadeh 				  struct device *parent)
2015dfc5606dSYehuda Sadeh {
2016dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2017dfc5606dSYehuda Sadeh 	int ret;
2018dfc5606dSYehuda Sadeh 
2019dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2020dfc5606dSYehuda Sadeh 	dev->parent = parent;
2021dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2022dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2023dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2024dfc5606dSYehuda Sadeh 
2025dfc5606dSYehuda Sadeh 	return ret;
2026dfc5606dSYehuda Sadeh }
2027dfc5606dSYehuda Sadeh 
2028dfc5606dSYehuda Sadeh static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2029dfc5606dSYehuda Sadeh 			      int i, const char *name,
2030dfc5606dSYehuda Sadeh 			      struct rbd_snap **snapp)
2031dfc5606dSYehuda Sadeh {
2032dfc5606dSYehuda Sadeh 	int ret;
2033dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2034dfc5606dSYehuda Sadeh 	if (!snap)
2035dfc5606dSYehuda Sadeh 		return -ENOMEM;
2036dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
2037dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
2038dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
2039dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
2040dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2041dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2042dfc5606dSYehuda Sadeh 		if (ret < 0)
2043dfc5606dSYehuda Sadeh 			goto err;
2044dfc5606dSYehuda Sadeh 	}
2045dfc5606dSYehuda Sadeh 	*snapp = snap;
2046dfc5606dSYehuda Sadeh 	return 0;
2047dfc5606dSYehuda Sadeh err:
2048dfc5606dSYehuda Sadeh 	kfree(snap->name);
2049dfc5606dSYehuda Sadeh 	kfree(snap);
2050dfc5606dSYehuda Sadeh 	return ret;
2051dfc5606dSYehuda Sadeh }
2052dfc5606dSYehuda Sadeh 
2053dfc5606dSYehuda Sadeh /*
2054dfc5606dSYehuda Sadeh  * search for the previous snap in a null delimited string list
2055dfc5606dSYehuda Sadeh  */
2056dfc5606dSYehuda Sadeh const char *rbd_prev_snap_name(const char *name, const char *start)
2057dfc5606dSYehuda Sadeh {
2058dfc5606dSYehuda Sadeh 	if (name < start + 2)
2059dfc5606dSYehuda Sadeh 		return NULL;
2060dfc5606dSYehuda Sadeh 
2061dfc5606dSYehuda Sadeh 	name -= 2;
2062dfc5606dSYehuda Sadeh 	while (*name) {
2063dfc5606dSYehuda Sadeh 		if (name == start)
2064dfc5606dSYehuda Sadeh 			return start;
2065dfc5606dSYehuda Sadeh 		name--;
2066dfc5606dSYehuda Sadeh 	}
2067dfc5606dSYehuda Sadeh 	return name + 1;
2068dfc5606dSYehuda Sadeh }
2069dfc5606dSYehuda Sadeh 
2070dfc5606dSYehuda Sadeh /*
2071dfc5606dSYehuda Sadeh  * compare the old list of snapshots that we have to what's in the header
2072dfc5606dSYehuda Sadeh  * and update it accordingly. Note that the header holds the snapshots
2073dfc5606dSYehuda Sadeh  * in a reverse order (from newest to oldest) and we need to go from
2074dfc5606dSYehuda Sadeh  * older to new so that we don't get a duplicate snap name when
2075dfc5606dSYehuda Sadeh  * doing the process (e.g., removed snapshot and recreated a new
2076dfc5606dSYehuda Sadeh  * one with the same name.
2077dfc5606dSYehuda Sadeh  */
2078dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2079dfc5606dSYehuda Sadeh {
2080dfc5606dSYehuda Sadeh 	const char *name, *first_name;
2081dfc5606dSYehuda Sadeh 	int i = rbd_dev->header.total_snaps;
2082dfc5606dSYehuda Sadeh 	struct rbd_snap *snap, *old_snap = NULL;
2083dfc5606dSYehuda Sadeh 	int ret;
2084dfc5606dSYehuda Sadeh 	struct list_head *p, *n;
2085dfc5606dSYehuda Sadeh 
2086dfc5606dSYehuda Sadeh 	first_name = rbd_dev->header.snap_names;
2087dfc5606dSYehuda Sadeh 	name = first_name + rbd_dev->header.snap_names_len;
2088dfc5606dSYehuda Sadeh 
2089dfc5606dSYehuda Sadeh 	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2090dfc5606dSYehuda Sadeh 		u64 cur_id;
2091dfc5606dSYehuda Sadeh 
2092dfc5606dSYehuda Sadeh 		old_snap = list_entry(p, struct rbd_snap, node);
2093dfc5606dSYehuda Sadeh 
2094dfc5606dSYehuda Sadeh 		if (i)
2095dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
2096dfc5606dSYehuda Sadeh 
2097dfc5606dSYehuda Sadeh 		if (!i || old_snap->id < cur_id) {
2098dfc5606dSYehuda Sadeh 			/* old_snap->id was skipped, thus was removed */
2099dfc5606dSYehuda Sadeh 			__rbd_remove_snap_dev(rbd_dev, old_snap);
2100dfc5606dSYehuda Sadeh 			continue;
2101dfc5606dSYehuda Sadeh 		}
2102dfc5606dSYehuda Sadeh 		if (old_snap->id == cur_id) {
2103dfc5606dSYehuda Sadeh 			/* we have this snapshot already */
2104dfc5606dSYehuda Sadeh 			i--;
2105dfc5606dSYehuda Sadeh 			name = rbd_prev_snap_name(name, first_name);
2106dfc5606dSYehuda Sadeh 			continue;
2107dfc5606dSYehuda Sadeh 		}
2108dfc5606dSYehuda Sadeh 		for (; i > 0;
2109dfc5606dSYehuda Sadeh 		     i--, name = rbd_prev_snap_name(name, first_name)) {
2110dfc5606dSYehuda Sadeh 			if (!name) {
2111dfc5606dSYehuda Sadeh 				WARN_ON(1);
2112dfc5606dSYehuda Sadeh 				return -EINVAL;
2113dfc5606dSYehuda Sadeh 			}
2114dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i];
2115dfc5606dSYehuda Sadeh 			/* snapshot removal? handle it above */
2116dfc5606dSYehuda Sadeh 			if (cur_id >= old_snap->id)
2117dfc5606dSYehuda Sadeh 				break;
2118dfc5606dSYehuda Sadeh 			/* a new snapshot */
2119dfc5606dSYehuda Sadeh 			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2120dfc5606dSYehuda Sadeh 			if (ret < 0)
2121dfc5606dSYehuda Sadeh 				return ret;
2122dfc5606dSYehuda Sadeh 
2123dfc5606dSYehuda Sadeh 			/* note that we add it backward so using n and not p */
2124dfc5606dSYehuda Sadeh 			list_add(&snap->node, n);
2125dfc5606dSYehuda Sadeh 			p = &snap->node;
2126dfc5606dSYehuda Sadeh 		}
2127dfc5606dSYehuda Sadeh 	}
2128dfc5606dSYehuda Sadeh 	/* we're done going over the old snap list, just add what's left */
2129dfc5606dSYehuda Sadeh 	for (; i > 0; i--) {
2130dfc5606dSYehuda Sadeh 		name = rbd_prev_snap_name(name, first_name);
2131dfc5606dSYehuda Sadeh 		if (!name) {
2132dfc5606dSYehuda Sadeh 			WARN_ON(1);
2133dfc5606dSYehuda Sadeh 			return -EINVAL;
2134dfc5606dSYehuda Sadeh 		}
2135dfc5606dSYehuda Sadeh 		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2136dfc5606dSYehuda Sadeh 		if (ret < 0)
2137dfc5606dSYehuda Sadeh 			return ret;
2138dfc5606dSYehuda Sadeh 		list_add(&snap->node, &rbd_dev->snaps);
2139dfc5606dSYehuda Sadeh 	}
2140dfc5606dSYehuda Sadeh 
2141dfc5606dSYehuda Sadeh 	return 0;
2142dfc5606dSYehuda Sadeh }
2143dfc5606dSYehuda Sadeh 
2144dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2145dfc5606dSYehuda Sadeh {
2146f0f8cef5SAlex Elder 	int ret;
2147dfc5606dSYehuda Sadeh 	struct device *dev;
2148dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2149dfc5606dSYehuda Sadeh 
2150dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2151dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2152dfc5606dSYehuda Sadeh 
2153dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2154dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2155dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2156dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2157dfc5606dSYehuda Sadeh 	dev_set_name(dev, "%d", rbd_dev->id);
2158dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2159dfc5606dSYehuda Sadeh 	if (ret < 0)
2160f0f8cef5SAlex Elder 		goto out;
2161dfc5606dSYehuda Sadeh 
2162dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2163dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2164dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2165dfc5606dSYehuda Sadeh 		if (ret < 0)
2166602adf40SYehuda Sadeh 			break;
2167602adf40SYehuda Sadeh 	}
2168f0f8cef5SAlex Elder out:
2169dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2170dfc5606dSYehuda Sadeh 	return ret;
2171602adf40SYehuda Sadeh }
2172602adf40SYehuda Sadeh 
2173dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2174dfc5606dSYehuda Sadeh {
2175dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2176dfc5606dSYehuda Sadeh }
2177dfc5606dSYehuda Sadeh 
217859c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
217959c2be1eSYehuda Sadeh {
218059c2be1eSYehuda Sadeh 	int ret, rc;
218159c2be1eSYehuda Sadeh 
218259c2be1eSYehuda Sadeh 	do {
21830bed54dcSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
218459c2be1eSYehuda Sadeh 					 rbd_dev->header.obj_version);
218559c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
218659c2be1eSYehuda Sadeh 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2187263c6ca0SJosh Durgin 			rc = __rbd_refresh_header(rbd_dev);
218859c2be1eSYehuda Sadeh 			mutex_unlock(&ctl_mutex);
218959c2be1eSYehuda Sadeh 			if (rc < 0)
219059c2be1eSYehuda Sadeh 				return rc;
219159c2be1eSYehuda Sadeh 		}
219259c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
219359c2be1eSYehuda Sadeh 
219459c2be1eSYehuda Sadeh 	return ret;
219559c2be1eSYehuda Sadeh }
219659c2be1eSYehuda Sadeh 
21971ddbe94eSAlex Elder static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
21981ddbe94eSAlex Elder 
21991ddbe94eSAlex Elder /*
2200499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2201499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
22021ddbe94eSAlex Elder  */
2203499afd5bSAlex Elder static void rbd_id_get(struct rbd_device *rbd_dev)
2204b7f23c36SAlex Elder {
2205499afd5bSAlex Elder 	rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2206499afd5bSAlex Elder 
2207499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2208499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2209499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2210b7f23c36SAlex Elder }
2211b7f23c36SAlex Elder 
22121ddbe94eSAlex Elder /*
2213499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2214499afd5bSAlex Elder  * identifier is no longer in use.
22151ddbe94eSAlex Elder  */
2216499afd5bSAlex Elder static void rbd_id_put(struct rbd_device *rbd_dev)
22171ddbe94eSAlex Elder {
2218d184f6bfSAlex Elder 	struct list_head *tmp;
2219d184f6bfSAlex Elder 	int rbd_id = rbd_dev->id;
2220d184f6bfSAlex Elder 	int max_id;
2221d184f6bfSAlex Elder 
2222d184f6bfSAlex Elder 	BUG_ON(rbd_id < 1);
2223499afd5bSAlex Elder 
2224499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2225499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2226d184f6bfSAlex Elder 
2227d184f6bfSAlex Elder 	/*
2228d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2229d184f6bfSAlex Elder 	 * is nothing special we need to do.
2230d184f6bfSAlex Elder 	 */
2231d184f6bfSAlex Elder 	if (rbd_id != atomic64_read(&rbd_id_max)) {
2232d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2233d184f6bfSAlex Elder 		return;
2234d184f6bfSAlex Elder 	}
2235d184f6bfSAlex Elder 
2236d184f6bfSAlex Elder 	/*
2237d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2238d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2239d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2240d184f6bfSAlex Elder 	 */
2241d184f6bfSAlex Elder 	max_id = 0;
2242d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2243d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2244d184f6bfSAlex Elder 
2245d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2246d184f6bfSAlex Elder 		if (rbd_id > max_id)
2247d184f6bfSAlex Elder 			max_id = rbd_id;
2248d184f6bfSAlex Elder 	}
2249499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
22501ddbe94eSAlex Elder 
22511ddbe94eSAlex Elder 	/*
2252d184f6bfSAlex Elder 	 * The max id could have been updated by rbd_id_get(), in
2253d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2254d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2255d184f6bfSAlex Elder 	 * case.
22561ddbe94eSAlex Elder 	 */
2257d184f6bfSAlex Elder 	atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
2258b7f23c36SAlex Elder }
2259b7f23c36SAlex Elder 
2260a725f65eSAlex Elder /*
2261e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2262e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2263593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2264593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2265e28fff26SAlex Elder  */
2266e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2267e28fff26SAlex Elder {
2268e28fff26SAlex Elder         /*
2269e28fff26SAlex Elder         * These are the characters that produce nonzero for
2270e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2271e28fff26SAlex Elder         */
2272e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2273e28fff26SAlex Elder 
2274e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2275e28fff26SAlex Elder 
2276e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2277e28fff26SAlex Elder }
2278e28fff26SAlex Elder 
2279e28fff26SAlex Elder /*
2280e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2281e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2282593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2283593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2284e28fff26SAlex Elder  *
2285e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2286e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2287e28fff26SAlex Elder  * token_size if the token would not fit.
2288e28fff26SAlex Elder  *
2289593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2290e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2291e28fff26SAlex Elder  * too small to hold it.
2292e28fff26SAlex Elder  */
2293e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2294e28fff26SAlex Elder 				char *token,
2295e28fff26SAlex Elder 				size_t token_size)
2296e28fff26SAlex Elder {
2297e28fff26SAlex Elder         size_t len;
2298e28fff26SAlex Elder 
2299e28fff26SAlex Elder 	len = next_token(buf);
2300e28fff26SAlex Elder 	if (len < token_size) {
2301e28fff26SAlex Elder 		memcpy(token, *buf, len);
2302e28fff26SAlex Elder 		*(token + len) = '\0';
2303e28fff26SAlex Elder 	}
2304e28fff26SAlex Elder 	*buf += len;
2305e28fff26SAlex Elder 
2306e28fff26SAlex Elder         return len;
2307e28fff26SAlex Elder }
2308e28fff26SAlex Elder 
2309e28fff26SAlex Elder /*
2310ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2311ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2312ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2313ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2314ea3352f4SAlex Elder  *
2315ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2316ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2317ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2318ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2319ea3352f4SAlex Elder  *
2320ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2321ea3352f4SAlex Elder  * the end of the found token.
2322ea3352f4SAlex Elder  *
2323ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2324ea3352f4SAlex Elder  */
2325ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2326ea3352f4SAlex Elder {
2327ea3352f4SAlex Elder 	char *dup;
2328ea3352f4SAlex Elder 	size_t len;
2329ea3352f4SAlex Elder 
2330ea3352f4SAlex Elder 	len = next_token(buf);
2331ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2332ea3352f4SAlex Elder 	if (!dup)
2333ea3352f4SAlex Elder 		return NULL;
2334ea3352f4SAlex Elder 
2335ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2336ea3352f4SAlex Elder 	*(dup + len) = '\0';
2337ea3352f4SAlex Elder 	*buf += len;
2338ea3352f4SAlex Elder 
2339ea3352f4SAlex Elder 	if (lenp)
2340ea3352f4SAlex Elder 		*lenp = len;
2341ea3352f4SAlex Elder 
2342ea3352f4SAlex Elder 	return dup;
2343ea3352f4SAlex Elder }
2344ea3352f4SAlex Elder 
2345ea3352f4SAlex Elder /*
23460bed54dcSAlex Elder  * This fills in the pool_name, image_name, image_name_len, snap_name,
2347a725f65eSAlex Elder  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2348a725f65eSAlex Elder  * on the list of monitor addresses and other options provided via
2349a725f65eSAlex Elder  * /sys/bus/rbd/add.
2350d22f76e7SAlex Elder  *
2351d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2352a725f65eSAlex Elder  */
2353a725f65eSAlex Elder static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2354a725f65eSAlex Elder 			      const char *buf,
23557ef3214aSAlex Elder 			      const char **mon_addrs,
23565214ecc4SAlex Elder 			      size_t *mon_addrs_size,
2357e28fff26SAlex Elder 			      char *options,
2358e28fff26SAlex Elder 			     size_t options_size)
2359a725f65eSAlex Elder {
2360e28fff26SAlex Elder 	size_t len;
2361d22f76e7SAlex Elder 	int ret;
2362e28fff26SAlex Elder 
2363e28fff26SAlex Elder 	/* The first four tokens are required */
2364e28fff26SAlex Elder 
23657ef3214aSAlex Elder 	len = next_token(&buf);
23667ef3214aSAlex Elder 	if (!len)
2367a725f65eSAlex Elder 		return -EINVAL;
23685214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
23697ef3214aSAlex Elder 	*mon_addrs = buf;
23707ef3214aSAlex Elder 
23717ef3214aSAlex Elder 	buf += len;
2372a725f65eSAlex Elder 
2373e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2374e28fff26SAlex Elder 	if (!len || len >= options_size)
2375e28fff26SAlex Elder 		return -EINVAL;
2376a725f65eSAlex Elder 
2377bf3e5ae1SAlex Elder 	ret = -ENOMEM;
2378d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2379d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2380d22f76e7SAlex Elder 		goto out_err;
2381e28fff26SAlex Elder 
23820bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
23830bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2384bf3e5ae1SAlex Elder 		goto out_err;
2385e28fff26SAlex Elder 
2386cb8627c7SAlex Elder 	/* Create the name of the header object */
2387cb8627c7SAlex Elder 
23880bed54dcSAlex Elder 	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2389bf3e5ae1SAlex Elder 						+ sizeof (RBD_SUFFIX),
2390bf3e5ae1SAlex Elder 					GFP_KERNEL);
23910bed54dcSAlex Elder 	if (!rbd_dev->header_name)
2392cb8627c7SAlex Elder 		goto out_err;
23930bed54dcSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2394a725f65eSAlex Elder 
2395e28fff26SAlex Elder 	/*
2396820a5f3eSAlex Elder 	 * The snapshot name is optional.  If none is is supplied,
2397820a5f3eSAlex Elder 	 * we use the default value.
2398e28fff26SAlex Elder 	 */
2399820a5f3eSAlex Elder 	rbd_dev->snap_name = dup_token(&buf, &len);
2400820a5f3eSAlex Elder 	if (!rbd_dev->snap_name)
2401820a5f3eSAlex Elder 		goto out_err;
2402820a5f3eSAlex Elder 	if (!len) {
2403820a5f3eSAlex Elder 		/* Replace the empty name with the default */
2404820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
2405820a5f3eSAlex Elder 		rbd_dev->snap_name
2406820a5f3eSAlex Elder 			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2407820a5f3eSAlex Elder 		if (!rbd_dev->snap_name)
2408820a5f3eSAlex Elder 			goto out_err;
2409820a5f3eSAlex Elder 
2410e28fff26SAlex Elder 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2411e28fff26SAlex Elder 			sizeof (RBD_SNAP_HEAD_NAME));
2412849b4260SAlex Elder 	}
2413e28fff26SAlex Elder 
2414a725f65eSAlex Elder 	return 0;
2415d22f76e7SAlex Elder 
2416d22f76e7SAlex Elder out_err:
24170bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
24180bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2419d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2420d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2421d22f76e7SAlex Elder 
2422d22f76e7SAlex Elder 	return ret;
2423a725f65eSAlex Elder }
2424a725f65eSAlex Elder 
242559c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
242659c2be1eSYehuda Sadeh 		       const char *buf,
242759c2be1eSYehuda Sadeh 		       size_t count)
2428602adf40SYehuda Sadeh {
2429cb8627c7SAlex Elder 	char *options;
2430cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
24317ef3214aSAlex Elder 	const char *mon_addrs = NULL;
24327ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
243327cc2594SAlex Elder 	struct ceph_osd_client *osdc;
243427cc2594SAlex Elder 	int rc = -ENOMEM;
2435602adf40SYehuda Sadeh 
2436602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2437602adf40SYehuda Sadeh 		return -ENODEV;
2438602adf40SYehuda Sadeh 
243927cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
244027cc2594SAlex Elder 	if (!options)
244127cc2594SAlex Elder 		goto err_nomem;
2442cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2443cb8627c7SAlex Elder 	if (!rbd_dev)
2444cb8627c7SAlex Elder 		goto err_nomem;
2445602adf40SYehuda Sadeh 
2446602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2447602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2448602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2449dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2450c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2451602adf40SYehuda Sadeh 
2452c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
24530e805a1dSAlex Elder 
2454d184f6bfSAlex Elder 	/* generate unique id: find highest unique id, add one */
2455499afd5bSAlex Elder 	rbd_id_get(rbd_dev);
2456602adf40SYehuda Sadeh 
2457a725f65eSAlex Elder 	/* Fill in the device name, now that we have its id. */
245881a89793SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
245981a89793SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
246081a89793SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
2461e124a82fSAlex Elder 
2462a725f65eSAlex Elder 	/* parse add command */
24637ef3214aSAlex Elder 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2464e28fff26SAlex Elder 				options, count);
2465a725f65eSAlex Elder 	if (rc)
2466a725f65eSAlex Elder 		goto err_put_id;
2467a725f65eSAlex Elder 
24685214ecc4SAlex Elder 	rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
24695214ecc4SAlex Elder 						options);
2470d720bcb0SAlex Elder 	if (IS_ERR(rbd_dev->rbd_client)) {
2471d720bcb0SAlex Elder 		rc = PTR_ERR(rbd_dev->rbd_client);
2472f0f8cef5SAlex Elder 		goto err_put_id;
2473d720bcb0SAlex Elder 	}
2474602adf40SYehuda Sadeh 
2475602adf40SYehuda Sadeh 	/* pick the pool */
24761dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2477602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2478602adf40SYehuda Sadeh 	if (rc < 0)
2479602adf40SYehuda Sadeh 		goto err_out_client;
24809bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
2481602adf40SYehuda Sadeh 
2482602adf40SYehuda Sadeh 	/* register our block device */
248327cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
248427cc2594SAlex Elder 	if (rc < 0)
2485602adf40SYehuda Sadeh 		goto err_out_client;
248627cc2594SAlex Elder 	rbd_dev->major = rc;
2487602adf40SYehuda Sadeh 
2488dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2489dfc5606dSYehuda Sadeh 	if (rc)
2490766fc439SYehuda Sadeh 		goto err_out_blkdev;
2491766fc439SYehuda Sadeh 
249232eec68dSAlex Elder 	/*
249332eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
249432eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
249532eec68dSAlex Elder 	 *
249632eec68dSAlex Elder 	 * Set up and announce blkdev mapping.
249732eec68dSAlex Elder 	 */
2498602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2499602adf40SYehuda Sadeh 	if (rc)
2500766fc439SYehuda Sadeh 		goto err_out_bus;
2501602adf40SYehuda Sadeh 
250259c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
250359c2be1eSYehuda Sadeh 	if (rc)
250459c2be1eSYehuda Sadeh 		goto err_out_bus;
250559c2be1eSYehuda Sadeh 
2506602adf40SYehuda Sadeh 	return count;
2507602adf40SYehuda Sadeh 
2508766fc439SYehuda Sadeh err_out_bus:
2509766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2510766fc439SYehuda Sadeh 
2511766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2512766fc439SYehuda Sadeh 	kfree(options);
2513766fc439SYehuda Sadeh 	return rc;
2514766fc439SYehuda Sadeh 
2515602adf40SYehuda Sadeh err_out_blkdev:
2516602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2517602adf40SYehuda Sadeh err_out_client:
2518602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2519f0f8cef5SAlex Elder err_put_id:
2520cb8627c7SAlex Elder 	if (rbd_dev->pool_name) {
2521820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
25220bed54dcSAlex Elder 		kfree(rbd_dev->header_name);
25230bed54dcSAlex Elder 		kfree(rbd_dev->image_name);
2524d22f76e7SAlex Elder 		kfree(rbd_dev->pool_name);
2525cb8627c7SAlex Elder 	}
2526499afd5bSAlex Elder 	rbd_id_put(rbd_dev);
252727cc2594SAlex Elder err_nomem:
252827cc2594SAlex Elder 	kfree(rbd_dev);
2529cb8627c7SAlex Elder 	kfree(options);
253027cc2594SAlex Elder 
2531602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2532602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
253327cc2594SAlex Elder 
253427cc2594SAlex Elder 	return (ssize_t) rc;
2535602adf40SYehuda Sadeh }
2536602adf40SYehuda Sadeh 
2537602adf40SYehuda Sadeh static struct rbd_device *__rbd_get_dev(unsigned long id)
2538602adf40SYehuda Sadeh {
2539602adf40SYehuda Sadeh 	struct list_head *tmp;
2540602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2541602adf40SYehuda Sadeh 
2542e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2543602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2544602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2545e124a82fSAlex Elder 		if (rbd_dev->id == id) {
2546e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2547602adf40SYehuda Sadeh 			return rbd_dev;
2548602adf40SYehuda Sadeh 		}
2549e124a82fSAlex Elder 	}
2550e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2551602adf40SYehuda Sadeh 	return NULL;
2552602adf40SYehuda Sadeh }
2553602adf40SYehuda Sadeh 
2554dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2555602adf40SYehuda Sadeh {
2556593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2557602adf40SYehuda Sadeh 
25581dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
25591dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
25601dbb4399SAlex Elder 
25611dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
256259c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
25631dbb4399SAlex Elder 	}
256459c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
25650bed54dcSAlex Elder 		rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
256659c2be1eSYehuda Sadeh 
2567602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2568602adf40SYehuda Sadeh 
2569602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2570602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2571602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
257232eec68dSAlex Elder 
257332eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
2574820a5f3eSAlex Elder 	kfree(rbd_dev->snap_name);
25750bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2576d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
25770bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
257832eec68dSAlex Elder 	rbd_id_put(rbd_dev);
2579602adf40SYehuda Sadeh 	kfree(rbd_dev);
2580602adf40SYehuda Sadeh 
2581602adf40SYehuda Sadeh 	/* release module ref */
2582602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2583602adf40SYehuda Sadeh }
2584602adf40SYehuda Sadeh 
2585dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2586602adf40SYehuda Sadeh 			  const char *buf,
2587602adf40SYehuda Sadeh 			  size_t count)
2588602adf40SYehuda Sadeh {
2589602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2590602adf40SYehuda Sadeh 	int target_id, rc;
2591602adf40SYehuda Sadeh 	unsigned long ul;
2592602adf40SYehuda Sadeh 	int ret = count;
2593602adf40SYehuda Sadeh 
2594602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2595602adf40SYehuda Sadeh 	if (rc)
2596602adf40SYehuda Sadeh 		return rc;
2597602adf40SYehuda Sadeh 
2598602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2599602adf40SYehuda Sadeh 	target_id = (int) ul;
2600602adf40SYehuda Sadeh 	if (target_id != ul)
2601602adf40SYehuda Sadeh 		return -EINVAL;
2602602adf40SYehuda Sadeh 
2603602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2604602adf40SYehuda Sadeh 
2605602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2606602adf40SYehuda Sadeh 	if (!rbd_dev) {
2607602adf40SYehuda Sadeh 		ret = -ENOENT;
2608602adf40SYehuda Sadeh 		goto done;
2609602adf40SYehuda Sadeh 	}
2610602adf40SYehuda Sadeh 
2611dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2612dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2613602adf40SYehuda Sadeh 
2614602adf40SYehuda Sadeh done:
2615602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2616602adf40SYehuda Sadeh 	return ret;
2617602adf40SYehuda Sadeh }
2618602adf40SYehuda Sadeh 
2619dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2620dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2621602adf40SYehuda Sadeh 			    const char *buf,
2622602adf40SYehuda Sadeh 			    size_t count)
2623602adf40SYehuda Sadeh {
2624593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2625dfc5606dSYehuda Sadeh 	int ret;
2626dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2627602adf40SYehuda Sadeh 	if (!name)
2628602adf40SYehuda Sadeh 		return -ENOMEM;
2629602adf40SYehuda Sadeh 
2630dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2631602adf40SYehuda Sadeh 
2632602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2633602adf40SYehuda Sadeh 
2634602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2635602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2636602adf40SYehuda Sadeh 	if (ret < 0)
263759c2be1eSYehuda Sadeh 		goto err_unlock;
2638602adf40SYehuda Sadeh 
2639263c6ca0SJosh Durgin 	ret = __rbd_refresh_header(rbd_dev);
2640602adf40SYehuda Sadeh 	if (ret < 0)
264159c2be1eSYehuda Sadeh 		goto err_unlock;
264259c2be1eSYehuda Sadeh 
264359c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
264459c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
264559c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
264659c2be1eSYehuda Sadeh 
264759c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
26480bed54dcSAlex Elder 	rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
2649602adf40SYehuda Sadeh 
2650602adf40SYehuda Sadeh 	ret = count;
265159c2be1eSYehuda Sadeh 	kfree(name);
265259c2be1eSYehuda Sadeh 	return ret;
265359c2be1eSYehuda Sadeh 
265459c2be1eSYehuda Sadeh err_unlock:
2655602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2656602adf40SYehuda Sadeh 	kfree(name);
2657602adf40SYehuda Sadeh 	return ret;
2658602adf40SYehuda Sadeh }
2659602adf40SYehuda Sadeh 
2660602adf40SYehuda Sadeh /*
2661602adf40SYehuda Sadeh  * create control files in sysfs
2662dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2663602adf40SYehuda Sadeh  */
2664602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2665602adf40SYehuda Sadeh {
2666dfc5606dSYehuda Sadeh 	int ret;
2667602adf40SYehuda Sadeh 
2668fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
2669dfc5606dSYehuda Sadeh 	if (ret < 0)
2670dfc5606dSYehuda Sadeh 		return ret;
2671602adf40SYehuda Sadeh 
2672fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
2673fed4c143SAlex Elder 	if (ret < 0)
2674fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
2675602adf40SYehuda Sadeh 
2676602adf40SYehuda Sadeh 	return ret;
2677602adf40SYehuda Sadeh }
2678602adf40SYehuda Sadeh 
2679602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2680602adf40SYehuda Sadeh {
2681dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2682fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
2683602adf40SYehuda Sadeh }
2684602adf40SYehuda Sadeh 
2685602adf40SYehuda Sadeh int __init rbd_init(void)
2686602adf40SYehuda Sadeh {
2687602adf40SYehuda Sadeh 	int rc;
2688602adf40SYehuda Sadeh 
2689602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2690602adf40SYehuda Sadeh 	if (rc)
2691602adf40SYehuda Sadeh 		return rc;
2692f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2693602adf40SYehuda Sadeh 	return 0;
2694602adf40SYehuda Sadeh }
2695602adf40SYehuda Sadeh 
2696602adf40SYehuda Sadeh void __exit rbd_exit(void)
2697602adf40SYehuda Sadeh {
2698602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2699602adf40SYehuda Sadeh }
2700602adf40SYehuda Sadeh 
2701602adf40SYehuda Sadeh module_init(rbd_init);
2702602adf40SYehuda Sadeh module_exit(rbd_exit);
2703602adf40SYehuda Sadeh 
2704602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2705602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2706602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2707602adf40SYehuda Sadeh 
2708602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2709602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2710602adf40SYehuda Sadeh 
2711602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2712