xref: /openbmc/linux/drivers/block/rbd.c (revision e88a36ec)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44593a9e7bSAlex Elder /*
45593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
46593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
47593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
48593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
49593a9e7bSAlex Elder  */
50593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
51593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
52593a9e7bSAlex Elder 
53f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
54f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
55602adf40SYehuda Sadeh 
56602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
59602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
60602adf40SYehuda Sadeh 
61602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
62602adf40SYehuda Sadeh 
6381a89793SAlex Elder /*
6481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
6581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
6681a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
6781a89793SAlex Elder  * enough to hold all possible device names.
6881a89793SAlex Elder  */
69602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7081a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
71602adf40SYehuda Sadeh 
7259c2be1eSYehuda Sadeh #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
7359c2be1eSYehuda Sadeh 
74602adf40SYehuda Sadeh /*
75602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
76602adf40SYehuda Sadeh  */
77602adf40SYehuda Sadeh struct rbd_image_header {
78602adf40SYehuda Sadeh 	u64 image_size;
79849b4260SAlex Elder 	char *object_prefix;
80602adf40SYehuda Sadeh 	__u8 obj_order;
81602adf40SYehuda Sadeh 	__u8 crypt_type;
82602adf40SYehuda Sadeh 	__u8 comp_type;
83602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc;
84602adf40SYehuda Sadeh 	size_t snap_names_len;
85602adf40SYehuda Sadeh 	u64 snap_seq;
86602adf40SYehuda Sadeh 	u32 total_snaps;
87602adf40SYehuda Sadeh 
88602adf40SYehuda Sadeh 	char *snap_names;
89602adf40SYehuda Sadeh 	u64 *snap_sizes;
9059c2be1eSYehuda Sadeh 
9159c2be1eSYehuda Sadeh 	u64 obj_version;
9259c2be1eSYehuda Sadeh };
9359c2be1eSYehuda Sadeh 
9459c2be1eSYehuda Sadeh struct rbd_options {
9559c2be1eSYehuda Sadeh 	int	notify_timeout;
96602adf40SYehuda Sadeh };
97602adf40SYehuda Sadeh 
98602adf40SYehuda Sadeh /*
99f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
100602adf40SYehuda Sadeh  */
101602adf40SYehuda Sadeh struct rbd_client {
102602adf40SYehuda Sadeh 	struct ceph_client	*client;
10359c2be1eSYehuda Sadeh 	struct rbd_options	*rbd_opts;
104602adf40SYehuda Sadeh 	struct kref		kref;
105602adf40SYehuda Sadeh 	struct list_head	node;
106602adf40SYehuda Sadeh };
107602adf40SYehuda Sadeh 
108602adf40SYehuda Sadeh /*
109f0f8cef5SAlex Elder  * a request completion status
110602adf40SYehuda Sadeh  */
1111fec7093SYehuda Sadeh struct rbd_req_status {
1121fec7093SYehuda Sadeh 	int done;
1131fec7093SYehuda Sadeh 	int rc;
1141fec7093SYehuda Sadeh 	u64 bytes;
1151fec7093SYehuda Sadeh };
1161fec7093SYehuda Sadeh 
1171fec7093SYehuda Sadeh /*
1181fec7093SYehuda Sadeh  * a collection of requests
1191fec7093SYehuda Sadeh  */
1201fec7093SYehuda Sadeh struct rbd_req_coll {
1211fec7093SYehuda Sadeh 	int			total;
1221fec7093SYehuda Sadeh 	int			num_done;
1231fec7093SYehuda Sadeh 	struct kref		kref;
1241fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
125602adf40SYehuda Sadeh };
126602adf40SYehuda Sadeh 
127f0f8cef5SAlex Elder /*
128f0f8cef5SAlex Elder  * a single io request
129f0f8cef5SAlex Elder  */
130f0f8cef5SAlex Elder struct rbd_request {
131f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
132f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
133f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
134f0f8cef5SAlex Elder 	u64			len;
135f0f8cef5SAlex Elder 	int			coll_index;
136f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
137f0f8cef5SAlex Elder };
138f0f8cef5SAlex Elder 
139dfc5606dSYehuda Sadeh struct rbd_snap {
140dfc5606dSYehuda Sadeh 	struct	device		dev;
141dfc5606dSYehuda Sadeh 	const char		*name;
1423591538fSJosh Durgin 	u64			size;
143dfc5606dSYehuda Sadeh 	struct list_head	node;
144dfc5606dSYehuda Sadeh 	u64			id;
145dfc5606dSYehuda Sadeh };
146dfc5606dSYehuda Sadeh 
147602adf40SYehuda Sadeh /*
148602adf40SYehuda Sadeh  * a single device
149602adf40SYehuda Sadeh  */
150602adf40SYehuda Sadeh struct rbd_device {
151602adf40SYehuda Sadeh 	int			id;		/* blkdev unique id */
152602adf40SYehuda Sadeh 
153602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
154602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
155602adf40SYehuda Sadeh 	struct request_queue	*q;
156602adf40SYehuda Sadeh 
157602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
158602adf40SYehuda Sadeh 
159602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160602adf40SYehuda Sadeh 
161602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
162602adf40SYehuda Sadeh 
163602adf40SYehuda Sadeh 	struct rbd_image_header	header;
1640bed54dcSAlex Elder 	char			*image_name;
1650bed54dcSAlex Elder 	size_t			image_name_len;
1660bed54dcSAlex Elder 	char			*header_name;
167d22f76e7SAlex Elder 	char			*pool_name;
1689bb2f334SAlex Elder 	int			pool_id;
169602adf40SYehuda Sadeh 
17059c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
17159c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
17259c2be1eSYehuda Sadeh 
173c666601aSJosh Durgin 	/* protects updating the header */
174c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
175e88a36ecSJosh Durgin 	/* name of the snapshot this device reads from */
176820a5f3eSAlex Elder 	char                    *snap_name;
177e88a36ecSJosh Durgin 	/* id of the snapshot this device reads from */
17877dfe99fSJosh Durgin 	u64                     snap_id;	/* current snapshot id */
179e88a36ecSJosh Durgin 	/* whether the snap_id this device reads from still exists */
180e88a36ecSJosh Durgin 	bool                    snap_exists;
181602adf40SYehuda Sadeh 	int                     read_only;
182602adf40SYehuda Sadeh 
183602adf40SYehuda Sadeh 	struct list_head	node;
184dfc5606dSYehuda Sadeh 
185dfc5606dSYehuda Sadeh 	/* list of snapshots */
186dfc5606dSYehuda Sadeh 	struct list_head	snaps;
187dfc5606dSYehuda Sadeh 
188dfc5606dSYehuda Sadeh 	/* sysfs related */
189dfc5606dSYehuda Sadeh 	struct device		dev;
190dfc5606dSYehuda Sadeh };
191dfc5606dSYehuda Sadeh 
192602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
193e124a82fSAlex Elder 
194602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
195e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
196e124a82fSAlex Elder 
197602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
198432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
199602adf40SYehuda Sadeh 
200dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
201dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
202dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
203dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
204dfc5606dSYehuda Sadeh 			    const char *buf,
205dfc5606dSYehuda Sadeh 			    size_t count);
206dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
20769932487SJustin P. Mattock 				  struct rbd_snap *snap);
208dfc5606dSYehuda Sadeh 
209f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
210f0f8cef5SAlex Elder 		       size_t count);
211f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
212f0f8cef5SAlex Elder 			  size_t count);
213f0f8cef5SAlex Elder 
214f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
215f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
216f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
217f0f8cef5SAlex Elder 	__ATTR_NULL
218f0f8cef5SAlex Elder };
219f0f8cef5SAlex Elder 
220f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
221f0f8cef5SAlex Elder 	.name		= "rbd",
222f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
223f0f8cef5SAlex Elder };
224f0f8cef5SAlex Elder 
225f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
226f0f8cef5SAlex Elder {
227f0f8cef5SAlex Elder }
228f0f8cef5SAlex Elder 
229f0f8cef5SAlex Elder static struct device rbd_root_dev = {
230f0f8cef5SAlex Elder 	.init_name =    "rbd",
231f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
232f0f8cef5SAlex Elder };
233f0f8cef5SAlex Elder 
234dfc5606dSYehuda Sadeh 
235dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
236dfc5606dSYehuda Sadeh {
237dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
238dfc5606dSYehuda Sadeh }
239dfc5606dSYehuda Sadeh 
240dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
241dfc5606dSYehuda Sadeh {
242dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
243dfc5606dSYehuda Sadeh }
244602adf40SYehuda Sadeh 
245263c6ca0SJosh Durgin static int __rbd_refresh_header(struct rbd_device *rbd_dev);
24659c2be1eSYehuda Sadeh 
247602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
248602adf40SYehuda Sadeh {
249f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
250602adf40SYehuda Sadeh 
251dfc5606dSYehuda Sadeh 	rbd_get_dev(rbd_dev);
252dfc5606dSYehuda Sadeh 
253602adf40SYehuda Sadeh 	set_device_ro(bdev, rbd_dev->read_only);
254602adf40SYehuda Sadeh 
255602adf40SYehuda Sadeh 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
256602adf40SYehuda Sadeh 		return -EROFS;
257602adf40SYehuda Sadeh 
258602adf40SYehuda Sadeh 	return 0;
259602adf40SYehuda Sadeh }
260602adf40SYehuda Sadeh 
261dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
262dfc5606dSYehuda Sadeh {
263dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
264dfc5606dSYehuda Sadeh 
265dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
266dfc5606dSYehuda Sadeh 
267dfc5606dSYehuda Sadeh 	return 0;
268dfc5606dSYehuda Sadeh }
269dfc5606dSYehuda Sadeh 
270602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
271602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
272602adf40SYehuda Sadeh 	.open			= rbd_open,
273dfc5606dSYehuda Sadeh 	.release		= rbd_release,
274602adf40SYehuda Sadeh };
275602adf40SYehuda Sadeh 
276602adf40SYehuda Sadeh /*
277602adf40SYehuda Sadeh  * Initialize an rbd client instance.
27843ae4701SAlex Elder  * We own *ceph_opts.
279602adf40SYehuda Sadeh  */
28043ae4701SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
28159c2be1eSYehuda Sadeh 					    struct rbd_options *rbd_opts)
282602adf40SYehuda Sadeh {
283602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
284602adf40SYehuda Sadeh 	int ret = -ENOMEM;
285602adf40SYehuda Sadeh 
286602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
287602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
288602adf40SYehuda Sadeh 	if (!rbdc)
289602adf40SYehuda Sadeh 		goto out_opt;
290602adf40SYehuda Sadeh 
291602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
292602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
293602adf40SYehuda Sadeh 
294bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
295bc534d86SAlex Elder 
29643ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
297602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
298bc534d86SAlex Elder 		goto out_mutex;
29943ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
300602adf40SYehuda Sadeh 
301602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
302602adf40SYehuda Sadeh 	if (ret < 0)
303602adf40SYehuda Sadeh 		goto out_err;
304602adf40SYehuda Sadeh 
30559c2be1eSYehuda Sadeh 	rbdc->rbd_opts = rbd_opts;
30659c2be1eSYehuda Sadeh 
307432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
308602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
309432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
310602adf40SYehuda Sadeh 
311bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
312bc534d86SAlex Elder 
313602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
314602adf40SYehuda Sadeh 	return rbdc;
315602adf40SYehuda Sadeh 
316602adf40SYehuda Sadeh out_err:
317602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
318bc534d86SAlex Elder out_mutex:
319bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
320602adf40SYehuda Sadeh 	kfree(rbdc);
321602adf40SYehuda Sadeh out_opt:
32243ae4701SAlex Elder 	if (ceph_opts)
32343ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
32428f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
325602adf40SYehuda Sadeh }
326602adf40SYehuda Sadeh 
327602adf40SYehuda Sadeh /*
328602adf40SYehuda Sadeh  * Find a ceph client with specific addr and configuration.
329602adf40SYehuda Sadeh  */
33043ae4701SAlex Elder static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
331602adf40SYehuda Sadeh {
332602adf40SYehuda Sadeh 	struct rbd_client *client_node;
333602adf40SYehuda Sadeh 
33443ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
335602adf40SYehuda Sadeh 		return NULL;
336602adf40SYehuda Sadeh 
337602adf40SYehuda Sadeh 	list_for_each_entry(client_node, &rbd_client_list, node)
33843ae4701SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client))
339602adf40SYehuda Sadeh 			return client_node;
340602adf40SYehuda Sadeh 	return NULL;
341602adf40SYehuda Sadeh }
342602adf40SYehuda Sadeh 
343602adf40SYehuda Sadeh /*
34459c2be1eSYehuda Sadeh  * mount options
34559c2be1eSYehuda Sadeh  */
34659c2be1eSYehuda Sadeh enum {
34759c2be1eSYehuda Sadeh 	Opt_notify_timeout,
34859c2be1eSYehuda Sadeh 	Opt_last_int,
34959c2be1eSYehuda Sadeh 	/* int args above */
35059c2be1eSYehuda Sadeh 	Opt_last_string,
35159c2be1eSYehuda Sadeh 	/* string args above */
35259c2be1eSYehuda Sadeh };
35359c2be1eSYehuda Sadeh 
35443ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
35559c2be1eSYehuda Sadeh 	{Opt_notify_timeout, "notify_timeout=%d"},
35659c2be1eSYehuda Sadeh 	/* int args above */
35759c2be1eSYehuda Sadeh 	/* string args above */
35859c2be1eSYehuda Sadeh 	{-1, NULL}
35959c2be1eSYehuda Sadeh };
36059c2be1eSYehuda Sadeh 
36159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
36259c2be1eSYehuda Sadeh {
36343ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
36459c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
36559c2be1eSYehuda Sadeh 	int token, intval, ret;
36659c2be1eSYehuda Sadeh 
36743ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
36859c2be1eSYehuda Sadeh 	if (token < 0)
36959c2be1eSYehuda Sadeh 		return -EINVAL;
37059c2be1eSYehuda Sadeh 
37159c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
37259c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
37359c2be1eSYehuda Sadeh 		if (ret < 0) {
37459c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
37559c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
37659c2be1eSYehuda Sadeh 			return ret;
37759c2be1eSYehuda Sadeh 		}
37859c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
37959c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
38059c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
38159c2be1eSYehuda Sadeh 		     argstr[0].from);
38259c2be1eSYehuda Sadeh 	} else {
38359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
38459c2be1eSYehuda Sadeh 	}
38559c2be1eSYehuda Sadeh 
38659c2be1eSYehuda Sadeh 	switch (token) {
38759c2be1eSYehuda Sadeh 	case Opt_notify_timeout:
38843ae4701SAlex Elder 		rbd_opts->notify_timeout = intval;
38959c2be1eSYehuda Sadeh 		break;
39059c2be1eSYehuda Sadeh 	default:
39159c2be1eSYehuda Sadeh 		BUG_ON(token);
39259c2be1eSYehuda Sadeh 	}
39359c2be1eSYehuda Sadeh 	return 0;
39459c2be1eSYehuda Sadeh }
39559c2be1eSYehuda Sadeh 
39659c2be1eSYehuda Sadeh /*
397602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
398602adf40SYehuda Sadeh  * not exist create it.
399602adf40SYehuda Sadeh  */
4005214ecc4SAlex Elder static struct rbd_client *rbd_get_client(const char *mon_addr,
4015214ecc4SAlex Elder 					 size_t mon_addr_len,
4025214ecc4SAlex Elder 					 char *options)
403602adf40SYehuda Sadeh {
404602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
40543ae4701SAlex Elder 	struct ceph_options *ceph_opts;
40659c2be1eSYehuda Sadeh 	struct rbd_options *rbd_opts;
40759c2be1eSYehuda Sadeh 
40859c2be1eSYehuda Sadeh 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
40959c2be1eSYehuda Sadeh 	if (!rbd_opts)
410d720bcb0SAlex Elder 		return ERR_PTR(-ENOMEM);
41159c2be1eSYehuda Sadeh 
41259c2be1eSYehuda Sadeh 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
413602adf40SYehuda Sadeh 
41443ae4701SAlex Elder 	ceph_opts = ceph_parse_options(options, mon_addr,
4155214ecc4SAlex Elder 					mon_addr + mon_addr_len,
41621079786SAlex Elder 					parse_rbd_opts_token, rbd_opts);
41743ae4701SAlex Elder 	if (IS_ERR(ceph_opts)) {
418d720bcb0SAlex Elder 		kfree(rbd_opts);
41943ae4701SAlex Elder 		return ERR_CAST(ceph_opts);
420ee57741cSAlex Elder 	}
421602adf40SYehuda Sadeh 
422432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
42343ae4701SAlex Elder 	rbdc = __rbd_client_find(ceph_opts);
424602adf40SYehuda Sadeh 	if (rbdc) {
425e6994d3dSAlex Elder 		/* using an existing client */
426e6994d3dSAlex Elder 		kref_get(&rbdc->kref);
427432b8587SAlex Elder 		spin_unlock(&rbd_client_list_lock);
428e6994d3dSAlex Elder 
42943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
43097bb59a0SAlex Elder 		kfree(rbd_opts);
431602adf40SYehuda Sadeh 
432d720bcb0SAlex Elder 		return rbdc;
433602adf40SYehuda Sadeh 	}
434432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
435602adf40SYehuda Sadeh 
43643ae4701SAlex Elder 	rbdc = rbd_client_create(ceph_opts, rbd_opts);
437d97081b0SAlex Elder 
438d720bcb0SAlex Elder 	if (IS_ERR(rbdc))
43959c2be1eSYehuda Sadeh 		kfree(rbd_opts);
440d720bcb0SAlex Elder 
441d720bcb0SAlex Elder 	return rbdc;
442602adf40SYehuda Sadeh }
443602adf40SYehuda Sadeh 
444602adf40SYehuda Sadeh /*
445602adf40SYehuda Sadeh  * Destroy ceph client
446d23a4b3fSAlex Elder  *
447432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
448602adf40SYehuda Sadeh  */
449602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
450602adf40SYehuda Sadeh {
451602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
452602adf40SYehuda Sadeh 
453602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
454cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
455602adf40SYehuda Sadeh 	list_del(&rbdc->node);
456cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
457602adf40SYehuda Sadeh 
458602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
45959c2be1eSYehuda Sadeh 	kfree(rbdc->rbd_opts);
460602adf40SYehuda Sadeh 	kfree(rbdc);
461602adf40SYehuda Sadeh }
462602adf40SYehuda Sadeh 
463602adf40SYehuda Sadeh /*
464602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
465602adf40SYehuda Sadeh  * it.
466602adf40SYehuda Sadeh  */
467602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
468602adf40SYehuda Sadeh {
469602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
470602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
471602adf40SYehuda Sadeh }
472602adf40SYehuda Sadeh 
4731fec7093SYehuda Sadeh /*
4741fec7093SYehuda Sadeh  * Destroy requests collection
4751fec7093SYehuda Sadeh  */
4761fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4771fec7093SYehuda Sadeh {
4781fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4791fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
4801fec7093SYehuda Sadeh 
4811fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
4821fec7093SYehuda Sadeh 	kfree(coll);
4831fec7093SYehuda Sadeh }
484602adf40SYehuda Sadeh 
485602adf40SYehuda Sadeh /*
486602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
487602adf40SYehuda Sadeh  * header.
488602adf40SYehuda Sadeh  */
489602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
490602adf40SYehuda Sadeh 				 struct rbd_image_header_ondisk *ondisk,
49150f7c4c9SXi Wang 				 u32 allocated_snaps,
492602adf40SYehuda Sadeh 				 gfp_t gfp_flags)
493602adf40SYehuda Sadeh {
49450f7c4c9SXi Wang 	u32 i, snap_count;
495602adf40SYehuda Sadeh 
49621079786SAlex Elder 	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
49781e759fbSJosh Durgin 		return -ENXIO;
49881e759fbSJosh Durgin 
49900f1f36fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
50050f7c4c9SXi Wang 	if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
50150f7c4c9SXi Wang 			 / sizeof (*ondisk))
50250f7c4c9SXi Wang 		return -EINVAL;
503602adf40SYehuda Sadeh 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
504f9f9a190SYan, Zheng 				snap_count * sizeof(u64),
505602adf40SYehuda Sadeh 				gfp_flags);
506602adf40SYehuda Sadeh 	if (!header->snapc)
507602adf40SYehuda Sadeh 		return -ENOMEM;
50800f1f36fSAlex Elder 
50900f1f36fSAlex Elder 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
510602adf40SYehuda Sadeh 	if (snap_count) {
511602adf40SYehuda Sadeh 		header->snap_names = kmalloc(header->snap_names_len,
512f8ad495aSDan Carpenter 					     gfp_flags);
513602adf40SYehuda Sadeh 		if (!header->snap_names)
514602adf40SYehuda Sadeh 			goto err_snapc;
515602adf40SYehuda Sadeh 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
516f8ad495aSDan Carpenter 					     gfp_flags);
517602adf40SYehuda Sadeh 		if (!header->snap_sizes)
518602adf40SYehuda Sadeh 			goto err_names;
519602adf40SYehuda Sadeh 	} else {
520602adf40SYehuda Sadeh 		header->snap_names = NULL;
521602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
522602adf40SYehuda Sadeh 	}
523849b4260SAlex Elder 
524849b4260SAlex Elder 	header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
525849b4260SAlex Elder 					gfp_flags);
526849b4260SAlex Elder 	if (!header->object_prefix)
527849b4260SAlex Elder 		goto err_sizes;
528849b4260SAlex Elder 
529ca1e49a6SAlex Elder 	memcpy(header->object_prefix, ondisk->block_name,
530602adf40SYehuda Sadeh 	       sizeof(ondisk->block_name));
531849b4260SAlex Elder 	header->object_prefix[sizeof (ondisk->block_name)] = '\0';
532602adf40SYehuda Sadeh 
533602adf40SYehuda Sadeh 	header->image_size = le64_to_cpu(ondisk->image_size);
534602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
535602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
536602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
537602adf40SYehuda Sadeh 
538602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
539602adf40SYehuda Sadeh 	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
540602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
541602adf40SYehuda Sadeh 	header->total_snaps = snap_count;
542602adf40SYehuda Sadeh 
54321079786SAlex Elder 	if (snap_count && allocated_snaps == snap_count) {
544602adf40SYehuda Sadeh 		for (i = 0; i < snap_count; i++) {
545602adf40SYehuda Sadeh 			header->snapc->snaps[i] =
546602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].id);
547602adf40SYehuda Sadeh 			header->snap_sizes[i] =
548602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].image_size);
549602adf40SYehuda Sadeh 		}
550602adf40SYehuda Sadeh 
551602adf40SYehuda Sadeh 		/* copy snapshot names */
552602adf40SYehuda Sadeh 		memcpy(header->snap_names, &ondisk->snaps[i],
553602adf40SYehuda Sadeh 			header->snap_names_len);
554602adf40SYehuda Sadeh 	}
555602adf40SYehuda Sadeh 
556602adf40SYehuda Sadeh 	return 0;
557602adf40SYehuda Sadeh 
558849b4260SAlex Elder err_sizes:
559849b4260SAlex Elder 	kfree(header->snap_sizes);
560602adf40SYehuda Sadeh err_names:
561602adf40SYehuda Sadeh 	kfree(header->snap_names);
562602adf40SYehuda Sadeh err_snapc:
563602adf40SYehuda Sadeh 	kfree(header->snapc);
56400f1f36fSAlex Elder 	return -ENOMEM;
565602adf40SYehuda Sadeh }
566602adf40SYehuda Sadeh 
567602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
568602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
569602adf40SYehuda Sadeh {
570602adf40SYehuda Sadeh 	int i;
571602adf40SYehuda Sadeh 	char *p = header->snap_names;
572602adf40SYehuda Sadeh 
57300f1f36fSAlex Elder 	for (i = 0; i < header->total_snaps; i++) {
57400f1f36fSAlex Elder 		if (!strcmp(snap_name, p)) {
57500f1f36fSAlex Elder 
57600f1f36fSAlex Elder 			/* Found it.  Pass back its id and/or size */
57700f1f36fSAlex Elder 
578602adf40SYehuda Sadeh 			if (seq)
579602adf40SYehuda Sadeh 				*seq = header->snapc->snaps[i];
580602adf40SYehuda Sadeh 			if (size)
581602adf40SYehuda Sadeh 				*size = header->snap_sizes[i];
582602adf40SYehuda Sadeh 			return i;
583602adf40SYehuda Sadeh 		}
58400f1f36fSAlex Elder 		p += strlen(p) + 1;	/* Skip ahead to the next name */
58500f1f36fSAlex Elder 	}
58600f1f36fSAlex Elder 	return -ENOENT;
58700f1f36fSAlex Elder }
588602adf40SYehuda Sadeh 
5890ce1a794SAlex Elder static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
590602adf40SYehuda Sadeh {
5910ce1a794SAlex Elder 	struct rbd_image_header *header = &rbd_dev->header;
592602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc = header->snapc;
593602adf40SYehuda Sadeh 	int ret = -ENOENT;
594602adf40SYehuda Sadeh 
5950ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
596602adf40SYehuda Sadeh 
5970ce1a794SAlex Elder 	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
598cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
599602adf40SYehuda Sadeh 		if (header->total_snaps)
600602adf40SYehuda Sadeh 			snapc->seq = header->snap_seq;
601602adf40SYehuda Sadeh 		else
602602adf40SYehuda Sadeh 			snapc->seq = 0;
6030ce1a794SAlex Elder 		rbd_dev->snap_id = CEPH_NOSNAP;
604e88a36ecSJosh Durgin 		rbd_dev->snap_exists = false;
6050ce1a794SAlex Elder 		rbd_dev->read_only = 0;
606602adf40SYehuda Sadeh 		if (size)
607602adf40SYehuda Sadeh 			*size = header->image_size;
608602adf40SYehuda Sadeh 	} else {
6090ce1a794SAlex Elder 		ret = snap_by_name(header, rbd_dev->snap_name,
6100ce1a794SAlex Elder 					&snapc->seq, size);
611602adf40SYehuda Sadeh 		if (ret < 0)
612602adf40SYehuda Sadeh 			goto done;
6130ce1a794SAlex Elder 		rbd_dev->snap_id = snapc->seq;
614e88a36ecSJosh Durgin 		rbd_dev->snap_exists = true;
6150ce1a794SAlex Elder 		rbd_dev->read_only = 1;
616602adf40SYehuda Sadeh 	}
617602adf40SYehuda Sadeh 
618602adf40SYehuda Sadeh 	ret = 0;
619602adf40SYehuda Sadeh done:
6200ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
621602adf40SYehuda Sadeh 	return ret;
622602adf40SYehuda Sadeh }
623602adf40SYehuda Sadeh 
624602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
625602adf40SYehuda Sadeh {
626849b4260SAlex Elder 	kfree(header->object_prefix);
627602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
628849b4260SAlex Elder 	kfree(header->snap_names);
629849b4260SAlex Elder 	kfree(header->snapc);
630602adf40SYehuda Sadeh }
631602adf40SYehuda Sadeh 
632602adf40SYehuda Sadeh /*
633602adf40SYehuda Sadeh  * get the actual striped segment name, offset and length
634602adf40SYehuda Sadeh  */
635602adf40SYehuda Sadeh static u64 rbd_get_segment(struct rbd_image_header *header,
636ca1e49a6SAlex Elder 			   const char *object_prefix,
637602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
638602adf40SYehuda Sadeh 			   char *seg_name, u64 *segofs)
639602adf40SYehuda Sadeh {
640602adf40SYehuda Sadeh 	u64 seg = ofs >> header->obj_order;
641602adf40SYehuda Sadeh 
642602adf40SYehuda Sadeh 	if (seg_name)
643602adf40SYehuda Sadeh 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
644ca1e49a6SAlex Elder 			 "%s.%012llx", object_prefix, seg);
645602adf40SYehuda Sadeh 
646602adf40SYehuda Sadeh 	ofs = ofs & ((1 << header->obj_order) - 1);
647602adf40SYehuda Sadeh 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
648602adf40SYehuda Sadeh 
649602adf40SYehuda Sadeh 	if (segofs)
650602adf40SYehuda Sadeh 		*segofs = ofs;
651602adf40SYehuda Sadeh 
652602adf40SYehuda Sadeh 	return len;
653602adf40SYehuda Sadeh }
654602adf40SYehuda Sadeh 
6551fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
6561fec7093SYehuda Sadeh 				u64 ofs, u64 len)
6571fec7093SYehuda Sadeh {
6581fec7093SYehuda Sadeh 	u64 start_seg = ofs >> header->obj_order;
6591fec7093SYehuda Sadeh 	u64 end_seg = (ofs + len - 1) >> header->obj_order;
6601fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
6611fec7093SYehuda Sadeh }
6621fec7093SYehuda Sadeh 
663602adf40SYehuda Sadeh /*
664029bcbd8SJosh Durgin  * returns the size of an object in the image
665029bcbd8SJosh Durgin  */
666029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
667029bcbd8SJosh Durgin {
668029bcbd8SJosh Durgin 	return 1 << header->obj_order;
669029bcbd8SJosh Durgin }
670029bcbd8SJosh Durgin 
671029bcbd8SJosh Durgin /*
672602adf40SYehuda Sadeh  * bio helpers
673602adf40SYehuda Sadeh  */
674602adf40SYehuda Sadeh 
675602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
676602adf40SYehuda Sadeh {
677602adf40SYehuda Sadeh 	struct bio *tmp;
678602adf40SYehuda Sadeh 
679602adf40SYehuda Sadeh 	while (chain) {
680602adf40SYehuda Sadeh 		tmp = chain;
681602adf40SYehuda Sadeh 		chain = chain->bi_next;
682602adf40SYehuda Sadeh 		bio_put(tmp);
683602adf40SYehuda Sadeh 	}
684602adf40SYehuda Sadeh }
685602adf40SYehuda Sadeh 
686602adf40SYehuda Sadeh /*
687602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
688602adf40SYehuda Sadeh  */
689602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
690602adf40SYehuda Sadeh {
691602adf40SYehuda Sadeh 	struct bio_vec *bv;
692602adf40SYehuda Sadeh 	unsigned long flags;
693602adf40SYehuda Sadeh 	void *buf;
694602adf40SYehuda Sadeh 	int i;
695602adf40SYehuda Sadeh 	int pos = 0;
696602adf40SYehuda Sadeh 
697602adf40SYehuda Sadeh 	while (chain) {
698602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
699602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
700602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
701602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
702602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
703602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
70485b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
705602adf40SYehuda Sadeh 			}
706602adf40SYehuda Sadeh 			pos += bv->bv_len;
707602adf40SYehuda Sadeh 		}
708602adf40SYehuda Sadeh 
709602adf40SYehuda Sadeh 		chain = chain->bi_next;
710602adf40SYehuda Sadeh 	}
711602adf40SYehuda Sadeh }
712602adf40SYehuda Sadeh 
713602adf40SYehuda Sadeh /*
714602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
715602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
716602adf40SYehuda Sadeh  */
717602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
718602adf40SYehuda Sadeh 				   struct bio_pair **bp,
719602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
720602adf40SYehuda Sadeh {
721602adf40SYehuda Sadeh 	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
722602adf40SYehuda Sadeh 	int total = 0;
723602adf40SYehuda Sadeh 
724602adf40SYehuda Sadeh 	if (*bp) {
725602adf40SYehuda Sadeh 		bio_pair_release(*bp);
726602adf40SYehuda Sadeh 		*bp = NULL;
727602adf40SYehuda Sadeh 	}
728602adf40SYehuda Sadeh 
729602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
730602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
731602adf40SYehuda Sadeh 		if (!tmp)
732602adf40SYehuda Sadeh 			goto err_out;
733602adf40SYehuda Sadeh 
734602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
735602adf40SYehuda Sadeh 			struct bio_pair *bp;
736602adf40SYehuda Sadeh 
737602adf40SYehuda Sadeh 			/*
738602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
739602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
740602adf40SYehuda Sadeh 			 */
741602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
742602adf40SYehuda Sadeh 			     "bi_size=%d\n",
743602adf40SYehuda Sadeh 			     (int)total, (int)len-total,
744602adf40SYehuda Sadeh 			     (int)old_chain->bi_size);
745602adf40SYehuda Sadeh 
746602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
747602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
748593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
749602adf40SYehuda Sadeh 			if (!bp)
750602adf40SYehuda Sadeh 				goto err_out;
751602adf40SYehuda Sadeh 
752602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
753602adf40SYehuda Sadeh 
754602adf40SYehuda Sadeh 			*next = &bp->bio2;
755602adf40SYehuda Sadeh 		} else {
756602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
757602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
758602adf40SYehuda Sadeh 		}
759602adf40SYehuda Sadeh 
760602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
761602adf40SYehuda Sadeh 		gfpmask &= ~__GFP_WAIT;
762602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
763602adf40SYehuda Sadeh 
764602adf40SYehuda Sadeh 		if (!new_chain) {
765602adf40SYehuda Sadeh 			new_chain = tail = tmp;
766602adf40SYehuda Sadeh 		} else {
767602adf40SYehuda Sadeh 			tail->bi_next = tmp;
768602adf40SYehuda Sadeh 			tail = tmp;
769602adf40SYehuda Sadeh 		}
770602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
771602adf40SYehuda Sadeh 
772602adf40SYehuda Sadeh 		total += tmp->bi_size;
773602adf40SYehuda Sadeh 	}
774602adf40SYehuda Sadeh 
775602adf40SYehuda Sadeh 	BUG_ON(total < len);
776602adf40SYehuda Sadeh 
777602adf40SYehuda Sadeh 	if (tail)
778602adf40SYehuda Sadeh 		tail->bi_next = NULL;
779602adf40SYehuda Sadeh 
780602adf40SYehuda Sadeh 	*old = old_chain;
781602adf40SYehuda Sadeh 
782602adf40SYehuda Sadeh 	return new_chain;
783602adf40SYehuda Sadeh 
784602adf40SYehuda Sadeh err_out:
785602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
786602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
787602adf40SYehuda Sadeh 	return NULL;
788602adf40SYehuda Sadeh }
789602adf40SYehuda Sadeh 
790602adf40SYehuda Sadeh /*
791602adf40SYehuda Sadeh  * helpers for osd request op vectors.
792602adf40SYehuda Sadeh  */
793602adf40SYehuda Sadeh static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
794602adf40SYehuda Sadeh 			    int num_ops,
795602adf40SYehuda Sadeh 			    int opcode,
796602adf40SYehuda Sadeh 			    u32 payload_len)
797602adf40SYehuda Sadeh {
798602adf40SYehuda Sadeh 	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
799602adf40SYehuda Sadeh 		       GFP_NOIO);
800602adf40SYehuda Sadeh 	if (!*ops)
801602adf40SYehuda Sadeh 		return -ENOMEM;
802602adf40SYehuda Sadeh 	(*ops)[0].op = opcode;
803602adf40SYehuda Sadeh 	/*
804602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
805602adf40SYehuda Sadeh 	 * in calc_raw_layout()
806602adf40SYehuda Sadeh 	 */
807602adf40SYehuda Sadeh 	(*ops)[0].payload_len = payload_len;
808602adf40SYehuda Sadeh 	return 0;
809602adf40SYehuda Sadeh }
810602adf40SYehuda Sadeh 
811602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
812602adf40SYehuda Sadeh {
813602adf40SYehuda Sadeh 	kfree(ops);
814602adf40SYehuda Sadeh }
815602adf40SYehuda Sadeh 
8161fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
8171fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
8181fec7093SYehuda Sadeh 				   int index,
8191fec7093SYehuda Sadeh 				   int ret, u64 len)
8201fec7093SYehuda Sadeh {
8211fec7093SYehuda Sadeh 	struct request_queue *q;
8221fec7093SYehuda Sadeh 	int min, max, i;
8231fec7093SYehuda Sadeh 
8241fec7093SYehuda Sadeh 	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
8251fec7093SYehuda Sadeh 	     coll, index, ret, len);
8261fec7093SYehuda Sadeh 
8271fec7093SYehuda Sadeh 	if (!rq)
8281fec7093SYehuda Sadeh 		return;
8291fec7093SYehuda Sadeh 
8301fec7093SYehuda Sadeh 	if (!coll) {
8311fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
8321fec7093SYehuda Sadeh 		return;
8331fec7093SYehuda Sadeh 	}
8341fec7093SYehuda Sadeh 
8351fec7093SYehuda Sadeh 	q = rq->q;
8361fec7093SYehuda Sadeh 
8371fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
8381fec7093SYehuda Sadeh 	coll->status[index].done = 1;
8391fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
8401fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
8411fec7093SYehuda Sadeh 	max = min = coll->num_done;
8421fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
8431fec7093SYehuda Sadeh 		max++;
8441fec7093SYehuda Sadeh 
8451fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
8461fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
8471fec7093SYehuda Sadeh 				  coll->status[i].bytes);
8481fec7093SYehuda Sadeh 		coll->num_done++;
8491fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
8501fec7093SYehuda Sadeh 	}
8511fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
8521fec7093SYehuda Sadeh }
8531fec7093SYehuda Sadeh 
8541fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
8551fec7093SYehuda Sadeh 			     int ret, u64 len)
8561fec7093SYehuda Sadeh {
8571fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
8581fec7093SYehuda Sadeh }
8591fec7093SYehuda Sadeh 
860602adf40SYehuda Sadeh /*
861602adf40SYehuda Sadeh  * Send ceph osd request
862602adf40SYehuda Sadeh  */
863602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
8640ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
865602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
866602adf40SYehuda Sadeh 			  u64 snapid,
867aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
868602adf40SYehuda Sadeh 			  struct bio *bio,
869602adf40SYehuda Sadeh 			  struct page **pages,
870602adf40SYehuda Sadeh 			  int num_pages,
871602adf40SYehuda Sadeh 			  int flags,
872602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
8731fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
8741fec7093SYehuda Sadeh 			  int coll_index,
875602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
87659c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
87759c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
87859c2be1eSYehuda Sadeh 			  u64 *ver)
879602adf40SYehuda Sadeh {
880602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
881602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
882602adf40SYehuda Sadeh 	int ret;
883602adf40SYehuda Sadeh 	u64 bno;
884602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
885602adf40SYehuda Sadeh 	struct rbd_request *req_data;
886602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
8871dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
888602adf40SYehuda Sadeh 
889602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
8901fec7093SYehuda Sadeh 	if (!req_data) {
8911fec7093SYehuda Sadeh 		if (coll)
8921fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
8931fec7093SYehuda Sadeh 					       -ENOMEM, len);
8941fec7093SYehuda Sadeh 		return -ENOMEM;
8951fec7093SYehuda Sadeh 	}
896602adf40SYehuda Sadeh 
8971fec7093SYehuda Sadeh 	if (coll) {
8981fec7093SYehuda Sadeh 		req_data->coll = coll;
8991fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
9001fec7093SYehuda Sadeh 	}
9011fec7093SYehuda Sadeh 
902aded07eaSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%lld len=%lld\n",
903aded07eaSAlex Elder 		object_name, len, ofs);
904602adf40SYehuda Sadeh 
9050ce1a794SAlex Elder 	down_read(&rbd_dev->header_rwsem);
906602adf40SYehuda Sadeh 
9070ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
9081dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9091dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9104ad12621SSage Weil 	if (!req) {
9110ce1a794SAlex Elder 		up_read(&rbd_dev->header_rwsem);
9124ad12621SSage Weil 		ret = -ENOMEM;
913602adf40SYehuda Sadeh 		goto done_pages;
914602adf40SYehuda Sadeh 	}
915602adf40SYehuda Sadeh 
916602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
917602adf40SYehuda Sadeh 
918602adf40SYehuda Sadeh 	req_data->rq = rq;
919602adf40SYehuda Sadeh 	req_data->bio = bio;
920602adf40SYehuda Sadeh 	req_data->pages = pages;
921602adf40SYehuda Sadeh 	req_data->len = len;
922602adf40SYehuda Sadeh 
923602adf40SYehuda Sadeh 	req->r_priv = req_data;
924602adf40SYehuda Sadeh 
925602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
926602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
927602adf40SYehuda Sadeh 
928aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
929602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
930602adf40SYehuda Sadeh 
931602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
932602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
933602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
934602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
935602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
9360ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
9371dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
9381dbb4399SAlex Elder 				req, ops);
939602adf40SYehuda Sadeh 
940602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
941602adf40SYehuda Sadeh 				ops,
942602adf40SYehuda Sadeh 				snapc,
943602adf40SYehuda Sadeh 				&mtime,
944602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
9450ce1a794SAlex Elder 	up_read(&rbd_dev->header_rwsem);
946602adf40SYehuda Sadeh 
94759c2be1eSYehuda Sadeh 	if (linger_req) {
9481dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
94959c2be1eSYehuda Sadeh 		*linger_req = req;
95059c2be1eSYehuda Sadeh 	}
95159c2be1eSYehuda Sadeh 
9521dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
953602adf40SYehuda Sadeh 	if (ret < 0)
954602adf40SYehuda Sadeh 		goto done_err;
955602adf40SYehuda Sadeh 
956602adf40SYehuda Sadeh 	if (!rbd_cb) {
9571dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
95859c2be1eSYehuda Sadeh 		if (ver)
95959c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
9601fec7093SYehuda Sadeh 		dout("reassert_ver=%lld\n",
9611fec7093SYehuda Sadeh 		     le64_to_cpu(req->r_reassert_version.version));
962602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
963602adf40SYehuda Sadeh 	}
964602adf40SYehuda Sadeh 	return ret;
965602adf40SYehuda Sadeh 
966602adf40SYehuda Sadeh done_err:
967602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
968602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
969602adf40SYehuda Sadeh done_pages:
9701fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
971602adf40SYehuda Sadeh 	kfree(req_data);
972602adf40SYehuda Sadeh 	return ret;
973602adf40SYehuda Sadeh }
974602adf40SYehuda Sadeh 
975602adf40SYehuda Sadeh /*
976602adf40SYehuda Sadeh  * Ceph osd op callback
977602adf40SYehuda Sadeh  */
978602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
979602adf40SYehuda Sadeh {
980602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
981602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
982602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
983602adf40SYehuda Sadeh 	__s32 rc;
984602adf40SYehuda Sadeh 	u64 bytes;
985602adf40SYehuda Sadeh 	int read_op;
986602adf40SYehuda Sadeh 
987602adf40SYehuda Sadeh 	/* parse reply */
988602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
989602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
990602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
991602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
992602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
993895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
994602adf40SYehuda Sadeh 
995602adf40SYehuda Sadeh 	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
996602adf40SYehuda Sadeh 
997602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
998602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
999602adf40SYehuda Sadeh 		rc = 0;
1000602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1001602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1002602adf40SYehuda Sadeh 		bytes = req_data->len;
1003602adf40SYehuda Sadeh 	}
1004602adf40SYehuda Sadeh 
10051fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1006602adf40SYehuda Sadeh 
1007602adf40SYehuda Sadeh 	if (req_data->bio)
1008602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1009602adf40SYehuda Sadeh 
1010602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1011602adf40SYehuda Sadeh 	kfree(req_data);
1012602adf40SYehuda Sadeh }
1013602adf40SYehuda Sadeh 
101459c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
101559c2be1eSYehuda Sadeh {
101659c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
101759c2be1eSYehuda Sadeh }
101859c2be1eSYehuda Sadeh 
1019602adf40SYehuda Sadeh /*
1020602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1021602adf40SYehuda Sadeh  */
10220ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1023602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1024602adf40SYehuda Sadeh 			   u64 snapid,
1025602adf40SYehuda Sadeh 			   int opcode,
1026602adf40SYehuda Sadeh 			   int flags,
1027602adf40SYehuda Sadeh 			   struct ceph_osd_req_op *orig_ops,
1028aded07eaSAlex Elder 			   const char *object_name,
1029602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
103059c2be1eSYehuda Sadeh 			   char *buf,
103159c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
103259c2be1eSYehuda Sadeh 			   u64 *ver)
1033602adf40SYehuda Sadeh {
1034602adf40SYehuda Sadeh 	int ret;
1035602adf40SYehuda Sadeh 	struct page **pages;
1036602adf40SYehuda Sadeh 	int num_pages;
1037602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops = orig_ops;
1038602adf40SYehuda Sadeh 	u32 payload_len;
1039602adf40SYehuda Sadeh 
1040602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1041602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1042b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1043b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1044602adf40SYehuda Sadeh 
1045602adf40SYehuda Sadeh 	if (!orig_ops) {
1046602adf40SYehuda Sadeh 		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1047602adf40SYehuda Sadeh 		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1048602adf40SYehuda Sadeh 		if (ret < 0)
1049602adf40SYehuda Sadeh 			goto done;
1050602adf40SYehuda Sadeh 
1051602adf40SYehuda Sadeh 		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1052602adf40SYehuda Sadeh 			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1053602adf40SYehuda Sadeh 			if (ret < 0)
1054602adf40SYehuda Sadeh 				goto done_ops;
1055602adf40SYehuda Sadeh 		}
1056602adf40SYehuda Sadeh 	}
1057602adf40SYehuda Sadeh 
10580ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1059aded07eaSAlex Elder 			  object_name, ofs, len, NULL,
1060602adf40SYehuda Sadeh 			  pages, num_pages,
1061602adf40SYehuda Sadeh 			  flags,
1062602adf40SYehuda Sadeh 			  ops,
10631fec7093SYehuda Sadeh 			  NULL, 0,
106459c2be1eSYehuda Sadeh 			  NULL,
106559c2be1eSYehuda Sadeh 			  linger_req, ver);
1066602adf40SYehuda Sadeh 	if (ret < 0)
1067602adf40SYehuda Sadeh 		goto done_ops;
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1070602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1071602adf40SYehuda Sadeh 
1072602adf40SYehuda Sadeh done_ops:
1073602adf40SYehuda Sadeh 	if (!orig_ops)
1074602adf40SYehuda Sadeh 		rbd_destroy_ops(ops);
1075602adf40SYehuda Sadeh done:
1076602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1077602adf40SYehuda Sadeh 	return ret;
1078602adf40SYehuda Sadeh }
1079602adf40SYehuda Sadeh 
1080602adf40SYehuda Sadeh /*
1081602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1082602adf40SYehuda Sadeh  */
1083602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1084602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1085602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1086602adf40SYehuda Sadeh 		     u64 snapid,
1087d1f57ea6SAlex Elder 		     int opcode, int flags,
1088602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
10891fec7093SYehuda Sadeh 		     struct bio *bio,
10901fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
10911fec7093SYehuda Sadeh 		     int coll_index)
1092602adf40SYehuda Sadeh {
1093602adf40SYehuda Sadeh 	char *seg_name;
1094602adf40SYehuda Sadeh 	u64 seg_ofs;
1095602adf40SYehuda Sadeh 	u64 seg_len;
1096602adf40SYehuda Sadeh 	int ret;
1097602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1098602adf40SYehuda Sadeh 	u32 payload_len;
1099602adf40SYehuda Sadeh 
1100602adf40SYehuda Sadeh 	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1101602adf40SYehuda Sadeh 	if (!seg_name)
1102602adf40SYehuda Sadeh 		return -ENOMEM;
1103602adf40SYehuda Sadeh 
1104602adf40SYehuda Sadeh 	seg_len = rbd_get_segment(&rbd_dev->header,
1105ca1e49a6SAlex Elder 				  rbd_dev->header.object_prefix,
1106602adf40SYehuda Sadeh 				  ofs, len,
1107602adf40SYehuda Sadeh 				  seg_name, &seg_ofs);
1108602adf40SYehuda Sadeh 
1109602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1110602adf40SYehuda Sadeh 
1111602adf40SYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1112602adf40SYehuda Sadeh 	if (ret < 0)
1113602adf40SYehuda Sadeh 		goto done;
1114602adf40SYehuda Sadeh 
1115602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1116602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1117602adf40SYehuda Sadeh 	   truncated at this point */
1118602adf40SYehuda Sadeh 	BUG_ON(seg_len < len);
1119602adf40SYehuda Sadeh 
1120602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1121602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1122602adf40SYehuda Sadeh 			     bio,
1123602adf40SYehuda Sadeh 			     NULL, 0,
1124602adf40SYehuda Sadeh 			     flags,
1125602adf40SYehuda Sadeh 			     ops,
11261fec7093SYehuda Sadeh 			     coll, coll_index,
112759c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
112811f77002SSage Weil 
112911f77002SSage Weil 	rbd_destroy_ops(ops);
1130602adf40SYehuda Sadeh done:
1131602adf40SYehuda Sadeh 	kfree(seg_name);
1132602adf40SYehuda Sadeh 	return ret;
1133602adf40SYehuda Sadeh }
1134602adf40SYehuda Sadeh 
1135602adf40SYehuda Sadeh /*
1136602adf40SYehuda Sadeh  * Request async osd write
1137602adf40SYehuda Sadeh  */
1138602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1139602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1140602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1141602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11421fec7093SYehuda Sadeh 			 struct bio *bio,
11431fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11441fec7093SYehuda Sadeh 			 int coll_index)
1145602adf40SYehuda Sadeh {
1146602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1147602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1148602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
11491fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1150602adf40SYehuda Sadeh }
1151602adf40SYehuda Sadeh 
1152602adf40SYehuda Sadeh /*
1153602adf40SYehuda Sadeh  * Request async osd read
1154602adf40SYehuda Sadeh  */
1155602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1156602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1157602adf40SYehuda Sadeh 			 u64 snapid,
1158602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11591fec7093SYehuda Sadeh 			 struct bio *bio,
11601fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11611fec7093SYehuda Sadeh 			 int coll_index)
1162602adf40SYehuda Sadeh {
1163602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1164b06e6a6bSJosh Durgin 			 snapid,
1165602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1166602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
11671fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1168602adf40SYehuda Sadeh }
1169602adf40SYehuda Sadeh 
1170602adf40SYehuda Sadeh /*
1171602adf40SYehuda Sadeh  * Request sync osd read
1172602adf40SYehuda Sadeh  */
11730ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1174602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1175602adf40SYehuda Sadeh 			  u64 snapid,
1176aded07eaSAlex Elder 			  const char *object_name,
1177602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
117859c2be1eSYehuda Sadeh 			  char *buf,
117959c2be1eSYehuda Sadeh 			  u64 *ver)
1180602adf40SYehuda Sadeh {
11810ce1a794SAlex Elder 	return rbd_req_sync_op(rbd_dev, NULL,
1182b06e6a6bSJosh Durgin 			       snapid,
1183602adf40SYehuda Sadeh 			       CEPH_OSD_OP_READ,
1184602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1185602adf40SYehuda Sadeh 			       NULL,
1186d1f57ea6SAlex Elder 			       object_name, ofs, len, buf, NULL, ver);
1187602adf40SYehuda Sadeh }
1188602adf40SYehuda Sadeh 
1189602adf40SYehuda Sadeh /*
119059c2be1eSYehuda Sadeh  * Request sync osd watch
119159c2be1eSYehuda Sadeh  */
11920ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
119359c2be1eSYehuda Sadeh 				   u64 ver,
119459c2be1eSYehuda Sadeh 				   u64 notify_id,
1195aded07eaSAlex Elder 				   const char *object_name)
119659c2be1eSYehuda Sadeh {
119759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
119811f77002SSage Weil 	int ret;
119911f77002SSage Weil 
120011f77002SSage Weil 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
120159c2be1eSYehuda Sadeh 	if (ret < 0)
120259c2be1eSYehuda Sadeh 		return ret;
120359c2be1eSYehuda Sadeh 
12040ce1a794SAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
120559c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
120659c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
120759c2be1eSYehuda Sadeh 
12080ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
1209aded07eaSAlex Elder 			  object_name, 0, 0, NULL,
1210ad4f232fSAlex Elder 			  NULL, 0,
121159c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
121259c2be1eSYehuda Sadeh 			  ops,
12131fec7093SYehuda Sadeh 			  NULL, 0,
121459c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
121559c2be1eSYehuda Sadeh 
121659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
121759c2be1eSYehuda Sadeh 	return ret;
121859c2be1eSYehuda Sadeh }
121959c2be1eSYehuda Sadeh 
122059c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
122159c2be1eSYehuda Sadeh {
12220ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
122313143d2dSSage Weil 	int rc;
122413143d2dSSage Weil 
12250ce1a794SAlex Elder 	if (!rbd_dev)
122659c2be1eSYehuda Sadeh 		return;
122759c2be1eSYehuda Sadeh 
12280bed54dcSAlex Elder 	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n",
12290bed54dcSAlex Elder 		rbd_dev->header_name, notify_id, (int) opcode);
123059c2be1eSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
12310ce1a794SAlex Elder 	rc = __rbd_refresh_header(rbd_dev);
123259c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
123313143d2dSSage Weil 	if (rc)
1234f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
12350ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
123659c2be1eSYehuda Sadeh 
12370bed54dcSAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, ver, notify_id, rbd_dev->header_name);
123859c2be1eSYehuda Sadeh }
123959c2be1eSYehuda Sadeh 
124059c2be1eSYehuda Sadeh /*
124159c2be1eSYehuda Sadeh  * Request sync osd watch
124259c2be1eSYehuda Sadeh  */
12430ce1a794SAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
1244aded07eaSAlex Elder 			      const char *object_name,
124559c2be1eSYehuda Sadeh 			      u64 ver)
124659c2be1eSYehuda Sadeh {
124759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
12480ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
124959c2be1eSYehuda Sadeh 
125059c2be1eSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
125159c2be1eSYehuda Sadeh 	if (ret < 0)
125259c2be1eSYehuda Sadeh 		return ret;
125359c2be1eSYehuda Sadeh 
125459c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
12550ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
125659c2be1eSYehuda Sadeh 	if (ret < 0)
125759c2be1eSYehuda Sadeh 		goto fail;
125859c2be1eSYehuda Sadeh 
125959c2be1eSYehuda Sadeh 	ops[0].watch.ver = cpu_to_le64(ver);
12600ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
126159c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
126259c2be1eSYehuda Sadeh 
12630ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
126459c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
126559c2be1eSYehuda Sadeh 			      0,
126659c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
126759c2be1eSYehuda Sadeh 			      ops,
1268d1f57ea6SAlex Elder 			      object_name, 0, 0, NULL,
12690ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
127059c2be1eSYehuda Sadeh 
127159c2be1eSYehuda Sadeh 	if (ret < 0)
127259c2be1eSYehuda Sadeh 		goto fail_event;
127359c2be1eSYehuda Sadeh 
127459c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
127559c2be1eSYehuda Sadeh 	return 0;
127659c2be1eSYehuda Sadeh 
127759c2be1eSYehuda Sadeh fail_event:
12780ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
12790ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
128059c2be1eSYehuda Sadeh fail:
128159c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
128259c2be1eSYehuda Sadeh 	return ret;
128359c2be1eSYehuda Sadeh }
128459c2be1eSYehuda Sadeh 
128579e3057cSYehuda Sadeh /*
128679e3057cSYehuda Sadeh  * Request sync osd unwatch
128779e3057cSYehuda Sadeh  */
12880ce1a794SAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
1289aded07eaSAlex Elder 				const char *object_name)
129079e3057cSYehuda Sadeh {
129179e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
129279e3057cSYehuda Sadeh 
129379e3057cSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
129479e3057cSYehuda Sadeh 	if (ret < 0)
129579e3057cSYehuda Sadeh 		return ret;
129679e3057cSYehuda Sadeh 
129779e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
12980ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
129979e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
130079e3057cSYehuda Sadeh 
13010ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
130279e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
130379e3057cSYehuda Sadeh 			      0,
130479e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
130579e3057cSYehuda Sadeh 			      ops,
1306d1f57ea6SAlex Elder 			      object_name, 0, 0, NULL, NULL, NULL);
130779e3057cSYehuda Sadeh 
130879e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
13090ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13100ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
131179e3057cSYehuda Sadeh 	return ret;
131279e3057cSYehuda Sadeh }
131379e3057cSYehuda Sadeh 
131459c2be1eSYehuda Sadeh struct rbd_notify_info {
13150ce1a794SAlex Elder 	struct rbd_device *rbd_dev;
131659c2be1eSYehuda Sadeh };
131759c2be1eSYehuda Sadeh 
131859c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
131959c2be1eSYehuda Sadeh {
13200ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
13210ce1a794SAlex Elder 	if (!rbd_dev)
132259c2be1eSYehuda Sadeh 		return;
132359c2be1eSYehuda Sadeh 
13240ce1a794SAlex Elder 	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n",
13250bed54dcSAlex Elder 				rbd_dev->header_name,
132659c2be1eSYehuda Sadeh 		notify_id, (int)opcode);
132759c2be1eSYehuda Sadeh }
132859c2be1eSYehuda Sadeh 
132959c2be1eSYehuda Sadeh /*
133059c2be1eSYehuda Sadeh  * Request sync osd notify
133159c2be1eSYehuda Sadeh  */
13320ce1a794SAlex Elder static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
1333aded07eaSAlex Elder 		          const char *object_name)
133459c2be1eSYehuda Sadeh {
133559c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13360ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
133759c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
133859c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
133959c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
134059c2be1eSYehuda Sadeh 	int ret;
134159c2be1eSYehuda Sadeh 
134259c2be1eSYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
134359c2be1eSYehuda Sadeh 	if (ret < 0)
134459c2be1eSYehuda Sadeh 		return ret;
134559c2be1eSYehuda Sadeh 
13460ce1a794SAlex Elder 	info.rbd_dev = rbd_dev;
134759c2be1eSYehuda Sadeh 
134859c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
134959c2be1eSYehuda Sadeh 				     (void *)&info, &event);
135059c2be1eSYehuda Sadeh 	if (ret < 0)
135159c2be1eSYehuda Sadeh 		goto fail;
135259c2be1eSYehuda Sadeh 
135359c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
135459c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
135559c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
135659c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
135759c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
135859c2be1eSYehuda Sadeh 
13590ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
136059c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
136159c2be1eSYehuda Sadeh 			       0,
136259c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
136359c2be1eSYehuda Sadeh 			       ops,
1364d1f57ea6SAlex Elder 			       object_name, 0, 0, NULL, NULL, NULL);
136559c2be1eSYehuda Sadeh 	if (ret < 0)
136659c2be1eSYehuda Sadeh 		goto fail_event;
136759c2be1eSYehuda Sadeh 
136859c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
136959c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
137059c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
137159c2be1eSYehuda Sadeh 	return 0;
137259c2be1eSYehuda Sadeh 
137359c2be1eSYehuda Sadeh fail_event:
137459c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
137559c2be1eSYehuda Sadeh fail:
137659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
137759c2be1eSYehuda Sadeh 	return ret;
137859c2be1eSYehuda Sadeh }
137959c2be1eSYehuda Sadeh 
138059c2be1eSYehuda Sadeh /*
1381602adf40SYehuda Sadeh  * Request sync osd read
1382602adf40SYehuda Sadeh  */
13830ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1384aded07eaSAlex Elder 			     const char *object_name,
1385aded07eaSAlex Elder 			     const char *class_name,
1386aded07eaSAlex Elder 			     const char *method_name,
1387602adf40SYehuda Sadeh 			     const char *data,
138859c2be1eSYehuda Sadeh 			     int len,
138959c2be1eSYehuda Sadeh 			     u64 *ver)
1390602adf40SYehuda Sadeh {
1391602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1392aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1393aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
1394602adf40SYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1395aded07eaSAlex Elder 				    class_name_len + method_name_len + len);
1396602adf40SYehuda Sadeh 	if (ret < 0)
1397602adf40SYehuda Sadeh 		return ret;
1398602adf40SYehuda Sadeh 
1399aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1400aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1401aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1402aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1403602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1404602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1405602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1406602adf40SYehuda Sadeh 
14070ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1408602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1409602adf40SYehuda Sadeh 			       0,
1410602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1411602adf40SYehuda Sadeh 			       ops,
1412d1f57ea6SAlex Elder 			       object_name, 0, 0, NULL, NULL, ver);
1413602adf40SYehuda Sadeh 
1414602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1415602adf40SYehuda Sadeh 
1416602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1417602adf40SYehuda Sadeh 	return ret;
1418602adf40SYehuda Sadeh }
1419602adf40SYehuda Sadeh 
14201fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14211fec7093SYehuda Sadeh {
14221fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14231fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14241fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14251fec7093SYehuda Sadeh 				GFP_ATOMIC);
14261fec7093SYehuda Sadeh 
14271fec7093SYehuda Sadeh 	if (!coll)
14281fec7093SYehuda Sadeh 		return NULL;
14291fec7093SYehuda Sadeh 	coll->total = num_reqs;
14301fec7093SYehuda Sadeh 	kref_init(&coll->kref);
14311fec7093SYehuda Sadeh 	return coll;
14321fec7093SYehuda Sadeh }
14331fec7093SYehuda Sadeh 
1434602adf40SYehuda Sadeh /*
1435602adf40SYehuda Sadeh  * block device queue callback
1436602adf40SYehuda Sadeh  */
1437602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1438602adf40SYehuda Sadeh {
1439602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1440602adf40SYehuda Sadeh 	struct request *rq;
1441602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1442602adf40SYehuda Sadeh 
144300f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1444602adf40SYehuda Sadeh 		struct bio *bio;
1445602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1446602adf40SYehuda Sadeh 		bool do_write;
1447602adf40SYehuda Sadeh 		int size, op_size = 0;
1448602adf40SYehuda Sadeh 		u64 ofs;
14491fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
14501fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1451602adf40SYehuda Sadeh 
1452602adf40SYehuda Sadeh 		/* peek at request from block layer */
1453602adf40SYehuda Sadeh 		if (!rq)
1454602adf40SYehuda Sadeh 			break;
1455602adf40SYehuda Sadeh 
1456602adf40SYehuda Sadeh 		dout("fetched request\n");
1457602adf40SYehuda Sadeh 
1458602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1459602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1460602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
146100f1f36fSAlex Elder 			continue;
1462602adf40SYehuda Sadeh 		}
1463602adf40SYehuda Sadeh 
1464602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1465602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1466602adf40SYehuda Sadeh 
1467602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1468593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1469602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1470602adf40SYehuda Sadeh 		if (do_write && rbd_dev->read_only) {
1471602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
147200f1f36fSAlex Elder 			continue;
1473602adf40SYehuda Sadeh 		}
1474602adf40SYehuda Sadeh 
1475602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1476602adf40SYehuda Sadeh 
1477e88a36ecSJosh Durgin 		if (rbd_dev->snap_id != CEPH_NOSNAP) {
1478e88a36ecSJosh Durgin 			bool snap_exists;
1479e88a36ecSJosh Durgin 
1480e88a36ecSJosh Durgin 			down_read(&rbd_dev->header_rwsem);
1481e88a36ecSJosh Durgin 			snap_exists = rbd_dev->snap_exists;
1482e88a36ecSJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1483e88a36ecSJosh Durgin 
1484e88a36ecSJosh Durgin 			if (!snap_exists) {
1485e88a36ecSJosh Durgin 				dout("request for non-existent snapshot");
1486e88a36ecSJosh Durgin 				spin_lock_irq(q->queue_lock);
1487e88a36ecSJosh Durgin 				__blk_end_request_all(rq, -ENXIO);
1488e88a36ecSJosh Durgin 				continue;
1489e88a36ecSJosh Durgin 			}
1490e88a36ecSJosh Durgin 		}
1491e88a36ecSJosh Durgin 
1492602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1493602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1494593a9e7bSAlex Elder 		     size, blk_rq_pos(rq) * SECTOR_SIZE);
1495602adf40SYehuda Sadeh 
14961fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
14971fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
14981fec7093SYehuda Sadeh 		if (!coll) {
14991fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
15001fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
150100f1f36fSAlex Elder 			continue;
15021fec7093SYehuda Sadeh 		}
15031fec7093SYehuda Sadeh 
1504602adf40SYehuda Sadeh 		do {
1505602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1506602adf40SYehuda Sadeh 			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1507602adf40SYehuda Sadeh 			op_size = rbd_get_segment(&rbd_dev->header,
1508ca1e49a6SAlex Elder 						  rbd_dev->header.object_prefix,
1509602adf40SYehuda Sadeh 						  ofs, size,
1510602adf40SYehuda Sadeh 						  NULL, NULL);
15111fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1512602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1513602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1514602adf40SYehuda Sadeh 			if (!bio) {
15151fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15161fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15171fec7093SYehuda Sadeh 				goto next_seg;
1518602adf40SYehuda Sadeh 			}
1519602adf40SYehuda Sadeh 
15201fec7093SYehuda Sadeh 
1521602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1522602adf40SYehuda Sadeh 			if (do_write)
1523602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1524602adf40SYehuda Sadeh 					      rbd_dev->header.snapc,
1525602adf40SYehuda Sadeh 					      ofs,
15261fec7093SYehuda Sadeh 					      op_size, bio,
15271fec7093SYehuda Sadeh 					      coll, cur_seg);
1528602adf40SYehuda Sadeh 			else
1529602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
153077dfe99fSJosh Durgin 					     rbd_dev->snap_id,
1531602adf40SYehuda Sadeh 					     ofs,
15321fec7093SYehuda Sadeh 					     op_size, bio,
15331fec7093SYehuda Sadeh 					     coll, cur_seg);
1534602adf40SYehuda Sadeh 
15351fec7093SYehuda Sadeh next_seg:
1536602adf40SYehuda Sadeh 			size -= op_size;
1537602adf40SYehuda Sadeh 			ofs += op_size;
1538602adf40SYehuda Sadeh 
15391fec7093SYehuda Sadeh 			cur_seg++;
1540602adf40SYehuda Sadeh 			rq_bio = next_bio;
1541602adf40SYehuda Sadeh 		} while (size > 0);
15421fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1543602adf40SYehuda Sadeh 
1544602adf40SYehuda Sadeh 		if (bp)
1545602adf40SYehuda Sadeh 			bio_pair_release(bp);
1546602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1547602adf40SYehuda Sadeh 	}
1548602adf40SYehuda Sadeh }
1549602adf40SYehuda Sadeh 
1550602adf40SYehuda Sadeh /*
1551602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1552602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1553602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1554602adf40SYehuda Sadeh  */
1555602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1556602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1557602adf40SYehuda Sadeh {
1558602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1559593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1560593a9e7bSAlex Elder 	sector_t sector;
1561593a9e7bSAlex Elder 	unsigned int bio_sectors;
1562602adf40SYehuda Sadeh 	int max;
1563602adf40SYehuda Sadeh 
1564593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1565593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1566593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1567593a9e7bSAlex Elder 
1568602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1569593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1570602adf40SYehuda Sadeh 	if (max < 0)
1571602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1572602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1573602adf40SYehuda Sadeh 		return bvec->bv_len;
1574602adf40SYehuda Sadeh 	return max;
1575602adf40SYehuda Sadeh }
1576602adf40SYehuda Sadeh 
1577602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1578602adf40SYehuda Sadeh {
1579602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1580602adf40SYehuda Sadeh 
1581602adf40SYehuda Sadeh 	if (!disk)
1582602adf40SYehuda Sadeh 		return;
1583602adf40SYehuda Sadeh 
1584602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1585602adf40SYehuda Sadeh 
1586602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1587602adf40SYehuda Sadeh 		del_gendisk(disk);
1588602adf40SYehuda Sadeh 	if (disk->queue)
1589602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1590602adf40SYehuda Sadeh 	put_disk(disk);
1591602adf40SYehuda Sadeh }
1592602adf40SYehuda Sadeh 
1593602adf40SYehuda Sadeh /*
1594602adf40SYehuda Sadeh  * reload the ondisk the header
1595602adf40SYehuda Sadeh  */
1596602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1597602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1598602adf40SYehuda Sadeh {
1599602adf40SYehuda Sadeh 	ssize_t rc;
1600602adf40SYehuda Sadeh 	struct rbd_image_header_ondisk *dh;
160150f7c4c9SXi Wang 	u32 snap_count = 0;
160259c2be1eSYehuda Sadeh 	u64 ver;
160300f1f36fSAlex Elder 	size_t len;
1604602adf40SYehuda Sadeh 
160500f1f36fSAlex Elder 	/*
160600f1f36fSAlex Elder 	 * First reads the fixed-size header to determine the number
160700f1f36fSAlex Elder 	 * of snapshots, then re-reads it, along with all snapshot
160800f1f36fSAlex Elder 	 * records as well as their stored names.
160900f1f36fSAlex Elder 	 */
161000f1f36fSAlex Elder 	len = sizeof (*dh);
1611602adf40SYehuda Sadeh 	while (1) {
1612602adf40SYehuda Sadeh 		dh = kmalloc(len, GFP_KERNEL);
1613602adf40SYehuda Sadeh 		if (!dh)
1614602adf40SYehuda Sadeh 			return -ENOMEM;
1615602adf40SYehuda Sadeh 
1616602adf40SYehuda Sadeh 		rc = rbd_req_sync_read(rbd_dev,
1617602adf40SYehuda Sadeh 				       NULL, CEPH_NOSNAP,
16180bed54dcSAlex Elder 				       rbd_dev->header_name,
1619602adf40SYehuda Sadeh 				       0, len,
162059c2be1eSYehuda Sadeh 				       (char *)dh, &ver);
1621602adf40SYehuda Sadeh 		if (rc < 0)
1622602adf40SYehuda Sadeh 			goto out_dh;
1623602adf40SYehuda Sadeh 
1624602adf40SYehuda Sadeh 		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
162581e759fbSJosh Durgin 		if (rc < 0) {
162600f1f36fSAlex Elder 			if (rc == -ENXIO)
162781e759fbSJosh Durgin 				pr_warning("unrecognized header format"
16280bed54dcSAlex Elder 					   " for image %s\n",
16290bed54dcSAlex Elder 					   rbd_dev->image_name);
1630602adf40SYehuda Sadeh 			goto out_dh;
163181e759fbSJosh Durgin 		}
1632602adf40SYehuda Sadeh 
163300f1f36fSAlex Elder 		if (snap_count == header->total_snaps)
163400f1f36fSAlex Elder 			break;
163500f1f36fSAlex Elder 
1636602adf40SYehuda Sadeh 		snap_count = header->total_snaps;
163700f1f36fSAlex Elder 		len = sizeof (*dh) +
163800f1f36fSAlex Elder 			snap_count * sizeof(struct rbd_image_snap_ondisk) +
163900f1f36fSAlex Elder 			header->snap_names_len;
164000f1f36fSAlex Elder 
1641602adf40SYehuda Sadeh 		rbd_header_free(header);
1642602adf40SYehuda Sadeh 		kfree(dh);
1643602adf40SYehuda Sadeh 	}
164459c2be1eSYehuda Sadeh 	header->obj_version = ver;
1645602adf40SYehuda Sadeh 
1646602adf40SYehuda Sadeh out_dh:
1647602adf40SYehuda Sadeh 	kfree(dh);
1648602adf40SYehuda Sadeh 	return rc;
1649602adf40SYehuda Sadeh }
1650602adf40SYehuda Sadeh 
1651602adf40SYehuda Sadeh /*
1652602adf40SYehuda Sadeh  * create a snapshot
1653602adf40SYehuda Sadeh  */
16540ce1a794SAlex Elder static int rbd_header_add_snap(struct rbd_device *rbd_dev,
1655602adf40SYehuda Sadeh 			       const char *snap_name,
1656602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1657602adf40SYehuda Sadeh {
1658602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1659602adf40SYehuda Sadeh 	u64 new_snapid;
1660602adf40SYehuda Sadeh 	int ret;
1661916d4d67SSage Weil 	void *data, *p, *e;
166259c2be1eSYehuda Sadeh 	u64 ver;
16631dbb4399SAlex Elder 	struct ceph_mon_client *monc;
1664602adf40SYehuda Sadeh 
1665602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
16660ce1a794SAlex Elder 	if (rbd_dev->snap_id != CEPH_NOSNAP)
1667602adf40SYehuda Sadeh 		return -EINVAL;
1668602adf40SYehuda Sadeh 
16690ce1a794SAlex Elder 	monc = &rbd_dev->rbd_client->client->monc;
16700ce1a794SAlex Elder 	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
1671602adf40SYehuda Sadeh 	dout("created snapid=%lld\n", new_snapid);
1672602adf40SYehuda Sadeh 	if (ret < 0)
1673602adf40SYehuda Sadeh 		return ret;
1674602adf40SYehuda Sadeh 
1675602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1676602adf40SYehuda Sadeh 	if (!data)
1677602adf40SYehuda Sadeh 		return -ENOMEM;
1678602adf40SYehuda Sadeh 
1679916d4d67SSage Weil 	p = data;
1680916d4d67SSage Weil 	e = data + name_len + 16;
1681602adf40SYehuda Sadeh 
1682916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1683916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1684602adf40SYehuda Sadeh 
16850bed54dcSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
16860ce1a794SAlex Elder 				"rbd", "snap_add",
1687916d4d67SSage Weil 				data, p - data, &ver);
1688602adf40SYehuda Sadeh 
1689916d4d67SSage Weil 	kfree(data);
1690602adf40SYehuda Sadeh 
1691602adf40SYehuda Sadeh 	if (ret < 0)
1692602adf40SYehuda Sadeh 		return ret;
1693602adf40SYehuda Sadeh 
16940ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
16950ce1a794SAlex Elder 	rbd_dev->header.snapc->seq = new_snapid;
16960ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
1697602adf40SYehuda Sadeh 
1698602adf40SYehuda Sadeh 	return 0;
1699602adf40SYehuda Sadeh bad:
1700602adf40SYehuda Sadeh 	return -ERANGE;
1701602adf40SYehuda Sadeh }
1702602adf40SYehuda Sadeh 
1703dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1704dfc5606dSYehuda Sadeh {
1705dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1706dfc5606dSYehuda Sadeh 
1707dfc5606dSYehuda Sadeh 	while (!list_empty(&rbd_dev->snaps)) {
1708dfc5606dSYehuda Sadeh 		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1709dfc5606dSYehuda Sadeh 		__rbd_remove_snap_dev(rbd_dev, snap);
1710dfc5606dSYehuda Sadeh 	}
1711dfc5606dSYehuda Sadeh }
1712dfc5606dSYehuda Sadeh 
1713602adf40SYehuda Sadeh /*
1714602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1715602adf40SYehuda Sadeh  */
1716263c6ca0SJosh Durgin static int __rbd_refresh_header(struct rbd_device *rbd_dev)
1717602adf40SYehuda Sadeh {
1718602adf40SYehuda Sadeh 	int ret;
1719602adf40SYehuda Sadeh 	struct rbd_image_header h;
1720602adf40SYehuda Sadeh 	u64 snap_seq;
172159c2be1eSYehuda Sadeh 	int follow_seq = 0;
1722602adf40SYehuda Sadeh 
1723602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1724602adf40SYehuda Sadeh 	if (ret < 0)
1725602adf40SYehuda Sadeh 		return ret;
1726602adf40SYehuda Sadeh 
17279db4b3e3SSage Weil 	/* resized? */
1728593a9e7bSAlex Elder 	set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
17299db4b3e3SSage Weil 
1730c666601aSJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1731602adf40SYehuda Sadeh 
1732602adf40SYehuda Sadeh 	snap_seq = rbd_dev->header.snapc->seq;
173359c2be1eSYehuda Sadeh 	if (rbd_dev->header.total_snaps &&
173459c2be1eSYehuda Sadeh 	    rbd_dev->header.snapc->snaps[0] == snap_seq)
173559c2be1eSYehuda Sadeh 		/* pointing at the head, will need to follow that
173659c2be1eSYehuda Sadeh 		   if head moves */
173759c2be1eSYehuda Sadeh 		follow_seq = 1;
1738602adf40SYehuda Sadeh 
1739849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1740602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1741849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1742849b4260SAlex Elder 	kfree(rbd_dev->header.snapc);
1743602adf40SYehuda Sadeh 
1744602adf40SYehuda Sadeh 	rbd_dev->header.total_snaps = h.total_snaps;
1745602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1746602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1747dfc5606dSYehuda Sadeh 	rbd_dev->header.snap_names_len = h.snap_names_len;
1748602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1749849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1750849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1751849b4260SAlex Elder 	kfree(h.object_prefix);
1752849b4260SAlex Elder 
175359c2be1eSYehuda Sadeh 	if (follow_seq)
175459c2be1eSYehuda Sadeh 		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
175559c2be1eSYehuda Sadeh 	else
1756602adf40SYehuda Sadeh 		rbd_dev->header.snapc->seq = snap_seq;
1757602adf40SYehuda Sadeh 
1758dfc5606dSYehuda Sadeh 	ret = __rbd_init_snaps_header(rbd_dev);
1759dfc5606dSYehuda Sadeh 
1760c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1761602adf40SYehuda Sadeh 
1762dfc5606dSYehuda Sadeh 	return ret;
1763602adf40SYehuda Sadeh }
1764602adf40SYehuda Sadeh 
1765602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1766602adf40SYehuda Sadeh {
1767602adf40SYehuda Sadeh 	struct gendisk *disk;
1768602adf40SYehuda Sadeh 	struct request_queue *q;
1769602adf40SYehuda Sadeh 	int rc;
1770593a9e7bSAlex Elder 	u64 segment_size;
1771602adf40SYehuda Sadeh 	u64 total_size = 0;
1772602adf40SYehuda Sadeh 
1773602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1774602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1775602adf40SYehuda Sadeh 	if (rc)
1776602adf40SYehuda Sadeh 		return rc;
1777602adf40SYehuda Sadeh 
1778dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
1779dfc5606dSYehuda Sadeh 	rc = __rbd_init_snaps_header(rbd_dev);
1780dfc5606dSYehuda Sadeh 	if (rc)
1781dfc5606dSYehuda Sadeh 		return rc;
1782dfc5606dSYehuda Sadeh 
1783cc9d734cSJosh Durgin 	rc = rbd_header_set_snap(rbd_dev, &total_size);
1784602adf40SYehuda Sadeh 	if (rc)
1785602adf40SYehuda Sadeh 		return rc;
1786602adf40SYehuda Sadeh 
1787602adf40SYehuda Sadeh 	/* create gendisk info */
1788602adf40SYehuda Sadeh 	rc = -ENOMEM;
1789602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1790602adf40SYehuda Sadeh 	if (!disk)
1791602adf40SYehuda Sadeh 		goto out;
1792602adf40SYehuda Sadeh 
1793f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1794aedfec59SSage Weil 		 rbd_dev->id);
1795602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1796602adf40SYehuda Sadeh 	disk->first_minor = 0;
1797602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1798602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1799602adf40SYehuda Sadeh 
1800602adf40SYehuda Sadeh 	/* init rq */
1801602adf40SYehuda Sadeh 	rc = -ENOMEM;
1802602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1803602adf40SYehuda Sadeh 	if (!q)
1804602adf40SYehuda Sadeh 		goto out_disk;
1805029bcbd8SJosh Durgin 
1806593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1807593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1808593a9e7bSAlex Elder 
1809029bcbd8SJosh Durgin 	/* set io sizes to object size */
1810593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1811593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1812593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1813593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1814593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1815029bcbd8SJosh Durgin 
1816602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1817602adf40SYehuda Sadeh 	disk->queue = q;
1818602adf40SYehuda Sadeh 
1819602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1820602adf40SYehuda Sadeh 
1821602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1822602adf40SYehuda Sadeh 	rbd_dev->q = q;
1823602adf40SYehuda Sadeh 
1824602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
1825593a9e7bSAlex Elder 	set_capacity(disk, total_size / SECTOR_SIZE);
1826602adf40SYehuda Sadeh 	add_disk(disk);
1827602adf40SYehuda Sadeh 
1828602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
1829602adf40SYehuda Sadeh 		disk->disk_name, (unsigned long long)total_size);
1830602adf40SYehuda Sadeh 	return 0;
1831602adf40SYehuda Sadeh 
1832602adf40SYehuda Sadeh out_disk:
1833602adf40SYehuda Sadeh 	put_disk(disk);
1834602adf40SYehuda Sadeh out:
1835602adf40SYehuda Sadeh 	return rc;
1836602adf40SYehuda Sadeh }
1837602adf40SYehuda Sadeh 
1838dfc5606dSYehuda Sadeh /*
1839dfc5606dSYehuda Sadeh   sysfs
1840dfc5606dSYehuda Sadeh */
1841602adf40SYehuda Sadeh 
1842593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1843593a9e7bSAlex Elder {
1844593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1845593a9e7bSAlex Elder }
1846593a9e7bSAlex Elder 
1847dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1848dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1849602adf40SYehuda Sadeh {
1850593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1851dfc5606dSYehuda Sadeh 
1852dfc5606dSYehuda Sadeh 	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1853602adf40SYehuda Sadeh }
1854602adf40SYehuda Sadeh 
1855dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1856dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1857602adf40SYehuda Sadeh {
1858593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1859dfc5606dSYehuda Sadeh 
1860dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1861dfc5606dSYehuda Sadeh }
1862dfc5606dSYehuda Sadeh 
1863dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1864dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1865dfc5606dSYehuda Sadeh {
1866593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1867dfc5606dSYehuda Sadeh 
18681dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
18691dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1870dfc5606dSYehuda Sadeh }
1871dfc5606dSYehuda Sadeh 
1872dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1873dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1874dfc5606dSYehuda Sadeh {
1875593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1876dfc5606dSYehuda Sadeh 
1877dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1878dfc5606dSYehuda Sadeh }
1879dfc5606dSYehuda Sadeh 
18809bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
18819bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
18829bb2f334SAlex Elder {
18839bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
18849bb2f334SAlex Elder 
18859bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
18869bb2f334SAlex Elder }
18879bb2f334SAlex Elder 
1888dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1889dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1890dfc5606dSYehuda Sadeh {
1891593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1892dfc5606dSYehuda Sadeh 
18930bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
1894dfc5606dSYehuda Sadeh }
1895dfc5606dSYehuda Sadeh 
1896dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
1897dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
1898dfc5606dSYehuda Sadeh 			     char *buf)
1899dfc5606dSYehuda Sadeh {
1900593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1901dfc5606dSYehuda Sadeh 
1902dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
1903dfc5606dSYehuda Sadeh }
1904dfc5606dSYehuda Sadeh 
1905dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
1906dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
1907dfc5606dSYehuda Sadeh 				 const char *buf,
1908dfc5606dSYehuda Sadeh 				 size_t size)
1909dfc5606dSYehuda Sadeh {
1910593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1911dfc5606dSYehuda Sadeh 	int rc;
1912dfc5606dSYehuda Sadeh 	int ret = size;
1913602adf40SYehuda Sadeh 
1914602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1915602adf40SYehuda Sadeh 
1916263c6ca0SJosh Durgin 	rc = __rbd_refresh_header(rbd_dev);
1917dfc5606dSYehuda Sadeh 	if (rc < 0)
1918dfc5606dSYehuda Sadeh 		ret = rc;
1919602adf40SYehuda Sadeh 
1920dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
1921dfc5606dSYehuda Sadeh 	return ret;
1922dfc5606dSYehuda Sadeh }
1923602adf40SYehuda Sadeh 
1924dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1925dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1926dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1927dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
19289bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
1929dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1930dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1931dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1932dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1933dfc5606dSYehuda Sadeh 
1934dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
1935dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
1936dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
1937dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
1938dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
19399bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
1940dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
1941dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
1942dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
1943dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
1944dfc5606dSYehuda Sadeh 	NULL
1945dfc5606dSYehuda Sadeh };
1946dfc5606dSYehuda Sadeh 
1947dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
1948dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
1949dfc5606dSYehuda Sadeh };
1950dfc5606dSYehuda Sadeh 
1951dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
1952dfc5606dSYehuda Sadeh 	&rbd_attr_group,
1953dfc5606dSYehuda Sadeh 	NULL
1954dfc5606dSYehuda Sadeh };
1955dfc5606dSYehuda Sadeh 
1956dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
1957dfc5606dSYehuda Sadeh {
1958dfc5606dSYehuda Sadeh }
1959dfc5606dSYehuda Sadeh 
1960dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
1961dfc5606dSYehuda Sadeh 	.name		= "rbd",
1962dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
1963dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
1964dfc5606dSYehuda Sadeh };
1965dfc5606dSYehuda Sadeh 
1966dfc5606dSYehuda Sadeh 
1967dfc5606dSYehuda Sadeh /*
1968dfc5606dSYehuda Sadeh   sysfs - snapshots
1969dfc5606dSYehuda Sadeh */
1970dfc5606dSYehuda Sadeh 
1971dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
1972dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
1973dfc5606dSYehuda Sadeh 				  char *buf)
1974dfc5606dSYehuda Sadeh {
1975dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1976dfc5606dSYehuda Sadeh 
19773591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
1978dfc5606dSYehuda Sadeh }
1979dfc5606dSYehuda Sadeh 
1980dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
1981dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
1982dfc5606dSYehuda Sadeh 				char *buf)
1983dfc5606dSYehuda Sadeh {
1984dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1985dfc5606dSYehuda Sadeh 
1986593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
1987dfc5606dSYehuda Sadeh }
1988dfc5606dSYehuda Sadeh 
1989dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1990dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1991dfc5606dSYehuda Sadeh 
1992dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
1993dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
1994dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
1995dfc5606dSYehuda Sadeh 	NULL,
1996dfc5606dSYehuda Sadeh };
1997dfc5606dSYehuda Sadeh 
1998dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
1999dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2000dfc5606dSYehuda Sadeh };
2001dfc5606dSYehuda Sadeh 
2002dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2003dfc5606dSYehuda Sadeh {
2004dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2005dfc5606dSYehuda Sadeh 	kfree(snap->name);
2006dfc5606dSYehuda Sadeh 	kfree(snap);
2007dfc5606dSYehuda Sadeh }
2008dfc5606dSYehuda Sadeh 
2009dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2010dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2011dfc5606dSYehuda Sadeh 	NULL
2012dfc5606dSYehuda Sadeh };
2013dfc5606dSYehuda Sadeh 
2014dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2015dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2016dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2017dfc5606dSYehuda Sadeh };
2018dfc5606dSYehuda Sadeh 
2019dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2020dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap)
2021dfc5606dSYehuda Sadeh {
2022dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2023dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
2024dfc5606dSYehuda Sadeh }
2025dfc5606dSYehuda Sadeh 
2026dfc5606dSYehuda Sadeh static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2027dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap,
2028dfc5606dSYehuda Sadeh 				  struct device *parent)
2029dfc5606dSYehuda Sadeh {
2030dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2031dfc5606dSYehuda Sadeh 	int ret;
2032dfc5606dSYehuda Sadeh 
2033dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2034dfc5606dSYehuda Sadeh 	dev->parent = parent;
2035dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2036dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2037dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2038dfc5606dSYehuda Sadeh 
2039dfc5606dSYehuda Sadeh 	return ret;
2040dfc5606dSYehuda Sadeh }
2041dfc5606dSYehuda Sadeh 
2042dfc5606dSYehuda Sadeh static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2043dfc5606dSYehuda Sadeh 			      int i, const char *name,
2044dfc5606dSYehuda Sadeh 			      struct rbd_snap **snapp)
2045dfc5606dSYehuda Sadeh {
2046dfc5606dSYehuda Sadeh 	int ret;
2047dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2048dfc5606dSYehuda Sadeh 	if (!snap)
2049dfc5606dSYehuda Sadeh 		return -ENOMEM;
2050dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
2051dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
2052dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
2053dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
2054dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2055dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2056dfc5606dSYehuda Sadeh 		if (ret < 0)
2057dfc5606dSYehuda Sadeh 			goto err;
2058dfc5606dSYehuda Sadeh 	}
2059dfc5606dSYehuda Sadeh 	*snapp = snap;
2060dfc5606dSYehuda Sadeh 	return 0;
2061dfc5606dSYehuda Sadeh err:
2062dfc5606dSYehuda Sadeh 	kfree(snap->name);
2063dfc5606dSYehuda Sadeh 	kfree(snap);
2064dfc5606dSYehuda Sadeh 	return ret;
2065dfc5606dSYehuda Sadeh }
2066dfc5606dSYehuda Sadeh 
2067dfc5606dSYehuda Sadeh /*
2068dfc5606dSYehuda Sadeh  * search for the previous snap in a null delimited string list
2069dfc5606dSYehuda Sadeh  */
2070dfc5606dSYehuda Sadeh const char *rbd_prev_snap_name(const char *name, const char *start)
2071dfc5606dSYehuda Sadeh {
2072dfc5606dSYehuda Sadeh 	if (name < start + 2)
2073dfc5606dSYehuda Sadeh 		return NULL;
2074dfc5606dSYehuda Sadeh 
2075dfc5606dSYehuda Sadeh 	name -= 2;
2076dfc5606dSYehuda Sadeh 	while (*name) {
2077dfc5606dSYehuda Sadeh 		if (name == start)
2078dfc5606dSYehuda Sadeh 			return start;
2079dfc5606dSYehuda Sadeh 		name--;
2080dfc5606dSYehuda Sadeh 	}
2081dfc5606dSYehuda Sadeh 	return name + 1;
2082dfc5606dSYehuda Sadeh }
2083dfc5606dSYehuda Sadeh 
2084dfc5606dSYehuda Sadeh /*
2085dfc5606dSYehuda Sadeh  * compare the old list of snapshots that we have to what's in the header
2086dfc5606dSYehuda Sadeh  * and update it accordingly. Note that the header holds the snapshots
2087dfc5606dSYehuda Sadeh  * in a reverse order (from newest to oldest) and we need to go from
2088dfc5606dSYehuda Sadeh  * older to new so that we don't get a duplicate snap name when
2089dfc5606dSYehuda Sadeh  * doing the process (e.g., removed snapshot and recreated a new
2090dfc5606dSYehuda Sadeh  * one with the same name.
2091dfc5606dSYehuda Sadeh  */
2092dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2093dfc5606dSYehuda Sadeh {
2094dfc5606dSYehuda Sadeh 	const char *name, *first_name;
2095dfc5606dSYehuda Sadeh 	int i = rbd_dev->header.total_snaps;
2096dfc5606dSYehuda Sadeh 	struct rbd_snap *snap, *old_snap = NULL;
2097dfc5606dSYehuda Sadeh 	int ret;
2098dfc5606dSYehuda Sadeh 	struct list_head *p, *n;
2099dfc5606dSYehuda Sadeh 
2100dfc5606dSYehuda Sadeh 	first_name = rbd_dev->header.snap_names;
2101dfc5606dSYehuda Sadeh 	name = first_name + rbd_dev->header.snap_names_len;
2102dfc5606dSYehuda Sadeh 
2103dfc5606dSYehuda Sadeh 	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2104dfc5606dSYehuda Sadeh 		u64 cur_id;
2105dfc5606dSYehuda Sadeh 
2106dfc5606dSYehuda Sadeh 		old_snap = list_entry(p, struct rbd_snap, node);
2107dfc5606dSYehuda Sadeh 
2108dfc5606dSYehuda Sadeh 		if (i)
2109dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
2110dfc5606dSYehuda Sadeh 
2111dfc5606dSYehuda Sadeh 		if (!i || old_snap->id < cur_id) {
2112e88a36ecSJosh Durgin 			/*
2113e88a36ecSJosh Durgin 			 * old_snap->id was skipped, thus was
2114e88a36ecSJosh Durgin 			 * removed.  If this rbd_dev is mapped to
2115e88a36ecSJosh Durgin 			 * the removed snapshot, record that it no
2116e88a36ecSJosh Durgin 			 * longer exists, to prevent further I/O.
2117e88a36ecSJosh Durgin 			 */
2118e88a36ecSJosh Durgin 			if (rbd_dev->snap_id == old_snap->id)
2119e88a36ecSJosh Durgin 				rbd_dev->snap_exists = false;
2120dfc5606dSYehuda Sadeh 			__rbd_remove_snap_dev(rbd_dev, old_snap);
2121dfc5606dSYehuda Sadeh 			continue;
2122dfc5606dSYehuda Sadeh 		}
2123dfc5606dSYehuda Sadeh 		if (old_snap->id == cur_id) {
2124dfc5606dSYehuda Sadeh 			/* we have this snapshot already */
2125dfc5606dSYehuda Sadeh 			i--;
2126dfc5606dSYehuda Sadeh 			name = rbd_prev_snap_name(name, first_name);
2127dfc5606dSYehuda Sadeh 			continue;
2128dfc5606dSYehuda Sadeh 		}
2129dfc5606dSYehuda Sadeh 		for (; i > 0;
2130dfc5606dSYehuda Sadeh 		     i--, name = rbd_prev_snap_name(name, first_name)) {
2131dfc5606dSYehuda Sadeh 			if (!name) {
2132dfc5606dSYehuda Sadeh 				WARN_ON(1);
2133dfc5606dSYehuda Sadeh 				return -EINVAL;
2134dfc5606dSYehuda Sadeh 			}
2135dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i];
2136dfc5606dSYehuda Sadeh 			/* snapshot removal? handle it above */
2137dfc5606dSYehuda Sadeh 			if (cur_id >= old_snap->id)
2138dfc5606dSYehuda Sadeh 				break;
2139dfc5606dSYehuda Sadeh 			/* a new snapshot */
2140dfc5606dSYehuda Sadeh 			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2141dfc5606dSYehuda Sadeh 			if (ret < 0)
2142dfc5606dSYehuda Sadeh 				return ret;
2143dfc5606dSYehuda Sadeh 
2144dfc5606dSYehuda Sadeh 			/* note that we add it backward so using n and not p */
2145dfc5606dSYehuda Sadeh 			list_add(&snap->node, n);
2146dfc5606dSYehuda Sadeh 			p = &snap->node;
2147dfc5606dSYehuda Sadeh 		}
2148dfc5606dSYehuda Sadeh 	}
2149dfc5606dSYehuda Sadeh 	/* we're done going over the old snap list, just add what's left */
2150dfc5606dSYehuda Sadeh 	for (; i > 0; i--) {
2151dfc5606dSYehuda Sadeh 		name = rbd_prev_snap_name(name, first_name);
2152dfc5606dSYehuda Sadeh 		if (!name) {
2153dfc5606dSYehuda Sadeh 			WARN_ON(1);
2154dfc5606dSYehuda Sadeh 			return -EINVAL;
2155dfc5606dSYehuda Sadeh 		}
2156dfc5606dSYehuda Sadeh 		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2157dfc5606dSYehuda Sadeh 		if (ret < 0)
2158dfc5606dSYehuda Sadeh 			return ret;
2159dfc5606dSYehuda Sadeh 		list_add(&snap->node, &rbd_dev->snaps);
2160dfc5606dSYehuda Sadeh 	}
2161dfc5606dSYehuda Sadeh 
2162dfc5606dSYehuda Sadeh 	return 0;
2163dfc5606dSYehuda Sadeh }
2164dfc5606dSYehuda Sadeh 
2165dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2166dfc5606dSYehuda Sadeh {
2167f0f8cef5SAlex Elder 	int ret;
2168dfc5606dSYehuda Sadeh 	struct device *dev;
2169dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2170dfc5606dSYehuda Sadeh 
2171dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2172dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2173dfc5606dSYehuda Sadeh 
2174dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2175dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2176dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2177dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2178dfc5606dSYehuda Sadeh 	dev_set_name(dev, "%d", rbd_dev->id);
2179dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2180dfc5606dSYehuda Sadeh 	if (ret < 0)
2181f0f8cef5SAlex Elder 		goto out;
2182dfc5606dSYehuda Sadeh 
2183dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2184dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2185dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2186dfc5606dSYehuda Sadeh 		if (ret < 0)
2187602adf40SYehuda Sadeh 			break;
2188602adf40SYehuda Sadeh 	}
2189f0f8cef5SAlex Elder out:
2190dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2191dfc5606dSYehuda Sadeh 	return ret;
2192602adf40SYehuda Sadeh }
2193602adf40SYehuda Sadeh 
2194dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2195dfc5606dSYehuda Sadeh {
2196dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2197dfc5606dSYehuda Sadeh }
2198dfc5606dSYehuda Sadeh 
219959c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
220059c2be1eSYehuda Sadeh {
220159c2be1eSYehuda Sadeh 	int ret, rc;
220259c2be1eSYehuda Sadeh 
220359c2be1eSYehuda Sadeh 	do {
22040bed54dcSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
220559c2be1eSYehuda Sadeh 					 rbd_dev->header.obj_version);
220659c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
220759c2be1eSYehuda Sadeh 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2208263c6ca0SJosh Durgin 			rc = __rbd_refresh_header(rbd_dev);
220959c2be1eSYehuda Sadeh 			mutex_unlock(&ctl_mutex);
221059c2be1eSYehuda Sadeh 			if (rc < 0)
221159c2be1eSYehuda Sadeh 				return rc;
221259c2be1eSYehuda Sadeh 		}
221359c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
221459c2be1eSYehuda Sadeh 
221559c2be1eSYehuda Sadeh 	return ret;
221659c2be1eSYehuda Sadeh }
221759c2be1eSYehuda Sadeh 
22181ddbe94eSAlex Elder static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
22191ddbe94eSAlex Elder 
22201ddbe94eSAlex Elder /*
2221499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2222499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
22231ddbe94eSAlex Elder  */
2224499afd5bSAlex Elder static void rbd_id_get(struct rbd_device *rbd_dev)
2225b7f23c36SAlex Elder {
2226499afd5bSAlex Elder 	rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2227499afd5bSAlex Elder 
2228499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2229499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2230499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2231b7f23c36SAlex Elder }
2232b7f23c36SAlex Elder 
22331ddbe94eSAlex Elder /*
2234499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2235499afd5bSAlex Elder  * identifier is no longer in use.
22361ddbe94eSAlex Elder  */
2237499afd5bSAlex Elder static void rbd_id_put(struct rbd_device *rbd_dev)
22381ddbe94eSAlex Elder {
2239d184f6bfSAlex Elder 	struct list_head *tmp;
2240d184f6bfSAlex Elder 	int rbd_id = rbd_dev->id;
2241d184f6bfSAlex Elder 	int max_id;
2242d184f6bfSAlex Elder 
2243d184f6bfSAlex Elder 	BUG_ON(rbd_id < 1);
2244499afd5bSAlex Elder 
2245499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2246499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2247d184f6bfSAlex Elder 
2248d184f6bfSAlex Elder 	/*
2249d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2250d184f6bfSAlex Elder 	 * is nothing special we need to do.
2251d184f6bfSAlex Elder 	 */
2252d184f6bfSAlex Elder 	if (rbd_id != atomic64_read(&rbd_id_max)) {
2253d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2254d184f6bfSAlex Elder 		return;
2255d184f6bfSAlex Elder 	}
2256d184f6bfSAlex Elder 
2257d184f6bfSAlex Elder 	/*
2258d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2259d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2260d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2261d184f6bfSAlex Elder 	 */
2262d184f6bfSAlex Elder 	max_id = 0;
2263d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2264d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2265d184f6bfSAlex Elder 
2266d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2267d184f6bfSAlex Elder 		if (rbd_id > max_id)
2268d184f6bfSAlex Elder 			max_id = rbd_id;
2269d184f6bfSAlex Elder 	}
2270499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
22711ddbe94eSAlex Elder 
22721ddbe94eSAlex Elder 	/*
2273d184f6bfSAlex Elder 	 * The max id could have been updated by rbd_id_get(), in
2274d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2275d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2276d184f6bfSAlex Elder 	 * case.
22771ddbe94eSAlex Elder 	 */
2278d184f6bfSAlex Elder 	atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
2279b7f23c36SAlex Elder }
2280b7f23c36SAlex Elder 
2281a725f65eSAlex Elder /*
2282e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2283e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2284593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2285593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2286e28fff26SAlex Elder  */
2287e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2288e28fff26SAlex Elder {
2289e28fff26SAlex Elder         /*
2290e28fff26SAlex Elder         * These are the characters that produce nonzero for
2291e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2292e28fff26SAlex Elder         */
2293e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2294e28fff26SAlex Elder 
2295e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2296e28fff26SAlex Elder 
2297e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2298e28fff26SAlex Elder }
2299e28fff26SAlex Elder 
2300e28fff26SAlex Elder /*
2301e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2302e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2303593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2304593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2305e28fff26SAlex Elder  *
2306e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2307e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2308e28fff26SAlex Elder  * token_size if the token would not fit.
2309e28fff26SAlex Elder  *
2310593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2311e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2312e28fff26SAlex Elder  * too small to hold it.
2313e28fff26SAlex Elder  */
2314e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2315e28fff26SAlex Elder 				char *token,
2316e28fff26SAlex Elder 				size_t token_size)
2317e28fff26SAlex Elder {
2318e28fff26SAlex Elder         size_t len;
2319e28fff26SAlex Elder 
2320e28fff26SAlex Elder 	len = next_token(buf);
2321e28fff26SAlex Elder 	if (len < token_size) {
2322e28fff26SAlex Elder 		memcpy(token, *buf, len);
2323e28fff26SAlex Elder 		*(token + len) = '\0';
2324e28fff26SAlex Elder 	}
2325e28fff26SAlex Elder 	*buf += len;
2326e28fff26SAlex Elder 
2327e28fff26SAlex Elder         return len;
2328e28fff26SAlex Elder }
2329e28fff26SAlex Elder 
2330e28fff26SAlex Elder /*
2331ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2332ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2333ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2334ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2335ea3352f4SAlex Elder  *
2336ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2337ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2338ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2339ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2340ea3352f4SAlex Elder  *
2341ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2342ea3352f4SAlex Elder  * the end of the found token.
2343ea3352f4SAlex Elder  *
2344ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2345ea3352f4SAlex Elder  */
2346ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2347ea3352f4SAlex Elder {
2348ea3352f4SAlex Elder 	char *dup;
2349ea3352f4SAlex Elder 	size_t len;
2350ea3352f4SAlex Elder 
2351ea3352f4SAlex Elder 	len = next_token(buf);
2352ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2353ea3352f4SAlex Elder 	if (!dup)
2354ea3352f4SAlex Elder 		return NULL;
2355ea3352f4SAlex Elder 
2356ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2357ea3352f4SAlex Elder 	*(dup + len) = '\0';
2358ea3352f4SAlex Elder 	*buf += len;
2359ea3352f4SAlex Elder 
2360ea3352f4SAlex Elder 	if (lenp)
2361ea3352f4SAlex Elder 		*lenp = len;
2362ea3352f4SAlex Elder 
2363ea3352f4SAlex Elder 	return dup;
2364ea3352f4SAlex Elder }
2365ea3352f4SAlex Elder 
2366ea3352f4SAlex Elder /*
23670bed54dcSAlex Elder  * This fills in the pool_name, image_name, image_name_len, snap_name,
2368a725f65eSAlex Elder  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2369a725f65eSAlex Elder  * on the list of monitor addresses and other options provided via
2370a725f65eSAlex Elder  * /sys/bus/rbd/add.
2371d22f76e7SAlex Elder  *
2372d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2373a725f65eSAlex Elder  */
2374a725f65eSAlex Elder static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2375a725f65eSAlex Elder 			      const char *buf,
23767ef3214aSAlex Elder 			      const char **mon_addrs,
23775214ecc4SAlex Elder 			      size_t *mon_addrs_size,
2378e28fff26SAlex Elder 			      char *options,
2379e28fff26SAlex Elder 			     size_t options_size)
2380a725f65eSAlex Elder {
2381e28fff26SAlex Elder 	size_t len;
2382d22f76e7SAlex Elder 	int ret;
2383e28fff26SAlex Elder 
2384e28fff26SAlex Elder 	/* The first four tokens are required */
2385e28fff26SAlex Elder 
23867ef3214aSAlex Elder 	len = next_token(&buf);
23877ef3214aSAlex Elder 	if (!len)
2388a725f65eSAlex Elder 		return -EINVAL;
23895214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
23907ef3214aSAlex Elder 	*mon_addrs = buf;
23917ef3214aSAlex Elder 
23927ef3214aSAlex Elder 	buf += len;
2393a725f65eSAlex Elder 
2394e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2395e28fff26SAlex Elder 	if (!len || len >= options_size)
2396e28fff26SAlex Elder 		return -EINVAL;
2397a725f65eSAlex Elder 
2398bf3e5ae1SAlex Elder 	ret = -ENOMEM;
2399d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2400d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2401d22f76e7SAlex Elder 		goto out_err;
2402e28fff26SAlex Elder 
24030bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
24040bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2405bf3e5ae1SAlex Elder 		goto out_err;
2406e28fff26SAlex Elder 
2407cb8627c7SAlex Elder 	/* Create the name of the header object */
2408cb8627c7SAlex Elder 
24090bed54dcSAlex Elder 	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2410bf3e5ae1SAlex Elder 						+ sizeof (RBD_SUFFIX),
2411bf3e5ae1SAlex Elder 					GFP_KERNEL);
24120bed54dcSAlex Elder 	if (!rbd_dev->header_name)
2413cb8627c7SAlex Elder 		goto out_err;
24140bed54dcSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2415a725f65eSAlex Elder 
2416e28fff26SAlex Elder 	/*
2417820a5f3eSAlex Elder 	 * The snapshot name is optional.  If none is is supplied,
2418820a5f3eSAlex Elder 	 * we use the default value.
2419e28fff26SAlex Elder 	 */
2420820a5f3eSAlex Elder 	rbd_dev->snap_name = dup_token(&buf, &len);
2421820a5f3eSAlex Elder 	if (!rbd_dev->snap_name)
2422820a5f3eSAlex Elder 		goto out_err;
2423820a5f3eSAlex Elder 	if (!len) {
2424820a5f3eSAlex Elder 		/* Replace the empty name with the default */
2425820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
2426820a5f3eSAlex Elder 		rbd_dev->snap_name
2427820a5f3eSAlex Elder 			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2428820a5f3eSAlex Elder 		if (!rbd_dev->snap_name)
2429820a5f3eSAlex Elder 			goto out_err;
2430820a5f3eSAlex Elder 
2431e28fff26SAlex Elder 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2432e28fff26SAlex Elder 			sizeof (RBD_SNAP_HEAD_NAME));
2433849b4260SAlex Elder 	}
2434e28fff26SAlex Elder 
2435a725f65eSAlex Elder 	return 0;
2436d22f76e7SAlex Elder 
2437d22f76e7SAlex Elder out_err:
24380bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
24390bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2440d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2441d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2442d22f76e7SAlex Elder 
2443d22f76e7SAlex Elder 	return ret;
2444a725f65eSAlex Elder }
2445a725f65eSAlex Elder 
244659c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
244759c2be1eSYehuda Sadeh 		       const char *buf,
244859c2be1eSYehuda Sadeh 		       size_t count)
2449602adf40SYehuda Sadeh {
2450cb8627c7SAlex Elder 	char *options;
2451cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
24527ef3214aSAlex Elder 	const char *mon_addrs = NULL;
24537ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
245427cc2594SAlex Elder 	struct ceph_osd_client *osdc;
245527cc2594SAlex Elder 	int rc = -ENOMEM;
2456602adf40SYehuda Sadeh 
2457602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2458602adf40SYehuda Sadeh 		return -ENODEV;
2459602adf40SYehuda Sadeh 
246027cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
246127cc2594SAlex Elder 	if (!options)
246227cc2594SAlex Elder 		goto err_nomem;
2463cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2464cb8627c7SAlex Elder 	if (!rbd_dev)
2465cb8627c7SAlex Elder 		goto err_nomem;
2466602adf40SYehuda Sadeh 
2467602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2468602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2469602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2470dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2471c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2472602adf40SYehuda Sadeh 
2473c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
24740e805a1dSAlex Elder 
2475d184f6bfSAlex Elder 	/* generate unique id: find highest unique id, add one */
2476499afd5bSAlex Elder 	rbd_id_get(rbd_dev);
2477602adf40SYehuda Sadeh 
2478a725f65eSAlex Elder 	/* Fill in the device name, now that we have its id. */
247981a89793SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
248081a89793SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
248181a89793SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
2482e124a82fSAlex Elder 
2483a725f65eSAlex Elder 	/* parse add command */
24847ef3214aSAlex Elder 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2485e28fff26SAlex Elder 				options, count);
2486a725f65eSAlex Elder 	if (rc)
2487a725f65eSAlex Elder 		goto err_put_id;
2488a725f65eSAlex Elder 
24895214ecc4SAlex Elder 	rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
24905214ecc4SAlex Elder 						options);
2491d720bcb0SAlex Elder 	if (IS_ERR(rbd_dev->rbd_client)) {
2492d720bcb0SAlex Elder 		rc = PTR_ERR(rbd_dev->rbd_client);
2493f0f8cef5SAlex Elder 		goto err_put_id;
2494d720bcb0SAlex Elder 	}
2495602adf40SYehuda Sadeh 
2496602adf40SYehuda Sadeh 	/* pick the pool */
24971dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2498602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2499602adf40SYehuda Sadeh 	if (rc < 0)
2500602adf40SYehuda Sadeh 		goto err_out_client;
25019bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
2502602adf40SYehuda Sadeh 
2503602adf40SYehuda Sadeh 	/* register our block device */
250427cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
250527cc2594SAlex Elder 	if (rc < 0)
2506602adf40SYehuda Sadeh 		goto err_out_client;
250727cc2594SAlex Elder 	rbd_dev->major = rc;
2508602adf40SYehuda Sadeh 
2509dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2510dfc5606dSYehuda Sadeh 	if (rc)
2511766fc439SYehuda Sadeh 		goto err_out_blkdev;
2512766fc439SYehuda Sadeh 
251332eec68dSAlex Elder 	/*
251432eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
251532eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
251632eec68dSAlex Elder 	 *
251732eec68dSAlex Elder 	 * Set up and announce blkdev mapping.
251832eec68dSAlex Elder 	 */
2519602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2520602adf40SYehuda Sadeh 	if (rc)
2521766fc439SYehuda Sadeh 		goto err_out_bus;
2522602adf40SYehuda Sadeh 
252359c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
252459c2be1eSYehuda Sadeh 	if (rc)
252559c2be1eSYehuda Sadeh 		goto err_out_bus;
252659c2be1eSYehuda Sadeh 
2527602adf40SYehuda Sadeh 	return count;
2528602adf40SYehuda Sadeh 
2529766fc439SYehuda Sadeh err_out_bus:
2530766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2531766fc439SYehuda Sadeh 
2532766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2533766fc439SYehuda Sadeh 	kfree(options);
2534766fc439SYehuda Sadeh 	return rc;
2535766fc439SYehuda Sadeh 
2536602adf40SYehuda Sadeh err_out_blkdev:
2537602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2538602adf40SYehuda Sadeh err_out_client:
2539602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2540f0f8cef5SAlex Elder err_put_id:
2541cb8627c7SAlex Elder 	if (rbd_dev->pool_name) {
2542820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
25430bed54dcSAlex Elder 		kfree(rbd_dev->header_name);
25440bed54dcSAlex Elder 		kfree(rbd_dev->image_name);
2545d22f76e7SAlex Elder 		kfree(rbd_dev->pool_name);
2546cb8627c7SAlex Elder 	}
2547499afd5bSAlex Elder 	rbd_id_put(rbd_dev);
254827cc2594SAlex Elder err_nomem:
254927cc2594SAlex Elder 	kfree(rbd_dev);
2550cb8627c7SAlex Elder 	kfree(options);
255127cc2594SAlex Elder 
2552602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2553602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
255427cc2594SAlex Elder 
255527cc2594SAlex Elder 	return (ssize_t) rc;
2556602adf40SYehuda Sadeh }
2557602adf40SYehuda Sadeh 
2558602adf40SYehuda Sadeh static struct rbd_device *__rbd_get_dev(unsigned long id)
2559602adf40SYehuda Sadeh {
2560602adf40SYehuda Sadeh 	struct list_head *tmp;
2561602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2562602adf40SYehuda Sadeh 
2563e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2564602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2565602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2566e124a82fSAlex Elder 		if (rbd_dev->id == id) {
2567e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2568602adf40SYehuda Sadeh 			return rbd_dev;
2569602adf40SYehuda Sadeh 		}
2570e124a82fSAlex Elder 	}
2571e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2572602adf40SYehuda Sadeh 	return NULL;
2573602adf40SYehuda Sadeh }
2574602adf40SYehuda Sadeh 
2575dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2576602adf40SYehuda Sadeh {
2577593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2578602adf40SYehuda Sadeh 
25791dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
25801dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
25811dbb4399SAlex Elder 
25821dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
258359c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
25841dbb4399SAlex Elder 	}
258559c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
25860bed54dcSAlex Elder 		rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
258759c2be1eSYehuda Sadeh 
2588602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2589602adf40SYehuda Sadeh 
2590602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2591602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2592602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
259332eec68dSAlex Elder 
259432eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
2595820a5f3eSAlex Elder 	kfree(rbd_dev->snap_name);
25960bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2597d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
25980bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
259932eec68dSAlex Elder 	rbd_id_put(rbd_dev);
2600602adf40SYehuda Sadeh 	kfree(rbd_dev);
2601602adf40SYehuda Sadeh 
2602602adf40SYehuda Sadeh 	/* release module ref */
2603602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2604602adf40SYehuda Sadeh }
2605602adf40SYehuda Sadeh 
2606dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2607602adf40SYehuda Sadeh 			  const char *buf,
2608602adf40SYehuda Sadeh 			  size_t count)
2609602adf40SYehuda Sadeh {
2610602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2611602adf40SYehuda Sadeh 	int target_id, rc;
2612602adf40SYehuda Sadeh 	unsigned long ul;
2613602adf40SYehuda Sadeh 	int ret = count;
2614602adf40SYehuda Sadeh 
2615602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2616602adf40SYehuda Sadeh 	if (rc)
2617602adf40SYehuda Sadeh 		return rc;
2618602adf40SYehuda Sadeh 
2619602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2620602adf40SYehuda Sadeh 	target_id = (int) ul;
2621602adf40SYehuda Sadeh 	if (target_id != ul)
2622602adf40SYehuda Sadeh 		return -EINVAL;
2623602adf40SYehuda Sadeh 
2624602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2625602adf40SYehuda Sadeh 
2626602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2627602adf40SYehuda Sadeh 	if (!rbd_dev) {
2628602adf40SYehuda Sadeh 		ret = -ENOENT;
2629602adf40SYehuda Sadeh 		goto done;
2630602adf40SYehuda Sadeh 	}
2631602adf40SYehuda Sadeh 
2632dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2633dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2634602adf40SYehuda Sadeh 
2635602adf40SYehuda Sadeh done:
2636602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2637602adf40SYehuda Sadeh 	return ret;
2638602adf40SYehuda Sadeh }
2639602adf40SYehuda Sadeh 
2640dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2641dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2642602adf40SYehuda Sadeh 			    const char *buf,
2643602adf40SYehuda Sadeh 			    size_t count)
2644602adf40SYehuda Sadeh {
2645593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2646dfc5606dSYehuda Sadeh 	int ret;
2647dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2648602adf40SYehuda Sadeh 	if (!name)
2649602adf40SYehuda Sadeh 		return -ENOMEM;
2650602adf40SYehuda Sadeh 
2651dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2652602adf40SYehuda Sadeh 
2653602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2654602adf40SYehuda Sadeh 
2655602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2656602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2657602adf40SYehuda Sadeh 	if (ret < 0)
265859c2be1eSYehuda Sadeh 		goto err_unlock;
2659602adf40SYehuda Sadeh 
2660263c6ca0SJosh Durgin 	ret = __rbd_refresh_header(rbd_dev);
2661602adf40SYehuda Sadeh 	if (ret < 0)
266259c2be1eSYehuda Sadeh 		goto err_unlock;
266359c2be1eSYehuda Sadeh 
266459c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
266559c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
266659c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
266759c2be1eSYehuda Sadeh 
266859c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
26690bed54dcSAlex Elder 	rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
2670602adf40SYehuda Sadeh 
2671602adf40SYehuda Sadeh 	ret = count;
267259c2be1eSYehuda Sadeh 	kfree(name);
267359c2be1eSYehuda Sadeh 	return ret;
267459c2be1eSYehuda Sadeh 
267559c2be1eSYehuda Sadeh err_unlock:
2676602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2677602adf40SYehuda Sadeh 	kfree(name);
2678602adf40SYehuda Sadeh 	return ret;
2679602adf40SYehuda Sadeh }
2680602adf40SYehuda Sadeh 
2681602adf40SYehuda Sadeh /*
2682602adf40SYehuda Sadeh  * create control files in sysfs
2683dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2684602adf40SYehuda Sadeh  */
2685602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2686602adf40SYehuda Sadeh {
2687dfc5606dSYehuda Sadeh 	int ret;
2688602adf40SYehuda Sadeh 
2689fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
2690dfc5606dSYehuda Sadeh 	if (ret < 0)
2691dfc5606dSYehuda Sadeh 		return ret;
2692602adf40SYehuda Sadeh 
2693fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
2694fed4c143SAlex Elder 	if (ret < 0)
2695fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
2696602adf40SYehuda Sadeh 
2697602adf40SYehuda Sadeh 	return ret;
2698602adf40SYehuda Sadeh }
2699602adf40SYehuda Sadeh 
2700602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2701602adf40SYehuda Sadeh {
2702dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2703fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
2704602adf40SYehuda Sadeh }
2705602adf40SYehuda Sadeh 
2706602adf40SYehuda Sadeh int __init rbd_init(void)
2707602adf40SYehuda Sadeh {
2708602adf40SYehuda Sadeh 	int rc;
2709602adf40SYehuda Sadeh 
2710602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2711602adf40SYehuda Sadeh 	if (rc)
2712602adf40SYehuda Sadeh 		return rc;
2713f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2714602adf40SYehuda Sadeh 	return 0;
2715602adf40SYehuda Sadeh }
2716602adf40SYehuda Sadeh 
2717602adf40SYehuda Sadeh void __exit rbd_exit(void)
2718602adf40SYehuda Sadeh {
2719602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2720602adf40SYehuda Sadeh }
2721602adf40SYehuda Sadeh 
2722602adf40SYehuda Sadeh module_init(rbd_init);
2723602adf40SYehuda Sadeh module_exit(rbd_exit);
2724602adf40SYehuda Sadeh 
2725602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2726602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2727602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2728602adf40SYehuda Sadeh 
2729602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2730602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2731602adf40SYehuda Sadeh 
2732602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2733