xref: /openbmc/linux/drivers/block/rbd.c (revision 1fe5e993)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44593a9e7bSAlex Elder /*
45593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
46593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
47593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
48593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
49593a9e7bSAlex Elder  */
50593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
51593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
52593a9e7bSAlex Elder 
53f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
54f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
55602adf40SYehuda Sadeh 
56602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
59602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
60602adf40SYehuda Sadeh 
61602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
62602adf40SYehuda Sadeh 
6381a89793SAlex Elder /*
6481a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
6581a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
6681a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
6781a89793SAlex Elder  * enough to hold all possible device names.
6881a89793SAlex Elder  */
69602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7081a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
71602adf40SYehuda Sadeh 
7259c2be1eSYehuda Sadeh #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
7359c2be1eSYehuda Sadeh 
74602adf40SYehuda Sadeh /*
75602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
76602adf40SYehuda Sadeh  */
77602adf40SYehuda Sadeh struct rbd_image_header {
78602adf40SYehuda Sadeh 	u64 image_size;
79849b4260SAlex Elder 	char *object_prefix;
80602adf40SYehuda Sadeh 	__u8 obj_order;
81602adf40SYehuda Sadeh 	__u8 crypt_type;
82602adf40SYehuda Sadeh 	__u8 comp_type;
83602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc;
84602adf40SYehuda Sadeh 	size_t snap_names_len;
85602adf40SYehuda Sadeh 	u32 total_snaps;
86602adf40SYehuda Sadeh 
87602adf40SYehuda Sadeh 	char *snap_names;
88602adf40SYehuda Sadeh 	u64 *snap_sizes;
8959c2be1eSYehuda Sadeh 
9059c2be1eSYehuda Sadeh 	u64 obj_version;
9159c2be1eSYehuda Sadeh };
9259c2be1eSYehuda Sadeh 
9359c2be1eSYehuda Sadeh struct rbd_options {
9459c2be1eSYehuda Sadeh 	int	notify_timeout;
95602adf40SYehuda Sadeh };
96602adf40SYehuda Sadeh 
97602adf40SYehuda Sadeh /*
98f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
99602adf40SYehuda Sadeh  */
100602adf40SYehuda Sadeh struct rbd_client {
101602adf40SYehuda Sadeh 	struct ceph_client	*client;
10259c2be1eSYehuda Sadeh 	struct rbd_options	*rbd_opts;
103602adf40SYehuda Sadeh 	struct kref		kref;
104602adf40SYehuda Sadeh 	struct list_head	node;
105602adf40SYehuda Sadeh };
106602adf40SYehuda Sadeh 
107602adf40SYehuda Sadeh /*
108f0f8cef5SAlex Elder  * a request completion status
109602adf40SYehuda Sadeh  */
1101fec7093SYehuda Sadeh struct rbd_req_status {
1111fec7093SYehuda Sadeh 	int done;
1121fec7093SYehuda Sadeh 	int rc;
1131fec7093SYehuda Sadeh 	u64 bytes;
1141fec7093SYehuda Sadeh };
1151fec7093SYehuda Sadeh 
1161fec7093SYehuda Sadeh /*
1171fec7093SYehuda Sadeh  * a collection of requests
1181fec7093SYehuda Sadeh  */
1191fec7093SYehuda Sadeh struct rbd_req_coll {
1201fec7093SYehuda Sadeh 	int			total;
1211fec7093SYehuda Sadeh 	int			num_done;
1221fec7093SYehuda Sadeh 	struct kref		kref;
1231fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
124602adf40SYehuda Sadeh };
125602adf40SYehuda Sadeh 
126f0f8cef5SAlex Elder /*
127f0f8cef5SAlex Elder  * a single io request
128f0f8cef5SAlex Elder  */
129f0f8cef5SAlex Elder struct rbd_request {
130f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
131f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
132f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
133f0f8cef5SAlex Elder 	u64			len;
134f0f8cef5SAlex Elder 	int			coll_index;
135f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
136f0f8cef5SAlex Elder };
137f0f8cef5SAlex Elder 
138dfc5606dSYehuda Sadeh struct rbd_snap {
139dfc5606dSYehuda Sadeh 	struct	device		dev;
140dfc5606dSYehuda Sadeh 	const char		*name;
1413591538fSJosh Durgin 	u64			size;
142dfc5606dSYehuda Sadeh 	struct list_head	node;
143dfc5606dSYehuda Sadeh 	u64			id;
144dfc5606dSYehuda Sadeh };
145dfc5606dSYehuda Sadeh 
146602adf40SYehuda Sadeh /*
147602adf40SYehuda Sadeh  * a single device
148602adf40SYehuda Sadeh  */
149602adf40SYehuda Sadeh struct rbd_device {
150de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
151602adf40SYehuda Sadeh 
152602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
153602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
154602adf40SYehuda Sadeh 	struct request_queue	*q;
155602adf40SYehuda Sadeh 
156602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
157602adf40SYehuda Sadeh 
158602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159602adf40SYehuda Sadeh 
160602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
161602adf40SYehuda Sadeh 
162602adf40SYehuda Sadeh 	struct rbd_image_header	header;
1630bed54dcSAlex Elder 	char			*image_name;
1640bed54dcSAlex Elder 	size_t			image_name_len;
1650bed54dcSAlex Elder 	char			*header_name;
166d22f76e7SAlex Elder 	char			*pool_name;
1679bb2f334SAlex Elder 	int			pool_id;
168602adf40SYehuda Sadeh 
16959c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
17059c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
17159c2be1eSYehuda Sadeh 
172c666601aSJosh Durgin 	/* protects updating the header */
173c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
174e88a36ecSJosh Durgin 	/* name of the snapshot this device reads from */
175820a5f3eSAlex Elder 	char                    *snap_name;
176e88a36ecSJosh Durgin 	/* id of the snapshot this device reads from */
17777dfe99fSJosh Durgin 	u64                     snap_id;	/* current snapshot id */
178e88a36ecSJosh Durgin 	/* whether the snap_id this device reads from still exists */
179e88a36ecSJosh Durgin 	bool                    snap_exists;
180602adf40SYehuda Sadeh 	int                     read_only;
181602adf40SYehuda Sadeh 
182602adf40SYehuda Sadeh 	struct list_head	node;
183dfc5606dSYehuda Sadeh 
184dfc5606dSYehuda Sadeh 	/* list of snapshots */
185dfc5606dSYehuda Sadeh 	struct list_head	snaps;
186dfc5606dSYehuda Sadeh 
187dfc5606dSYehuda Sadeh 	/* sysfs related */
188dfc5606dSYehuda Sadeh 	struct device		dev;
189dfc5606dSYehuda Sadeh };
190dfc5606dSYehuda Sadeh 
191602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
192e124a82fSAlex Elder 
193602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
194e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
195e124a82fSAlex Elder 
196602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
197432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
198602adf40SYehuda Sadeh 
199dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
201dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
202dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
203dfc5606dSYehuda Sadeh 			    const char *buf,
204dfc5606dSYehuda Sadeh 			    size_t count);
20514e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap);
206dfc5606dSYehuda Sadeh 
207f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208f0f8cef5SAlex Elder 		       size_t count);
209f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210f0f8cef5SAlex Elder 			  size_t count);
211f0f8cef5SAlex Elder 
212f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
213f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
214f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
215f0f8cef5SAlex Elder 	__ATTR_NULL
216f0f8cef5SAlex Elder };
217f0f8cef5SAlex Elder 
218f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
219f0f8cef5SAlex Elder 	.name		= "rbd",
220f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
221f0f8cef5SAlex Elder };
222f0f8cef5SAlex Elder 
223f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
224f0f8cef5SAlex Elder {
225f0f8cef5SAlex Elder }
226f0f8cef5SAlex Elder 
227f0f8cef5SAlex Elder static struct device rbd_root_dev = {
228f0f8cef5SAlex Elder 	.init_name =    "rbd",
229f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
230f0f8cef5SAlex Elder };
231f0f8cef5SAlex Elder 
232dfc5606dSYehuda Sadeh 
233dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234dfc5606dSYehuda Sadeh {
235dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
236dfc5606dSYehuda Sadeh }
237dfc5606dSYehuda Sadeh 
238dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
239dfc5606dSYehuda Sadeh {
240dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
241dfc5606dSYehuda Sadeh }
242602adf40SYehuda Sadeh 
2431fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
24459c2be1eSYehuda Sadeh 
245602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
246602adf40SYehuda Sadeh {
247f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
248602adf40SYehuda Sadeh 
249dfc5606dSYehuda Sadeh 	rbd_get_dev(rbd_dev);
250dfc5606dSYehuda Sadeh 
251602adf40SYehuda Sadeh 	set_device_ro(bdev, rbd_dev->read_only);
252602adf40SYehuda Sadeh 
253602adf40SYehuda Sadeh 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
254602adf40SYehuda Sadeh 		return -EROFS;
255602adf40SYehuda Sadeh 
256602adf40SYehuda Sadeh 	return 0;
257602adf40SYehuda Sadeh }
258602adf40SYehuda Sadeh 
259dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
260dfc5606dSYehuda Sadeh {
261dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
262dfc5606dSYehuda Sadeh 
263dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
264dfc5606dSYehuda Sadeh 
265dfc5606dSYehuda Sadeh 	return 0;
266dfc5606dSYehuda Sadeh }
267dfc5606dSYehuda Sadeh 
268602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
269602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
270602adf40SYehuda Sadeh 	.open			= rbd_open,
271dfc5606dSYehuda Sadeh 	.release		= rbd_release,
272602adf40SYehuda Sadeh };
273602adf40SYehuda Sadeh 
274602adf40SYehuda Sadeh /*
275602adf40SYehuda Sadeh  * Initialize an rbd client instance.
27643ae4701SAlex Elder  * We own *ceph_opts.
277602adf40SYehuda Sadeh  */
27843ae4701SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
27959c2be1eSYehuda Sadeh 					    struct rbd_options *rbd_opts)
280602adf40SYehuda Sadeh {
281602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
282602adf40SYehuda Sadeh 	int ret = -ENOMEM;
283602adf40SYehuda Sadeh 
284602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
285602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
286602adf40SYehuda Sadeh 	if (!rbdc)
287602adf40SYehuda Sadeh 		goto out_opt;
288602adf40SYehuda Sadeh 
289602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
290602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
291602adf40SYehuda Sadeh 
292bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
293bc534d86SAlex Elder 
29443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
295602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
296bc534d86SAlex Elder 		goto out_mutex;
29743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
298602adf40SYehuda Sadeh 
299602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
300602adf40SYehuda Sadeh 	if (ret < 0)
301602adf40SYehuda Sadeh 		goto out_err;
302602adf40SYehuda Sadeh 
30359c2be1eSYehuda Sadeh 	rbdc->rbd_opts = rbd_opts;
30459c2be1eSYehuda Sadeh 
305432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
306602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
307432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
308602adf40SYehuda Sadeh 
309bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
310bc534d86SAlex Elder 
311602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
312602adf40SYehuda Sadeh 	return rbdc;
313602adf40SYehuda Sadeh 
314602adf40SYehuda Sadeh out_err:
315602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
316bc534d86SAlex Elder out_mutex:
317bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
318602adf40SYehuda Sadeh 	kfree(rbdc);
319602adf40SYehuda Sadeh out_opt:
32043ae4701SAlex Elder 	if (ceph_opts)
32143ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
32228f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
323602adf40SYehuda Sadeh }
324602adf40SYehuda Sadeh 
325602adf40SYehuda Sadeh /*
326602adf40SYehuda Sadeh  * Find a ceph client with specific addr and configuration.
327602adf40SYehuda Sadeh  */
32843ae4701SAlex Elder static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
329602adf40SYehuda Sadeh {
330602adf40SYehuda Sadeh 	struct rbd_client *client_node;
331602adf40SYehuda Sadeh 
33243ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
333602adf40SYehuda Sadeh 		return NULL;
334602adf40SYehuda Sadeh 
335602adf40SYehuda Sadeh 	list_for_each_entry(client_node, &rbd_client_list, node)
33643ae4701SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client))
337602adf40SYehuda Sadeh 			return client_node;
338602adf40SYehuda Sadeh 	return NULL;
339602adf40SYehuda Sadeh }
340602adf40SYehuda Sadeh 
341602adf40SYehuda Sadeh /*
34259c2be1eSYehuda Sadeh  * mount options
34359c2be1eSYehuda Sadeh  */
34459c2be1eSYehuda Sadeh enum {
34559c2be1eSYehuda Sadeh 	Opt_notify_timeout,
34659c2be1eSYehuda Sadeh 	Opt_last_int,
34759c2be1eSYehuda Sadeh 	/* int args above */
34859c2be1eSYehuda Sadeh 	Opt_last_string,
34959c2be1eSYehuda Sadeh 	/* string args above */
35059c2be1eSYehuda Sadeh };
35159c2be1eSYehuda Sadeh 
35243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
35359c2be1eSYehuda Sadeh 	{Opt_notify_timeout, "notify_timeout=%d"},
35459c2be1eSYehuda Sadeh 	/* int args above */
35559c2be1eSYehuda Sadeh 	/* string args above */
35659c2be1eSYehuda Sadeh 	{-1, NULL}
35759c2be1eSYehuda Sadeh };
35859c2be1eSYehuda Sadeh 
35959c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
36059c2be1eSYehuda Sadeh {
36143ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
36259c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
36359c2be1eSYehuda Sadeh 	int token, intval, ret;
36459c2be1eSYehuda Sadeh 
36543ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
36659c2be1eSYehuda Sadeh 	if (token < 0)
36759c2be1eSYehuda Sadeh 		return -EINVAL;
36859c2be1eSYehuda Sadeh 
36959c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
37059c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
37159c2be1eSYehuda Sadeh 		if (ret < 0) {
37259c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
37359c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
37459c2be1eSYehuda Sadeh 			return ret;
37559c2be1eSYehuda Sadeh 		}
37659c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
37759c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
37859c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
37959c2be1eSYehuda Sadeh 		     argstr[0].from);
38059c2be1eSYehuda Sadeh 	} else {
38159c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
38259c2be1eSYehuda Sadeh 	}
38359c2be1eSYehuda Sadeh 
38459c2be1eSYehuda Sadeh 	switch (token) {
38559c2be1eSYehuda Sadeh 	case Opt_notify_timeout:
38643ae4701SAlex Elder 		rbd_opts->notify_timeout = intval;
38759c2be1eSYehuda Sadeh 		break;
38859c2be1eSYehuda Sadeh 	default:
38959c2be1eSYehuda Sadeh 		BUG_ON(token);
39059c2be1eSYehuda Sadeh 	}
39159c2be1eSYehuda Sadeh 	return 0;
39259c2be1eSYehuda Sadeh }
39359c2be1eSYehuda Sadeh 
39459c2be1eSYehuda Sadeh /*
395602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
396602adf40SYehuda Sadeh  * not exist create it.
397602adf40SYehuda Sadeh  */
3985214ecc4SAlex Elder static struct rbd_client *rbd_get_client(const char *mon_addr,
3995214ecc4SAlex Elder 					 size_t mon_addr_len,
4005214ecc4SAlex Elder 					 char *options)
401602adf40SYehuda Sadeh {
402602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
40343ae4701SAlex Elder 	struct ceph_options *ceph_opts;
40459c2be1eSYehuda Sadeh 	struct rbd_options *rbd_opts;
40559c2be1eSYehuda Sadeh 
40659c2be1eSYehuda Sadeh 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
40759c2be1eSYehuda Sadeh 	if (!rbd_opts)
408d720bcb0SAlex Elder 		return ERR_PTR(-ENOMEM);
40959c2be1eSYehuda Sadeh 
41059c2be1eSYehuda Sadeh 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
411602adf40SYehuda Sadeh 
41243ae4701SAlex Elder 	ceph_opts = ceph_parse_options(options, mon_addr,
4135214ecc4SAlex Elder 					mon_addr + mon_addr_len,
41421079786SAlex Elder 					parse_rbd_opts_token, rbd_opts);
41543ae4701SAlex Elder 	if (IS_ERR(ceph_opts)) {
416d720bcb0SAlex Elder 		kfree(rbd_opts);
41743ae4701SAlex Elder 		return ERR_CAST(ceph_opts);
418ee57741cSAlex Elder 	}
419602adf40SYehuda Sadeh 
420432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
42143ae4701SAlex Elder 	rbdc = __rbd_client_find(ceph_opts);
422602adf40SYehuda Sadeh 	if (rbdc) {
423e6994d3dSAlex Elder 		/* using an existing client */
424e6994d3dSAlex Elder 		kref_get(&rbdc->kref);
425432b8587SAlex Elder 		spin_unlock(&rbd_client_list_lock);
426e6994d3dSAlex Elder 
42743ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
42897bb59a0SAlex Elder 		kfree(rbd_opts);
429602adf40SYehuda Sadeh 
430d720bcb0SAlex Elder 		return rbdc;
431602adf40SYehuda Sadeh 	}
432432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
433602adf40SYehuda Sadeh 
43443ae4701SAlex Elder 	rbdc = rbd_client_create(ceph_opts, rbd_opts);
435d97081b0SAlex Elder 
436d720bcb0SAlex Elder 	if (IS_ERR(rbdc))
43759c2be1eSYehuda Sadeh 		kfree(rbd_opts);
438d720bcb0SAlex Elder 
439d720bcb0SAlex Elder 	return rbdc;
440602adf40SYehuda Sadeh }
441602adf40SYehuda Sadeh 
442602adf40SYehuda Sadeh /*
443602adf40SYehuda Sadeh  * Destroy ceph client
444d23a4b3fSAlex Elder  *
445432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
446602adf40SYehuda Sadeh  */
447602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
448602adf40SYehuda Sadeh {
449602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
450602adf40SYehuda Sadeh 
451602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
452cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
453602adf40SYehuda Sadeh 	list_del(&rbdc->node);
454cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
455602adf40SYehuda Sadeh 
456602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
45759c2be1eSYehuda Sadeh 	kfree(rbdc->rbd_opts);
458602adf40SYehuda Sadeh 	kfree(rbdc);
459602adf40SYehuda Sadeh }
460602adf40SYehuda Sadeh 
461602adf40SYehuda Sadeh /*
462602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
463602adf40SYehuda Sadeh  * it.
464602adf40SYehuda Sadeh  */
465602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
466602adf40SYehuda Sadeh {
467602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
468602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
469602adf40SYehuda Sadeh }
470602adf40SYehuda Sadeh 
4711fec7093SYehuda Sadeh /*
4721fec7093SYehuda Sadeh  * Destroy requests collection
4731fec7093SYehuda Sadeh  */
4741fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4751fec7093SYehuda Sadeh {
4761fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4771fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
4781fec7093SYehuda Sadeh 
4791fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
4801fec7093SYehuda Sadeh 	kfree(coll);
4811fec7093SYehuda Sadeh }
482602adf40SYehuda Sadeh 
4838e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
4848e94af8eSAlex Elder {
4858e94af8eSAlex Elder 	return !memcmp(&ondisk->text,
4868e94af8eSAlex Elder 			RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
4878e94af8eSAlex Elder }
4888e94af8eSAlex Elder 
489602adf40SYehuda Sadeh /*
490602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
491602adf40SYehuda Sadeh  * header.
492602adf40SYehuda Sadeh  */
493602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
494602adf40SYehuda Sadeh 				 struct rbd_image_header_ondisk *ondisk,
495ed63f4fdSAlex Elder 				 u32 allocated_snaps)
496602adf40SYehuda Sadeh {
497ccece235SAlex Elder 	u32 snap_count;
498602adf40SYehuda Sadeh 
4998e94af8eSAlex Elder 	if (!rbd_dev_ondisk_valid(ondisk))
50081e759fbSJosh Durgin 		return -ENXIO;
50181e759fbSJosh Durgin 
50200f1f36fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
503ccece235SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context))
504ccece235SAlex Elder 				 / sizeof (u64))
50550f7c4c9SXi Wang 		return -EINVAL;
506602adf40SYehuda Sadeh 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
507f9f9a190SYan, Zheng 				snap_count * sizeof(u64),
508ed63f4fdSAlex Elder 				GFP_KERNEL);
509602adf40SYehuda Sadeh 	if (!header->snapc)
510602adf40SYehuda Sadeh 		return -ENOMEM;
51100f1f36fSAlex Elder 
512602adf40SYehuda Sadeh 	if (snap_count) {
513ccece235SAlex Elder 		header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
514602adf40SYehuda Sadeh 		header->snap_names = kmalloc(header->snap_names_len,
515ed63f4fdSAlex Elder 					     GFP_KERNEL);
516602adf40SYehuda Sadeh 		if (!header->snap_names)
517602adf40SYehuda Sadeh 			goto err_snapc;
518602adf40SYehuda Sadeh 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
519ed63f4fdSAlex Elder 					     GFP_KERNEL);
520602adf40SYehuda Sadeh 		if (!header->snap_sizes)
521602adf40SYehuda Sadeh 			goto err_names;
522602adf40SYehuda Sadeh 	} else {
523ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
524ccece235SAlex Elder 		header->snap_names_len = 0;
525602adf40SYehuda Sadeh 		header->snap_names = NULL;
526602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
527602adf40SYehuda Sadeh 	}
528849b4260SAlex Elder 
529849b4260SAlex Elder 	header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
530ed63f4fdSAlex Elder 					GFP_KERNEL);
531849b4260SAlex Elder 	if (!header->object_prefix)
532849b4260SAlex Elder 		goto err_sizes;
533849b4260SAlex Elder 
534ca1e49a6SAlex Elder 	memcpy(header->object_prefix, ondisk->block_name,
535602adf40SYehuda Sadeh 	       sizeof(ondisk->block_name));
536849b4260SAlex Elder 	header->object_prefix[sizeof (ondisk->block_name)] = '\0';
537602adf40SYehuda Sadeh 
538602adf40SYehuda Sadeh 	header->image_size = le64_to_cpu(ondisk->image_size);
539602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
540602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
541602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
542602adf40SYehuda Sadeh 
543602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
544505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
545602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
546602adf40SYehuda Sadeh 	header->total_snaps = snap_count;
547602adf40SYehuda Sadeh 
54821079786SAlex Elder 	if (snap_count && allocated_snaps == snap_count) {
549ccece235SAlex Elder 		int i;
550ccece235SAlex Elder 
551602adf40SYehuda Sadeh 		for (i = 0; i < snap_count; i++) {
552602adf40SYehuda Sadeh 			header->snapc->snaps[i] =
553602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].id);
554602adf40SYehuda Sadeh 			header->snap_sizes[i] =
555602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].image_size);
556602adf40SYehuda Sadeh 		}
557602adf40SYehuda Sadeh 
558602adf40SYehuda Sadeh 		/* copy snapshot names */
559ccece235SAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
560602adf40SYehuda Sadeh 			header->snap_names_len);
561602adf40SYehuda Sadeh 	}
562602adf40SYehuda Sadeh 
563602adf40SYehuda Sadeh 	return 0;
564602adf40SYehuda Sadeh 
565849b4260SAlex Elder err_sizes:
566849b4260SAlex Elder 	kfree(header->snap_sizes);
567ccece235SAlex Elder 	header->snap_sizes = NULL;
568602adf40SYehuda Sadeh err_names:
569602adf40SYehuda Sadeh 	kfree(header->snap_names);
570ccece235SAlex Elder 	header->snap_names = NULL;
571602adf40SYehuda Sadeh err_snapc:
572602adf40SYehuda Sadeh 	kfree(header->snapc);
573ccece235SAlex Elder 	header->snapc = NULL;
574ccece235SAlex Elder 
57500f1f36fSAlex Elder 	return -ENOMEM;
576602adf40SYehuda Sadeh }
577602adf40SYehuda Sadeh 
578602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
579602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
580602adf40SYehuda Sadeh {
581602adf40SYehuda Sadeh 	int i;
582602adf40SYehuda Sadeh 	char *p = header->snap_names;
583602adf40SYehuda Sadeh 
58400f1f36fSAlex Elder 	for (i = 0; i < header->total_snaps; i++) {
58500f1f36fSAlex Elder 		if (!strcmp(snap_name, p)) {
58600f1f36fSAlex Elder 
58700f1f36fSAlex Elder 			/* Found it.  Pass back its id and/or size */
58800f1f36fSAlex Elder 
589602adf40SYehuda Sadeh 			if (seq)
590602adf40SYehuda Sadeh 				*seq = header->snapc->snaps[i];
591602adf40SYehuda Sadeh 			if (size)
592602adf40SYehuda Sadeh 				*size = header->snap_sizes[i];
593602adf40SYehuda Sadeh 			return i;
594602adf40SYehuda Sadeh 		}
59500f1f36fSAlex Elder 		p += strlen(p) + 1;	/* Skip ahead to the next name */
59600f1f36fSAlex Elder 	}
59700f1f36fSAlex Elder 	return -ENOENT;
59800f1f36fSAlex Elder }
599602adf40SYehuda Sadeh 
6000ce1a794SAlex Elder static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
601602adf40SYehuda Sadeh {
60278dc447dSAlex Elder 	int ret;
603602adf40SYehuda Sadeh 
6040ce1a794SAlex Elder 	down_write(&rbd_dev->header_rwsem);
605602adf40SYehuda Sadeh 
6060ce1a794SAlex Elder 	if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
607cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
6080ce1a794SAlex Elder 		rbd_dev->snap_id = CEPH_NOSNAP;
609e88a36ecSJosh Durgin 		rbd_dev->snap_exists = false;
6100ce1a794SAlex Elder 		rbd_dev->read_only = 0;
611602adf40SYehuda Sadeh 		if (size)
61278dc447dSAlex Elder 			*size = rbd_dev->header.image_size;
613602adf40SYehuda Sadeh 	} else {
61478dc447dSAlex Elder 		u64 snap_id = 0;
61578dc447dSAlex Elder 
61678dc447dSAlex Elder 		ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
61778dc447dSAlex Elder 					&snap_id, size);
618602adf40SYehuda Sadeh 		if (ret < 0)
619602adf40SYehuda Sadeh 			goto done;
62078dc447dSAlex Elder 		rbd_dev->snap_id = snap_id;
621e88a36ecSJosh Durgin 		rbd_dev->snap_exists = true;
6220ce1a794SAlex Elder 		rbd_dev->read_only = 1;
623602adf40SYehuda Sadeh 	}
624602adf40SYehuda Sadeh 
625602adf40SYehuda Sadeh 	ret = 0;
626602adf40SYehuda Sadeh done:
6270ce1a794SAlex Elder 	up_write(&rbd_dev->header_rwsem);
628602adf40SYehuda Sadeh 	return ret;
629602adf40SYehuda Sadeh }
630602adf40SYehuda Sadeh 
631602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
632602adf40SYehuda Sadeh {
633849b4260SAlex Elder 	kfree(header->object_prefix);
634602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
635849b4260SAlex Elder 	kfree(header->snap_names);
636d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
637602adf40SYehuda Sadeh }
638602adf40SYehuda Sadeh 
639602adf40SYehuda Sadeh /*
640602adf40SYehuda Sadeh  * get the actual striped segment name, offset and length
641602adf40SYehuda Sadeh  */
642602adf40SYehuda Sadeh static u64 rbd_get_segment(struct rbd_image_header *header,
643ca1e49a6SAlex Elder 			   const char *object_prefix,
644602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
645602adf40SYehuda Sadeh 			   char *seg_name, u64 *segofs)
646602adf40SYehuda Sadeh {
647602adf40SYehuda Sadeh 	u64 seg = ofs >> header->obj_order;
648602adf40SYehuda Sadeh 
649602adf40SYehuda Sadeh 	if (seg_name)
650602adf40SYehuda Sadeh 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
651ca1e49a6SAlex Elder 			 "%s.%012llx", object_prefix, seg);
652602adf40SYehuda Sadeh 
653602adf40SYehuda Sadeh 	ofs = ofs & ((1 << header->obj_order) - 1);
654602adf40SYehuda Sadeh 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
655602adf40SYehuda Sadeh 
656602adf40SYehuda Sadeh 	if (segofs)
657602adf40SYehuda Sadeh 		*segofs = ofs;
658602adf40SYehuda Sadeh 
659602adf40SYehuda Sadeh 	return len;
660602adf40SYehuda Sadeh }
661602adf40SYehuda Sadeh 
6621fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
6631fec7093SYehuda Sadeh 				u64 ofs, u64 len)
6641fec7093SYehuda Sadeh {
6651fec7093SYehuda Sadeh 	u64 start_seg = ofs >> header->obj_order;
6661fec7093SYehuda Sadeh 	u64 end_seg = (ofs + len - 1) >> header->obj_order;
6671fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
6681fec7093SYehuda Sadeh }
6691fec7093SYehuda Sadeh 
670602adf40SYehuda Sadeh /*
671029bcbd8SJosh Durgin  * returns the size of an object in the image
672029bcbd8SJosh Durgin  */
673029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
674029bcbd8SJosh Durgin {
675029bcbd8SJosh Durgin 	return 1 << header->obj_order;
676029bcbd8SJosh Durgin }
677029bcbd8SJosh Durgin 
678029bcbd8SJosh Durgin /*
679602adf40SYehuda Sadeh  * bio helpers
680602adf40SYehuda Sadeh  */
681602adf40SYehuda Sadeh 
682602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
683602adf40SYehuda Sadeh {
684602adf40SYehuda Sadeh 	struct bio *tmp;
685602adf40SYehuda Sadeh 
686602adf40SYehuda Sadeh 	while (chain) {
687602adf40SYehuda Sadeh 		tmp = chain;
688602adf40SYehuda Sadeh 		chain = chain->bi_next;
689602adf40SYehuda Sadeh 		bio_put(tmp);
690602adf40SYehuda Sadeh 	}
691602adf40SYehuda Sadeh }
692602adf40SYehuda Sadeh 
693602adf40SYehuda Sadeh /*
694602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
695602adf40SYehuda Sadeh  */
696602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
697602adf40SYehuda Sadeh {
698602adf40SYehuda Sadeh 	struct bio_vec *bv;
699602adf40SYehuda Sadeh 	unsigned long flags;
700602adf40SYehuda Sadeh 	void *buf;
701602adf40SYehuda Sadeh 	int i;
702602adf40SYehuda Sadeh 	int pos = 0;
703602adf40SYehuda Sadeh 
704602adf40SYehuda Sadeh 	while (chain) {
705602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
706602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
707602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
708602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
709602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
710602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
71185b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
712602adf40SYehuda Sadeh 			}
713602adf40SYehuda Sadeh 			pos += bv->bv_len;
714602adf40SYehuda Sadeh 		}
715602adf40SYehuda Sadeh 
716602adf40SYehuda Sadeh 		chain = chain->bi_next;
717602adf40SYehuda Sadeh 	}
718602adf40SYehuda Sadeh }
719602adf40SYehuda Sadeh 
720602adf40SYehuda Sadeh /*
721602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
722602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
723602adf40SYehuda Sadeh  */
724602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
725602adf40SYehuda Sadeh 				   struct bio_pair **bp,
726602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
727602adf40SYehuda Sadeh {
728602adf40SYehuda Sadeh 	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
729602adf40SYehuda Sadeh 	int total = 0;
730602adf40SYehuda Sadeh 
731602adf40SYehuda Sadeh 	if (*bp) {
732602adf40SYehuda Sadeh 		bio_pair_release(*bp);
733602adf40SYehuda Sadeh 		*bp = NULL;
734602adf40SYehuda Sadeh 	}
735602adf40SYehuda Sadeh 
736602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
737602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
738602adf40SYehuda Sadeh 		if (!tmp)
739602adf40SYehuda Sadeh 			goto err_out;
740602adf40SYehuda Sadeh 
741602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
742602adf40SYehuda Sadeh 			struct bio_pair *bp;
743602adf40SYehuda Sadeh 
744602adf40SYehuda Sadeh 			/*
745602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
746602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
747602adf40SYehuda Sadeh 			 */
748602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
749bd919d45SAlex Elder 			     "bi_size=%u\n",
750bd919d45SAlex Elder 			     total, len - total, old_chain->bi_size);
751602adf40SYehuda Sadeh 
752602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
753602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
754593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
755602adf40SYehuda Sadeh 			if (!bp)
756602adf40SYehuda Sadeh 				goto err_out;
757602adf40SYehuda Sadeh 
758602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
759602adf40SYehuda Sadeh 
760602adf40SYehuda Sadeh 			*next = &bp->bio2;
761602adf40SYehuda Sadeh 		} else {
762602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
763602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
764602adf40SYehuda Sadeh 		}
765602adf40SYehuda Sadeh 
766602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
767602adf40SYehuda Sadeh 		gfpmask &= ~__GFP_WAIT;
768602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
769602adf40SYehuda Sadeh 
770602adf40SYehuda Sadeh 		if (!new_chain) {
771602adf40SYehuda Sadeh 			new_chain = tail = tmp;
772602adf40SYehuda Sadeh 		} else {
773602adf40SYehuda Sadeh 			tail->bi_next = tmp;
774602adf40SYehuda Sadeh 			tail = tmp;
775602adf40SYehuda Sadeh 		}
776602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
777602adf40SYehuda Sadeh 
778602adf40SYehuda Sadeh 		total += tmp->bi_size;
779602adf40SYehuda Sadeh 	}
780602adf40SYehuda Sadeh 
781602adf40SYehuda Sadeh 	BUG_ON(total < len);
782602adf40SYehuda Sadeh 
783602adf40SYehuda Sadeh 	if (tail)
784602adf40SYehuda Sadeh 		tail->bi_next = NULL;
785602adf40SYehuda Sadeh 
786602adf40SYehuda Sadeh 	*old = old_chain;
787602adf40SYehuda Sadeh 
788602adf40SYehuda Sadeh 	return new_chain;
789602adf40SYehuda Sadeh 
790602adf40SYehuda Sadeh err_out:
791602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
792602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
793602adf40SYehuda Sadeh 	return NULL;
794602adf40SYehuda Sadeh }
795602adf40SYehuda Sadeh 
796602adf40SYehuda Sadeh /*
797602adf40SYehuda Sadeh  * helpers for osd request op vectors.
798602adf40SYehuda Sadeh  */
79957cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
80057cfc106SAlex Elder 					int opcode, u32 payload_len)
801602adf40SYehuda Sadeh {
80257cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
80357cfc106SAlex Elder 
80457cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
80557cfc106SAlex Elder 	if (!ops)
80657cfc106SAlex Elder 		return NULL;
80757cfc106SAlex Elder 
80857cfc106SAlex Elder 	ops[0].op = opcode;
80957cfc106SAlex Elder 
810602adf40SYehuda Sadeh 	/*
811602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
812602adf40SYehuda Sadeh 	 * in calc_raw_layout()
813602adf40SYehuda Sadeh 	 */
81457cfc106SAlex Elder 	ops[0].payload_len = payload_len;
81557cfc106SAlex Elder 
81657cfc106SAlex Elder 	return ops;
817602adf40SYehuda Sadeh }
818602adf40SYehuda Sadeh 
819602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
820602adf40SYehuda Sadeh {
821602adf40SYehuda Sadeh 	kfree(ops);
822602adf40SYehuda Sadeh }
823602adf40SYehuda Sadeh 
8241fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
8251fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
8261fec7093SYehuda Sadeh 				   int index,
8271fec7093SYehuda Sadeh 				   int ret, u64 len)
8281fec7093SYehuda Sadeh {
8291fec7093SYehuda Sadeh 	struct request_queue *q;
8301fec7093SYehuda Sadeh 	int min, max, i;
8311fec7093SYehuda Sadeh 
832bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
833bd919d45SAlex Elder 	     coll, index, ret, (unsigned long long) len);
8341fec7093SYehuda Sadeh 
8351fec7093SYehuda Sadeh 	if (!rq)
8361fec7093SYehuda Sadeh 		return;
8371fec7093SYehuda Sadeh 
8381fec7093SYehuda Sadeh 	if (!coll) {
8391fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
8401fec7093SYehuda Sadeh 		return;
8411fec7093SYehuda Sadeh 	}
8421fec7093SYehuda Sadeh 
8431fec7093SYehuda Sadeh 	q = rq->q;
8441fec7093SYehuda Sadeh 
8451fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
8461fec7093SYehuda Sadeh 	coll->status[index].done = 1;
8471fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
8481fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
8491fec7093SYehuda Sadeh 	max = min = coll->num_done;
8501fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
8511fec7093SYehuda Sadeh 		max++;
8521fec7093SYehuda Sadeh 
8531fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
8541fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
8551fec7093SYehuda Sadeh 				  coll->status[i].bytes);
8561fec7093SYehuda Sadeh 		coll->num_done++;
8571fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
8581fec7093SYehuda Sadeh 	}
8591fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
8601fec7093SYehuda Sadeh }
8611fec7093SYehuda Sadeh 
8621fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
8631fec7093SYehuda Sadeh 			     int ret, u64 len)
8641fec7093SYehuda Sadeh {
8651fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
8661fec7093SYehuda Sadeh }
8671fec7093SYehuda Sadeh 
868602adf40SYehuda Sadeh /*
869602adf40SYehuda Sadeh  * Send ceph osd request
870602adf40SYehuda Sadeh  */
871602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
8720ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
873602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
874602adf40SYehuda Sadeh 			  u64 snapid,
875aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
876602adf40SYehuda Sadeh 			  struct bio *bio,
877602adf40SYehuda Sadeh 			  struct page **pages,
878602adf40SYehuda Sadeh 			  int num_pages,
879602adf40SYehuda Sadeh 			  int flags,
880602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
8811fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
8821fec7093SYehuda Sadeh 			  int coll_index,
883602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
88459c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
88559c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
88659c2be1eSYehuda Sadeh 			  u64 *ver)
887602adf40SYehuda Sadeh {
888602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
889602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
890602adf40SYehuda Sadeh 	int ret;
891602adf40SYehuda Sadeh 	u64 bno;
892602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
893602adf40SYehuda Sadeh 	struct rbd_request *req_data;
894602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
8951dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
896602adf40SYehuda Sadeh 
897602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
8981fec7093SYehuda Sadeh 	if (!req_data) {
8991fec7093SYehuda Sadeh 		if (coll)
9001fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
9011fec7093SYehuda Sadeh 					       -ENOMEM, len);
9021fec7093SYehuda Sadeh 		return -ENOMEM;
9031fec7093SYehuda Sadeh 	}
904602adf40SYehuda Sadeh 
9051fec7093SYehuda Sadeh 	if (coll) {
9061fec7093SYehuda Sadeh 		req_data->coll = coll;
9071fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
9081fec7093SYehuda Sadeh 	}
9091fec7093SYehuda Sadeh 
910bd919d45SAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
911bd919d45SAlex Elder 		(unsigned long long) ofs, (unsigned long long) len);
912602adf40SYehuda Sadeh 
9130ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
9141dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9151dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9164ad12621SSage Weil 	if (!req) {
9174ad12621SSage Weil 		ret = -ENOMEM;
918602adf40SYehuda Sadeh 		goto done_pages;
919602adf40SYehuda Sadeh 	}
920602adf40SYehuda Sadeh 
921602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
922602adf40SYehuda Sadeh 
923602adf40SYehuda Sadeh 	req_data->rq = rq;
924602adf40SYehuda Sadeh 	req_data->bio = bio;
925602adf40SYehuda Sadeh 	req_data->pages = pages;
926602adf40SYehuda Sadeh 	req_data->len = len;
927602adf40SYehuda Sadeh 
928602adf40SYehuda Sadeh 	req->r_priv = req_data;
929602adf40SYehuda Sadeh 
930602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
931602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
932602adf40SYehuda Sadeh 
933aded07eaSAlex Elder 	strncpy(req->r_oid, object_name, sizeof(req->r_oid));
934602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
935602adf40SYehuda Sadeh 
936602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
937602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
938602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
939602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
940602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
9410ce1a794SAlex Elder 	layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
9421dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
9431dbb4399SAlex Elder 				req, ops);
944602adf40SYehuda Sadeh 
945602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
946602adf40SYehuda Sadeh 				ops,
947602adf40SYehuda Sadeh 				snapc,
948602adf40SYehuda Sadeh 				&mtime,
949602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
950602adf40SYehuda Sadeh 
95159c2be1eSYehuda Sadeh 	if (linger_req) {
9521dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
95359c2be1eSYehuda Sadeh 		*linger_req = req;
95459c2be1eSYehuda Sadeh 	}
95559c2be1eSYehuda Sadeh 
9561dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
957602adf40SYehuda Sadeh 	if (ret < 0)
958602adf40SYehuda Sadeh 		goto done_err;
959602adf40SYehuda Sadeh 
960602adf40SYehuda Sadeh 	if (!rbd_cb) {
9611dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
96259c2be1eSYehuda Sadeh 		if (ver)
96359c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
964bd919d45SAlex Elder 		dout("reassert_ver=%llu\n",
965bd919d45SAlex Elder 			(unsigned long long)
9661fec7093SYehuda Sadeh 				le64_to_cpu(req->r_reassert_version.version));
967602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
968602adf40SYehuda Sadeh 	}
969602adf40SYehuda Sadeh 	return ret;
970602adf40SYehuda Sadeh 
971602adf40SYehuda Sadeh done_err:
972602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
973602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
974602adf40SYehuda Sadeh done_pages:
9751fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
976602adf40SYehuda Sadeh 	kfree(req_data);
977602adf40SYehuda Sadeh 	return ret;
978602adf40SYehuda Sadeh }
979602adf40SYehuda Sadeh 
980602adf40SYehuda Sadeh /*
981602adf40SYehuda Sadeh  * Ceph osd op callback
982602adf40SYehuda Sadeh  */
983602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
984602adf40SYehuda Sadeh {
985602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
986602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
987602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
988602adf40SYehuda Sadeh 	__s32 rc;
989602adf40SYehuda Sadeh 	u64 bytes;
990602adf40SYehuda Sadeh 	int read_op;
991602adf40SYehuda Sadeh 
992602adf40SYehuda Sadeh 	/* parse reply */
993602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
994602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
995602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
996602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
997602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
998895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
999602adf40SYehuda Sadeh 
1000bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1001bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1002602adf40SYehuda Sadeh 
1003602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1004602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1005602adf40SYehuda Sadeh 		rc = 0;
1006602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1007602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1008602adf40SYehuda Sadeh 		bytes = req_data->len;
1009602adf40SYehuda Sadeh 	}
1010602adf40SYehuda Sadeh 
10111fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1012602adf40SYehuda Sadeh 
1013602adf40SYehuda Sadeh 	if (req_data->bio)
1014602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1015602adf40SYehuda Sadeh 
1016602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1017602adf40SYehuda Sadeh 	kfree(req_data);
1018602adf40SYehuda Sadeh }
1019602adf40SYehuda Sadeh 
102059c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
102159c2be1eSYehuda Sadeh {
102259c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
102359c2be1eSYehuda Sadeh }
102459c2be1eSYehuda Sadeh 
1025602adf40SYehuda Sadeh /*
1026602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1027602adf40SYehuda Sadeh  */
10280ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1029602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1030602adf40SYehuda Sadeh 			   u64 snapid,
1031602adf40SYehuda Sadeh 			   int flags,
1032913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1033aded07eaSAlex Elder 			   const char *object_name,
1034602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
103559c2be1eSYehuda Sadeh 			   char *buf,
103659c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
103759c2be1eSYehuda Sadeh 			   u64 *ver)
1038602adf40SYehuda Sadeh {
1039602adf40SYehuda Sadeh 	int ret;
1040602adf40SYehuda Sadeh 	struct page **pages;
1041602adf40SYehuda Sadeh 	int num_pages;
1042913d2fdcSAlex Elder 
1043913d2fdcSAlex Elder 	BUG_ON(ops == NULL);
1044602adf40SYehuda Sadeh 
1045602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1046602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1047b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1048b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1049602adf40SYehuda Sadeh 
10500ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1051aded07eaSAlex Elder 			  object_name, ofs, len, NULL,
1052602adf40SYehuda Sadeh 			  pages, num_pages,
1053602adf40SYehuda Sadeh 			  flags,
1054602adf40SYehuda Sadeh 			  ops,
10551fec7093SYehuda Sadeh 			  NULL, 0,
105659c2be1eSYehuda Sadeh 			  NULL,
105759c2be1eSYehuda Sadeh 			  linger_req, ver);
1058602adf40SYehuda Sadeh 	if (ret < 0)
1059913d2fdcSAlex Elder 		goto done;
1060602adf40SYehuda Sadeh 
1061602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1062602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1063602adf40SYehuda Sadeh 
1064602adf40SYehuda Sadeh done:
1065602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1066602adf40SYehuda Sadeh 	return ret;
1067602adf40SYehuda Sadeh }
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh /*
1070602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1071602adf40SYehuda Sadeh  */
1072602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1073602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1074602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1075602adf40SYehuda Sadeh 		     u64 snapid,
1076d1f57ea6SAlex Elder 		     int opcode, int flags,
1077602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
10781fec7093SYehuda Sadeh 		     struct bio *bio,
10791fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
10801fec7093SYehuda Sadeh 		     int coll_index)
1081602adf40SYehuda Sadeh {
1082602adf40SYehuda Sadeh 	char *seg_name;
1083602adf40SYehuda Sadeh 	u64 seg_ofs;
1084602adf40SYehuda Sadeh 	u64 seg_len;
1085602adf40SYehuda Sadeh 	int ret;
1086602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1087602adf40SYehuda Sadeh 	u32 payload_len;
1088602adf40SYehuda Sadeh 
1089602adf40SYehuda Sadeh 	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1090602adf40SYehuda Sadeh 	if (!seg_name)
1091602adf40SYehuda Sadeh 		return -ENOMEM;
1092602adf40SYehuda Sadeh 
1093602adf40SYehuda Sadeh 	seg_len = rbd_get_segment(&rbd_dev->header,
1094ca1e49a6SAlex Elder 				  rbd_dev->header.object_prefix,
1095602adf40SYehuda Sadeh 				  ofs, len,
1096602adf40SYehuda Sadeh 				  seg_name, &seg_ofs);
1097602adf40SYehuda Sadeh 
1098602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1099602adf40SYehuda Sadeh 
110057cfc106SAlex Elder 	ret = -ENOMEM;
110157cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
110257cfc106SAlex Elder 	if (!ops)
1103602adf40SYehuda Sadeh 		goto done;
1104602adf40SYehuda Sadeh 
1105602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1106602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1107602adf40SYehuda Sadeh 	   truncated at this point */
1108602adf40SYehuda Sadeh 	BUG_ON(seg_len < len);
1109602adf40SYehuda Sadeh 
1110602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1111602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1112602adf40SYehuda Sadeh 			     bio,
1113602adf40SYehuda Sadeh 			     NULL, 0,
1114602adf40SYehuda Sadeh 			     flags,
1115602adf40SYehuda Sadeh 			     ops,
11161fec7093SYehuda Sadeh 			     coll, coll_index,
111759c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
111811f77002SSage Weil 
111911f77002SSage Weil 	rbd_destroy_ops(ops);
1120602adf40SYehuda Sadeh done:
1121602adf40SYehuda Sadeh 	kfree(seg_name);
1122602adf40SYehuda Sadeh 	return ret;
1123602adf40SYehuda Sadeh }
1124602adf40SYehuda Sadeh 
1125602adf40SYehuda Sadeh /*
1126602adf40SYehuda Sadeh  * Request async osd write
1127602adf40SYehuda Sadeh  */
1128602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1129602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1130602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1131602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11321fec7093SYehuda Sadeh 			 struct bio *bio,
11331fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11341fec7093SYehuda Sadeh 			 int coll_index)
1135602adf40SYehuda Sadeh {
1136602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1137602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1138602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
11391fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1140602adf40SYehuda Sadeh }
1141602adf40SYehuda Sadeh 
1142602adf40SYehuda Sadeh /*
1143602adf40SYehuda Sadeh  * Request async osd read
1144602adf40SYehuda Sadeh  */
1145602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1146602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1147602adf40SYehuda Sadeh 			 u64 snapid,
1148602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11491fec7093SYehuda Sadeh 			 struct bio *bio,
11501fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11511fec7093SYehuda Sadeh 			 int coll_index)
1152602adf40SYehuda Sadeh {
1153602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1154b06e6a6bSJosh Durgin 			 snapid,
1155602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1156602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
11571fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1158602adf40SYehuda Sadeh }
1159602adf40SYehuda Sadeh 
1160602adf40SYehuda Sadeh /*
1161602adf40SYehuda Sadeh  * Request sync osd read
1162602adf40SYehuda Sadeh  */
11630ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1164602adf40SYehuda Sadeh 			  u64 snapid,
1165aded07eaSAlex Elder 			  const char *object_name,
1166602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
116759c2be1eSYehuda Sadeh 			  char *buf,
116859c2be1eSYehuda Sadeh 			  u64 *ver)
1169602adf40SYehuda Sadeh {
1170913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1171913d2fdcSAlex Elder 	int ret;
1172913d2fdcSAlex Elder 
1173913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1174913d2fdcSAlex Elder 	if (!ops)
1175913d2fdcSAlex Elder 		return -ENOMEM;
1176913d2fdcSAlex Elder 
1177913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1178b06e6a6bSJosh Durgin 			       snapid,
1179602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1180913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1181913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1182913d2fdcSAlex Elder 
1183913d2fdcSAlex Elder 	return ret;
1184602adf40SYehuda Sadeh }
1185602adf40SYehuda Sadeh 
1186602adf40SYehuda Sadeh /*
118759c2be1eSYehuda Sadeh  * Request sync osd watch
118859c2be1eSYehuda Sadeh  */
11890ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
119059c2be1eSYehuda Sadeh 				   u64 ver,
11917f0a24d8SAlex Elder 				   u64 notify_id)
119259c2be1eSYehuda Sadeh {
119359c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
119411f77002SSage Weil 	int ret;
119511f77002SSage Weil 
119657cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
119757cfc106SAlex Elder 	if (!ops)
119857cfc106SAlex Elder 		return -ENOMEM;
119959c2be1eSYehuda Sadeh 
1200a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
120159c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
120259c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
120359c2be1eSYehuda Sadeh 
12040ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
12057f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1206ad4f232fSAlex Elder 			  NULL, 0,
120759c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
120859c2be1eSYehuda Sadeh 			  ops,
12091fec7093SYehuda Sadeh 			  NULL, 0,
121059c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
121159c2be1eSYehuda Sadeh 
121259c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
121359c2be1eSYehuda Sadeh 	return ret;
121459c2be1eSYehuda Sadeh }
121559c2be1eSYehuda Sadeh 
121659c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
121759c2be1eSYehuda Sadeh {
12180ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1219a71b891bSJosh Durgin 	u64 hver;
122013143d2dSSage Weil 	int rc;
122113143d2dSSage Weil 
12220ce1a794SAlex Elder 	if (!rbd_dev)
122359c2be1eSYehuda Sadeh 		return;
122459c2be1eSYehuda Sadeh 
1225bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1226bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1227bd919d45SAlex Elder 		(unsigned int) opcode);
12281fe5e993SAlex Elder 	rc = rbd_refresh_header(rbd_dev, &hver);
122913143d2dSSage Weil 	if (rc)
1230f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
12310ce1a794SAlex Elder 			   " update snaps: %d\n", rbd_dev->major, rc);
123259c2be1eSYehuda Sadeh 
12337f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
123459c2be1eSYehuda Sadeh }
123559c2be1eSYehuda Sadeh 
123659c2be1eSYehuda Sadeh /*
123759c2be1eSYehuda Sadeh  * Request sync osd watch
123859c2be1eSYehuda Sadeh  */
12390e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
124059c2be1eSYehuda Sadeh {
124159c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
12420ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
124357cfc106SAlex Elder 	int ret;
124459c2be1eSYehuda Sadeh 
124557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
124657cfc106SAlex Elder 	if (!ops)
124757cfc106SAlex Elder 		return -ENOMEM;
124859c2be1eSYehuda Sadeh 
124959c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
12500ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
125159c2be1eSYehuda Sadeh 	if (ret < 0)
125259c2be1eSYehuda Sadeh 		goto fail;
125359c2be1eSYehuda Sadeh 
12540e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
12550ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
125659c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
125759c2be1eSYehuda Sadeh 
12580ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
125959c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
126059c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
126159c2be1eSYehuda Sadeh 			      ops,
12620e6f322dSAlex Elder 			      rbd_dev->header_name,
12630e6f322dSAlex Elder 			      0, 0, NULL,
12640ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
126559c2be1eSYehuda Sadeh 
126659c2be1eSYehuda Sadeh 	if (ret < 0)
126759c2be1eSYehuda Sadeh 		goto fail_event;
126859c2be1eSYehuda Sadeh 
126959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
127059c2be1eSYehuda Sadeh 	return 0;
127159c2be1eSYehuda Sadeh 
127259c2be1eSYehuda Sadeh fail_event:
12730ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
12740ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
127559c2be1eSYehuda Sadeh fail:
127659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
127759c2be1eSYehuda Sadeh 	return ret;
127859c2be1eSYehuda Sadeh }
127959c2be1eSYehuda Sadeh 
128079e3057cSYehuda Sadeh /*
128179e3057cSYehuda Sadeh  * Request sync osd unwatch
128279e3057cSYehuda Sadeh  */
1283070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
128479e3057cSYehuda Sadeh {
128579e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
128657cfc106SAlex Elder 	int ret;
128779e3057cSYehuda Sadeh 
128857cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
128957cfc106SAlex Elder 	if (!ops)
129057cfc106SAlex Elder 		return -ENOMEM;
129179e3057cSYehuda Sadeh 
129279e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
12930ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
129479e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
129579e3057cSYehuda Sadeh 
12960ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
129779e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
129879e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
129979e3057cSYehuda Sadeh 			      ops,
1300070c633fSAlex Elder 			      rbd_dev->header_name,
1301070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1302070c633fSAlex Elder 
130379e3057cSYehuda Sadeh 
130479e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
13050ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
13060ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
130779e3057cSYehuda Sadeh 	return ret;
130879e3057cSYehuda Sadeh }
130979e3057cSYehuda Sadeh 
131059c2be1eSYehuda Sadeh struct rbd_notify_info {
13110ce1a794SAlex Elder 	struct rbd_device *rbd_dev;
131259c2be1eSYehuda Sadeh };
131359c2be1eSYehuda Sadeh 
131459c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
131559c2be1eSYehuda Sadeh {
13160ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
13170ce1a794SAlex Elder 	if (!rbd_dev)
131859c2be1eSYehuda Sadeh 		return;
131959c2be1eSYehuda Sadeh 
1320bd919d45SAlex Elder 	dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1321bd919d45SAlex Elder 			rbd_dev->header_name, (unsigned long long) notify_id,
1322bd919d45SAlex Elder 			(unsigned int) opcode);
132359c2be1eSYehuda Sadeh }
132459c2be1eSYehuda Sadeh 
132559c2be1eSYehuda Sadeh /*
132659c2be1eSYehuda Sadeh  * Request sync osd notify
132759c2be1eSYehuda Sadeh  */
13284cb16250SAlex Elder static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
132959c2be1eSYehuda Sadeh {
133059c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13310ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
133259c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
133359c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
133459c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
133559c2be1eSYehuda Sadeh 	int ret;
133659c2be1eSYehuda Sadeh 
133757cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
133857cfc106SAlex Elder 	if (!ops)
133957cfc106SAlex Elder 		return -ENOMEM;
134059c2be1eSYehuda Sadeh 
13410ce1a794SAlex Elder 	info.rbd_dev = rbd_dev;
134259c2be1eSYehuda Sadeh 
134359c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
134459c2be1eSYehuda Sadeh 				     (void *)&info, &event);
134559c2be1eSYehuda Sadeh 	if (ret < 0)
134659c2be1eSYehuda Sadeh 		goto fail;
134759c2be1eSYehuda Sadeh 
134859c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
134959c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
135059c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
135159c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
135259c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
135359c2be1eSYehuda Sadeh 
13540ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
135559c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
135659c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
135759c2be1eSYehuda Sadeh 			       ops,
13584cb16250SAlex Elder 			       rbd_dev->header_name,
13594cb16250SAlex Elder 			       0, 0, NULL, NULL, NULL);
136059c2be1eSYehuda Sadeh 	if (ret < 0)
136159c2be1eSYehuda Sadeh 		goto fail_event;
136259c2be1eSYehuda Sadeh 
136359c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
136459c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
136559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
136659c2be1eSYehuda Sadeh 	return 0;
136759c2be1eSYehuda Sadeh 
136859c2be1eSYehuda Sadeh fail_event:
136959c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
137059c2be1eSYehuda Sadeh fail:
137159c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
137259c2be1eSYehuda Sadeh 	return ret;
137359c2be1eSYehuda Sadeh }
137459c2be1eSYehuda Sadeh 
137559c2be1eSYehuda Sadeh /*
1376602adf40SYehuda Sadeh  * Request sync osd read
1377602adf40SYehuda Sadeh  */
13780ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1379aded07eaSAlex Elder 			     const char *object_name,
1380aded07eaSAlex Elder 			     const char *class_name,
1381aded07eaSAlex Elder 			     const char *method_name,
1382602adf40SYehuda Sadeh 			     const char *data,
138359c2be1eSYehuda Sadeh 			     int len,
138459c2be1eSYehuda Sadeh 			     u64 *ver)
1385602adf40SYehuda Sadeh {
1386602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1387aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1388aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
138957cfc106SAlex Elder 	int ret;
139057cfc106SAlex Elder 
139157cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
1392aded07eaSAlex Elder 				    class_name_len + method_name_len + len);
139357cfc106SAlex Elder 	if (!ops)
139457cfc106SAlex Elder 		return -ENOMEM;
1395602adf40SYehuda Sadeh 
1396aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1397aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1398aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1399aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1400602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1401602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1402602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1403602adf40SYehuda Sadeh 
14040ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1405602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1406602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1407602adf40SYehuda Sadeh 			       ops,
1408d1f57ea6SAlex Elder 			       object_name, 0, 0, NULL, NULL, ver);
1409602adf40SYehuda Sadeh 
1410602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1411602adf40SYehuda Sadeh 
1412602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1413602adf40SYehuda Sadeh 	return ret;
1414602adf40SYehuda Sadeh }
1415602adf40SYehuda Sadeh 
14161fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14171fec7093SYehuda Sadeh {
14181fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14191fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14201fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14211fec7093SYehuda Sadeh 				GFP_ATOMIC);
14221fec7093SYehuda Sadeh 
14231fec7093SYehuda Sadeh 	if (!coll)
14241fec7093SYehuda Sadeh 		return NULL;
14251fec7093SYehuda Sadeh 	coll->total = num_reqs;
14261fec7093SYehuda Sadeh 	kref_init(&coll->kref);
14271fec7093SYehuda Sadeh 	return coll;
14281fec7093SYehuda Sadeh }
14291fec7093SYehuda Sadeh 
1430602adf40SYehuda Sadeh /*
1431602adf40SYehuda Sadeh  * block device queue callback
1432602adf40SYehuda Sadeh  */
1433602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1434602adf40SYehuda Sadeh {
1435602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1436602adf40SYehuda Sadeh 	struct request *rq;
1437602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1438602adf40SYehuda Sadeh 
143900f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1440602adf40SYehuda Sadeh 		struct bio *bio;
1441602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1442602adf40SYehuda Sadeh 		bool do_write;
1443bd919d45SAlex Elder 		unsigned int size;
1444bd919d45SAlex Elder 		u64 op_size = 0;
1445602adf40SYehuda Sadeh 		u64 ofs;
14461fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
14471fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1448d1d25646SJosh Durgin 		struct ceph_snap_context *snapc;
1449602adf40SYehuda Sadeh 
1450602adf40SYehuda Sadeh 		/* peek at request from block layer */
1451602adf40SYehuda Sadeh 		if (!rq)
1452602adf40SYehuda Sadeh 			break;
1453602adf40SYehuda Sadeh 
1454602adf40SYehuda Sadeh 		dout("fetched request\n");
1455602adf40SYehuda Sadeh 
1456602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1457602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1458602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
145900f1f36fSAlex Elder 			continue;
1460602adf40SYehuda Sadeh 		}
1461602adf40SYehuda Sadeh 
1462602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1463602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1464602adf40SYehuda Sadeh 
1465602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1466593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1467602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1468602adf40SYehuda Sadeh 		if (do_write && rbd_dev->read_only) {
1469602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
147000f1f36fSAlex Elder 			continue;
1471602adf40SYehuda Sadeh 		}
1472602adf40SYehuda Sadeh 
1473602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1474602adf40SYehuda Sadeh 
1475e88a36ecSJosh Durgin 		down_read(&rbd_dev->header_rwsem);
1476e88a36ecSJosh Durgin 
1477d1d25646SJosh Durgin 		if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
1478d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1479e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1480e88a36ecSJosh Durgin 			spin_lock_irq(q->queue_lock);
1481e88a36ecSJosh Durgin 			__blk_end_request_all(rq, -ENXIO);
1482e88a36ecSJosh Durgin 			continue;
1483e88a36ecSJosh Durgin 		}
1484d1d25646SJosh Durgin 
1485d1d25646SJosh Durgin 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1486d1d25646SJosh Durgin 
1487d1d25646SJosh Durgin 		up_read(&rbd_dev->header_rwsem);
1488e88a36ecSJosh Durgin 
1489602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1490602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1491bd919d45SAlex Elder 		     size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1492602adf40SYehuda Sadeh 
14931fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
14941fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
14951fec7093SYehuda Sadeh 		if (!coll) {
14961fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
14971fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
1498d1d25646SJosh Durgin 			ceph_put_snap_context(snapc);
149900f1f36fSAlex Elder 			continue;
15001fec7093SYehuda Sadeh 		}
15011fec7093SYehuda Sadeh 
1502602adf40SYehuda Sadeh 		do {
1503602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1504bd919d45SAlex Elder 			dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
1505602adf40SYehuda Sadeh 			op_size = rbd_get_segment(&rbd_dev->header,
1506ca1e49a6SAlex Elder 						  rbd_dev->header.object_prefix,
1507602adf40SYehuda Sadeh 						  ofs, size,
1508602adf40SYehuda Sadeh 						  NULL, NULL);
15091fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1510602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1511602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1512602adf40SYehuda Sadeh 			if (!bio) {
15131fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15141fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15151fec7093SYehuda Sadeh 				goto next_seg;
1516602adf40SYehuda Sadeh 			}
1517602adf40SYehuda Sadeh 
15181fec7093SYehuda Sadeh 
1519602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1520602adf40SYehuda Sadeh 			if (do_write)
1521602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1522d1d25646SJosh Durgin 					      snapc,
1523602adf40SYehuda Sadeh 					      ofs,
15241fec7093SYehuda Sadeh 					      op_size, bio,
15251fec7093SYehuda Sadeh 					      coll, cur_seg);
1526602adf40SYehuda Sadeh 			else
1527602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
152877dfe99fSJosh Durgin 					     rbd_dev->snap_id,
1529602adf40SYehuda Sadeh 					     ofs,
15301fec7093SYehuda Sadeh 					     op_size, bio,
15311fec7093SYehuda Sadeh 					     coll, cur_seg);
1532602adf40SYehuda Sadeh 
15331fec7093SYehuda Sadeh next_seg:
1534602adf40SYehuda Sadeh 			size -= op_size;
1535602adf40SYehuda Sadeh 			ofs += op_size;
1536602adf40SYehuda Sadeh 
15371fec7093SYehuda Sadeh 			cur_seg++;
1538602adf40SYehuda Sadeh 			rq_bio = next_bio;
1539602adf40SYehuda Sadeh 		} while (size > 0);
15401fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1541602adf40SYehuda Sadeh 
1542602adf40SYehuda Sadeh 		if (bp)
1543602adf40SYehuda Sadeh 			bio_pair_release(bp);
1544602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1545d1d25646SJosh Durgin 
1546d1d25646SJosh Durgin 		ceph_put_snap_context(snapc);
1547602adf40SYehuda Sadeh 	}
1548602adf40SYehuda Sadeh }
1549602adf40SYehuda Sadeh 
1550602adf40SYehuda Sadeh /*
1551602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1552602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1553602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1554602adf40SYehuda Sadeh  */
1555602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1556602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1557602adf40SYehuda Sadeh {
1558602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1559593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1560593a9e7bSAlex Elder 	sector_t sector;
1561593a9e7bSAlex Elder 	unsigned int bio_sectors;
1562602adf40SYehuda Sadeh 	int max;
1563602adf40SYehuda Sadeh 
1564593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1565593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1566593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1567593a9e7bSAlex Elder 
1568602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1569593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1570602adf40SYehuda Sadeh 	if (max < 0)
1571602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1572602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1573602adf40SYehuda Sadeh 		return bvec->bv_len;
1574602adf40SYehuda Sadeh 	return max;
1575602adf40SYehuda Sadeh }
1576602adf40SYehuda Sadeh 
1577602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1578602adf40SYehuda Sadeh {
1579602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1580602adf40SYehuda Sadeh 
1581602adf40SYehuda Sadeh 	if (!disk)
1582602adf40SYehuda Sadeh 		return;
1583602adf40SYehuda Sadeh 
1584602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1585602adf40SYehuda Sadeh 
1586602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1587602adf40SYehuda Sadeh 		del_gendisk(disk);
1588602adf40SYehuda Sadeh 	if (disk->queue)
1589602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1590602adf40SYehuda Sadeh 	put_disk(disk);
1591602adf40SYehuda Sadeh }
1592602adf40SYehuda Sadeh 
1593602adf40SYehuda Sadeh /*
1594602adf40SYehuda Sadeh  * reload the ondisk the header
1595602adf40SYehuda Sadeh  */
1596602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1597602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1598602adf40SYehuda Sadeh {
1599602adf40SYehuda Sadeh 	ssize_t rc;
1600602adf40SYehuda Sadeh 	struct rbd_image_header_ondisk *dh;
160150f7c4c9SXi Wang 	u32 snap_count = 0;
160259c2be1eSYehuda Sadeh 	u64 ver;
160300f1f36fSAlex Elder 	size_t len;
1604602adf40SYehuda Sadeh 
160500f1f36fSAlex Elder 	/*
160600f1f36fSAlex Elder 	 * First reads the fixed-size header to determine the number
160700f1f36fSAlex Elder 	 * of snapshots, then re-reads it, along with all snapshot
160800f1f36fSAlex Elder 	 * records as well as their stored names.
160900f1f36fSAlex Elder 	 */
161000f1f36fSAlex Elder 	len = sizeof (*dh);
1611602adf40SYehuda Sadeh 	while (1) {
1612602adf40SYehuda Sadeh 		dh = kmalloc(len, GFP_KERNEL);
1613602adf40SYehuda Sadeh 		if (!dh)
1614602adf40SYehuda Sadeh 			return -ENOMEM;
1615602adf40SYehuda Sadeh 
1616602adf40SYehuda Sadeh 		rc = rbd_req_sync_read(rbd_dev,
16179a5d690bSAlex Elder 				       CEPH_NOSNAP,
16180bed54dcSAlex Elder 				       rbd_dev->header_name,
1619602adf40SYehuda Sadeh 				       0, len,
162059c2be1eSYehuda Sadeh 				       (char *)dh, &ver);
1621602adf40SYehuda Sadeh 		if (rc < 0)
1622602adf40SYehuda Sadeh 			goto out_dh;
1623602adf40SYehuda Sadeh 
1624ed63f4fdSAlex Elder 		rc = rbd_header_from_disk(header, dh, snap_count);
162581e759fbSJosh Durgin 		if (rc < 0) {
162600f1f36fSAlex Elder 			if (rc == -ENXIO)
162781e759fbSJosh Durgin 				pr_warning("unrecognized header format"
16280bed54dcSAlex Elder 					   " for image %s\n",
16290bed54dcSAlex Elder 					   rbd_dev->image_name);
1630602adf40SYehuda Sadeh 			goto out_dh;
163181e759fbSJosh Durgin 		}
1632602adf40SYehuda Sadeh 
163300f1f36fSAlex Elder 		if (snap_count == header->total_snaps)
163400f1f36fSAlex Elder 			break;
163500f1f36fSAlex Elder 
1636602adf40SYehuda Sadeh 		snap_count = header->total_snaps;
163700f1f36fSAlex Elder 		len = sizeof (*dh) +
163800f1f36fSAlex Elder 			snap_count * sizeof(struct rbd_image_snap_ondisk) +
163900f1f36fSAlex Elder 			header->snap_names_len;
164000f1f36fSAlex Elder 
1641602adf40SYehuda Sadeh 		rbd_header_free(header);
1642602adf40SYehuda Sadeh 		kfree(dh);
1643602adf40SYehuda Sadeh 	}
164459c2be1eSYehuda Sadeh 	header->obj_version = ver;
1645602adf40SYehuda Sadeh 
1646602adf40SYehuda Sadeh out_dh:
1647602adf40SYehuda Sadeh 	kfree(dh);
1648602adf40SYehuda Sadeh 	return rc;
1649602adf40SYehuda Sadeh }
1650602adf40SYehuda Sadeh 
1651602adf40SYehuda Sadeh /*
1652602adf40SYehuda Sadeh  * create a snapshot
1653602adf40SYehuda Sadeh  */
16540ce1a794SAlex Elder static int rbd_header_add_snap(struct rbd_device *rbd_dev,
1655602adf40SYehuda Sadeh 			       const char *snap_name,
1656602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1657602adf40SYehuda Sadeh {
1658602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1659602adf40SYehuda Sadeh 	u64 new_snapid;
1660602adf40SYehuda Sadeh 	int ret;
1661916d4d67SSage Weil 	void *data, *p, *e;
16621dbb4399SAlex Elder 	struct ceph_mon_client *monc;
1663602adf40SYehuda Sadeh 
1664602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
16650ce1a794SAlex Elder 	if (rbd_dev->snap_id != CEPH_NOSNAP)
1666602adf40SYehuda Sadeh 		return -EINVAL;
1667602adf40SYehuda Sadeh 
16680ce1a794SAlex Elder 	monc = &rbd_dev->rbd_client->client->monc;
16690ce1a794SAlex Elder 	ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
1670bd919d45SAlex Elder 	dout("created snapid=%llu\n", (unsigned long long) new_snapid);
1671602adf40SYehuda Sadeh 	if (ret < 0)
1672602adf40SYehuda Sadeh 		return ret;
1673602adf40SYehuda Sadeh 
1674602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1675602adf40SYehuda Sadeh 	if (!data)
1676602adf40SYehuda Sadeh 		return -ENOMEM;
1677602adf40SYehuda Sadeh 
1678916d4d67SSage Weil 	p = data;
1679916d4d67SSage Weil 	e = data + name_len + 16;
1680602adf40SYehuda Sadeh 
1681916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1682916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1683602adf40SYehuda Sadeh 
16840bed54dcSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
16850ce1a794SAlex Elder 				"rbd", "snap_add",
1686d67d4be5SAlex Elder 				data, p - data, NULL);
1687602adf40SYehuda Sadeh 
1688916d4d67SSage Weil 	kfree(data);
1689602adf40SYehuda Sadeh 
1690505cbb9bSAlex Elder 	return ret < 0 ? ret : 0;
1691602adf40SYehuda Sadeh bad:
1692602adf40SYehuda Sadeh 	return -ERANGE;
1693602adf40SYehuda Sadeh }
1694602adf40SYehuda Sadeh 
1695dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1696dfc5606dSYehuda Sadeh {
1697dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1698a0593290SAlex Elder 	struct rbd_snap *next;
1699dfc5606dSYehuda Sadeh 
1700a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
170114e7085dSAlex Elder 		__rbd_remove_snap_dev(snap);
1702dfc5606dSYehuda Sadeh }
1703dfc5606dSYehuda Sadeh 
1704602adf40SYehuda Sadeh /*
1705602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1706602adf40SYehuda Sadeh  */
1707b813623aSAlex Elder static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1708602adf40SYehuda Sadeh {
1709602adf40SYehuda Sadeh 	int ret;
1710602adf40SYehuda Sadeh 	struct rbd_image_header h;
1711602adf40SYehuda Sadeh 
1712602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1713602adf40SYehuda Sadeh 	if (ret < 0)
1714602adf40SYehuda Sadeh 		return ret;
1715602adf40SYehuda Sadeh 
1716a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1717a51aa0c0SJosh Durgin 
17189db4b3e3SSage Weil 	/* resized? */
1719474ef7ceSJosh Durgin 	if (rbd_dev->snap_id == CEPH_NOSNAP) {
1720474ef7ceSJosh Durgin 		sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1721474ef7ceSJosh Durgin 
1722474ef7ceSJosh Durgin 		dout("setting size to %llu sectors", (unsigned long long) size);
1723474ef7ceSJosh Durgin 		set_capacity(rbd_dev->disk, size);
1724474ef7ceSJosh Durgin 	}
17259db4b3e3SSage Weil 
1726849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1727602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1728849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1729d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1730d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1731602adf40SYehuda Sadeh 
1732b813623aSAlex Elder 	if (hver)
1733b813623aSAlex Elder 		*hver = h.obj_version;
1734a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
173593a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1736602adf40SYehuda Sadeh 	rbd_dev->header.total_snaps = h.total_snaps;
1737602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1738602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1739dfc5606dSYehuda Sadeh 	rbd_dev->header.snap_names_len = h.snap_names_len;
1740602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1741849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1742849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1743849b4260SAlex Elder 	kfree(h.object_prefix);
1744849b4260SAlex Elder 
1745dfc5606dSYehuda Sadeh 	ret = __rbd_init_snaps_header(rbd_dev);
1746dfc5606dSYehuda Sadeh 
1747c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1748602adf40SYehuda Sadeh 
1749dfc5606dSYehuda Sadeh 	return ret;
1750602adf40SYehuda Sadeh }
1751602adf40SYehuda Sadeh 
17521fe5e993SAlex Elder static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
17531fe5e993SAlex Elder {
17541fe5e993SAlex Elder 	int ret;
17551fe5e993SAlex Elder 
17561fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
17571fe5e993SAlex Elder 	ret = __rbd_refresh_header(rbd_dev, hver);
17581fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
17591fe5e993SAlex Elder 
17601fe5e993SAlex Elder 	return ret;
17611fe5e993SAlex Elder }
17621fe5e993SAlex Elder 
1763602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1764602adf40SYehuda Sadeh {
1765602adf40SYehuda Sadeh 	struct gendisk *disk;
1766602adf40SYehuda Sadeh 	struct request_queue *q;
1767602adf40SYehuda Sadeh 	int rc;
1768593a9e7bSAlex Elder 	u64 segment_size;
1769602adf40SYehuda Sadeh 	u64 total_size = 0;
1770602adf40SYehuda Sadeh 
1771602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1772602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1773602adf40SYehuda Sadeh 	if (rc)
1774602adf40SYehuda Sadeh 		return rc;
1775602adf40SYehuda Sadeh 
1776dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
1777dfc5606dSYehuda Sadeh 	rc = __rbd_init_snaps_header(rbd_dev);
1778dfc5606dSYehuda Sadeh 	if (rc)
1779dfc5606dSYehuda Sadeh 		return rc;
1780dfc5606dSYehuda Sadeh 
1781cc9d734cSJosh Durgin 	rc = rbd_header_set_snap(rbd_dev, &total_size);
1782602adf40SYehuda Sadeh 	if (rc)
1783602adf40SYehuda Sadeh 		return rc;
1784602adf40SYehuda Sadeh 
1785602adf40SYehuda Sadeh 	/* create gendisk info */
1786602adf40SYehuda Sadeh 	rc = -ENOMEM;
1787602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1788602adf40SYehuda Sadeh 	if (!disk)
1789602adf40SYehuda Sadeh 		goto out;
1790602adf40SYehuda Sadeh 
1791f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1792de71a297SAlex Elder 		 rbd_dev->dev_id);
1793602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1794602adf40SYehuda Sadeh 	disk->first_minor = 0;
1795602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1796602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1797602adf40SYehuda Sadeh 
1798602adf40SYehuda Sadeh 	/* init rq */
1799602adf40SYehuda Sadeh 	rc = -ENOMEM;
1800602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1801602adf40SYehuda Sadeh 	if (!q)
1802602adf40SYehuda Sadeh 		goto out_disk;
1803029bcbd8SJosh Durgin 
1804593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1805593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1806593a9e7bSAlex Elder 
1807029bcbd8SJosh Durgin 	/* set io sizes to object size */
1808593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1809593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1810593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1811593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1812593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1813029bcbd8SJosh Durgin 
1814602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1815602adf40SYehuda Sadeh 	disk->queue = q;
1816602adf40SYehuda Sadeh 
1817602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1818602adf40SYehuda Sadeh 
1819602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1820602adf40SYehuda Sadeh 	rbd_dev->q = q;
1821602adf40SYehuda Sadeh 
1822602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
1823593a9e7bSAlex Elder 	set_capacity(disk, total_size / SECTOR_SIZE);
1824602adf40SYehuda Sadeh 	add_disk(disk);
1825602adf40SYehuda Sadeh 
1826602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
1827602adf40SYehuda Sadeh 		disk->disk_name, (unsigned long long)total_size);
1828602adf40SYehuda Sadeh 	return 0;
1829602adf40SYehuda Sadeh 
1830602adf40SYehuda Sadeh out_disk:
1831602adf40SYehuda Sadeh 	put_disk(disk);
1832602adf40SYehuda Sadeh out:
1833602adf40SYehuda Sadeh 	return rc;
1834602adf40SYehuda Sadeh }
1835602adf40SYehuda Sadeh 
1836dfc5606dSYehuda Sadeh /*
1837dfc5606dSYehuda Sadeh   sysfs
1838dfc5606dSYehuda Sadeh */
1839602adf40SYehuda Sadeh 
1840593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1841593a9e7bSAlex Elder {
1842593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1843593a9e7bSAlex Elder }
1844593a9e7bSAlex Elder 
1845dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1846dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1847602adf40SYehuda Sadeh {
1848593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1849a51aa0c0SJosh Durgin 	sector_t size;
1850dfc5606dSYehuda Sadeh 
1851a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1852a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1853a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1854a51aa0c0SJosh Durgin 
1855a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1856602adf40SYehuda Sadeh }
1857602adf40SYehuda Sadeh 
1858dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1859dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1860602adf40SYehuda Sadeh {
1861593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1862dfc5606dSYehuda Sadeh 
1863dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1864dfc5606dSYehuda Sadeh }
1865dfc5606dSYehuda Sadeh 
1866dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1867dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1868dfc5606dSYehuda Sadeh {
1869593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1870dfc5606dSYehuda Sadeh 
18711dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
18721dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1873dfc5606dSYehuda Sadeh }
1874dfc5606dSYehuda Sadeh 
1875dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1876dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1877dfc5606dSYehuda Sadeh {
1878593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1879dfc5606dSYehuda Sadeh 
1880dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1881dfc5606dSYehuda Sadeh }
1882dfc5606dSYehuda Sadeh 
18839bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
18849bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
18859bb2f334SAlex Elder {
18869bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
18879bb2f334SAlex Elder 
18889bb2f334SAlex Elder 	return sprintf(buf, "%d\n", rbd_dev->pool_id);
18899bb2f334SAlex Elder }
18909bb2f334SAlex Elder 
1891dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1892dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1893dfc5606dSYehuda Sadeh {
1894593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1895dfc5606dSYehuda Sadeh 
18960bed54dcSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->image_name);
1897dfc5606dSYehuda Sadeh }
1898dfc5606dSYehuda Sadeh 
1899dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
1900dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
1901dfc5606dSYehuda Sadeh 			     char *buf)
1902dfc5606dSYehuda Sadeh {
1903593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1904dfc5606dSYehuda Sadeh 
1905dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
1906dfc5606dSYehuda Sadeh }
1907dfc5606dSYehuda Sadeh 
1908dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
1909dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
1910dfc5606dSYehuda Sadeh 				 const char *buf,
1911dfc5606dSYehuda Sadeh 				 size_t size)
1912dfc5606dSYehuda Sadeh {
1913593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1914b813623aSAlex Elder 	int ret;
1915602adf40SYehuda Sadeh 
19161fe5e993SAlex Elder 	ret = rbd_refresh_header(rbd_dev, NULL);
1917b813623aSAlex Elder 
1918b813623aSAlex Elder 	return ret < 0 ? ret : size;
1919dfc5606dSYehuda Sadeh }
1920602adf40SYehuda Sadeh 
1921dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1922dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1923dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1924dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
19259bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
1926dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1927dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1928dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1929dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1930dfc5606dSYehuda Sadeh 
1931dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
1932dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
1933dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
1934dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
1935dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
19369bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
1937dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
1938dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
1939dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
1940dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
1941dfc5606dSYehuda Sadeh 	NULL
1942dfc5606dSYehuda Sadeh };
1943dfc5606dSYehuda Sadeh 
1944dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
1945dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
1946dfc5606dSYehuda Sadeh };
1947dfc5606dSYehuda Sadeh 
1948dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
1949dfc5606dSYehuda Sadeh 	&rbd_attr_group,
1950dfc5606dSYehuda Sadeh 	NULL
1951dfc5606dSYehuda Sadeh };
1952dfc5606dSYehuda Sadeh 
1953dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
1954dfc5606dSYehuda Sadeh {
1955dfc5606dSYehuda Sadeh }
1956dfc5606dSYehuda Sadeh 
1957dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
1958dfc5606dSYehuda Sadeh 	.name		= "rbd",
1959dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
1960dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
1961dfc5606dSYehuda Sadeh };
1962dfc5606dSYehuda Sadeh 
1963dfc5606dSYehuda Sadeh 
1964dfc5606dSYehuda Sadeh /*
1965dfc5606dSYehuda Sadeh   sysfs - snapshots
1966dfc5606dSYehuda Sadeh */
1967dfc5606dSYehuda Sadeh 
1968dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
1969dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
1970dfc5606dSYehuda Sadeh 				  char *buf)
1971dfc5606dSYehuda Sadeh {
1972dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1973dfc5606dSYehuda Sadeh 
19743591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
1975dfc5606dSYehuda Sadeh }
1976dfc5606dSYehuda Sadeh 
1977dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
1978dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
1979dfc5606dSYehuda Sadeh 				char *buf)
1980dfc5606dSYehuda Sadeh {
1981dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1982dfc5606dSYehuda Sadeh 
1983593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
1984dfc5606dSYehuda Sadeh }
1985dfc5606dSYehuda Sadeh 
1986dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1987dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1988dfc5606dSYehuda Sadeh 
1989dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
1990dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
1991dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
1992dfc5606dSYehuda Sadeh 	NULL,
1993dfc5606dSYehuda Sadeh };
1994dfc5606dSYehuda Sadeh 
1995dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
1996dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
1997dfc5606dSYehuda Sadeh };
1998dfc5606dSYehuda Sadeh 
1999dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2000dfc5606dSYehuda Sadeh {
2001dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2002dfc5606dSYehuda Sadeh 	kfree(snap->name);
2003dfc5606dSYehuda Sadeh 	kfree(snap);
2004dfc5606dSYehuda Sadeh }
2005dfc5606dSYehuda Sadeh 
2006dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2007dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2008dfc5606dSYehuda Sadeh 	NULL
2009dfc5606dSYehuda Sadeh };
2010dfc5606dSYehuda Sadeh 
2011dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2012dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2013dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2014dfc5606dSYehuda Sadeh };
2015dfc5606dSYehuda Sadeh 
201614e7085dSAlex Elder static void __rbd_remove_snap_dev(struct rbd_snap *snap)
2017dfc5606dSYehuda Sadeh {
2018dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2019dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
2020dfc5606dSYehuda Sadeh }
2021dfc5606dSYehuda Sadeh 
202214e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2023dfc5606dSYehuda Sadeh 				  struct device *parent)
2024dfc5606dSYehuda Sadeh {
2025dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2026dfc5606dSYehuda Sadeh 	int ret;
2027dfc5606dSYehuda Sadeh 
2028dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2029dfc5606dSYehuda Sadeh 	dev->parent = parent;
2030dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2031dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2032dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2033dfc5606dSYehuda Sadeh 
2034dfc5606dSYehuda Sadeh 	return ret;
2035dfc5606dSYehuda Sadeh }
2036dfc5606dSYehuda Sadeh 
20374e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
20384e891e0aSAlex Elder 					      int i, const char *name)
2039dfc5606dSYehuda Sadeh {
20404e891e0aSAlex Elder 	struct rbd_snap *snap;
2041dfc5606dSYehuda Sadeh 	int ret;
20424e891e0aSAlex Elder 
20434e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2044dfc5606dSYehuda Sadeh 	if (!snap)
20454e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
20464e891e0aSAlex Elder 
20474e891e0aSAlex Elder 	ret = -ENOMEM;
2048dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
20494e891e0aSAlex Elder 	if (!snap->name)
20504e891e0aSAlex Elder 		goto err;
20514e891e0aSAlex Elder 
2052dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
2053dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
2054dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
205514e7085dSAlex Elder 		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2056dfc5606dSYehuda Sadeh 		if (ret < 0)
2057dfc5606dSYehuda Sadeh 			goto err;
2058dfc5606dSYehuda Sadeh 	}
20594e891e0aSAlex Elder 
20604e891e0aSAlex Elder 	return snap;
20614e891e0aSAlex Elder 
2062dfc5606dSYehuda Sadeh err:
2063dfc5606dSYehuda Sadeh 	kfree(snap->name);
2064dfc5606dSYehuda Sadeh 	kfree(snap);
20654e891e0aSAlex Elder 
20664e891e0aSAlex Elder 	return ERR_PTR(ret);
2067dfc5606dSYehuda Sadeh }
2068dfc5606dSYehuda Sadeh 
2069dfc5606dSYehuda Sadeh /*
2070dfc5606dSYehuda Sadeh  * search for the previous snap in a null delimited string list
2071dfc5606dSYehuda Sadeh  */
2072dfc5606dSYehuda Sadeh const char *rbd_prev_snap_name(const char *name, const char *start)
2073dfc5606dSYehuda Sadeh {
2074dfc5606dSYehuda Sadeh 	if (name < start + 2)
2075dfc5606dSYehuda Sadeh 		return NULL;
2076dfc5606dSYehuda Sadeh 
2077dfc5606dSYehuda Sadeh 	name -= 2;
2078dfc5606dSYehuda Sadeh 	while (*name) {
2079dfc5606dSYehuda Sadeh 		if (name == start)
2080dfc5606dSYehuda Sadeh 			return start;
2081dfc5606dSYehuda Sadeh 		name--;
2082dfc5606dSYehuda Sadeh 	}
2083dfc5606dSYehuda Sadeh 	return name + 1;
2084dfc5606dSYehuda Sadeh }
2085dfc5606dSYehuda Sadeh 
2086dfc5606dSYehuda Sadeh /*
2087dfc5606dSYehuda Sadeh  * compare the old list of snapshots that we have to what's in the header
2088dfc5606dSYehuda Sadeh  * and update it accordingly. Note that the header holds the snapshots
2089dfc5606dSYehuda Sadeh  * in a reverse order (from newest to oldest) and we need to go from
2090dfc5606dSYehuda Sadeh  * older to new so that we don't get a duplicate snap name when
2091dfc5606dSYehuda Sadeh  * doing the process (e.g., removed snapshot and recreated a new
2092dfc5606dSYehuda Sadeh  * one with the same name.
2093dfc5606dSYehuda Sadeh  */
2094dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2095dfc5606dSYehuda Sadeh {
2096dfc5606dSYehuda Sadeh 	const char *name, *first_name;
2097dfc5606dSYehuda Sadeh 	int i = rbd_dev->header.total_snaps;
2098dfc5606dSYehuda Sadeh 	struct rbd_snap *snap, *old_snap = NULL;
2099dfc5606dSYehuda Sadeh 	struct list_head *p, *n;
2100dfc5606dSYehuda Sadeh 
2101dfc5606dSYehuda Sadeh 	first_name = rbd_dev->header.snap_names;
2102dfc5606dSYehuda Sadeh 	name = first_name + rbd_dev->header.snap_names_len;
2103dfc5606dSYehuda Sadeh 
2104dfc5606dSYehuda Sadeh 	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2105dfc5606dSYehuda Sadeh 		u64 cur_id;
2106dfc5606dSYehuda Sadeh 
2107dfc5606dSYehuda Sadeh 		old_snap = list_entry(p, struct rbd_snap, node);
2108dfc5606dSYehuda Sadeh 
2109dfc5606dSYehuda Sadeh 		if (i)
2110dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
2111dfc5606dSYehuda Sadeh 
2112dfc5606dSYehuda Sadeh 		if (!i || old_snap->id < cur_id) {
2113e88a36ecSJosh Durgin 			/*
2114e88a36ecSJosh Durgin 			 * old_snap->id was skipped, thus was
2115e88a36ecSJosh Durgin 			 * removed.  If this rbd_dev is mapped to
2116e88a36ecSJosh Durgin 			 * the removed snapshot, record that it no
2117e88a36ecSJosh Durgin 			 * longer exists, to prevent further I/O.
2118e88a36ecSJosh Durgin 			 */
2119e88a36ecSJosh Durgin 			if (rbd_dev->snap_id == old_snap->id)
2120e88a36ecSJosh Durgin 				rbd_dev->snap_exists = false;
212114e7085dSAlex Elder 			__rbd_remove_snap_dev(old_snap);
2122dfc5606dSYehuda Sadeh 			continue;
2123dfc5606dSYehuda Sadeh 		}
2124dfc5606dSYehuda Sadeh 		if (old_snap->id == cur_id) {
2125dfc5606dSYehuda Sadeh 			/* we have this snapshot already */
2126dfc5606dSYehuda Sadeh 			i--;
2127dfc5606dSYehuda Sadeh 			name = rbd_prev_snap_name(name, first_name);
2128dfc5606dSYehuda Sadeh 			continue;
2129dfc5606dSYehuda Sadeh 		}
2130dfc5606dSYehuda Sadeh 		for (; i > 0;
2131dfc5606dSYehuda Sadeh 		     i--, name = rbd_prev_snap_name(name, first_name)) {
2132dfc5606dSYehuda Sadeh 			if (!name) {
2133dfc5606dSYehuda Sadeh 				WARN_ON(1);
2134dfc5606dSYehuda Sadeh 				return -EINVAL;
2135dfc5606dSYehuda Sadeh 			}
2136dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i];
2137dfc5606dSYehuda Sadeh 			/* snapshot removal? handle it above */
2138dfc5606dSYehuda Sadeh 			if (cur_id >= old_snap->id)
2139dfc5606dSYehuda Sadeh 				break;
2140dfc5606dSYehuda Sadeh 			/* a new snapshot */
21414e891e0aSAlex Elder 			snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
21424e891e0aSAlex Elder 			if (IS_ERR(snap))
21434e891e0aSAlex Elder 				return PTR_ERR(snap);
2144dfc5606dSYehuda Sadeh 
2145dfc5606dSYehuda Sadeh 			/* note that we add it backward so using n and not p */
2146dfc5606dSYehuda Sadeh 			list_add(&snap->node, n);
2147dfc5606dSYehuda Sadeh 			p = &snap->node;
2148dfc5606dSYehuda Sadeh 		}
2149dfc5606dSYehuda Sadeh 	}
2150dfc5606dSYehuda Sadeh 	/* we're done going over the old snap list, just add what's left */
2151dfc5606dSYehuda Sadeh 	for (; i > 0; i--) {
2152dfc5606dSYehuda Sadeh 		name = rbd_prev_snap_name(name, first_name);
2153dfc5606dSYehuda Sadeh 		if (!name) {
2154dfc5606dSYehuda Sadeh 			WARN_ON(1);
2155dfc5606dSYehuda Sadeh 			return -EINVAL;
2156dfc5606dSYehuda Sadeh 		}
21574e891e0aSAlex Elder 		snap = __rbd_add_snap_dev(rbd_dev, i - 1, name);
21584e891e0aSAlex Elder 		if (IS_ERR(snap))
21594e891e0aSAlex Elder 			return PTR_ERR(snap);
2160dfc5606dSYehuda Sadeh 		list_add(&snap->node, &rbd_dev->snaps);
2161dfc5606dSYehuda Sadeh 	}
2162dfc5606dSYehuda Sadeh 
2163dfc5606dSYehuda Sadeh 	return 0;
2164dfc5606dSYehuda Sadeh }
2165dfc5606dSYehuda Sadeh 
2166dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2167dfc5606dSYehuda Sadeh {
2168f0f8cef5SAlex Elder 	int ret;
2169dfc5606dSYehuda Sadeh 	struct device *dev;
2170dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2171dfc5606dSYehuda Sadeh 
2172dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2173dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2174dfc5606dSYehuda Sadeh 
2175dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2176dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2177dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2178dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2179de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
2180dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2181dfc5606dSYehuda Sadeh 	if (ret < 0)
2182f0f8cef5SAlex Elder 		goto out;
2183dfc5606dSYehuda Sadeh 
2184dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
218514e7085dSAlex Elder 		ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2186dfc5606dSYehuda Sadeh 		if (ret < 0)
2187602adf40SYehuda Sadeh 			break;
2188602adf40SYehuda Sadeh 	}
2189f0f8cef5SAlex Elder out:
2190dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2191dfc5606dSYehuda Sadeh 	return ret;
2192602adf40SYehuda Sadeh }
2193602adf40SYehuda Sadeh 
2194dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2195dfc5606dSYehuda Sadeh {
2196dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2197dfc5606dSYehuda Sadeh }
2198dfc5606dSYehuda Sadeh 
219959c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
220059c2be1eSYehuda Sadeh {
220159c2be1eSYehuda Sadeh 	int ret, rc;
220259c2be1eSYehuda Sadeh 
220359c2be1eSYehuda Sadeh 	do {
22040e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
220559c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
22061fe5e993SAlex Elder 			rc = rbd_refresh_header(rbd_dev, NULL);
220759c2be1eSYehuda Sadeh 			if (rc < 0)
220859c2be1eSYehuda Sadeh 				return rc;
220959c2be1eSYehuda Sadeh 		}
221059c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
221159c2be1eSYehuda Sadeh 
221259c2be1eSYehuda Sadeh 	return ret;
221359c2be1eSYehuda Sadeh }
221459c2be1eSYehuda Sadeh 
22151ddbe94eSAlex Elder static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
22161ddbe94eSAlex Elder 
22171ddbe94eSAlex Elder /*
2218499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2219499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
22201ddbe94eSAlex Elder  */
2221499afd5bSAlex Elder static void rbd_id_get(struct rbd_device *rbd_dev)
2222b7f23c36SAlex Elder {
2223de71a297SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
2224499afd5bSAlex Elder 
2225499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2226499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2227499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2228b7f23c36SAlex Elder }
2229b7f23c36SAlex Elder 
22301ddbe94eSAlex Elder /*
2231499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2232499afd5bSAlex Elder  * identifier is no longer in use.
22331ddbe94eSAlex Elder  */
2234499afd5bSAlex Elder static void rbd_id_put(struct rbd_device *rbd_dev)
22351ddbe94eSAlex Elder {
2236d184f6bfSAlex Elder 	struct list_head *tmp;
2237de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
2238d184f6bfSAlex Elder 	int max_id;
2239d184f6bfSAlex Elder 
2240d184f6bfSAlex Elder 	BUG_ON(rbd_id < 1);
2241499afd5bSAlex Elder 
2242499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2243499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2244d184f6bfSAlex Elder 
2245d184f6bfSAlex Elder 	/*
2246d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2247d184f6bfSAlex Elder 	 * is nothing special we need to do.
2248d184f6bfSAlex Elder 	 */
2249d184f6bfSAlex Elder 	if (rbd_id != atomic64_read(&rbd_id_max)) {
2250d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2251d184f6bfSAlex Elder 		return;
2252d184f6bfSAlex Elder 	}
2253d184f6bfSAlex Elder 
2254d184f6bfSAlex Elder 	/*
2255d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2256d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2257d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2258d184f6bfSAlex Elder 	 */
2259d184f6bfSAlex Elder 	max_id = 0;
2260d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2261d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2262d184f6bfSAlex Elder 
2263d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2264d184f6bfSAlex Elder 		if (rbd_id > max_id)
2265d184f6bfSAlex Elder 			max_id = rbd_id;
2266d184f6bfSAlex Elder 	}
2267499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
22681ddbe94eSAlex Elder 
22691ddbe94eSAlex Elder 	/*
2270d184f6bfSAlex Elder 	 * The max id could have been updated by rbd_id_get(), in
2271d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2272d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2273d184f6bfSAlex Elder 	 * case.
22741ddbe94eSAlex Elder 	 */
2275d184f6bfSAlex Elder 	atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
2276b7f23c36SAlex Elder }
2277b7f23c36SAlex Elder 
2278a725f65eSAlex Elder /*
2279e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2280e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2281593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2282593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2283e28fff26SAlex Elder  */
2284e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2285e28fff26SAlex Elder {
2286e28fff26SAlex Elder         /*
2287e28fff26SAlex Elder         * These are the characters that produce nonzero for
2288e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2289e28fff26SAlex Elder         */
2290e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2291e28fff26SAlex Elder 
2292e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2293e28fff26SAlex Elder 
2294e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2295e28fff26SAlex Elder }
2296e28fff26SAlex Elder 
2297e28fff26SAlex Elder /*
2298e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2299e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2300593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2301593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2302e28fff26SAlex Elder  *
2303e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2304e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2305e28fff26SAlex Elder  * token_size if the token would not fit.
2306e28fff26SAlex Elder  *
2307593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2308e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2309e28fff26SAlex Elder  * too small to hold it.
2310e28fff26SAlex Elder  */
2311e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2312e28fff26SAlex Elder 				char *token,
2313e28fff26SAlex Elder 				size_t token_size)
2314e28fff26SAlex Elder {
2315e28fff26SAlex Elder         size_t len;
2316e28fff26SAlex Elder 
2317e28fff26SAlex Elder 	len = next_token(buf);
2318e28fff26SAlex Elder 	if (len < token_size) {
2319e28fff26SAlex Elder 		memcpy(token, *buf, len);
2320e28fff26SAlex Elder 		*(token + len) = '\0';
2321e28fff26SAlex Elder 	}
2322e28fff26SAlex Elder 	*buf += len;
2323e28fff26SAlex Elder 
2324e28fff26SAlex Elder         return len;
2325e28fff26SAlex Elder }
2326e28fff26SAlex Elder 
2327e28fff26SAlex Elder /*
2328ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
2329ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
2330ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
2331ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
2332ea3352f4SAlex Elder  *
2333ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
2334ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
2335ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
2336ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
2337ea3352f4SAlex Elder  *
2338ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
2339ea3352f4SAlex Elder  * the end of the found token.
2340ea3352f4SAlex Elder  *
2341ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
2342ea3352f4SAlex Elder  */
2343ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
2344ea3352f4SAlex Elder {
2345ea3352f4SAlex Elder 	char *dup;
2346ea3352f4SAlex Elder 	size_t len;
2347ea3352f4SAlex Elder 
2348ea3352f4SAlex Elder 	len = next_token(buf);
2349ea3352f4SAlex Elder 	dup = kmalloc(len + 1, GFP_KERNEL);
2350ea3352f4SAlex Elder 	if (!dup)
2351ea3352f4SAlex Elder 		return NULL;
2352ea3352f4SAlex Elder 
2353ea3352f4SAlex Elder 	memcpy(dup, *buf, len);
2354ea3352f4SAlex Elder 	*(dup + len) = '\0';
2355ea3352f4SAlex Elder 	*buf += len;
2356ea3352f4SAlex Elder 
2357ea3352f4SAlex Elder 	if (lenp)
2358ea3352f4SAlex Elder 		*lenp = len;
2359ea3352f4SAlex Elder 
2360ea3352f4SAlex Elder 	return dup;
2361ea3352f4SAlex Elder }
2362ea3352f4SAlex Elder 
2363ea3352f4SAlex Elder /*
23640bed54dcSAlex Elder  * This fills in the pool_name, image_name, image_name_len, snap_name,
2365a725f65eSAlex Elder  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2366a725f65eSAlex Elder  * on the list of monitor addresses and other options provided via
2367a725f65eSAlex Elder  * /sys/bus/rbd/add.
2368d22f76e7SAlex Elder  *
2369d22f76e7SAlex Elder  * Note: rbd_dev is assumed to have been initially zero-filled.
2370a725f65eSAlex Elder  */
2371a725f65eSAlex Elder static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2372a725f65eSAlex Elder 			      const char *buf,
23737ef3214aSAlex Elder 			      const char **mon_addrs,
23745214ecc4SAlex Elder 			      size_t *mon_addrs_size,
2375e28fff26SAlex Elder 			      char *options,
2376e28fff26SAlex Elder 			     size_t options_size)
2377a725f65eSAlex Elder {
2378e28fff26SAlex Elder 	size_t len;
2379d22f76e7SAlex Elder 	int ret;
2380e28fff26SAlex Elder 
2381e28fff26SAlex Elder 	/* The first four tokens are required */
2382e28fff26SAlex Elder 
23837ef3214aSAlex Elder 	len = next_token(&buf);
23847ef3214aSAlex Elder 	if (!len)
2385a725f65eSAlex Elder 		return -EINVAL;
23865214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
23877ef3214aSAlex Elder 	*mon_addrs = buf;
23887ef3214aSAlex Elder 
23897ef3214aSAlex Elder 	buf += len;
2390a725f65eSAlex Elder 
2391e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2392e28fff26SAlex Elder 	if (!len || len >= options_size)
2393e28fff26SAlex Elder 		return -EINVAL;
2394a725f65eSAlex Elder 
2395bf3e5ae1SAlex Elder 	ret = -ENOMEM;
2396d22f76e7SAlex Elder 	rbd_dev->pool_name = dup_token(&buf, NULL);
2397d22f76e7SAlex Elder 	if (!rbd_dev->pool_name)
2398d22f76e7SAlex Elder 		goto out_err;
2399e28fff26SAlex Elder 
24000bed54dcSAlex Elder 	rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
24010bed54dcSAlex Elder 	if (!rbd_dev->image_name)
2402bf3e5ae1SAlex Elder 		goto out_err;
2403e28fff26SAlex Elder 
2404cb8627c7SAlex Elder 	/* Create the name of the header object */
2405cb8627c7SAlex Elder 
24060bed54dcSAlex Elder 	rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2407bf3e5ae1SAlex Elder 						+ sizeof (RBD_SUFFIX),
2408bf3e5ae1SAlex Elder 					GFP_KERNEL);
24090bed54dcSAlex Elder 	if (!rbd_dev->header_name)
2410cb8627c7SAlex Elder 		goto out_err;
24110bed54dcSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2412a725f65eSAlex Elder 
2413e28fff26SAlex Elder 	/*
2414820a5f3eSAlex Elder 	 * The snapshot name is optional.  If none is is supplied,
2415820a5f3eSAlex Elder 	 * we use the default value.
2416e28fff26SAlex Elder 	 */
2417820a5f3eSAlex Elder 	rbd_dev->snap_name = dup_token(&buf, &len);
2418820a5f3eSAlex Elder 	if (!rbd_dev->snap_name)
2419820a5f3eSAlex Elder 		goto out_err;
2420820a5f3eSAlex Elder 	if (!len) {
2421820a5f3eSAlex Elder 		/* Replace the empty name with the default */
2422820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
2423820a5f3eSAlex Elder 		rbd_dev->snap_name
2424820a5f3eSAlex Elder 			= kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2425820a5f3eSAlex Elder 		if (!rbd_dev->snap_name)
2426820a5f3eSAlex Elder 			goto out_err;
2427820a5f3eSAlex Elder 
2428e28fff26SAlex Elder 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2429e28fff26SAlex Elder 			sizeof (RBD_SNAP_HEAD_NAME));
2430849b4260SAlex Elder 	}
2431e28fff26SAlex Elder 
2432a725f65eSAlex Elder 	return 0;
2433d22f76e7SAlex Elder 
2434d22f76e7SAlex Elder out_err:
24350bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
24360bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
2437d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
2438d22f76e7SAlex Elder 	rbd_dev->pool_name = NULL;
2439d22f76e7SAlex Elder 
2440d22f76e7SAlex Elder 	return ret;
2441a725f65eSAlex Elder }
2442a725f65eSAlex Elder 
244359c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
244459c2be1eSYehuda Sadeh 		       const char *buf,
244559c2be1eSYehuda Sadeh 		       size_t count)
2446602adf40SYehuda Sadeh {
2447cb8627c7SAlex Elder 	char *options;
2448cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
24497ef3214aSAlex Elder 	const char *mon_addrs = NULL;
24507ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
245127cc2594SAlex Elder 	struct ceph_osd_client *osdc;
245227cc2594SAlex Elder 	int rc = -ENOMEM;
2453602adf40SYehuda Sadeh 
2454602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2455602adf40SYehuda Sadeh 		return -ENODEV;
2456602adf40SYehuda Sadeh 
245727cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
245827cc2594SAlex Elder 	if (!options)
245927cc2594SAlex Elder 		goto err_nomem;
2460cb8627c7SAlex Elder 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2461cb8627c7SAlex Elder 	if (!rbd_dev)
2462cb8627c7SAlex Elder 		goto err_nomem;
2463602adf40SYehuda Sadeh 
2464602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2465602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2466602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2467dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2468c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2469602adf40SYehuda Sadeh 
2470d184f6bfSAlex Elder 	/* generate unique id: find highest unique id, add one */
2471499afd5bSAlex Elder 	rbd_id_get(rbd_dev);
2472602adf40SYehuda Sadeh 
2473a725f65eSAlex Elder 	/* Fill in the device name, now that we have its id. */
247481a89793SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
247581a89793SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2476de71a297SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2477e124a82fSAlex Elder 
2478a725f65eSAlex Elder 	/* parse add command */
24797ef3214aSAlex Elder 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2480e28fff26SAlex Elder 				options, count);
2481a725f65eSAlex Elder 	if (rc)
2482a725f65eSAlex Elder 		goto err_put_id;
2483a725f65eSAlex Elder 
24845214ecc4SAlex Elder 	rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
24855214ecc4SAlex Elder 						options);
2486d720bcb0SAlex Elder 	if (IS_ERR(rbd_dev->rbd_client)) {
2487d720bcb0SAlex Elder 		rc = PTR_ERR(rbd_dev->rbd_client);
2488f0f8cef5SAlex Elder 		goto err_put_id;
2489d720bcb0SAlex Elder 	}
2490602adf40SYehuda Sadeh 
2491602adf40SYehuda Sadeh 	/* pick the pool */
24921dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2493602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2494602adf40SYehuda Sadeh 	if (rc < 0)
2495602adf40SYehuda Sadeh 		goto err_out_client;
24969bb2f334SAlex Elder 	rbd_dev->pool_id = rc;
2497602adf40SYehuda Sadeh 
2498602adf40SYehuda Sadeh 	/* register our block device */
249927cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
250027cc2594SAlex Elder 	if (rc < 0)
2501602adf40SYehuda Sadeh 		goto err_out_client;
250227cc2594SAlex Elder 	rbd_dev->major = rc;
2503602adf40SYehuda Sadeh 
2504dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2505dfc5606dSYehuda Sadeh 	if (rc)
2506766fc439SYehuda Sadeh 		goto err_out_blkdev;
2507766fc439SYehuda Sadeh 
250832eec68dSAlex Elder 	/*
250932eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
251032eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
251132eec68dSAlex Elder 	 *
251232eec68dSAlex Elder 	 * Set up and announce blkdev mapping.
251332eec68dSAlex Elder 	 */
2514602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2515602adf40SYehuda Sadeh 	if (rc)
2516766fc439SYehuda Sadeh 		goto err_out_bus;
2517602adf40SYehuda Sadeh 
251859c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
251959c2be1eSYehuda Sadeh 	if (rc)
252059c2be1eSYehuda Sadeh 		goto err_out_bus;
252159c2be1eSYehuda Sadeh 
2522602adf40SYehuda Sadeh 	return count;
2523602adf40SYehuda Sadeh 
2524766fc439SYehuda Sadeh err_out_bus:
2525766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2526766fc439SYehuda Sadeh 
2527766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2528766fc439SYehuda Sadeh 	kfree(options);
2529766fc439SYehuda Sadeh 	return rc;
2530766fc439SYehuda Sadeh 
2531602adf40SYehuda Sadeh err_out_blkdev:
2532602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2533602adf40SYehuda Sadeh err_out_client:
2534602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2535f0f8cef5SAlex Elder err_put_id:
2536cb8627c7SAlex Elder 	if (rbd_dev->pool_name) {
2537820a5f3eSAlex Elder 		kfree(rbd_dev->snap_name);
25380bed54dcSAlex Elder 		kfree(rbd_dev->header_name);
25390bed54dcSAlex Elder 		kfree(rbd_dev->image_name);
2540d22f76e7SAlex Elder 		kfree(rbd_dev->pool_name);
2541cb8627c7SAlex Elder 	}
2542499afd5bSAlex Elder 	rbd_id_put(rbd_dev);
254327cc2594SAlex Elder err_nomem:
254427cc2594SAlex Elder 	kfree(rbd_dev);
2545cb8627c7SAlex Elder 	kfree(options);
254627cc2594SAlex Elder 
2547602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2548602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
254927cc2594SAlex Elder 
255027cc2594SAlex Elder 	return (ssize_t) rc;
2551602adf40SYehuda Sadeh }
2552602adf40SYehuda Sadeh 
2553de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
2554602adf40SYehuda Sadeh {
2555602adf40SYehuda Sadeh 	struct list_head *tmp;
2556602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2557602adf40SYehuda Sadeh 
2558e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2559602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2560602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2561de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
2562e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2563602adf40SYehuda Sadeh 			return rbd_dev;
2564602adf40SYehuda Sadeh 		}
2565e124a82fSAlex Elder 	}
2566e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2567602adf40SYehuda Sadeh 	return NULL;
2568602adf40SYehuda Sadeh }
2569602adf40SYehuda Sadeh 
2570dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2571602adf40SYehuda Sadeh {
2572593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2573602adf40SYehuda Sadeh 
25741dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
25751dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
25761dbb4399SAlex Elder 
25771dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
257859c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
25791dbb4399SAlex Elder 	}
258059c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
2581070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
258259c2be1eSYehuda Sadeh 
2583602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2584602adf40SYehuda Sadeh 
2585602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2586602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2587602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
258832eec68dSAlex Elder 
258932eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
2590820a5f3eSAlex Elder 	kfree(rbd_dev->snap_name);
25910bed54dcSAlex Elder 	kfree(rbd_dev->header_name);
2592d22f76e7SAlex Elder 	kfree(rbd_dev->pool_name);
25930bed54dcSAlex Elder 	kfree(rbd_dev->image_name);
259432eec68dSAlex Elder 	rbd_id_put(rbd_dev);
2595602adf40SYehuda Sadeh 	kfree(rbd_dev);
2596602adf40SYehuda Sadeh 
2597602adf40SYehuda Sadeh 	/* release module ref */
2598602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2599602adf40SYehuda Sadeh }
2600602adf40SYehuda Sadeh 
2601dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2602602adf40SYehuda Sadeh 			  const char *buf,
2603602adf40SYehuda Sadeh 			  size_t count)
2604602adf40SYehuda Sadeh {
2605602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2606602adf40SYehuda Sadeh 	int target_id, rc;
2607602adf40SYehuda Sadeh 	unsigned long ul;
2608602adf40SYehuda Sadeh 	int ret = count;
2609602adf40SYehuda Sadeh 
2610602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2611602adf40SYehuda Sadeh 	if (rc)
2612602adf40SYehuda Sadeh 		return rc;
2613602adf40SYehuda Sadeh 
2614602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2615602adf40SYehuda Sadeh 	target_id = (int) ul;
2616602adf40SYehuda Sadeh 	if (target_id != ul)
2617602adf40SYehuda Sadeh 		return -EINVAL;
2618602adf40SYehuda Sadeh 
2619602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2620602adf40SYehuda Sadeh 
2621602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2622602adf40SYehuda Sadeh 	if (!rbd_dev) {
2623602adf40SYehuda Sadeh 		ret = -ENOENT;
2624602adf40SYehuda Sadeh 		goto done;
2625602adf40SYehuda Sadeh 	}
2626602adf40SYehuda Sadeh 
2627dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2628dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2629602adf40SYehuda Sadeh 
2630602adf40SYehuda Sadeh done:
2631602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2632602adf40SYehuda Sadeh 	return ret;
2633602adf40SYehuda Sadeh }
2634602adf40SYehuda Sadeh 
2635dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2636dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2637602adf40SYehuda Sadeh 			    const char *buf,
2638602adf40SYehuda Sadeh 			    size_t count)
2639602adf40SYehuda Sadeh {
2640593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2641dfc5606dSYehuda Sadeh 	int ret;
2642dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2643602adf40SYehuda Sadeh 	if (!name)
2644602adf40SYehuda Sadeh 		return -ENOMEM;
2645602adf40SYehuda Sadeh 
2646dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2647602adf40SYehuda Sadeh 
2648602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2649602adf40SYehuda Sadeh 
2650602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2651602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2652602adf40SYehuda Sadeh 	if (ret < 0)
265359c2be1eSYehuda Sadeh 		goto err_unlock;
2654602adf40SYehuda Sadeh 
2655b813623aSAlex Elder 	ret = __rbd_refresh_header(rbd_dev, NULL);
2656602adf40SYehuda Sadeh 	if (ret < 0)
265759c2be1eSYehuda Sadeh 		goto err_unlock;
265859c2be1eSYehuda Sadeh 
265959c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
266059c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
266159c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
266259c2be1eSYehuda Sadeh 
266359c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
26644cb16250SAlex Elder 	rbd_req_sync_notify(rbd_dev);
2665602adf40SYehuda Sadeh 
2666602adf40SYehuda Sadeh 	ret = count;
266759c2be1eSYehuda Sadeh 	kfree(name);
266859c2be1eSYehuda Sadeh 	return ret;
266959c2be1eSYehuda Sadeh 
267059c2be1eSYehuda Sadeh err_unlock:
2671602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2672602adf40SYehuda Sadeh 	kfree(name);
2673602adf40SYehuda Sadeh 	return ret;
2674602adf40SYehuda Sadeh }
2675602adf40SYehuda Sadeh 
2676602adf40SYehuda Sadeh /*
2677602adf40SYehuda Sadeh  * create control files in sysfs
2678dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2679602adf40SYehuda Sadeh  */
2680602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2681602adf40SYehuda Sadeh {
2682dfc5606dSYehuda Sadeh 	int ret;
2683602adf40SYehuda Sadeh 
2684fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
2685dfc5606dSYehuda Sadeh 	if (ret < 0)
2686dfc5606dSYehuda Sadeh 		return ret;
2687602adf40SYehuda Sadeh 
2688fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
2689fed4c143SAlex Elder 	if (ret < 0)
2690fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
2691602adf40SYehuda Sadeh 
2692602adf40SYehuda Sadeh 	return ret;
2693602adf40SYehuda Sadeh }
2694602adf40SYehuda Sadeh 
2695602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2696602adf40SYehuda Sadeh {
2697dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2698fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
2699602adf40SYehuda Sadeh }
2700602adf40SYehuda Sadeh 
2701602adf40SYehuda Sadeh int __init rbd_init(void)
2702602adf40SYehuda Sadeh {
2703602adf40SYehuda Sadeh 	int rc;
2704602adf40SYehuda Sadeh 
2705602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2706602adf40SYehuda Sadeh 	if (rc)
2707602adf40SYehuda Sadeh 		return rc;
2708f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2709602adf40SYehuda Sadeh 	return 0;
2710602adf40SYehuda Sadeh }
2711602adf40SYehuda Sadeh 
2712602adf40SYehuda Sadeh void __exit rbd_exit(void)
2713602adf40SYehuda Sadeh {
2714602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2715602adf40SYehuda Sadeh }
2716602adf40SYehuda Sadeh 
2717602adf40SYehuda Sadeh module_init(rbd_init);
2718602adf40SYehuda Sadeh module_exit(rbd_exit);
2719602adf40SYehuda Sadeh 
2720602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2721602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2722602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2723602adf40SYehuda Sadeh 
2724602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2725602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2726602adf40SYehuda Sadeh 
2727602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2728