xref: /openbmc/linux/drivers/block/rbd.c (revision c666601a)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44593a9e7bSAlex Elder /*
45593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
46593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
47593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
48593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
49593a9e7bSAlex Elder  */
50593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
51593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
52593a9e7bSAlex Elder 
53f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
54f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
55602adf40SYehuda Sadeh 
56602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
57602adf40SYehuda Sadeh 
5821079786SAlex Elder #define RBD_MAX_MD_NAME_LEN	(RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
59602adf40SYehuda Sadeh #define RBD_MAX_POOL_NAME_LEN	64
60602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
61602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
62602adf40SYehuda Sadeh 
63602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
64602adf40SYehuda Sadeh 
6581a89793SAlex Elder /*
6681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
6781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
6881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
6981a89793SAlex Elder  * enough to hold all possible device names.
7081a89793SAlex Elder  */
71602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
7281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
73602adf40SYehuda Sadeh 
7459c2be1eSYehuda Sadeh #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
7559c2be1eSYehuda Sadeh 
76602adf40SYehuda Sadeh /*
77602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
78602adf40SYehuda Sadeh  */
79602adf40SYehuda Sadeh struct rbd_image_header {
80602adf40SYehuda Sadeh 	u64 image_size;
81602adf40SYehuda Sadeh 	char block_name[32];
82602adf40SYehuda Sadeh 	__u8 obj_order;
83602adf40SYehuda Sadeh 	__u8 crypt_type;
84602adf40SYehuda Sadeh 	__u8 comp_type;
85602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc;
86602adf40SYehuda Sadeh 	size_t snap_names_len;
87602adf40SYehuda Sadeh 	u64 snap_seq;
88602adf40SYehuda Sadeh 	u32 total_snaps;
89602adf40SYehuda Sadeh 
90602adf40SYehuda Sadeh 	char *snap_names;
91602adf40SYehuda Sadeh 	u64 *snap_sizes;
9259c2be1eSYehuda Sadeh 
9359c2be1eSYehuda Sadeh 	u64 obj_version;
9459c2be1eSYehuda Sadeh };
9559c2be1eSYehuda Sadeh 
9659c2be1eSYehuda Sadeh struct rbd_options {
9759c2be1eSYehuda Sadeh 	int	notify_timeout;
98602adf40SYehuda Sadeh };
99602adf40SYehuda Sadeh 
100602adf40SYehuda Sadeh /*
101f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
102602adf40SYehuda Sadeh  */
103602adf40SYehuda Sadeh struct rbd_client {
104602adf40SYehuda Sadeh 	struct ceph_client	*client;
10559c2be1eSYehuda Sadeh 	struct rbd_options	*rbd_opts;
106602adf40SYehuda Sadeh 	struct kref		kref;
107602adf40SYehuda Sadeh 	struct list_head	node;
108602adf40SYehuda Sadeh };
109602adf40SYehuda Sadeh 
110602adf40SYehuda Sadeh /*
111f0f8cef5SAlex Elder  * a request completion status
112602adf40SYehuda Sadeh  */
1131fec7093SYehuda Sadeh struct rbd_req_status {
1141fec7093SYehuda Sadeh 	int done;
1151fec7093SYehuda Sadeh 	int rc;
1161fec7093SYehuda Sadeh 	u64 bytes;
1171fec7093SYehuda Sadeh };
1181fec7093SYehuda Sadeh 
1191fec7093SYehuda Sadeh /*
1201fec7093SYehuda Sadeh  * a collection of requests
1211fec7093SYehuda Sadeh  */
1221fec7093SYehuda Sadeh struct rbd_req_coll {
1231fec7093SYehuda Sadeh 	int			total;
1241fec7093SYehuda Sadeh 	int			num_done;
1251fec7093SYehuda Sadeh 	struct kref		kref;
1261fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
127602adf40SYehuda Sadeh };
128602adf40SYehuda Sadeh 
129f0f8cef5SAlex Elder /*
130f0f8cef5SAlex Elder  * a single io request
131f0f8cef5SAlex Elder  */
132f0f8cef5SAlex Elder struct rbd_request {
133f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
134f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
135f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
136f0f8cef5SAlex Elder 	u64			len;
137f0f8cef5SAlex Elder 	int			coll_index;
138f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
139f0f8cef5SAlex Elder };
140f0f8cef5SAlex Elder 
141dfc5606dSYehuda Sadeh struct rbd_snap {
142dfc5606dSYehuda Sadeh 	struct	device		dev;
143dfc5606dSYehuda Sadeh 	const char		*name;
144dfc5606dSYehuda Sadeh 	size_t			size;
145dfc5606dSYehuda Sadeh 	struct list_head	node;
146dfc5606dSYehuda Sadeh 	u64			id;
147dfc5606dSYehuda Sadeh };
148dfc5606dSYehuda Sadeh 
149602adf40SYehuda Sadeh /*
150602adf40SYehuda Sadeh  * a single device
151602adf40SYehuda Sadeh  */
152602adf40SYehuda Sadeh struct rbd_device {
153602adf40SYehuda Sadeh 	int			id;		/* blkdev unique id */
154602adf40SYehuda Sadeh 
155602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
156602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
157602adf40SYehuda Sadeh 	struct request_queue	*q;
158602adf40SYehuda Sadeh 
159602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
160602adf40SYehuda Sadeh 
161602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
162602adf40SYehuda Sadeh 
163602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
164602adf40SYehuda Sadeh 
165602adf40SYehuda Sadeh 	struct rbd_image_header	header;
166602adf40SYehuda Sadeh 	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
167602adf40SYehuda Sadeh 	int			obj_len;
168602adf40SYehuda Sadeh 	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
169602adf40SYehuda Sadeh 	char			pool_name[RBD_MAX_POOL_NAME_LEN];
170602adf40SYehuda Sadeh 	int			poolid;
171602adf40SYehuda Sadeh 
17259c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
17359c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
17459c2be1eSYehuda Sadeh 
175c666601aSJosh Durgin 	/* protects updating the header */
176c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
177602adf40SYehuda Sadeh 	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
178602adf40SYehuda Sadeh 	u32 cur_snap;	/* index+1 of current snapshot within snap context
179602adf40SYehuda Sadeh 			   0 - for the head */
180602adf40SYehuda Sadeh 	int read_only;
181602adf40SYehuda Sadeh 
182602adf40SYehuda Sadeh 	struct list_head	node;
183dfc5606dSYehuda Sadeh 
184dfc5606dSYehuda Sadeh 	/* list of snapshots */
185dfc5606dSYehuda Sadeh 	struct list_head	snaps;
186dfc5606dSYehuda Sadeh 
187dfc5606dSYehuda Sadeh 	/* sysfs related */
188dfc5606dSYehuda Sadeh 	struct device		dev;
189dfc5606dSYehuda Sadeh };
190dfc5606dSYehuda Sadeh 
191602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
192e124a82fSAlex Elder 
193602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
194e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
195e124a82fSAlex Elder 
196602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
197432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
198602adf40SYehuda Sadeh 
199dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
201dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
202dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
203dfc5606dSYehuda Sadeh 			    const char *buf,
204dfc5606dSYehuda Sadeh 			    size_t count);
205dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
20669932487SJustin P. Mattock 				  struct rbd_snap *snap);
207dfc5606dSYehuda Sadeh 
208f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
209f0f8cef5SAlex Elder 		       size_t count);
210f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
211f0f8cef5SAlex Elder 			  size_t count);
212f0f8cef5SAlex Elder 
213f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
214f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
215f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
216f0f8cef5SAlex Elder 	__ATTR_NULL
217f0f8cef5SAlex Elder };
218f0f8cef5SAlex Elder 
219f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
220f0f8cef5SAlex Elder 	.name		= "rbd",
221f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
222f0f8cef5SAlex Elder };
223f0f8cef5SAlex Elder 
224f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
225f0f8cef5SAlex Elder {
226f0f8cef5SAlex Elder }
227f0f8cef5SAlex Elder 
228f0f8cef5SAlex Elder static struct device rbd_root_dev = {
229f0f8cef5SAlex Elder 	.init_name =    "rbd",
230f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
231f0f8cef5SAlex Elder };
232f0f8cef5SAlex Elder 
233dfc5606dSYehuda Sadeh 
234dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
235dfc5606dSYehuda Sadeh {
236dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
237dfc5606dSYehuda Sadeh }
238dfc5606dSYehuda Sadeh 
239dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
240dfc5606dSYehuda Sadeh {
241dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
242dfc5606dSYehuda Sadeh }
243602adf40SYehuda Sadeh 
24459c2be1eSYehuda Sadeh static int __rbd_update_snaps(struct rbd_device *rbd_dev);
24559c2be1eSYehuda Sadeh 
246602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
247602adf40SYehuda Sadeh {
248f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
249602adf40SYehuda Sadeh 
250dfc5606dSYehuda Sadeh 	rbd_get_dev(rbd_dev);
251dfc5606dSYehuda Sadeh 
252602adf40SYehuda Sadeh 	set_device_ro(bdev, rbd_dev->read_only);
253602adf40SYehuda Sadeh 
254602adf40SYehuda Sadeh 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
255602adf40SYehuda Sadeh 		return -EROFS;
256602adf40SYehuda Sadeh 
257602adf40SYehuda Sadeh 	return 0;
258602adf40SYehuda Sadeh }
259602adf40SYehuda Sadeh 
260dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
261dfc5606dSYehuda Sadeh {
262dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
263dfc5606dSYehuda Sadeh 
264dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
265dfc5606dSYehuda Sadeh 
266dfc5606dSYehuda Sadeh 	return 0;
267dfc5606dSYehuda Sadeh }
268dfc5606dSYehuda Sadeh 
269602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
270602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
271602adf40SYehuda Sadeh 	.open			= rbd_open,
272dfc5606dSYehuda Sadeh 	.release		= rbd_release,
273602adf40SYehuda Sadeh };
274602adf40SYehuda Sadeh 
275602adf40SYehuda Sadeh /*
276602adf40SYehuda Sadeh  * Initialize an rbd client instance.
277602adf40SYehuda Sadeh  * We own *opt.
278602adf40SYehuda Sadeh  */
27959c2be1eSYehuda Sadeh static struct rbd_client *rbd_client_create(struct ceph_options *opt,
28059c2be1eSYehuda Sadeh 					    struct rbd_options *rbd_opts)
281602adf40SYehuda Sadeh {
282602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
283602adf40SYehuda Sadeh 	int ret = -ENOMEM;
284602adf40SYehuda Sadeh 
285602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
286602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
287602adf40SYehuda Sadeh 	if (!rbdc)
288602adf40SYehuda Sadeh 		goto out_opt;
289602adf40SYehuda Sadeh 
290602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
291602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
292602adf40SYehuda Sadeh 
293bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
294bc534d86SAlex Elder 
2956ab00d46SSage Weil 	rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
296602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
297bc534d86SAlex Elder 		goto out_mutex;
29828f259b7SVasiliy Kulikov 	opt = NULL; /* Now rbdc->client is responsible for opt */
299602adf40SYehuda Sadeh 
300602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
301602adf40SYehuda Sadeh 	if (ret < 0)
302602adf40SYehuda Sadeh 		goto out_err;
303602adf40SYehuda Sadeh 
30459c2be1eSYehuda Sadeh 	rbdc->rbd_opts = rbd_opts;
30559c2be1eSYehuda Sadeh 
306432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
307602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
308432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
309602adf40SYehuda Sadeh 
310bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
311bc534d86SAlex Elder 
312602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
313602adf40SYehuda Sadeh 	return rbdc;
314602adf40SYehuda Sadeh 
315602adf40SYehuda Sadeh out_err:
316602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
317bc534d86SAlex Elder out_mutex:
318bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
319602adf40SYehuda Sadeh 	kfree(rbdc);
320602adf40SYehuda Sadeh out_opt:
32128f259b7SVasiliy Kulikov 	if (opt)
322602adf40SYehuda Sadeh 		ceph_destroy_options(opt);
32328f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
324602adf40SYehuda Sadeh }
325602adf40SYehuda Sadeh 
326602adf40SYehuda Sadeh /*
327602adf40SYehuda Sadeh  * Find a ceph client with specific addr and configuration.
328602adf40SYehuda Sadeh  */
329602adf40SYehuda Sadeh static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
330602adf40SYehuda Sadeh {
331602adf40SYehuda Sadeh 	struct rbd_client *client_node;
332602adf40SYehuda Sadeh 
333602adf40SYehuda Sadeh 	if (opt->flags & CEPH_OPT_NOSHARE)
334602adf40SYehuda Sadeh 		return NULL;
335602adf40SYehuda Sadeh 
336602adf40SYehuda Sadeh 	list_for_each_entry(client_node, &rbd_client_list, node)
337602adf40SYehuda Sadeh 		if (ceph_compare_options(opt, client_node->client) == 0)
338602adf40SYehuda Sadeh 			return client_node;
339602adf40SYehuda Sadeh 	return NULL;
340602adf40SYehuda Sadeh }
341602adf40SYehuda Sadeh 
342602adf40SYehuda Sadeh /*
34359c2be1eSYehuda Sadeh  * mount options
34459c2be1eSYehuda Sadeh  */
34559c2be1eSYehuda Sadeh enum {
34659c2be1eSYehuda Sadeh 	Opt_notify_timeout,
34759c2be1eSYehuda Sadeh 	Opt_last_int,
34859c2be1eSYehuda Sadeh 	/* int args above */
34959c2be1eSYehuda Sadeh 	Opt_last_string,
35059c2be1eSYehuda Sadeh 	/* string args above */
35159c2be1eSYehuda Sadeh };
35259c2be1eSYehuda Sadeh 
35359c2be1eSYehuda Sadeh static match_table_t rbdopt_tokens = {
35459c2be1eSYehuda Sadeh 	{Opt_notify_timeout, "notify_timeout=%d"},
35559c2be1eSYehuda Sadeh 	/* int args above */
35659c2be1eSYehuda Sadeh 	/* string args above */
35759c2be1eSYehuda Sadeh 	{-1, NULL}
35859c2be1eSYehuda Sadeh };
35959c2be1eSYehuda Sadeh 
36059c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
36159c2be1eSYehuda Sadeh {
36259c2be1eSYehuda Sadeh 	struct rbd_options *rbdopt = private;
36359c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
36459c2be1eSYehuda Sadeh 	int token, intval, ret;
36559c2be1eSYehuda Sadeh 
36621079786SAlex Elder 	token = match_token(c, rbdopt_tokens, argstr);
36759c2be1eSYehuda Sadeh 	if (token < 0)
36859c2be1eSYehuda Sadeh 		return -EINVAL;
36959c2be1eSYehuda Sadeh 
37059c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
37159c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
37259c2be1eSYehuda Sadeh 		if (ret < 0) {
37359c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
37459c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
37559c2be1eSYehuda Sadeh 			return ret;
37659c2be1eSYehuda Sadeh 		}
37759c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
37859c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
37959c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
38059c2be1eSYehuda Sadeh 		     argstr[0].from);
38159c2be1eSYehuda Sadeh 	} else {
38259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
38359c2be1eSYehuda Sadeh 	}
38459c2be1eSYehuda Sadeh 
38559c2be1eSYehuda Sadeh 	switch (token) {
38659c2be1eSYehuda Sadeh 	case Opt_notify_timeout:
38759c2be1eSYehuda Sadeh 		rbdopt->notify_timeout = intval;
38859c2be1eSYehuda Sadeh 		break;
38959c2be1eSYehuda Sadeh 	default:
39059c2be1eSYehuda Sadeh 		BUG_ON(token);
39159c2be1eSYehuda Sadeh 	}
39259c2be1eSYehuda Sadeh 	return 0;
39359c2be1eSYehuda Sadeh }
39459c2be1eSYehuda Sadeh 
39559c2be1eSYehuda Sadeh /*
396602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
397602adf40SYehuda Sadeh  * not exist create it.
398602adf40SYehuda Sadeh  */
3995214ecc4SAlex Elder static struct rbd_client *rbd_get_client(const char *mon_addr,
4005214ecc4SAlex Elder 					 size_t mon_addr_len,
4015214ecc4SAlex Elder 					 char *options)
402602adf40SYehuda Sadeh {
403602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
404602adf40SYehuda Sadeh 	struct ceph_options *opt;
40559c2be1eSYehuda Sadeh 	struct rbd_options *rbd_opts;
40659c2be1eSYehuda Sadeh 
40759c2be1eSYehuda Sadeh 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
40859c2be1eSYehuda Sadeh 	if (!rbd_opts)
409d720bcb0SAlex Elder 		return ERR_PTR(-ENOMEM);
41059c2be1eSYehuda Sadeh 
41159c2be1eSYehuda Sadeh 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
412602adf40SYehuda Sadeh 
413ee57741cSAlex Elder 	opt = ceph_parse_options(options, mon_addr,
4145214ecc4SAlex Elder 				mon_addr + mon_addr_len,
41521079786SAlex Elder 				parse_rbd_opts_token, rbd_opts);
416ee57741cSAlex Elder 	if (IS_ERR(opt)) {
417d720bcb0SAlex Elder 		kfree(rbd_opts);
418d720bcb0SAlex Elder 		return ERR_CAST(opt);
419ee57741cSAlex Elder 	}
420602adf40SYehuda Sadeh 
421432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
422602adf40SYehuda Sadeh 	rbdc = __rbd_client_find(opt);
423602adf40SYehuda Sadeh 	if (rbdc) {
424e6994d3dSAlex Elder 		/* using an existing client */
425e6994d3dSAlex Elder 		kref_get(&rbdc->kref);
426432b8587SAlex Elder 		spin_unlock(&rbd_client_list_lock);
427e6994d3dSAlex Elder 
428602adf40SYehuda Sadeh 		ceph_destroy_options(opt);
42997bb59a0SAlex Elder 		kfree(rbd_opts);
430602adf40SYehuda Sadeh 
431d720bcb0SAlex Elder 		return rbdc;
432602adf40SYehuda Sadeh 	}
433432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
434602adf40SYehuda Sadeh 
43559c2be1eSYehuda Sadeh 	rbdc = rbd_client_create(opt, rbd_opts);
436d97081b0SAlex Elder 
437d720bcb0SAlex Elder 	if (IS_ERR(rbdc))
43859c2be1eSYehuda Sadeh 		kfree(rbd_opts);
439d720bcb0SAlex Elder 
440d720bcb0SAlex Elder 	return rbdc;
441602adf40SYehuda Sadeh }
442602adf40SYehuda Sadeh 
443602adf40SYehuda Sadeh /*
444602adf40SYehuda Sadeh  * Destroy ceph client
445d23a4b3fSAlex Elder  *
446432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
447602adf40SYehuda Sadeh  */
448602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
449602adf40SYehuda Sadeh {
450602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
451602adf40SYehuda Sadeh 
452602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
453602adf40SYehuda Sadeh 	list_del(&rbdc->node);
454602adf40SYehuda Sadeh 
455602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
45659c2be1eSYehuda Sadeh 	kfree(rbdc->rbd_opts);
457602adf40SYehuda Sadeh 	kfree(rbdc);
458602adf40SYehuda Sadeh }
459602adf40SYehuda Sadeh 
460602adf40SYehuda Sadeh /*
461602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
462602adf40SYehuda Sadeh  * it.
463602adf40SYehuda Sadeh  */
464602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
465602adf40SYehuda Sadeh {
466432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
467602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
468432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
469602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
470602adf40SYehuda Sadeh }
471602adf40SYehuda Sadeh 
4721fec7093SYehuda Sadeh /*
4731fec7093SYehuda Sadeh  * Destroy requests collection
4741fec7093SYehuda Sadeh  */
4751fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4761fec7093SYehuda Sadeh {
4771fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4781fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
4791fec7093SYehuda Sadeh 
4801fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
4811fec7093SYehuda Sadeh 	kfree(coll);
4821fec7093SYehuda Sadeh }
483602adf40SYehuda Sadeh 
484602adf40SYehuda Sadeh /*
485602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
486602adf40SYehuda Sadeh  * header.
487602adf40SYehuda Sadeh  */
488602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
489602adf40SYehuda Sadeh 				 struct rbd_image_header_ondisk *ondisk,
490602adf40SYehuda Sadeh 				 int allocated_snaps,
491602adf40SYehuda Sadeh 				 gfp_t gfp_flags)
492602adf40SYehuda Sadeh {
493602adf40SYehuda Sadeh 	int i;
49400f1f36fSAlex Elder 	u32 snap_count;
495602adf40SYehuda Sadeh 
49621079786SAlex Elder 	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
49781e759fbSJosh Durgin 		return -ENXIO;
49881e759fbSJosh Durgin 
49900f1f36fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
500602adf40SYehuda Sadeh 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
50121079786SAlex Elder 				snap_count * sizeof (*ondisk),
502602adf40SYehuda Sadeh 				gfp_flags);
503602adf40SYehuda Sadeh 	if (!header->snapc)
504602adf40SYehuda Sadeh 		return -ENOMEM;
50500f1f36fSAlex Elder 
50600f1f36fSAlex Elder 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
507602adf40SYehuda Sadeh 	if (snap_count) {
508602adf40SYehuda Sadeh 		header->snap_names = kmalloc(header->snap_names_len,
509602adf40SYehuda Sadeh 					     GFP_KERNEL);
510602adf40SYehuda Sadeh 		if (!header->snap_names)
511602adf40SYehuda Sadeh 			goto err_snapc;
512602adf40SYehuda Sadeh 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
513602adf40SYehuda Sadeh 					     GFP_KERNEL);
514602adf40SYehuda Sadeh 		if (!header->snap_sizes)
515602adf40SYehuda Sadeh 			goto err_names;
516602adf40SYehuda Sadeh 	} else {
517602adf40SYehuda Sadeh 		header->snap_names = NULL;
518602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
519602adf40SYehuda Sadeh 	}
520602adf40SYehuda Sadeh 	memcpy(header->block_name, ondisk->block_name,
521602adf40SYehuda Sadeh 	       sizeof(ondisk->block_name));
522602adf40SYehuda Sadeh 
523602adf40SYehuda Sadeh 	header->image_size = le64_to_cpu(ondisk->image_size);
524602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
525602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
526602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
527602adf40SYehuda Sadeh 
528602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
529602adf40SYehuda Sadeh 	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
530602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
531602adf40SYehuda Sadeh 	header->total_snaps = snap_count;
532602adf40SYehuda Sadeh 
53321079786SAlex Elder 	if (snap_count && allocated_snaps == snap_count) {
534602adf40SYehuda Sadeh 		for (i = 0; i < snap_count; i++) {
535602adf40SYehuda Sadeh 			header->snapc->snaps[i] =
536602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].id);
537602adf40SYehuda Sadeh 			header->snap_sizes[i] =
538602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].image_size);
539602adf40SYehuda Sadeh 		}
540602adf40SYehuda Sadeh 
541602adf40SYehuda Sadeh 		/* copy snapshot names */
542602adf40SYehuda Sadeh 		memcpy(header->snap_names, &ondisk->snaps[i],
543602adf40SYehuda Sadeh 			header->snap_names_len);
544602adf40SYehuda Sadeh 	}
545602adf40SYehuda Sadeh 
546602adf40SYehuda Sadeh 	return 0;
547602adf40SYehuda Sadeh 
548602adf40SYehuda Sadeh err_names:
549602adf40SYehuda Sadeh 	kfree(header->snap_names);
550602adf40SYehuda Sadeh err_snapc:
551602adf40SYehuda Sadeh 	kfree(header->snapc);
55200f1f36fSAlex Elder 	return -ENOMEM;
553602adf40SYehuda Sadeh }
554602adf40SYehuda Sadeh 
555602adf40SYehuda Sadeh static int snap_index(struct rbd_image_header *header, int snap_num)
556602adf40SYehuda Sadeh {
557602adf40SYehuda Sadeh 	return header->total_snaps - snap_num;
558602adf40SYehuda Sadeh }
559602adf40SYehuda Sadeh 
560602adf40SYehuda Sadeh static u64 cur_snap_id(struct rbd_device *rbd_dev)
561602adf40SYehuda Sadeh {
562602adf40SYehuda Sadeh 	struct rbd_image_header *header = &rbd_dev->header;
563602adf40SYehuda Sadeh 
564602adf40SYehuda Sadeh 	if (!rbd_dev->cur_snap)
565602adf40SYehuda Sadeh 		return 0;
566602adf40SYehuda Sadeh 
567602adf40SYehuda Sadeh 	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
568602adf40SYehuda Sadeh }
569602adf40SYehuda Sadeh 
570602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
571602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
572602adf40SYehuda Sadeh {
573602adf40SYehuda Sadeh 	int i;
574602adf40SYehuda Sadeh 	char *p = header->snap_names;
575602adf40SYehuda Sadeh 
57600f1f36fSAlex Elder 	for (i = 0; i < header->total_snaps; i++) {
57700f1f36fSAlex Elder 		if (!strcmp(snap_name, p)) {
57800f1f36fSAlex Elder 
57900f1f36fSAlex Elder 			/* Found it.  Pass back its id and/or size */
58000f1f36fSAlex Elder 
581602adf40SYehuda Sadeh 			if (seq)
582602adf40SYehuda Sadeh 				*seq = header->snapc->snaps[i];
583602adf40SYehuda Sadeh 			if (size)
584602adf40SYehuda Sadeh 				*size = header->snap_sizes[i];
585602adf40SYehuda Sadeh 			return i;
586602adf40SYehuda Sadeh 		}
58700f1f36fSAlex Elder 		p += strlen(p) + 1;	/* Skip ahead to the next name */
58800f1f36fSAlex Elder 	}
58900f1f36fSAlex Elder 	return -ENOENT;
59000f1f36fSAlex Elder }
591602adf40SYehuda Sadeh 
592cc9d734cSJosh Durgin static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
593602adf40SYehuda Sadeh {
594602adf40SYehuda Sadeh 	struct rbd_image_header *header = &dev->header;
595602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc = header->snapc;
596602adf40SYehuda Sadeh 	int ret = -ENOENT;
597602adf40SYehuda Sadeh 
598cc9d734cSJosh Durgin 	BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
599cc9d734cSJosh Durgin 
600c666601aSJosh Durgin 	down_write(&dev->header_rwsem);
601602adf40SYehuda Sadeh 
602cc9d734cSJosh Durgin 	if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
603cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
604602adf40SYehuda Sadeh 		if (header->total_snaps)
605602adf40SYehuda Sadeh 			snapc->seq = header->snap_seq;
606602adf40SYehuda Sadeh 		else
607602adf40SYehuda Sadeh 			snapc->seq = 0;
608602adf40SYehuda Sadeh 		dev->cur_snap = 0;
609602adf40SYehuda Sadeh 		dev->read_only = 0;
610602adf40SYehuda Sadeh 		if (size)
611602adf40SYehuda Sadeh 			*size = header->image_size;
612602adf40SYehuda Sadeh 	} else {
613cc9d734cSJosh Durgin 		ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
614602adf40SYehuda Sadeh 		if (ret < 0)
615602adf40SYehuda Sadeh 			goto done;
616602adf40SYehuda Sadeh 
617602adf40SYehuda Sadeh 		dev->cur_snap = header->total_snaps - ret;
618602adf40SYehuda Sadeh 		dev->read_only = 1;
619602adf40SYehuda Sadeh 	}
620602adf40SYehuda Sadeh 
621602adf40SYehuda Sadeh 	ret = 0;
622602adf40SYehuda Sadeh done:
623c666601aSJosh Durgin 	up_write(&dev->header_rwsem);
624602adf40SYehuda Sadeh 	return ret;
625602adf40SYehuda Sadeh }
626602adf40SYehuda Sadeh 
627602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
628602adf40SYehuda Sadeh {
629602adf40SYehuda Sadeh 	kfree(header->snapc);
630602adf40SYehuda Sadeh 	kfree(header->snap_names);
631602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
632602adf40SYehuda Sadeh }
633602adf40SYehuda Sadeh 
634602adf40SYehuda Sadeh /*
635602adf40SYehuda Sadeh  * get the actual striped segment name, offset and length
636602adf40SYehuda Sadeh  */
637602adf40SYehuda Sadeh static u64 rbd_get_segment(struct rbd_image_header *header,
638602adf40SYehuda Sadeh 			   const char *block_name,
639602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
640602adf40SYehuda Sadeh 			   char *seg_name, u64 *segofs)
641602adf40SYehuda Sadeh {
642602adf40SYehuda Sadeh 	u64 seg = ofs >> header->obj_order;
643602adf40SYehuda Sadeh 
644602adf40SYehuda Sadeh 	if (seg_name)
645602adf40SYehuda Sadeh 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
646602adf40SYehuda Sadeh 			 "%s.%012llx", block_name, seg);
647602adf40SYehuda Sadeh 
648602adf40SYehuda Sadeh 	ofs = ofs & ((1 << header->obj_order) - 1);
649602adf40SYehuda Sadeh 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
650602adf40SYehuda Sadeh 
651602adf40SYehuda Sadeh 	if (segofs)
652602adf40SYehuda Sadeh 		*segofs = ofs;
653602adf40SYehuda Sadeh 
654602adf40SYehuda Sadeh 	return len;
655602adf40SYehuda Sadeh }
656602adf40SYehuda Sadeh 
6571fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
6581fec7093SYehuda Sadeh 				u64 ofs, u64 len)
6591fec7093SYehuda Sadeh {
6601fec7093SYehuda Sadeh 	u64 start_seg = ofs >> header->obj_order;
6611fec7093SYehuda Sadeh 	u64 end_seg = (ofs + len - 1) >> header->obj_order;
6621fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
6631fec7093SYehuda Sadeh }
6641fec7093SYehuda Sadeh 
665602adf40SYehuda Sadeh /*
666029bcbd8SJosh Durgin  * returns the size of an object in the image
667029bcbd8SJosh Durgin  */
668029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
669029bcbd8SJosh Durgin {
670029bcbd8SJosh Durgin 	return 1 << header->obj_order;
671029bcbd8SJosh Durgin }
672029bcbd8SJosh Durgin 
673029bcbd8SJosh Durgin /*
674602adf40SYehuda Sadeh  * bio helpers
675602adf40SYehuda Sadeh  */
676602adf40SYehuda Sadeh 
677602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
678602adf40SYehuda Sadeh {
679602adf40SYehuda Sadeh 	struct bio *tmp;
680602adf40SYehuda Sadeh 
681602adf40SYehuda Sadeh 	while (chain) {
682602adf40SYehuda Sadeh 		tmp = chain;
683602adf40SYehuda Sadeh 		chain = chain->bi_next;
684602adf40SYehuda Sadeh 		bio_put(tmp);
685602adf40SYehuda Sadeh 	}
686602adf40SYehuda Sadeh }
687602adf40SYehuda Sadeh 
688602adf40SYehuda Sadeh /*
689602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
690602adf40SYehuda Sadeh  */
691602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
692602adf40SYehuda Sadeh {
693602adf40SYehuda Sadeh 	struct bio_vec *bv;
694602adf40SYehuda Sadeh 	unsigned long flags;
695602adf40SYehuda Sadeh 	void *buf;
696602adf40SYehuda Sadeh 	int i;
697602adf40SYehuda Sadeh 	int pos = 0;
698602adf40SYehuda Sadeh 
699602adf40SYehuda Sadeh 	while (chain) {
700602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
701602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
702602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
703602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
704602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
705602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
70685b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
707602adf40SYehuda Sadeh 			}
708602adf40SYehuda Sadeh 			pos += bv->bv_len;
709602adf40SYehuda Sadeh 		}
710602adf40SYehuda Sadeh 
711602adf40SYehuda Sadeh 		chain = chain->bi_next;
712602adf40SYehuda Sadeh 	}
713602adf40SYehuda Sadeh }
714602adf40SYehuda Sadeh 
715602adf40SYehuda Sadeh /*
716602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
717602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
718602adf40SYehuda Sadeh  */
719602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
720602adf40SYehuda Sadeh 				   struct bio_pair **bp,
721602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
722602adf40SYehuda Sadeh {
723602adf40SYehuda Sadeh 	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
724602adf40SYehuda Sadeh 	int total = 0;
725602adf40SYehuda Sadeh 
726602adf40SYehuda Sadeh 	if (*bp) {
727602adf40SYehuda Sadeh 		bio_pair_release(*bp);
728602adf40SYehuda Sadeh 		*bp = NULL;
729602adf40SYehuda Sadeh 	}
730602adf40SYehuda Sadeh 
731602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
732602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
733602adf40SYehuda Sadeh 		if (!tmp)
734602adf40SYehuda Sadeh 			goto err_out;
735602adf40SYehuda Sadeh 
736602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
737602adf40SYehuda Sadeh 			struct bio_pair *bp;
738602adf40SYehuda Sadeh 
739602adf40SYehuda Sadeh 			/*
740602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
741602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
742602adf40SYehuda Sadeh 			 */
743602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
744602adf40SYehuda Sadeh 			     "bi_size=%d\n",
745602adf40SYehuda Sadeh 			     (int)total, (int)len-total,
746602adf40SYehuda Sadeh 			     (int)old_chain->bi_size);
747602adf40SYehuda Sadeh 
748602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
749602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
750593a9e7bSAlex Elder 			bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
751602adf40SYehuda Sadeh 			if (!bp)
752602adf40SYehuda Sadeh 				goto err_out;
753602adf40SYehuda Sadeh 
754602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
755602adf40SYehuda Sadeh 
756602adf40SYehuda Sadeh 			*next = &bp->bio2;
757602adf40SYehuda Sadeh 		} else {
758602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
759602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
760602adf40SYehuda Sadeh 		}
761602adf40SYehuda Sadeh 
762602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
763602adf40SYehuda Sadeh 		gfpmask &= ~__GFP_WAIT;
764602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
765602adf40SYehuda Sadeh 
766602adf40SYehuda Sadeh 		if (!new_chain) {
767602adf40SYehuda Sadeh 			new_chain = tail = tmp;
768602adf40SYehuda Sadeh 		} else {
769602adf40SYehuda Sadeh 			tail->bi_next = tmp;
770602adf40SYehuda Sadeh 			tail = tmp;
771602adf40SYehuda Sadeh 		}
772602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
773602adf40SYehuda Sadeh 
774602adf40SYehuda Sadeh 		total += tmp->bi_size;
775602adf40SYehuda Sadeh 	}
776602adf40SYehuda Sadeh 
777602adf40SYehuda Sadeh 	BUG_ON(total < len);
778602adf40SYehuda Sadeh 
779602adf40SYehuda Sadeh 	if (tail)
780602adf40SYehuda Sadeh 		tail->bi_next = NULL;
781602adf40SYehuda Sadeh 
782602adf40SYehuda Sadeh 	*old = old_chain;
783602adf40SYehuda Sadeh 
784602adf40SYehuda Sadeh 	return new_chain;
785602adf40SYehuda Sadeh 
786602adf40SYehuda Sadeh err_out:
787602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
788602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
789602adf40SYehuda Sadeh 	return NULL;
790602adf40SYehuda Sadeh }
791602adf40SYehuda Sadeh 
792602adf40SYehuda Sadeh /*
793602adf40SYehuda Sadeh  * helpers for osd request op vectors.
794602adf40SYehuda Sadeh  */
795602adf40SYehuda Sadeh static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
796602adf40SYehuda Sadeh 			    int num_ops,
797602adf40SYehuda Sadeh 			    int opcode,
798602adf40SYehuda Sadeh 			    u32 payload_len)
799602adf40SYehuda Sadeh {
800602adf40SYehuda Sadeh 	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
801602adf40SYehuda Sadeh 		       GFP_NOIO);
802602adf40SYehuda Sadeh 	if (!*ops)
803602adf40SYehuda Sadeh 		return -ENOMEM;
804602adf40SYehuda Sadeh 	(*ops)[0].op = opcode;
805602adf40SYehuda Sadeh 	/*
806602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
807602adf40SYehuda Sadeh 	 * in calc_raw_layout()
808602adf40SYehuda Sadeh 	 */
809602adf40SYehuda Sadeh 	(*ops)[0].payload_len = payload_len;
810602adf40SYehuda Sadeh 	return 0;
811602adf40SYehuda Sadeh }
812602adf40SYehuda Sadeh 
813602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
814602adf40SYehuda Sadeh {
815602adf40SYehuda Sadeh 	kfree(ops);
816602adf40SYehuda Sadeh }
817602adf40SYehuda Sadeh 
8181fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
8191fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
8201fec7093SYehuda Sadeh 				   int index,
8211fec7093SYehuda Sadeh 				   int ret, u64 len)
8221fec7093SYehuda Sadeh {
8231fec7093SYehuda Sadeh 	struct request_queue *q;
8241fec7093SYehuda Sadeh 	int min, max, i;
8251fec7093SYehuda Sadeh 
8261fec7093SYehuda Sadeh 	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
8271fec7093SYehuda Sadeh 	     coll, index, ret, len);
8281fec7093SYehuda Sadeh 
8291fec7093SYehuda Sadeh 	if (!rq)
8301fec7093SYehuda Sadeh 		return;
8311fec7093SYehuda Sadeh 
8321fec7093SYehuda Sadeh 	if (!coll) {
8331fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
8341fec7093SYehuda Sadeh 		return;
8351fec7093SYehuda Sadeh 	}
8361fec7093SYehuda Sadeh 
8371fec7093SYehuda Sadeh 	q = rq->q;
8381fec7093SYehuda Sadeh 
8391fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
8401fec7093SYehuda Sadeh 	coll->status[index].done = 1;
8411fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
8421fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
8431fec7093SYehuda Sadeh 	max = min = coll->num_done;
8441fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
8451fec7093SYehuda Sadeh 		max++;
8461fec7093SYehuda Sadeh 
8471fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
8481fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
8491fec7093SYehuda Sadeh 				  coll->status[i].bytes);
8501fec7093SYehuda Sadeh 		coll->num_done++;
8511fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
8521fec7093SYehuda Sadeh 	}
8531fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
8541fec7093SYehuda Sadeh }
8551fec7093SYehuda Sadeh 
8561fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
8571fec7093SYehuda Sadeh 			     int ret, u64 len)
8581fec7093SYehuda Sadeh {
8591fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
8601fec7093SYehuda Sadeh }
8611fec7093SYehuda Sadeh 
862602adf40SYehuda Sadeh /*
863602adf40SYehuda Sadeh  * Send ceph osd request
864602adf40SYehuda Sadeh  */
865602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
866602adf40SYehuda Sadeh 			  struct rbd_device *dev,
867602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
868602adf40SYehuda Sadeh 			  u64 snapid,
869602adf40SYehuda Sadeh 			  const char *obj, u64 ofs, u64 len,
870602adf40SYehuda Sadeh 			  struct bio *bio,
871602adf40SYehuda Sadeh 			  struct page **pages,
872602adf40SYehuda Sadeh 			  int num_pages,
873602adf40SYehuda Sadeh 			  int flags,
874602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
875602adf40SYehuda Sadeh 			  int num_reply,
8761fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
8771fec7093SYehuda Sadeh 			  int coll_index,
878602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
87959c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
88059c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
88159c2be1eSYehuda Sadeh 			  u64 *ver)
882602adf40SYehuda Sadeh {
883602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
884602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
885602adf40SYehuda Sadeh 	int ret;
886602adf40SYehuda Sadeh 	u64 bno;
887602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
888602adf40SYehuda Sadeh 	struct rbd_request *req_data;
889602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
8901dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
891602adf40SYehuda Sadeh 
892602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
8931fec7093SYehuda Sadeh 	if (!req_data) {
8941fec7093SYehuda Sadeh 		if (coll)
8951fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
8961fec7093SYehuda Sadeh 					       -ENOMEM, len);
8971fec7093SYehuda Sadeh 		return -ENOMEM;
8981fec7093SYehuda Sadeh 	}
899602adf40SYehuda Sadeh 
9001fec7093SYehuda Sadeh 	if (coll) {
9011fec7093SYehuda Sadeh 		req_data->coll = coll;
9021fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
9031fec7093SYehuda Sadeh 	}
9041fec7093SYehuda Sadeh 
9051fec7093SYehuda Sadeh 	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
906602adf40SYehuda Sadeh 
907c666601aSJosh Durgin 	down_read(&dev->header_rwsem);
908602adf40SYehuda Sadeh 
9091dbb4399SAlex Elder 	osdc = &dev->rbd_client->client->osdc;
9101dbb4399SAlex Elder 	req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
9111dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
9124ad12621SSage Weil 	if (!req) {
913c666601aSJosh Durgin 		up_read(&dev->header_rwsem);
9144ad12621SSage Weil 		ret = -ENOMEM;
915602adf40SYehuda Sadeh 		goto done_pages;
916602adf40SYehuda Sadeh 	}
917602adf40SYehuda Sadeh 
918602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
919602adf40SYehuda Sadeh 
920602adf40SYehuda Sadeh 	req_data->rq = rq;
921602adf40SYehuda Sadeh 	req_data->bio = bio;
922602adf40SYehuda Sadeh 	req_data->pages = pages;
923602adf40SYehuda Sadeh 	req_data->len = len;
924602adf40SYehuda Sadeh 
925602adf40SYehuda Sadeh 	req->r_priv = req_data;
926602adf40SYehuda Sadeh 
927602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
928602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
929602adf40SYehuda Sadeh 
930602adf40SYehuda Sadeh 	strncpy(req->r_oid, obj, sizeof(req->r_oid));
931602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
932602adf40SYehuda Sadeh 
933602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
934602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
935602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
936602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
937602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
938602adf40SYehuda Sadeh 	layout->fl_pg_preferred = cpu_to_le32(-1);
939602adf40SYehuda Sadeh 	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
9401dbb4399SAlex Elder 	ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
9411dbb4399SAlex Elder 				req, ops);
942602adf40SYehuda Sadeh 
943602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
944602adf40SYehuda Sadeh 				ops,
945602adf40SYehuda Sadeh 				snapc,
946602adf40SYehuda Sadeh 				&mtime,
947602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
948c666601aSJosh Durgin 	up_read(&dev->header_rwsem);
949602adf40SYehuda Sadeh 
95059c2be1eSYehuda Sadeh 	if (linger_req) {
9511dbb4399SAlex Elder 		ceph_osdc_set_request_linger(osdc, req);
95259c2be1eSYehuda Sadeh 		*linger_req = req;
95359c2be1eSYehuda Sadeh 	}
95459c2be1eSYehuda Sadeh 
9551dbb4399SAlex Elder 	ret = ceph_osdc_start_request(osdc, req, false);
956602adf40SYehuda Sadeh 	if (ret < 0)
957602adf40SYehuda Sadeh 		goto done_err;
958602adf40SYehuda Sadeh 
959602adf40SYehuda Sadeh 	if (!rbd_cb) {
9601dbb4399SAlex Elder 		ret = ceph_osdc_wait_request(osdc, req);
96159c2be1eSYehuda Sadeh 		if (ver)
96259c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
9631fec7093SYehuda Sadeh 		dout("reassert_ver=%lld\n",
9641fec7093SYehuda Sadeh 		     le64_to_cpu(req->r_reassert_version.version));
965602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
966602adf40SYehuda Sadeh 	}
967602adf40SYehuda Sadeh 	return ret;
968602adf40SYehuda Sadeh 
969602adf40SYehuda Sadeh done_err:
970602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
971602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
972602adf40SYehuda Sadeh done_pages:
9731fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
974602adf40SYehuda Sadeh 	kfree(req_data);
975602adf40SYehuda Sadeh 	return ret;
976602adf40SYehuda Sadeh }
977602adf40SYehuda Sadeh 
978602adf40SYehuda Sadeh /*
979602adf40SYehuda Sadeh  * Ceph osd op callback
980602adf40SYehuda Sadeh  */
981602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
982602adf40SYehuda Sadeh {
983602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
984602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
985602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
986602adf40SYehuda Sadeh 	__s32 rc;
987602adf40SYehuda Sadeh 	u64 bytes;
988602adf40SYehuda Sadeh 	int read_op;
989602adf40SYehuda Sadeh 
990602adf40SYehuda Sadeh 	/* parse reply */
991602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
992602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
993602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
994602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
995602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
996602adf40SYehuda Sadeh 	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
997602adf40SYehuda Sadeh 
998602adf40SYehuda Sadeh 	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
999602adf40SYehuda Sadeh 
1000602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
1001602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
1002602adf40SYehuda Sadeh 		rc = 0;
1003602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
1004602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
1005602adf40SYehuda Sadeh 		bytes = req_data->len;
1006602adf40SYehuda Sadeh 	}
1007602adf40SYehuda Sadeh 
10081fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
1009602adf40SYehuda Sadeh 
1010602adf40SYehuda Sadeh 	if (req_data->bio)
1011602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
1012602adf40SYehuda Sadeh 
1013602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
1014602adf40SYehuda Sadeh 	kfree(req_data);
1015602adf40SYehuda Sadeh }
1016602adf40SYehuda Sadeh 
101759c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
101859c2be1eSYehuda Sadeh {
101959c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
102059c2be1eSYehuda Sadeh }
102159c2be1eSYehuda Sadeh 
1022602adf40SYehuda Sadeh /*
1023602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1024602adf40SYehuda Sadeh  */
1025602adf40SYehuda Sadeh static int rbd_req_sync_op(struct rbd_device *dev,
1026602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1027602adf40SYehuda Sadeh 			   u64 snapid,
1028602adf40SYehuda Sadeh 			   int opcode,
1029602adf40SYehuda Sadeh 			   int flags,
1030602adf40SYehuda Sadeh 			   struct ceph_osd_req_op *orig_ops,
1031602adf40SYehuda Sadeh 			   int num_reply,
1032602adf40SYehuda Sadeh 			   const char *obj,
1033602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
103459c2be1eSYehuda Sadeh 			   char *buf,
103559c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
103659c2be1eSYehuda Sadeh 			   u64 *ver)
1037602adf40SYehuda Sadeh {
1038602adf40SYehuda Sadeh 	int ret;
1039602adf40SYehuda Sadeh 	struct page **pages;
1040602adf40SYehuda Sadeh 	int num_pages;
1041602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops = orig_ops;
1042602adf40SYehuda Sadeh 	u32 payload_len;
1043602adf40SYehuda Sadeh 
1044602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1045602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1046b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1047b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1048602adf40SYehuda Sadeh 
1049602adf40SYehuda Sadeh 	if (!orig_ops) {
1050602adf40SYehuda Sadeh 		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1051602adf40SYehuda Sadeh 		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1052602adf40SYehuda Sadeh 		if (ret < 0)
1053602adf40SYehuda Sadeh 			goto done;
1054602adf40SYehuda Sadeh 
1055602adf40SYehuda Sadeh 		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1056602adf40SYehuda Sadeh 			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1057602adf40SYehuda Sadeh 			if (ret < 0)
1058602adf40SYehuda Sadeh 				goto done_ops;
1059602adf40SYehuda Sadeh 		}
1060602adf40SYehuda Sadeh 	}
1061602adf40SYehuda Sadeh 
1062602adf40SYehuda Sadeh 	ret = rbd_do_request(NULL, dev, snapc, snapid,
1063602adf40SYehuda Sadeh 			  obj, ofs, len, NULL,
1064602adf40SYehuda Sadeh 			  pages, num_pages,
1065602adf40SYehuda Sadeh 			  flags,
1066602adf40SYehuda Sadeh 			  ops,
1067602adf40SYehuda Sadeh 			  2,
10681fec7093SYehuda Sadeh 			  NULL, 0,
106959c2be1eSYehuda Sadeh 			  NULL,
107059c2be1eSYehuda Sadeh 			  linger_req, ver);
1071602adf40SYehuda Sadeh 	if (ret < 0)
1072602adf40SYehuda Sadeh 		goto done_ops;
1073602adf40SYehuda Sadeh 
1074602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1075602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1076602adf40SYehuda Sadeh 
1077602adf40SYehuda Sadeh done_ops:
1078602adf40SYehuda Sadeh 	if (!orig_ops)
1079602adf40SYehuda Sadeh 		rbd_destroy_ops(ops);
1080602adf40SYehuda Sadeh done:
1081602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1082602adf40SYehuda Sadeh 	return ret;
1083602adf40SYehuda Sadeh }
1084602adf40SYehuda Sadeh 
1085602adf40SYehuda Sadeh /*
1086602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1087602adf40SYehuda Sadeh  */
1088602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1089602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev ,
1090602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1091602adf40SYehuda Sadeh 		     u64 snapid,
1092602adf40SYehuda Sadeh 		     int opcode, int flags, int num_reply,
1093602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
10941fec7093SYehuda Sadeh 		     struct bio *bio,
10951fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
10961fec7093SYehuda Sadeh 		     int coll_index)
1097602adf40SYehuda Sadeh {
1098602adf40SYehuda Sadeh 	char *seg_name;
1099602adf40SYehuda Sadeh 	u64 seg_ofs;
1100602adf40SYehuda Sadeh 	u64 seg_len;
1101602adf40SYehuda Sadeh 	int ret;
1102602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1103602adf40SYehuda Sadeh 	u32 payload_len;
1104602adf40SYehuda Sadeh 
1105602adf40SYehuda Sadeh 	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1106602adf40SYehuda Sadeh 	if (!seg_name)
1107602adf40SYehuda Sadeh 		return -ENOMEM;
1108602adf40SYehuda Sadeh 
1109602adf40SYehuda Sadeh 	seg_len = rbd_get_segment(&rbd_dev->header,
1110602adf40SYehuda Sadeh 				  rbd_dev->header.block_name,
1111602adf40SYehuda Sadeh 				  ofs, len,
1112602adf40SYehuda Sadeh 				  seg_name, &seg_ofs);
1113602adf40SYehuda Sadeh 
1114602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1115602adf40SYehuda Sadeh 
1116602adf40SYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1117602adf40SYehuda Sadeh 	if (ret < 0)
1118602adf40SYehuda Sadeh 		goto done;
1119602adf40SYehuda Sadeh 
1120602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1121602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1122602adf40SYehuda Sadeh 	   truncated at this point */
1123602adf40SYehuda Sadeh 	BUG_ON(seg_len < len);
1124602adf40SYehuda Sadeh 
1125602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1126602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1127602adf40SYehuda Sadeh 			     bio,
1128602adf40SYehuda Sadeh 			     NULL, 0,
1129602adf40SYehuda Sadeh 			     flags,
1130602adf40SYehuda Sadeh 			     ops,
1131602adf40SYehuda Sadeh 			     num_reply,
11321fec7093SYehuda Sadeh 			     coll, coll_index,
113359c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
113411f77002SSage Weil 
113511f77002SSage Weil 	rbd_destroy_ops(ops);
1136602adf40SYehuda Sadeh done:
1137602adf40SYehuda Sadeh 	kfree(seg_name);
1138602adf40SYehuda Sadeh 	return ret;
1139602adf40SYehuda Sadeh }
1140602adf40SYehuda Sadeh 
1141602adf40SYehuda Sadeh /*
1142602adf40SYehuda Sadeh  * Request async osd write
1143602adf40SYehuda Sadeh  */
1144602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1145602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1146602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1147602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11481fec7093SYehuda Sadeh 			 struct bio *bio,
11491fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11501fec7093SYehuda Sadeh 			 int coll_index)
1151602adf40SYehuda Sadeh {
1152602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1153602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1154602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1155602adf40SYehuda Sadeh 			 2,
11561fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1157602adf40SYehuda Sadeh }
1158602adf40SYehuda Sadeh 
1159602adf40SYehuda Sadeh /*
1160602adf40SYehuda Sadeh  * Request async osd read
1161602adf40SYehuda Sadeh  */
1162602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1163602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1164602adf40SYehuda Sadeh 			 u64 snapid,
1165602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11661fec7093SYehuda Sadeh 			 struct bio *bio,
11671fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11681fec7093SYehuda Sadeh 			 int coll_index)
1169602adf40SYehuda Sadeh {
1170602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1171602adf40SYehuda Sadeh 			 (snapid ? snapid : CEPH_NOSNAP),
1172602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1173602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
1174602adf40SYehuda Sadeh 			 2,
11751fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1176602adf40SYehuda Sadeh }
1177602adf40SYehuda Sadeh 
1178602adf40SYehuda Sadeh /*
1179602adf40SYehuda Sadeh  * Request sync osd read
1180602adf40SYehuda Sadeh  */
1181602adf40SYehuda Sadeh static int rbd_req_sync_read(struct rbd_device *dev,
1182602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1183602adf40SYehuda Sadeh 			  u64 snapid,
1184602adf40SYehuda Sadeh 			  const char *obj,
1185602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
118659c2be1eSYehuda Sadeh 			  char *buf,
118759c2be1eSYehuda Sadeh 			  u64 *ver)
1188602adf40SYehuda Sadeh {
1189602adf40SYehuda Sadeh 	return rbd_req_sync_op(dev, NULL,
1190602adf40SYehuda Sadeh 			       (snapid ? snapid : CEPH_NOSNAP),
1191602adf40SYehuda Sadeh 			       CEPH_OSD_OP_READ,
1192602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1193602adf40SYehuda Sadeh 			       NULL,
119459c2be1eSYehuda Sadeh 			       1, obj, ofs, len, buf, NULL, ver);
1195602adf40SYehuda Sadeh }
1196602adf40SYehuda Sadeh 
1197602adf40SYehuda Sadeh /*
119859c2be1eSYehuda Sadeh  * Request sync osd watch
119959c2be1eSYehuda Sadeh  */
120059c2be1eSYehuda Sadeh static int rbd_req_sync_notify_ack(struct rbd_device *dev,
120159c2be1eSYehuda Sadeh 				   u64 ver,
120259c2be1eSYehuda Sadeh 				   u64 notify_id,
120359c2be1eSYehuda Sadeh 				   const char *obj)
120459c2be1eSYehuda Sadeh {
120559c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
120659c2be1eSYehuda Sadeh 	struct page **pages = NULL;
120711f77002SSage Weil 	int ret;
120811f77002SSage Weil 
120911f77002SSage Weil 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
121059c2be1eSYehuda Sadeh 	if (ret < 0)
121159c2be1eSYehuda Sadeh 		return ret;
121259c2be1eSYehuda Sadeh 
121359c2be1eSYehuda Sadeh 	ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
121459c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
121559c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
121659c2be1eSYehuda Sadeh 
121759c2be1eSYehuda Sadeh 	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
121859c2be1eSYehuda Sadeh 			  obj, 0, 0, NULL,
121959c2be1eSYehuda Sadeh 			  pages, 0,
122059c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
122159c2be1eSYehuda Sadeh 			  ops,
122259c2be1eSYehuda Sadeh 			  1,
12231fec7093SYehuda Sadeh 			  NULL, 0,
122459c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
122559c2be1eSYehuda Sadeh 
122659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
122759c2be1eSYehuda Sadeh 	return ret;
122859c2be1eSYehuda Sadeh }
122959c2be1eSYehuda Sadeh 
123059c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
123159c2be1eSYehuda Sadeh {
123259c2be1eSYehuda Sadeh 	struct rbd_device *dev = (struct rbd_device *)data;
123313143d2dSSage Weil 	int rc;
123413143d2dSSage Weil 
123559c2be1eSYehuda Sadeh 	if (!dev)
123659c2be1eSYehuda Sadeh 		return;
123759c2be1eSYehuda Sadeh 
123859c2be1eSYehuda Sadeh 	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
123959c2be1eSYehuda Sadeh 		notify_id, (int)opcode);
124059c2be1eSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
124113143d2dSSage Weil 	rc = __rbd_update_snaps(dev);
124259c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
124313143d2dSSage Weil 	if (rc)
1244f0f8cef5SAlex Elder 		pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1245f0f8cef5SAlex Elder 			   " update snaps: %d\n", dev->major, rc);
124659c2be1eSYehuda Sadeh 
124759c2be1eSYehuda Sadeh 	rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
124859c2be1eSYehuda Sadeh }
124959c2be1eSYehuda Sadeh 
125059c2be1eSYehuda Sadeh /*
125159c2be1eSYehuda Sadeh  * Request sync osd watch
125259c2be1eSYehuda Sadeh  */
125359c2be1eSYehuda Sadeh static int rbd_req_sync_watch(struct rbd_device *dev,
125459c2be1eSYehuda Sadeh 			      const char *obj,
125559c2be1eSYehuda Sadeh 			      u64 ver)
125659c2be1eSYehuda Sadeh {
125759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
12581dbb4399SAlex Elder 	struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
125959c2be1eSYehuda Sadeh 
126059c2be1eSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
126159c2be1eSYehuda Sadeh 	if (ret < 0)
126259c2be1eSYehuda Sadeh 		return ret;
126359c2be1eSYehuda Sadeh 
126459c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
126559c2be1eSYehuda Sadeh 				     (void *)dev, &dev->watch_event);
126659c2be1eSYehuda Sadeh 	if (ret < 0)
126759c2be1eSYehuda Sadeh 		goto fail;
126859c2be1eSYehuda Sadeh 
126959c2be1eSYehuda Sadeh 	ops[0].watch.ver = cpu_to_le64(ver);
127059c2be1eSYehuda Sadeh 	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
127159c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
127259c2be1eSYehuda Sadeh 
127359c2be1eSYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
127459c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
127559c2be1eSYehuda Sadeh 			      0,
127659c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
127759c2be1eSYehuda Sadeh 			      ops,
127859c2be1eSYehuda Sadeh 			      1, obj, 0, 0, NULL,
127959c2be1eSYehuda Sadeh 			      &dev->watch_request, NULL);
128059c2be1eSYehuda Sadeh 
128159c2be1eSYehuda Sadeh 	if (ret < 0)
128259c2be1eSYehuda Sadeh 		goto fail_event;
128359c2be1eSYehuda Sadeh 
128459c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
128559c2be1eSYehuda Sadeh 	return 0;
128659c2be1eSYehuda Sadeh 
128759c2be1eSYehuda Sadeh fail_event:
128859c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(dev->watch_event);
128959c2be1eSYehuda Sadeh 	dev->watch_event = NULL;
129059c2be1eSYehuda Sadeh fail:
129159c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
129259c2be1eSYehuda Sadeh 	return ret;
129359c2be1eSYehuda Sadeh }
129459c2be1eSYehuda Sadeh 
129579e3057cSYehuda Sadeh /*
129679e3057cSYehuda Sadeh  * Request sync osd unwatch
129779e3057cSYehuda Sadeh  */
129879e3057cSYehuda Sadeh static int rbd_req_sync_unwatch(struct rbd_device *dev,
129979e3057cSYehuda Sadeh 				const char *obj)
130079e3057cSYehuda Sadeh {
130179e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
130279e3057cSYehuda Sadeh 
130379e3057cSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
130479e3057cSYehuda Sadeh 	if (ret < 0)
130579e3057cSYehuda Sadeh 		return ret;
130679e3057cSYehuda Sadeh 
130779e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
130879e3057cSYehuda Sadeh 	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
130979e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
131079e3057cSYehuda Sadeh 
131179e3057cSYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
131279e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
131379e3057cSYehuda Sadeh 			      0,
131479e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
131579e3057cSYehuda Sadeh 			      ops,
131679e3057cSYehuda Sadeh 			      1, obj, 0, 0, NULL, NULL, NULL);
131779e3057cSYehuda Sadeh 
131879e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
131979e3057cSYehuda Sadeh 	ceph_osdc_cancel_event(dev->watch_event);
132079e3057cSYehuda Sadeh 	dev->watch_event = NULL;
132179e3057cSYehuda Sadeh 	return ret;
132279e3057cSYehuda Sadeh }
132379e3057cSYehuda Sadeh 
132459c2be1eSYehuda Sadeh struct rbd_notify_info {
132559c2be1eSYehuda Sadeh 	struct rbd_device *dev;
132659c2be1eSYehuda Sadeh };
132759c2be1eSYehuda Sadeh 
132859c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
132959c2be1eSYehuda Sadeh {
133059c2be1eSYehuda Sadeh 	struct rbd_device *dev = (struct rbd_device *)data;
133159c2be1eSYehuda Sadeh 	if (!dev)
133259c2be1eSYehuda Sadeh 		return;
133359c2be1eSYehuda Sadeh 
133459c2be1eSYehuda Sadeh 	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
133559c2be1eSYehuda Sadeh 		notify_id, (int)opcode);
133659c2be1eSYehuda Sadeh }
133759c2be1eSYehuda Sadeh 
133859c2be1eSYehuda Sadeh /*
133959c2be1eSYehuda Sadeh  * Request sync osd notify
134059c2be1eSYehuda Sadeh  */
134159c2be1eSYehuda Sadeh static int rbd_req_sync_notify(struct rbd_device *dev,
134259c2be1eSYehuda Sadeh 		          const char *obj)
134359c2be1eSYehuda Sadeh {
134459c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
13451dbb4399SAlex Elder 	struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
134659c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
134759c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
134859c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
134959c2be1eSYehuda Sadeh 	int ret;
135059c2be1eSYehuda Sadeh 
135159c2be1eSYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
135259c2be1eSYehuda Sadeh 	if (ret < 0)
135359c2be1eSYehuda Sadeh 		return ret;
135459c2be1eSYehuda Sadeh 
135559c2be1eSYehuda Sadeh 	info.dev = dev;
135659c2be1eSYehuda Sadeh 
135759c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
135859c2be1eSYehuda Sadeh 				     (void *)&info, &event);
135959c2be1eSYehuda Sadeh 	if (ret < 0)
136059c2be1eSYehuda Sadeh 		goto fail;
136159c2be1eSYehuda Sadeh 
136259c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
136359c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
136459c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
136559c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
136659c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
136759c2be1eSYehuda Sadeh 
136859c2be1eSYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
136959c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
137059c2be1eSYehuda Sadeh 			       0,
137159c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
137259c2be1eSYehuda Sadeh 			       ops,
137359c2be1eSYehuda Sadeh 			       1, obj, 0, 0, NULL, NULL, NULL);
137459c2be1eSYehuda Sadeh 	if (ret < 0)
137559c2be1eSYehuda Sadeh 		goto fail_event;
137659c2be1eSYehuda Sadeh 
137759c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
137859c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
137959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
138059c2be1eSYehuda Sadeh 	return 0;
138159c2be1eSYehuda Sadeh 
138259c2be1eSYehuda Sadeh fail_event:
138359c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
138459c2be1eSYehuda Sadeh fail:
138559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
138659c2be1eSYehuda Sadeh 	return ret;
138759c2be1eSYehuda Sadeh }
138859c2be1eSYehuda Sadeh 
138959c2be1eSYehuda Sadeh /*
1390602adf40SYehuda Sadeh  * Request sync osd read
1391602adf40SYehuda Sadeh  */
1392602adf40SYehuda Sadeh static int rbd_req_sync_exec(struct rbd_device *dev,
1393602adf40SYehuda Sadeh 			     const char *obj,
1394602adf40SYehuda Sadeh 			     const char *cls,
1395602adf40SYehuda Sadeh 			     const char *method,
1396602adf40SYehuda Sadeh 			     const char *data,
139759c2be1eSYehuda Sadeh 			     int len,
139859c2be1eSYehuda Sadeh 			     u64 *ver)
1399602adf40SYehuda Sadeh {
1400602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1401602adf40SYehuda Sadeh 	int cls_len = strlen(cls);
1402602adf40SYehuda Sadeh 	int method_len = strlen(method);
1403602adf40SYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1404602adf40SYehuda Sadeh 				    cls_len + method_len + len);
1405602adf40SYehuda Sadeh 	if (ret < 0)
1406602adf40SYehuda Sadeh 		return ret;
1407602adf40SYehuda Sadeh 
1408602adf40SYehuda Sadeh 	ops[0].cls.class_name = cls;
1409602adf40SYehuda Sadeh 	ops[0].cls.class_len = (__u8)cls_len;
1410602adf40SYehuda Sadeh 	ops[0].cls.method_name = method;
1411602adf40SYehuda Sadeh 	ops[0].cls.method_len = (__u8)method_len;
1412602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1413602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1414602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1415602adf40SYehuda Sadeh 
1416602adf40SYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
1417602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1418602adf40SYehuda Sadeh 			       0,
1419602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1420602adf40SYehuda Sadeh 			       ops,
142159c2be1eSYehuda Sadeh 			       1, obj, 0, 0, NULL, NULL, ver);
1422602adf40SYehuda Sadeh 
1423602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1424602adf40SYehuda Sadeh 
1425602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1426602adf40SYehuda Sadeh 	return ret;
1427602adf40SYehuda Sadeh }
1428602adf40SYehuda Sadeh 
14291fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14301fec7093SYehuda Sadeh {
14311fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14321fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14331fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14341fec7093SYehuda Sadeh 				GFP_ATOMIC);
14351fec7093SYehuda Sadeh 
14361fec7093SYehuda Sadeh 	if (!coll)
14371fec7093SYehuda Sadeh 		return NULL;
14381fec7093SYehuda Sadeh 	coll->total = num_reqs;
14391fec7093SYehuda Sadeh 	kref_init(&coll->kref);
14401fec7093SYehuda Sadeh 	return coll;
14411fec7093SYehuda Sadeh }
14421fec7093SYehuda Sadeh 
1443602adf40SYehuda Sadeh /*
1444602adf40SYehuda Sadeh  * block device queue callback
1445602adf40SYehuda Sadeh  */
1446602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1447602adf40SYehuda Sadeh {
1448602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1449602adf40SYehuda Sadeh 	struct request *rq;
1450602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1451602adf40SYehuda Sadeh 
145200f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1453602adf40SYehuda Sadeh 		struct bio *bio;
1454602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1455602adf40SYehuda Sadeh 		bool do_write;
1456602adf40SYehuda Sadeh 		int size, op_size = 0;
1457602adf40SYehuda Sadeh 		u64 ofs;
14581fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
14591fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1460602adf40SYehuda Sadeh 
1461602adf40SYehuda Sadeh 		/* peek at request from block layer */
1462602adf40SYehuda Sadeh 		if (!rq)
1463602adf40SYehuda Sadeh 			break;
1464602adf40SYehuda Sadeh 
1465602adf40SYehuda Sadeh 		dout("fetched request\n");
1466602adf40SYehuda Sadeh 
1467602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1468602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1469602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
147000f1f36fSAlex Elder 			continue;
1471602adf40SYehuda Sadeh 		}
1472602adf40SYehuda Sadeh 
1473602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1474602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1475602adf40SYehuda Sadeh 
1476602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1477593a9e7bSAlex Elder 		ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1478602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1479602adf40SYehuda Sadeh 		if (do_write && rbd_dev->read_only) {
1480602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
148100f1f36fSAlex Elder 			continue;
1482602adf40SYehuda Sadeh 		}
1483602adf40SYehuda Sadeh 
1484602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1485602adf40SYehuda Sadeh 
1486602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1487602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1488593a9e7bSAlex Elder 		     size, blk_rq_pos(rq) * SECTOR_SIZE);
1489602adf40SYehuda Sadeh 
14901fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
14911fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
14921fec7093SYehuda Sadeh 		if (!coll) {
14931fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
14941fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
149500f1f36fSAlex Elder 			continue;
14961fec7093SYehuda Sadeh 		}
14971fec7093SYehuda Sadeh 
1498602adf40SYehuda Sadeh 		do {
1499602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1500602adf40SYehuda Sadeh 			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1501602adf40SYehuda Sadeh 			op_size = rbd_get_segment(&rbd_dev->header,
1502602adf40SYehuda Sadeh 						  rbd_dev->header.block_name,
1503602adf40SYehuda Sadeh 						  ofs, size,
1504602adf40SYehuda Sadeh 						  NULL, NULL);
15051fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1506602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1507602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1508602adf40SYehuda Sadeh 			if (!bio) {
15091fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
15101fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
15111fec7093SYehuda Sadeh 				goto next_seg;
1512602adf40SYehuda Sadeh 			}
1513602adf40SYehuda Sadeh 
15141fec7093SYehuda Sadeh 
1515602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1516602adf40SYehuda Sadeh 			if (do_write)
1517602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1518602adf40SYehuda Sadeh 					      rbd_dev->header.snapc,
1519602adf40SYehuda Sadeh 					      ofs,
15201fec7093SYehuda Sadeh 					      op_size, bio,
15211fec7093SYehuda Sadeh 					      coll, cur_seg);
1522602adf40SYehuda Sadeh 			else
1523602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
1524602adf40SYehuda Sadeh 					     cur_snap_id(rbd_dev),
1525602adf40SYehuda Sadeh 					     ofs,
15261fec7093SYehuda Sadeh 					     op_size, bio,
15271fec7093SYehuda Sadeh 					     coll, cur_seg);
1528602adf40SYehuda Sadeh 
15291fec7093SYehuda Sadeh next_seg:
1530602adf40SYehuda Sadeh 			size -= op_size;
1531602adf40SYehuda Sadeh 			ofs += op_size;
1532602adf40SYehuda Sadeh 
15331fec7093SYehuda Sadeh 			cur_seg++;
1534602adf40SYehuda Sadeh 			rq_bio = next_bio;
1535602adf40SYehuda Sadeh 		} while (size > 0);
15361fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1537602adf40SYehuda Sadeh 
1538602adf40SYehuda Sadeh 		if (bp)
1539602adf40SYehuda Sadeh 			bio_pair_release(bp);
1540602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1541602adf40SYehuda Sadeh 	}
1542602adf40SYehuda Sadeh }
1543602adf40SYehuda Sadeh 
1544602adf40SYehuda Sadeh /*
1545602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1546602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1547602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1548602adf40SYehuda Sadeh  */
1549602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1550602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1551602adf40SYehuda Sadeh {
1552602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1553593a9e7bSAlex Elder 	unsigned int chunk_sectors;
1554593a9e7bSAlex Elder 	sector_t sector;
1555593a9e7bSAlex Elder 	unsigned int bio_sectors;
1556602adf40SYehuda Sadeh 	int max;
1557602adf40SYehuda Sadeh 
1558593a9e7bSAlex Elder 	chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1559593a9e7bSAlex Elder 	sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1560593a9e7bSAlex Elder 	bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1561593a9e7bSAlex Elder 
1562602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1563593a9e7bSAlex Elder 				 + bio_sectors)) << SECTOR_SHIFT;
1564602adf40SYehuda Sadeh 	if (max < 0)
1565602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1566602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1567602adf40SYehuda Sadeh 		return bvec->bv_len;
1568602adf40SYehuda Sadeh 	return max;
1569602adf40SYehuda Sadeh }
1570602adf40SYehuda Sadeh 
1571602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1572602adf40SYehuda Sadeh {
1573602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1574602adf40SYehuda Sadeh 
1575602adf40SYehuda Sadeh 	if (!disk)
1576602adf40SYehuda Sadeh 		return;
1577602adf40SYehuda Sadeh 
1578602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1579602adf40SYehuda Sadeh 
1580602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1581602adf40SYehuda Sadeh 		del_gendisk(disk);
1582602adf40SYehuda Sadeh 	if (disk->queue)
1583602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1584602adf40SYehuda Sadeh 	put_disk(disk);
1585602adf40SYehuda Sadeh }
1586602adf40SYehuda Sadeh 
1587602adf40SYehuda Sadeh /*
1588602adf40SYehuda Sadeh  * reload the ondisk the header
1589602adf40SYehuda Sadeh  */
1590602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1591602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1592602adf40SYehuda Sadeh {
1593602adf40SYehuda Sadeh 	ssize_t rc;
1594602adf40SYehuda Sadeh 	struct rbd_image_header_ondisk *dh;
1595602adf40SYehuda Sadeh 	int snap_count = 0;
159659c2be1eSYehuda Sadeh 	u64 ver;
159700f1f36fSAlex Elder 	size_t len;
1598602adf40SYehuda Sadeh 
159900f1f36fSAlex Elder 	/*
160000f1f36fSAlex Elder 	 * First reads the fixed-size header to determine the number
160100f1f36fSAlex Elder 	 * of snapshots, then re-reads it, along with all snapshot
160200f1f36fSAlex Elder 	 * records as well as their stored names.
160300f1f36fSAlex Elder 	 */
160400f1f36fSAlex Elder 	len = sizeof (*dh);
1605602adf40SYehuda Sadeh 	while (1) {
1606602adf40SYehuda Sadeh 		dh = kmalloc(len, GFP_KERNEL);
1607602adf40SYehuda Sadeh 		if (!dh)
1608602adf40SYehuda Sadeh 			return -ENOMEM;
1609602adf40SYehuda Sadeh 
1610602adf40SYehuda Sadeh 		rc = rbd_req_sync_read(rbd_dev,
1611602adf40SYehuda Sadeh 				       NULL, CEPH_NOSNAP,
1612602adf40SYehuda Sadeh 				       rbd_dev->obj_md_name,
1613602adf40SYehuda Sadeh 				       0, len,
161459c2be1eSYehuda Sadeh 				       (char *)dh, &ver);
1615602adf40SYehuda Sadeh 		if (rc < 0)
1616602adf40SYehuda Sadeh 			goto out_dh;
1617602adf40SYehuda Sadeh 
1618602adf40SYehuda Sadeh 		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
161981e759fbSJosh Durgin 		if (rc < 0) {
162000f1f36fSAlex Elder 			if (rc == -ENXIO)
162181e759fbSJosh Durgin 				pr_warning("unrecognized header format"
162281e759fbSJosh Durgin 					   " for image %s", rbd_dev->obj);
1623602adf40SYehuda Sadeh 			goto out_dh;
162481e759fbSJosh Durgin 		}
1625602adf40SYehuda Sadeh 
162600f1f36fSAlex Elder 		if (snap_count == header->total_snaps)
162700f1f36fSAlex Elder 			break;
162800f1f36fSAlex Elder 
1629602adf40SYehuda Sadeh 		snap_count = header->total_snaps;
163000f1f36fSAlex Elder 		len = sizeof (*dh) +
163100f1f36fSAlex Elder 			snap_count * sizeof(struct rbd_image_snap_ondisk) +
163200f1f36fSAlex Elder 			header->snap_names_len;
163300f1f36fSAlex Elder 
1634602adf40SYehuda Sadeh 		rbd_header_free(header);
1635602adf40SYehuda Sadeh 		kfree(dh);
1636602adf40SYehuda Sadeh 	}
163759c2be1eSYehuda Sadeh 	header->obj_version = ver;
1638602adf40SYehuda Sadeh 
1639602adf40SYehuda Sadeh out_dh:
1640602adf40SYehuda Sadeh 	kfree(dh);
1641602adf40SYehuda Sadeh 	return rc;
1642602adf40SYehuda Sadeh }
1643602adf40SYehuda Sadeh 
1644602adf40SYehuda Sadeh /*
1645602adf40SYehuda Sadeh  * create a snapshot
1646602adf40SYehuda Sadeh  */
1647602adf40SYehuda Sadeh static int rbd_header_add_snap(struct rbd_device *dev,
1648602adf40SYehuda Sadeh 			       const char *snap_name,
1649602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1650602adf40SYehuda Sadeh {
1651602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1652602adf40SYehuda Sadeh 	u64 new_snapid;
1653602adf40SYehuda Sadeh 	int ret;
1654916d4d67SSage Weil 	void *data, *p, *e;
165559c2be1eSYehuda Sadeh 	u64 ver;
16561dbb4399SAlex Elder 	struct ceph_mon_client *monc;
1657602adf40SYehuda Sadeh 
1658602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
1659602adf40SYehuda Sadeh 	if (dev->cur_snap)
1660602adf40SYehuda Sadeh 		return -EINVAL;
1661602adf40SYehuda Sadeh 
16621dbb4399SAlex Elder 	monc = &dev->rbd_client->client->monc;
16631dbb4399SAlex Elder 	ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
1664602adf40SYehuda Sadeh 	dout("created snapid=%lld\n", new_snapid);
1665602adf40SYehuda Sadeh 	if (ret < 0)
1666602adf40SYehuda Sadeh 		return ret;
1667602adf40SYehuda Sadeh 
1668602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1669602adf40SYehuda Sadeh 	if (!data)
1670602adf40SYehuda Sadeh 		return -ENOMEM;
1671602adf40SYehuda Sadeh 
1672916d4d67SSage Weil 	p = data;
1673916d4d67SSage Weil 	e = data + name_len + 16;
1674602adf40SYehuda Sadeh 
1675916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1676916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1677602adf40SYehuda Sadeh 
1678602adf40SYehuda Sadeh 	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1679916d4d67SSage Weil 				data, p - data, &ver);
1680602adf40SYehuda Sadeh 
1681916d4d67SSage Weil 	kfree(data);
1682602adf40SYehuda Sadeh 
1683602adf40SYehuda Sadeh 	if (ret < 0)
1684602adf40SYehuda Sadeh 		return ret;
1685602adf40SYehuda Sadeh 
1686602adf40SYehuda Sadeh 	dev->header.snapc->seq =  new_snapid;
1687602adf40SYehuda Sadeh 
1688602adf40SYehuda Sadeh 	return 0;
1689602adf40SYehuda Sadeh bad:
1690602adf40SYehuda Sadeh 	return -ERANGE;
1691602adf40SYehuda Sadeh }
1692602adf40SYehuda Sadeh 
1693dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1694dfc5606dSYehuda Sadeh {
1695dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1696dfc5606dSYehuda Sadeh 
1697dfc5606dSYehuda Sadeh 	while (!list_empty(&rbd_dev->snaps)) {
1698dfc5606dSYehuda Sadeh 		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1699dfc5606dSYehuda Sadeh 		__rbd_remove_snap_dev(rbd_dev, snap);
1700dfc5606dSYehuda Sadeh 	}
1701dfc5606dSYehuda Sadeh }
1702dfc5606dSYehuda Sadeh 
1703602adf40SYehuda Sadeh /*
1704602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1705602adf40SYehuda Sadeh  */
1706dfc5606dSYehuda Sadeh static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1707602adf40SYehuda Sadeh {
1708602adf40SYehuda Sadeh 	int ret;
1709602adf40SYehuda Sadeh 	struct rbd_image_header h;
1710602adf40SYehuda Sadeh 	u64 snap_seq;
171159c2be1eSYehuda Sadeh 	int follow_seq = 0;
1712602adf40SYehuda Sadeh 
1713602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1714602adf40SYehuda Sadeh 	if (ret < 0)
1715602adf40SYehuda Sadeh 		return ret;
1716602adf40SYehuda Sadeh 
17179db4b3e3SSage Weil 	/* resized? */
1718593a9e7bSAlex Elder 	set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE);
17199db4b3e3SSage Weil 
1720c666601aSJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1721602adf40SYehuda Sadeh 
1722602adf40SYehuda Sadeh 	snap_seq = rbd_dev->header.snapc->seq;
172359c2be1eSYehuda Sadeh 	if (rbd_dev->header.total_snaps &&
172459c2be1eSYehuda Sadeh 	    rbd_dev->header.snapc->snaps[0] == snap_seq)
172559c2be1eSYehuda Sadeh 		/* pointing at the head, will need to follow that
172659c2be1eSYehuda Sadeh 		   if head moves */
172759c2be1eSYehuda Sadeh 		follow_seq = 1;
1728602adf40SYehuda Sadeh 
1729602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snapc);
1730602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_names);
1731602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1732602adf40SYehuda Sadeh 
1733602adf40SYehuda Sadeh 	rbd_dev->header.total_snaps = h.total_snaps;
1734602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1735602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1736dfc5606dSYehuda Sadeh 	rbd_dev->header.snap_names_len = h.snap_names_len;
1737602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
173859c2be1eSYehuda Sadeh 	if (follow_seq)
173959c2be1eSYehuda Sadeh 		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
174059c2be1eSYehuda Sadeh 	else
1741602adf40SYehuda Sadeh 		rbd_dev->header.snapc->seq = snap_seq;
1742602adf40SYehuda Sadeh 
1743dfc5606dSYehuda Sadeh 	ret = __rbd_init_snaps_header(rbd_dev);
1744dfc5606dSYehuda Sadeh 
1745c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1746602adf40SYehuda Sadeh 
1747dfc5606dSYehuda Sadeh 	return ret;
1748602adf40SYehuda Sadeh }
1749602adf40SYehuda Sadeh 
1750602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1751602adf40SYehuda Sadeh {
1752602adf40SYehuda Sadeh 	struct gendisk *disk;
1753602adf40SYehuda Sadeh 	struct request_queue *q;
1754602adf40SYehuda Sadeh 	int rc;
1755593a9e7bSAlex Elder 	u64 segment_size;
1756602adf40SYehuda Sadeh 	u64 total_size = 0;
1757602adf40SYehuda Sadeh 
1758602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1759602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1760602adf40SYehuda Sadeh 	if (rc)
1761602adf40SYehuda Sadeh 		return rc;
1762602adf40SYehuda Sadeh 
1763dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
1764dfc5606dSYehuda Sadeh 	rc = __rbd_init_snaps_header(rbd_dev);
1765dfc5606dSYehuda Sadeh 	if (rc)
1766dfc5606dSYehuda Sadeh 		return rc;
1767dfc5606dSYehuda Sadeh 
1768cc9d734cSJosh Durgin 	rc = rbd_header_set_snap(rbd_dev, &total_size);
1769602adf40SYehuda Sadeh 	if (rc)
1770602adf40SYehuda Sadeh 		return rc;
1771602adf40SYehuda Sadeh 
1772602adf40SYehuda Sadeh 	/* create gendisk info */
1773602adf40SYehuda Sadeh 	rc = -ENOMEM;
1774602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1775602adf40SYehuda Sadeh 	if (!disk)
1776602adf40SYehuda Sadeh 		goto out;
1777602adf40SYehuda Sadeh 
1778f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1779aedfec59SSage Weil 		 rbd_dev->id);
1780602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1781602adf40SYehuda Sadeh 	disk->first_minor = 0;
1782602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1783602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1784602adf40SYehuda Sadeh 
1785602adf40SYehuda Sadeh 	/* init rq */
1786602adf40SYehuda Sadeh 	rc = -ENOMEM;
1787602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1788602adf40SYehuda Sadeh 	if (!q)
1789602adf40SYehuda Sadeh 		goto out_disk;
1790029bcbd8SJosh Durgin 
1791593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1792593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1793593a9e7bSAlex Elder 
1794029bcbd8SJosh Durgin 	/* set io sizes to object size */
1795593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1796593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1797593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1798593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1799593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1800029bcbd8SJosh Durgin 
1801602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1802602adf40SYehuda Sadeh 	disk->queue = q;
1803602adf40SYehuda Sadeh 
1804602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1805602adf40SYehuda Sadeh 
1806602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1807602adf40SYehuda Sadeh 	rbd_dev->q = q;
1808602adf40SYehuda Sadeh 
1809602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
1810593a9e7bSAlex Elder 	set_capacity(disk, total_size / SECTOR_SIZE);
1811602adf40SYehuda Sadeh 	add_disk(disk);
1812602adf40SYehuda Sadeh 
1813602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
1814602adf40SYehuda Sadeh 		disk->disk_name, (unsigned long long)total_size);
1815602adf40SYehuda Sadeh 	return 0;
1816602adf40SYehuda Sadeh 
1817602adf40SYehuda Sadeh out_disk:
1818602adf40SYehuda Sadeh 	put_disk(disk);
1819602adf40SYehuda Sadeh out:
1820602adf40SYehuda Sadeh 	return rc;
1821602adf40SYehuda Sadeh }
1822602adf40SYehuda Sadeh 
1823dfc5606dSYehuda Sadeh /*
1824dfc5606dSYehuda Sadeh   sysfs
1825dfc5606dSYehuda Sadeh */
1826602adf40SYehuda Sadeh 
1827593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1828593a9e7bSAlex Elder {
1829593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1830593a9e7bSAlex Elder }
1831593a9e7bSAlex Elder 
1832dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1833dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1834602adf40SYehuda Sadeh {
1835593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1836dfc5606dSYehuda Sadeh 
1837dfc5606dSYehuda Sadeh 	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1838602adf40SYehuda Sadeh }
1839602adf40SYehuda Sadeh 
1840dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1841dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1842602adf40SYehuda Sadeh {
1843593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1844dfc5606dSYehuda Sadeh 
1845dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1846dfc5606dSYehuda Sadeh }
1847dfc5606dSYehuda Sadeh 
1848dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1849dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1850dfc5606dSYehuda Sadeh {
1851593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1852dfc5606dSYehuda Sadeh 
18531dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
18541dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
1855dfc5606dSYehuda Sadeh }
1856dfc5606dSYehuda Sadeh 
1857dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1858dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1859dfc5606dSYehuda Sadeh {
1860593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1861dfc5606dSYehuda Sadeh 
1862dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1863dfc5606dSYehuda Sadeh }
1864dfc5606dSYehuda Sadeh 
1865dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1866dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1867dfc5606dSYehuda Sadeh {
1868593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1869dfc5606dSYehuda Sadeh 
1870dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->obj);
1871dfc5606dSYehuda Sadeh }
1872dfc5606dSYehuda Sadeh 
1873dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
1874dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
1875dfc5606dSYehuda Sadeh 			     char *buf)
1876dfc5606dSYehuda Sadeh {
1877593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1878dfc5606dSYehuda Sadeh 
1879dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
1880dfc5606dSYehuda Sadeh }
1881dfc5606dSYehuda Sadeh 
1882dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
1883dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
1884dfc5606dSYehuda Sadeh 				 const char *buf,
1885dfc5606dSYehuda Sadeh 				 size_t size)
1886dfc5606dSYehuda Sadeh {
1887593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1888dfc5606dSYehuda Sadeh 	int rc;
1889dfc5606dSYehuda Sadeh 	int ret = size;
1890602adf40SYehuda Sadeh 
1891602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1892602adf40SYehuda Sadeh 
1893dfc5606dSYehuda Sadeh 	rc = __rbd_update_snaps(rbd_dev);
1894dfc5606dSYehuda Sadeh 	if (rc < 0)
1895dfc5606dSYehuda Sadeh 		ret = rc;
1896602adf40SYehuda Sadeh 
1897dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
1898dfc5606dSYehuda Sadeh 	return ret;
1899dfc5606dSYehuda Sadeh }
1900602adf40SYehuda Sadeh 
1901dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1902dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1903dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1904dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1905dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1906dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1907dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1908dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1909dfc5606dSYehuda Sadeh 
1910dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
1911dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
1912dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
1913dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
1914dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
1915dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
1916dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
1917dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
1918dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
1919dfc5606dSYehuda Sadeh 	NULL
1920dfc5606dSYehuda Sadeh };
1921dfc5606dSYehuda Sadeh 
1922dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
1923dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
1924dfc5606dSYehuda Sadeh };
1925dfc5606dSYehuda Sadeh 
1926dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
1927dfc5606dSYehuda Sadeh 	&rbd_attr_group,
1928dfc5606dSYehuda Sadeh 	NULL
1929dfc5606dSYehuda Sadeh };
1930dfc5606dSYehuda Sadeh 
1931dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
1932dfc5606dSYehuda Sadeh {
1933dfc5606dSYehuda Sadeh }
1934dfc5606dSYehuda Sadeh 
1935dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
1936dfc5606dSYehuda Sadeh 	.name		= "rbd",
1937dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
1938dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
1939dfc5606dSYehuda Sadeh };
1940dfc5606dSYehuda Sadeh 
1941dfc5606dSYehuda Sadeh 
1942dfc5606dSYehuda Sadeh /*
1943dfc5606dSYehuda Sadeh   sysfs - snapshots
1944dfc5606dSYehuda Sadeh */
1945dfc5606dSYehuda Sadeh 
1946dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
1947dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
1948dfc5606dSYehuda Sadeh 				  char *buf)
1949dfc5606dSYehuda Sadeh {
1950dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1951dfc5606dSYehuda Sadeh 
1952593a9e7bSAlex Elder 	return sprintf(buf, "%zd\n", snap->size);
1953dfc5606dSYehuda Sadeh }
1954dfc5606dSYehuda Sadeh 
1955dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
1956dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
1957dfc5606dSYehuda Sadeh 				char *buf)
1958dfc5606dSYehuda Sadeh {
1959dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1960dfc5606dSYehuda Sadeh 
1961593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long) snap->id);
1962dfc5606dSYehuda Sadeh }
1963dfc5606dSYehuda Sadeh 
1964dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1965dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1966dfc5606dSYehuda Sadeh 
1967dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
1968dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
1969dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
1970dfc5606dSYehuda Sadeh 	NULL,
1971dfc5606dSYehuda Sadeh };
1972dfc5606dSYehuda Sadeh 
1973dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
1974dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
1975dfc5606dSYehuda Sadeh };
1976dfc5606dSYehuda Sadeh 
1977dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
1978dfc5606dSYehuda Sadeh {
1979dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1980dfc5606dSYehuda Sadeh 	kfree(snap->name);
1981dfc5606dSYehuda Sadeh 	kfree(snap);
1982dfc5606dSYehuda Sadeh }
1983dfc5606dSYehuda Sadeh 
1984dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
1985dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
1986dfc5606dSYehuda Sadeh 	NULL
1987dfc5606dSYehuda Sadeh };
1988dfc5606dSYehuda Sadeh 
1989dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
1990dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
1991dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
1992dfc5606dSYehuda Sadeh };
1993dfc5606dSYehuda Sadeh 
1994dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1995dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap)
1996dfc5606dSYehuda Sadeh {
1997dfc5606dSYehuda Sadeh 	list_del(&snap->node);
1998dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
1999dfc5606dSYehuda Sadeh }
2000dfc5606dSYehuda Sadeh 
2001dfc5606dSYehuda Sadeh static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2002dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap,
2003dfc5606dSYehuda Sadeh 				  struct device *parent)
2004dfc5606dSYehuda Sadeh {
2005dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2006dfc5606dSYehuda Sadeh 	int ret;
2007dfc5606dSYehuda Sadeh 
2008dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2009dfc5606dSYehuda Sadeh 	dev->parent = parent;
2010dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2011dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
2012dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2013dfc5606dSYehuda Sadeh 
2014dfc5606dSYehuda Sadeh 	return ret;
2015dfc5606dSYehuda Sadeh }
2016dfc5606dSYehuda Sadeh 
2017dfc5606dSYehuda Sadeh static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2018dfc5606dSYehuda Sadeh 			      int i, const char *name,
2019dfc5606dSYehuda Sadeh 			      struct rbd_snap **snapp)
2020dfc5606dSYehuda Sadeh {
2021dfc5606dSYehuda Sadeh 	int ret;
2022dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2023dfc5606dSYehuda Sadeh 	if (!snap)
2024dfc5606dSYehuda Sadeh 		return -ENOMEM;
2025dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
2026dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
2027dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
2028dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
2029dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2030dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2031dfc5606dSYehuda Sadeh 		if (ret < 0)
2032dfc5606dSYehuda Sadeh 			goto err;
2033dfc5606dSYehuda Sadeh 	}
2034dfc5606dSYehuda Sadeh 	*snapp = snap;
2035dfc5606dSYehuda Sadeh 	return 0;
2036dfc5606dSYehuda Sadeh err:
2037dfc5606dSYehuda Sadeh 	kfree(snap->name);
2038dfc5606dSYehuda Sadeh 	kfree(snap);
2039dfc5606dSYehuda Sadeh 	return ret;
2040dfc5606dSYehuda Sadeh }
2041dfc5606dSYehuda Sadeh 
2042dfc5606dSYehuda Sadeh /*
2043dfc5606dSYehuda Sadeh  * search for the previous snap in a null delimited string list
2044dfc5606dSYehuda Sadeh  */
2045dfc5606dSYehuda Sadeh const char *rbd_prev_snap_name(const char *name, const char *start)
2046dfc5606dSYehuda Sadeh {
2047dfc5606dSYehuda Sadeh 	if (name < start + 2)
2048dfc5606dSYehuda Sadeh 		return NULL;
2049dfc5606dSYehuda Sadeh 
2050dfc5606dSYehuda Sadeh 	name -= 2;
2051dfc5606dSYehuda Sadeh 	while (*name) {
2052dfc5606dSYehuda Sadeh 		if (name == start)
2053dfc5606dSYehuda Sadeh 			return start;
2054dfc5606dSYehuda Sadeh 		name--;
2055dfc5606dSYehuda Sadeh 	}
2056dfc5606dSYehuda Sadeh 	return name + 1;
2057dfc5606dSYehuda Sadeh }
2058dfc5606dSYehuda Sadeh 
2059dfc5606dSYehuda Sadeh /*
2060dfc5606dSYehuda Sadeh  * compare the old list of snapshots that we have to what's in the header
2061dfc5606dSYehuda Sadeh  * and update it accordingly. Note that the header holds the snapshots
2062dfc5606dSYehuda Sadeh  * in a reverse order (from newest to oldest) and we need to go from
2063dfc5606dSYehuda Sadeh  * older to new so that we don't get a duplicate snap name when
2064dfc5606dSYehuda Sadeh  * doing the process (e.g., removed snapshot and recreated a new
2065dfc5606dSYehuda Sadeh  * one with the same name.
2066dfc5606dSYehuda Sadeh  */
2067dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2068dfc5606dSYehuda Sadeh {
2069dfc5606dSYehuda Sadeh 	const char *name, *first_name;
2070dfc5606dSYehuda Sadeh 	int i = rbd_dev->header.total_snaps;
2071dfc5606dSYehuda Sadeh 	struct rbd_snap *snap, *old_snap = NULL;
2072dfc5606dSYehuda Sadeh 	int ret;
2073dfc5606dSYehuda Sadeh 	struct list_head *p, *n;
2074dfc5606dSYehuda Sadeh 
2075dfc5606dSYehuda Sadeh 	first_name = rbd_dev->header.snap_names;
2076dfc5606dSYehuda Sadeh 	name = first_name + rbd_dev->header.snap_names_len;
2077dfc5606dSYehuda Sadeh 
2078dfc5606dSYehuda Sadeh 	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2079dfc5606dSYehuda Sadeh 		u64 cur_id;
2080dfc5606dSYehuda Sadeh 
2081dfc5606dSYehuda Sadeh 		old_snap = list_entry(p, struct rbd_snap, node);
2082dfc5606dSYehuda Sadeh 
2083dfc5606dSYehuda Sadeh 		if (i)
2084dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
2085dfc5606dSYehuda Sadeh 
2086dfc5606dSYehuda Sadeh 		if (!i || old_snap->id < cur_id) {
2087dfc5606dSYehuda Sadeh 			/* old_snap->id was skipped, thus was removed */
2088dfc5606dSYehuda Sadeh 			__rbd_remove_snap_dev(rbd_dev, old_snap);
2089dfc5606dSYehuda Sadeh 			continue;
2090dfc5606dSYehuda Sadeh 		}
2091dfc5606dSYehuda Sadeh 		if (old_snap->id == cur_id) {
2092dfc5606dSYehuda Sadeh 			/* we have this snapshot already */
2093dfc5606dSYehuda Sadeh 			i--;
2094dfc5606dSYehuda Sadeh 			name = rbd_prev_snap_name(name, first_name);
2095dfc5606dSYehuda Sadeh 			continue;
2096dfc5606dSYehuda Sadeh 		}
2097dfc5606dSYehuda Sadeh 		for (; i > 0;
2098dfc5606dSYehuda Sadeh 		     i--, name = rbd_prev_snap_name(name, first_name)) {
2099dfc5606dSYehuda Sadeh 			if (!name) {
2100dfc5606dSYehuda Sadeh 				WARN_ON(1);
2101dfc5606dSYehuda Sadeh 				return -EINVAL;
2102dfc5606dSYehuda Sadeh 			}
2103dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i];
2104dfc5606dSYehuda Sadeh 			/* snapshot removal? handle it above */
2105dfc5606dSYehuda Sadeh 			if (cur_id >= old_snap->id)
2106dfc5606dSYehuda Sadeh 				break;
2107dfc5606dSYehuda Sadeh 			/* a new snapshot */
2108dfc5606dSYehuda Sadeh 			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2109dfc5606dSYehuda Sadeh 			if (ret < 0)
2110dfc5606dSYehuda Sadeh 				return ret;
2111dfc5606dSYehuda Sadeh 
2112dfc5606dSYehuda Sadeh 			/* note that we add it backward so using n and not p */
2113dfc5606dSYehuda Sadeh 			list_add(&snap->node, n);
2114dfc5606dSYehuda Sadeh 			p = &snap->node;
2115dfc5606dSYehuda Sadeh 		}
2116dfc5606dSYehuda Sadeh 	}
2117dfc5606dSYehuda Sadeh 	/* we're done going over the old snap list, just add what's left */
2118dfc5606dSYehuda Sadeh 	for (; i > 0; i--) {
2119dfc5606dSYehuda Sadeh 		name = rbd_prev_snap_name(name, first_name);
2120dfc5606dSYehuda Sadeh 		if (!name) {
2121dfc5606dSYehuda Sadeh 			WARN_ON(1);
2122dfc5606dSYehuda Sadeh 			return -EINVAL;
2123dfc5606dSYehuda Sadeh 		}
2124dfc5606dSYehuda Sadeh 		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2125dfc5606dSYehuda Sadeh 		if (ret < 0)
2126dfc5606dSYehuda Sadeh 			return ret;
2127dfc5606dSYehuda Sadeh 		list_add(&snap->node, &rbd_dev->snaps);
2128dfc5606dSYehuda Sadeh 	}
2129dfc5606dSYehuda Sadeh 
2130dfc5606dSYehuda Sadeh 	return 0;
2131dfc5606dSYehuda Sadeh }
2132dfc5606dSYehuda Sadeh 
2133dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2134dfc5606dSYehuda Sadeh {
2135f0f8cef5SAlex Elder 	int ret;
2136dfc5606dSYehuda Sadeh 	struct device *dev;
2137dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2138dfc5606dSYehuda Sadeh 
2139dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2140dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2141dfc5606dSYehuda Sadeh 
2142dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2143dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2144dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2145dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2146dfc5606dSYehuda Sadeh 	dev_set_name(dev, "%d", rbd_dev->id);
2147dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2148dfc5606dSYehuda Sadeh 	if (ret < 0)
2149f0f8cef5SAlex Elder 		goto out;
2150dfc5606dSYehuda Sadeh 
2151dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2152dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2153dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2154dfc5606dSYehuda Sadeh 		if (ret < 0)
2155602adf40SYehuda Sadeh 			break;
2156602adf40SYehuda Sadeh 	}
2157f0f8cef5SAlex Elder out:
2158dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2159dfc5606dSYehuda Sadeh 	return ret;
2160602adf40SYehuda Sadeh }
2161602adf40SYehuda Sadeh 
2162dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2163dfc5606dSYehuda Sadeh {
2164dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2165dfc5606dSYehuda Sadeh }
2166dfc5606dSYehuda Sadeh 
216759c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
216859c2be1eSYehuda Sadeh {
216959c2be1eSYehuda Sadeh 	int ret, rc;
217059c2be1eSYehuda Sadeh 
217159c2be1eSYehuda Sadeh 	do {
217259c2be1eSYehuda Sadeh 		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
217359c2be1eSYehuda Sadeh 					 rbd_dev->header.obj_version);
217459c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
217559c2be1eSYehuda Sadeh 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
217659c2be1eSYehuda Sadeh 			rc = __rbd_update_snaps(rbd_dev);
217759c2be1eSYehuda Sadeh 			mutex_unlock(&ctl_mutex);
217859c2be1eSYehuda Sadeh 			if (rc < 0)
217959c2be1eSYehuda Sadeh 				return rc;
218059c2be1eSYehuda Sadeh 		}
218159c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
218259c2be1eSYehuda Sadeh 
218359c2be1eSYehuda Sadeh 	return ret;
218459c2be1eSYehuda Sadeh }
218559c2be1eSYehuda Sadeh 
21861ddbe94eSAlex Elder static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
21871ddbe94eSAlex Elder 
21881ddbe94eSAlex Elder /*
2189499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
2190499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
21911ddbe94eSAlex Elder  */
2192499afd5bSAlex Elder static void rbd_id_get(struct rbd_device *rbd_dev)
2193b7f23c36SAlex Elder {
2194499afd5bSAlex Elder 	rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2195499afd5bSAlex Elder 
2196499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2197499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2198499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2199b7f23c36SAlex Elder }
2200b7f23c36SAlex Elder 
22011ddbe94eSAlex Elder /*
2202499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
2203499afd5bSAlex Elder  * identifier is no longer in use.
22041ddbe94eSAlex Elder  */
2205499afd5bSAlex Elder static void rbd_id_put(struct rbd_device *rbd_dev)
22061ddbe94eSAlex Elder {
2207d184f6bfSAlex Elder 	struct list_head *tmp;
2208d184f6bfSAlex Elder 	int rbd_id = rbd_dev->id;
2209d184f6bfSAlex Elder 	int max_id;
2210d184f6bfSAlex Elder 
2211d184f6bfSAlex Elder 	BUG_ON(rbd_id < 1);
2212499afd5bSAlex Elder 
2213499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2214499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
2215d184f6bfSAlex Elder 
2216d184f6bfSAlex Elder 	/*
2217d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
2218d184f6bfSAlex Elder 	 * is nothing special we need to do.
2219d184f6bfSAlex Elder 	 */
2220d184f6bfSAlex Elder 	if (rbd_id != atomic64_read(&rbd_id_max)) {
2221d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
2222d184f6bfSAlex Elder 		return;
2223d184f6bfSAlex Elder 	}
2224d184f6bfSAlex Elder 
2225d184f6bfSAlex Elder 	/*
2226d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
2227d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
2228d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
2229d184f6bfSAlex Elder 	 */
2230d184f6bfSAlex Elder 	max_id = 0;
2231d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
2232d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
2233d184f6bfSAlex Elder 
2234d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2235d184f6bfSAlex Elder 		if (rbd_id > max_id)
2236d184f6bfSAlex Elder 			max_id = rbd_id;
2237d184f6bfSAlex Elder 	}
2238499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
22391ddbe94eSAlex Elder 
22401ddbe94eSAlex Elder 	/*
2241d184f6bfSAlex Elder 	 * The max id could have been updated by rbd_id_get(), in
2242d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
2243d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
2244d184f6bfSAlex Elder 	 * case.
22451ddbe94eSAlex Elder 	 */
2246d184f6bfSAlex Elder 	atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
2247b7f23c36SAlex Elder }
2248b7f23c36SAlex Elder 
2249a725f65eSAlex Elder /*
2250e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
2251e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
2252593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
2253593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
2254e28fff26SAlex Elder  */
2255e28fff26SAlex Elder static inline size_t next_token(const char **buf)
2256e28fff26SAlex Elder {
2257e28fff26SAlex Elder         /*
2258e28fff26SAlex Elder         * These are the characters that produce nonzero for
2259e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
2260e28fff26SAlex Elder         */
2261e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
2262e28fff26SAlex Elder 
2263e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
2264e28fff26SAlex Elder 
2265e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
2266e28fff26SAlex Elder }
2267e28fff26SAlex Elder 
2268e28fff26SAlex Elder /*
2269e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
2270e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
2271593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
2272593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
2273e28fff26SAlex Elder  *
2274e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
2275e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
2276e28fff26SAlex Elder  * token_size if the token would not fit.
2277e28fff26SAlex Elder  *
2278593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
2279e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
2280e28fff26SAlex Elder  * too small to hold it.
2281e28fff26SAlex Elder  */
2282e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
2283e28fff26SAlex Elder 				char *token,
2284e28fff26SAlex Elder 				size_t token_size)
2285e28fff26SAlex Elder {
2286e28fff26SAlex Elder         size_t len;
2287e28fff26SAlex Elder 
2288e28fff26SAlex Elder 	len = next_token(buf);
2289e28fff26SAlex Elder 	if (len < token_size) {
2290e28fff26SAlex Elder 		memcpy(token, *buf, len);
2291e28fff26SAlex Elder 		*(token + len) = '\0';
2292e28fff26SAlex Elder 	}
2293e28fff26SAlex Elder 	*buf += len;
2294e28fff26SAlex Elder 
2295e28fff26SAlex Elder         return len;
2296e28fff26SAlex Elder }
2297e28fff26SAlex Elder 
2298e28fff26SAlex Elder /*
2299a725f65eSAlex Elder  * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
2300a725f65eSAlex Elder  * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2301a725f65eSAlex Elder  * on the list of monitor addresses and other options provided via
2302a725f65eSAlex Elder  * /sys/bus/rbd/add.
2303a725f65eSAlex Elder  */
2304a725f65eSAlex Elder static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2305a725f65eSAlex Elder 			      const char *buf,
23067ef3214aSAlex Elder 			      const char **mon_addrs,
23075214ecc4SAlex Elder 			      size_t *mon_addrs_size,
2308e28fff26SAlex Elder 			      char *options,
2309e28fff26SAlex Elder 			      size_t options_size)
2310a725f65eSAlex Elder {
2311e28fff26SAlex Elder 	size_t	len;
2312e28fff26SAlex Elder 
2313e28fff26SAlex Elder 	/* The first four tokens are required */
2314e28fff26SAlex Elder 
23157ef3214aSAlex Elder 	len = next_token(&buf);
23167ef3214aSAlex Elder 	if (!len)
2317a725f65eSAlex Elder 		return -EINVAL;
23185214ecc4SAlex Elder 	*mon_addrs_size = len + 1;
23197ef3214aSAlex Elder 	*mon_addrs = buf;
23207ef3214aSAlex Elder 
23217ef3214aSAlex Elder 	buf += len;
2322a725f65eSAlex Elder 
2323e28fff26SAlex Elder 	len = copy_token(&buf, options, options_size);
2324e28fff26SAlex Elder 	if (!len || len >= options_size)
2325e28fff26SAlex Elder 		return -EINVAL;
2326a725f65eSAlex Elder 
2327e28fff26SAlex Elder 	len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name));
2328e28fff26SAlex Elder 	if (!len || len >= sizeof (rbd_dev->pool_name))
2329e28fff26SAlex Elder 		return -EINVAL;
2330e28fff26SAlex Elder 
2331e28fff26SAlex Elder 	len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
2332e28fff26SAlex Elder 	if (!len || len >= sizeof (rbd_dev->obj))
2333e28fff26SAlex Elder 		return -EINVAL;
2334e28fff26SAlex Elder 
2335e28fff26SAlex Elder 	/* We have the object length in hand, save it. */
2336e28fff26SAlex Elder 
2337e28fff26SAlex Elder 	rbd_dev->obj_len = len;
2338e28fff26SAlex Elder 
233981a89793SAlex Elder 	BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
234081a89793SAlex Elder 				< RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
234181a89793SAlex Elder 	sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
2342a725f65eSAlex Elder 
2343e28fff26SAlex Elder 	/*
2344e28fff26SAlex Elder 	 * The snapshot name is optional, but it's an error if it's
2345e28fff26SAlex Elder 	 * too long.  If no snapshot is supplied, fill in the default.
2346e28fff26SAlex Elder 	 */
2347e28fff26SAlex Elder 	len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
2348e28fff26SAlex Elder 	if (!len)
2349e28fff26SAlex Elder 		memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2350e28fff26SAlex Elder 			sizeof (RBD_SNAP_HEAD_NAME));
2351e28fff26SAlex Elder 	else if (len >= sizeof (rbd_dev->snap_name))
2352e28fff26SAlex Elder 		return -EINVAL;
2353e28fff26SAlex Elder 
2354a725f65eSAlex Elder 	return 0;
2355a725f65eSAlex Elder }
2356a725f65eSAlex Elder 
235759c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
235859c2be1eSYehuda Sadeh 		       const char *buf,
235959c2be1eSYehuda Sadeh 		       size_t count)
2360602adf40SYehuda Sadeh {
2361602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
23627ef3214aSAlex Elder 	const char *mon_addrs = NULL;
23637ef3214aSAlex Elder 	size_t mon_addrs_size = 0;
236427cc2594SAlex Elder 	char *options = NULL;
236527cc2594SAlex Elder 	struct ceph_osd_client *osdc;
236627cc2594SAlex Elder 	int rc = -ENOMEM;
2367602adf40SYehuda Sadeh 
2368602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2369602adf40SYehuda Sadeh 		return -ENODEV;
2370602adf40SYehuda Sadeh 
2371602adf40SYehuda Sadeh 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2372602adf40SYehuda Sadeh 	if (!rbd_dev)
237327cc2594SAlex Elder 		goto err_nomem;
237427cc2594SAlex Elder 	options = kmalloc(count, GFP_KERNEL);
237527cc2594SAlex Elder 	if (!options)
237627cc2594SAlex Elder 		goto err_nomem;
2377602adf40SYehuda Sadeh 
2378602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2379602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2380602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2381dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2382c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
2383602adf40SYehuda Sadeh 
2384c666601aSJosh Durgin 	init_rwsem(&rbd_dev->header_rwsem);
23850e805a1dSAlex Elder 
2386d184f6bfSAlex Elder 	/* generate unique id: find highest unique id, add one */
2387499afd5bSAlex Elder 	rbd_id_get(rbd_dev);
2388602adf40SYehuda Sadeh 
2389a725f65eSAlex Elder 	/* Fill in the device name, now that we have its id. */
239081a89793SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
239181a89793SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
239281a89793SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
2393e124a82fSAlex Elder 
2394a725f65eSAlex Elder 	/* parse add command */
23957ef3214aSAlex Elder 	rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
2396e28fff26SAlex Elder 				options, count);
2397a725f65eSAlex Elder 	if (rc)
2398a725f65eSAlex Elder 		goto err_put_id;
2399a725f65eSAlex Elder 
24005214ecc4SAlex Elder 	rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
24015214ecc4SAlex Elder 						options);
2402d720bcb0SAlex Elder 	if (IS_ERR(rbd_dev->rbd_client)) {
2403d720bcb0SAlex Elder 		rc = PTR_ERR(rbd_dev->rbd_client);
2404f0f8cef5SAlex Elder 		goto err_put_id;
2405d720bcb0SAlex Elder 	}
2406602adf40SYehuda Sadeh 
2407602adf40SYehuda Sadeh 	/* pick the pool */
24081dbb4399SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2409602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2410602adf40SYehuda Sadeh 	if (rc < 0)
2411602adf40SYehuda Sadeh 		goto err_out_client;
2412602adf40SYehuda Sadeh 	rbd_dev->poolid = rc;
2413602adf40SYehuda Sadeh 
2414602adf40SYehuda Sadeh 	/* register our block device */
241527cc2594SAlex Elder 	rc = register_blkdev(0, rbd_dev->name);
241627cc2594SAlex Elder 	if (rc < 0)
2417602adf40SYehuda Sadeh 		goto err_out_client;
241827cc2594SAlex Elder 	rbd_dev->major = rc;
2419602adf40SYehuda Sadeh 
2420dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2421dfc5606dSYehuda Sadeh 	if (rc)
2422766fc439SYehuda Sadeh 		goto err_out_blkdev;
2423766fc439SYehuda Sadeh 
242432eec68dSAlex Elder 	/*
242532eec68dSAlex Elder 	 * At this point cleanup in the event of an error is the job
242632eec68dSAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
242732eec68dSAlex Elder 	 *
242832eec68dSAlex Elder 	 * Set up and announce blkdev mapping.
242932eec68dSAlex Elder 	 */
2430602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2431602adf40SYehuda Sadeh 	if (rc)
2432766fc439SYehuda Sadeh 		goto err_out_bus;
2433602adf40SYehuda Sadeh 
243459c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
243559c2be1eSYehuda Sadeh 	if (rc)
243659c2be1eSYehuda Sadeh 		goto err_out_bus;
243759c2be1eSYehuda Sadeh 
2438602adf40SYehuda Sadeh 	return count;
2439602adf40SYehuda Sadeh 
2440766fc439SYehuda Sadeh err_out_bus:
2441766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2442766fc439SYehuda Sadeh 
2443766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2444766fc439SYehuda Sadeh 	kfree(options);
2445766fc439SYehuda Sadeh 	return rc;
2446766fc439SYehuda Sadeh 
2447602adf40SYehuda Sadeh err_out_blkdev:
2448602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2449602adf40SYehuda Sadeh err_out_client:
2450602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2451f0f8cef5SAlex Elder err_put_id:
2452499afd5bSAlex Elder 	rbd_id_put(rbd_dev);
245327cc2594SAlex Elder err_nomem:
2454602adf40SYehuda Sadeh 	kfree(options);
245527cc2594SAlex Elder 	kfree(rbd_dev);
245627cc2594SAlex Elder 
2457602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2458602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
245927cc2594SAlex Elder 
246027cc2594SAlex Elder 	return (ssize_t) rc;
2461602adf40SYehuda Sadeh }
2462602adf40SYehuda Sadeh 
2463602adf40SYehuda Sadeh static struct rbd_device *__rbd_get_dev(unsigned long id)
2464602adf40SYehuda Sadeh {
2465602adf40SYehuda Sadeh 	struct list_head *tmp;
2466602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2467602adf40SYehuda Sadeh 
2468e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
2469602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2470602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2471e124a82fSAlex Elder 		if (rbd_dev->id == id) {
2472e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
2473602adf40SYehuda Sadeh 			return rbd_dev;
2474602adf40SYehuda Sadeh 		}
2475e124a82fSAlex Elder 	}
2476e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
2477602adf40SYehuda Sadeh 	return NULL;
2478602adf40SYehuda Sadeh }
2479602adf40SYehuda Sadeh 
2480dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2481602adf40SYehuda Sadeh {
2482593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2483602adf40SYehuda Sadeh 
24841dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
24851dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
24861dbb4399SAlex Elder 
24871dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
248859c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
24891dbb4399SAlex Elder 	}
249059c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
249179e3057cSYehuda Sadeh 		rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
249259c2be1eSYehuda Sadeh 
2493602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2494602adf40SYehuda Sadeh 
2495602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2496602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2497602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
249832eec68dSAlex Elder 
249932eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
250032eec68dSAlex Elder 	rbd_id_put(rbd_dev);
2501602adf40SYehuda Sadeh 	kfree(rbd_dev);
2502602adf40SYehuda Sadeh 
2503602adf40SYehuda Sadeh 	/* release module ref */
2504602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2505602adf40SYehuda Sadeh }
2506602adf40SYehuda Sadeh 
2507dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2508602adf40SYehuda Sadeh 			  const char *buf,
2509602adf40SYehuda Sadeh 			  size_t count)
2510602adf40SYehuda Sadeh {
2511602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2512602adf40SYehuda Sadeh 	int target_id, rc;
2513602adf40SYehuda Sadeh 	unsigned long ul;
2514602adf40SYehuda Sadeh 	int ret = count;
2515602adf40SYehuda Sadeh 
2516602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2517602adf40SYehuda Sadeh 	if (rc)
2518602adf40SYehuda Sadeh 		return rc;
2519602adf40SYehuda Sadeh 
2520602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2521602adf40SYehuda Sadeh 	target_id = (int) ul;
2522602adf40SYehuda Sadeh 	if (target_id != ul)
2523602adf40SYehuda Sadeh 		return -EINVAL;
2524602adf40SYehuda Sadeh 
2525602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2526602adf40SYehuda Sadeh 
2527602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2528602adf40SYehuda Sadeh 	if (!rbd_dev) {
2529602adf40SYehuda Sadeh 		ret = -ENOENT;
2530602adf40SYehuda Sadeh 		goto done;
2531602adf40SYehuda Sadeh 	}
2532602adf40SYehuda Sadeh 
2533dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2534dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2535602adf40SYehuda Sadeh 
2536602adf40SYehuda Sadeh done:
2537602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2538602adf40SYehuda Sadeh 	return ret;
2539602adf40SYehuda Sadeh }
2540602adf40SYehuda Sadeh 
2541dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2542dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2543602adf40SYehuda Sadeh 			    const char *buf,
2544602adf40SYehuda Sadeh 			    size_t count)
2545602adf40SYehuda Sadeh {
2546593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2547dfc5606dSYehuda Sadeh 	int ret;
2548dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2549602adf40SYehuda Sadeh 	if (!name)
2550602adf40SYehuda Sadeh 		return -ENOMEM;
2551602adf40SYehuda Sadeh 
2552dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2553602adf40SYehuda Sadeh 
2554602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2555602adf40SYehuda Sadeh 
2556602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2557602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2558602adf40SYehuda Sadeh 	if (ret < 0)
255959c2be1eSYehuda Sadeh 		goto err_unlock;
2560602adf40SYehuda Sadeh 
2561dfc5606dSYehuda Sadeh 	ret = __rbd_update_snaps(rbd_dev);
2562602adf40SYehuda Sadeh 	if (ret < 0)
256359c2be1eSYehuda Sadeh 		goto err_unlock;
256459c2be1eSYehuda Sadeh 
256559c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
256659c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
256759c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
256859c2be1eSYehuda Sadeh 
256959c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
257059c2be1eSYehuda Sadeh 	rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
2571602adf40SYehuda Sadeh 
2572602adf40SYehuda Sadeh 	ret = count;
257359c2be1eSYehuda Sadeh 	kfree(name);
257459c2be1eSYehuda Sadeh 	return ret;
257559c2be1eSYehuda Sadeh 
257659c2be1eSYehuda Sadeh err_unlock:
2577602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2578602adf40SYehuda Sadeh 	kfree(name);
2579602adf40SYehuda Sadeh 	return ret;
2580602adf40SYehuda Sadeh }
2581602adf40SYehuda Sadeh 
2582602adf40SYehuda Sadeh /*
2583602adf40SYehuda Sadeh  * create control files in sysfs
2584dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2585602adf40SYehuda Sadeh  */
2586602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2587602adf40SYehuda Sadeh {
2588dfc5606dSYehuda Sadeh 	int ret;
2589602adf40SYehuda Sadeh 
2590fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
2591dfc5606dSYehuda Sadeh 	if (ret < 0)
2592dfc5606dSYehuda Sadeh 		return ret;
2593602adf40SYehuda Sadeh 
2594fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
2595fed4c143SAlex Elder 	if (ret < 0)
2596fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
2597602adf40SYehuda Sadeh 
2598602adf40SYehuda Sadeh 	return ret;
2599602adf40SYehuda Sadeh }
2600602adf40SYehuda Sadeh 
2601602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2602602adf40SYehuda Sadeh {
2603dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2604fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
2605602adf40SYehuda Sadeh }
2606602adf40SYehuda Sadeh 
2607602adf40SYehuda Sadeh int __init rbd_init(void)
2608602adf40SYehuda Sadeh {
2609602adf40SYehuda Sadeh 	int rc;
2610602adf40SYehuda Sadeh 
2611602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2612602adf40SYehuda Sadeh 	if (rc)
2613602adf40SYehuda Sadeh 		return rc;
2614f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
2615602adf40SYehuda Sadeh 	return 0;
2616602adf40SYehuda Sadeh }
2617602adf40SYehuda Sadeh 
2618602adf40SYehuda Sadeh void __exit rbd_exit(void)
2619602adf40SYehuda Sadeh {
2620602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2621602adf40SYehuda Sadeh }
2622602adf40SYehuda Sadeh 
2623602adf40SYehuda Sadeh module_init(rbd_init);
2624602adf40SYehuda Sadeh module_exit(rbd_exit);
2625602adf40SYehuda Sadeh 
2626602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2627602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2628602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2629602adf40SYehuda Sadeh 
2630602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2631602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2632602adf40SYehuda Sadeh 
2633602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2634