xref: /openbmc/linux/drivers/block/rbd.c (revision ee57741c)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44602adf40SYehuda Sadeh #define DRV_NAME "rbd"
45602adf40SYehuda Sadeh #define DRV_NAME_LONG "rbd (rados block device)"
46602adf40SYehuda Sadeh 
47602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
48602adf40SYehuda Sadeh 
4921079786SAlex Elder #define RBD_MAX_MD_NAME_LEN	(RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
50602adf40SYehuda Sadeh #define RBD_MAX_POOL_NAME_LEN	64
51602adf40SYehuda Sadeh #define RBD_MAX_SNAP_NAME_LEN	32
52602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
53602adf40SYehuda Sadeh 
54602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
55602adf40SYehuda Sadeh 
56602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
57602adf40SYehuda Sadeh 
5859c2be1eSYehuda Sadeh #define RBD_NOTIFY_TIMEOUT_DEFAULT 10
5959c2be1eSYehuda Sadeh 
60602adf40SYehuda Sadeh /*
61602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
62602adf40SYehuda Sadeh  */
63602adf40SYehuda Sadeh struct rbd_image_header {
64602adf40SYehuda Sadeh 	u64 image_size;
65602adf40SYehuda Sadeh 	char block_name[32];
66602adf40SYehuda Sadeh 	__u8 obj_order;
67602adf40SYehuda Sadeh 	__u8 crypt_type;
68602adf40SYehuda Sadeh 	__u8 comp_type;
69602adf40SYehuda Sadeh 	struct rw_semaphore snap_rwsem;
70602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc;
71602adf40SYehuda Sadeh 	size_t snap_names_len;
72602adf40SYehuda Sadeh 	u64 snap_seq;
73602adf40SYehuda Sadeh 	u32 total_snaps;
74602adf40SYehuda Sadeh 
75602adf40SYehuda Sadeh 	char *snap_names;
76602adf40SYehuda Sadeh 	u64 *snap_sizes;
7759c2be1eSYehuda Sadeh 
7859c2be1eSYehuda Sadeh 	u64 obj_version;
7959c2be1eSYehuda Sadeh };
8059c2be1eSYehuda Sadeh 
8159c2be1eSYehuda Sadeh struct rbd_options {
8259c2be1eSYehuda Sadeh 	int	notify_timeout;
83602adf40SYehuda Sadeh };
84602adf40SYehuda Sadeh 
85602adf40SYehuda Sadeh /*
86602adf40SYehuda Sadeh  * an instance of the client.  multiple devices may share a client.
87602adf40SYehuda Sadeh  */
88602adf40SYehuda Sadeh struct rbd_client {
89602adf40SYehuda Sadeh 	struct ceph_client	*client;
9059c2be1eSYehuda Sadeh 	struct rbd_options	*rbd_opts;
91602adf40SYehuda Sadeh 	struct kref		kref;
92602adf40SYehuda Sadeh 	struct list_head	node;
93602adf40SYehuda Sadeh };
94602adf40SYehuda Sadeh 
951fec7093SYehuda Sadeh struct rbd_req_coll;
961fec7093SYehuda Sadeh 
97602adf40SYehuda Sadeh /*
98602adf40SYehuda Sadeh  * a single io request
99602adf40SYehuda Sadeh  */
100602adf40SYehuda Sadeh struct rbd_request {
101602adf40SYehuda Sadeh 	struct request		*rq;		/* blk layer request */
102602adf40SYehuda Sadeh 	struct bio		*bio;		/* cloned bio */
103602adf40SYehuda Sadeh 	struct page		**pages;	/* list of used pages */
104602adf40SYehuda Sadeh 	u64			len;
1051fec7093SYehuda Sadeh 	int			coll_index;
1061fec7093SYehuda Sadeh 	struct rbd_req_coll	*coll;
1071fec7093SYehuda Sadeh };
1081fec7093SYehuda Sadeh 
1091fec7093SYehuda Sadeh struct rbd_req_status {
1101fec7093SYehuda Sadeh 	int done;
1111fec7093SYehuda Sadeh 	int rc;
1121fec7093SYehuda Sadeh 	u64 bytes;
1131fec7093SYehuda Sadeh };
1141fec7093SYehuda Sadeh 
1151fec7093SYehuda Sadeh /*
1161fec7093SYehuda Sadeh  * a collection of requests
1171fec7093SYehuda Sadeh  */
1181fec7093SYehuda Sadeh struct rbd_req_coll {
1191fec7093SYehuda Sadeh 	int			total;
1201fec7093SYehuda Sadeh 	int			num_done;
1211fec7093SYehuda Sadeh 	struct kref		kref;
1221fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
123602adf40SYehuda Sadeh };
124602adf40SYehuda Sadeh 
125dfc5606dSYehuda Sadeh struct rbd_snap {
126dfc5606dSYehuda Sadeh 	struct	device		dev;
127dfc5606dSYehuda Sadeh 	const char		*name;
128dfc5606dSYehuda Sadeh 	size_t			size;
129dfc5606dSYehuda Sadeh 	struct list_head	node;
130dfc5606dSYehuda Sadeh 	u64			id;
131dfc5606dSYehuda Sadeh };
132dfc5606dSYehuda Sadeh 
133602adf40SYehuda Sadeh /*
134602adf40SYehuda Sadeh  * a single device
135602adf40SYehuda Sadeh  */
136602adf40SYehuda Sadeh struct rbd_device {
137602adf40SYehuda Sadeh 	int			id;		/* blkdev unique id */
138602adf40SYehuda Sadeh 
139602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
140602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
141602adf40SYehuda Sadeh 	struct request_queue	*q;
142602adf40SYehuda Sadeh 
143602adf40SYehuda Sadeh 	struct ceph_client	*client;
144602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
145602adf40SYehuda Sadeh 
146602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
147602adf40SYehuda Sadeh 
148602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
149602adf40SYehuda Sadeh 
150602adf40SYehuda Sadeh 	struct rbd_image_header	header;
151602adf40SYehuda Sadeh 	char			obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
152602adf40SYehuda Sadeh 	int			obj_len;
153602adf40SYehuda Sadeh 	char			obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
154602adf40SYehuda Sadeh 	char			pool_name[RBD_MAX_POOL_NAME_LEN];
155602adf40SYehuda Sadeh 	int			poolid;
156602adf40SYehuda Sadeh 
15759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
15859c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
15959c2be1eSYehuda Sadeh 
160602adf40SYehuda Sadeh 	char                    snap_name[RBD_MAX_SNAP_NAME_LEN];
161602adf40SYehuda Sadeh 	u32 cur_snap;	/* index+1 of current snapshot within snap context
162602adf40SYehuda Sadeh 			   0 - for the head */
163602adf40SYehuda Sadeh 	int read_only;
164602adf40SYehuda Sadeh 
165602adf40SYehuda Sadeh 	struct list_head	node;
166dfc5606dSYehuda Sadeh 
167dfc5606dSYehuda Sadeh 	/* list of snapshots */
168dfc5606dSYehuda Sadeh 	struct list_head	snaps;
169dfc5606dSYehuda Sadeh 
170dfc5606dSYehuda Sadeh 	/* sysfs related */
171dfc5606dSYehuda Sadeh 	struct device		dev;
172dfc5606dSYehuda Sadeh };
173dfc5606dSYehuda Sadeh 
174dfc5606dSYehuda Sadeh static struct bus_type rbd_bus_type = {
175dfc5606dSYehuda Sadeh 	.name		= "rbd",
176602adf40SYehuda Sadeh };
177602adf40SYehuda Sadeh 
17821079786SAlex Elder static DEFINE_SPINLOCK(node_lock);      /* protects client get/put */
179602adf40SYehuda Sadeh 
180602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
181602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
182602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);      /* clients */
183602adf40SYehuda Sadeh 
184dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
185dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
186dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
187dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
188dfc5606dSYehuda Sadeh 			    const char *buf,
189dfc5606dSYehuda Sadeh 			    size_t count);
190dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
19169932487SJustin P. Mattock 				  struct rbd_snap *snap);
192dfc5606dSYehuda Sadeh 
193dfc5606dSYehuda Sadeh 
194dfc5606dSYehuda Sadeh static struct rbd_device *dev_to_rbd(struct device *dev)
195dfc5606dSYehuda Sadeh {
196dfc5606dSYehuda Sadeh 	return container_of(dev, struct rbd_device, dev);
197dfc5606dSYehuda Sadeh }
198dfc5606dSYehuda Sadeh 
199dfc5606dSYehuda Sadeh static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
200dfc5606dSYehuda Sadeh {
201dfc5606dSYehuda Sadeh 	return get_device(&rbd_dev->dev);
202dfc5606dSYehuda Sadeh }
203dfc5606dSYehuda Sadeh 
204dfc5606dSYehuda Sadeh static void rbd_put_dev(struct rbd_device *rbd_dev)
205dfc5606dSYehuda Sadeh {
206dfc5606dSYehuda Sadeh 	put_device(&rbd_dev->dev);
207dfc5606dSYehuda Sadeh }
208602adf40SYehuda Sadeh 
20959c2be1eSYehuda Sadeh static int __rbd_update_snaps(struct rbd_device *rbd_dev);
21059c2be1eSYehuda Sadeh 
211602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
212602adf40SYehuda Sadeh {
213602adf40SYehuda Sadeh 	struct gendisk *disk = bdev->bd_disk;
214602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
215602adf40SYehuda Sadeh 
216dfc5606dSYehuda Sadeh 	rbd_get_dev(rbd_dev);
217dfc5606dSYehuda Sadeh 
218602adf40SYehuda Sadeh 	set_device_ro(bdev, rbd_dev->read_only);
219602adf40SYehuda Sadeh 
220602adf40SYehuda Sadeh 	if ((mode & FMODE_WRITE) && rbd_dev->read_only)
221602adf40SYehuda Sadeh 		return -EROFS;
222602adf40SYehuda Sadeh 
223602adf40SYehuda Sadeh 	return 0;
224602adf40SYehuda Sadeh }
225602adf40SYehuda Sadeh 
226dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
227dfc5606dSYehuda Sadeh {
228dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
229dfc5606dSYehuda Sadeh 
230dfc5606dSYehuda Sadeh 	rbd_put_dev(rbd_dev);
231dfc5606dSYehuda Sadeh 
232dfc5606dSYehuda Sadeh 	return 0;
233dfc5606dSYehuda Sadeh }
234dfc5606dSYehuda Sadeh 
235602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
236602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
237602adf40SYehuda Sadeh 	.open			= rbd_open,
238dfc5606dSYehuda Sadeh 	.release		= rbd_release,
239602adf40SYehuda Sadeh };
240602adf40SYehuda Sadeh 
241602adf40SYehuda Sadeh /*
242602adf40SYehuda Sadeh  * Initialize an rbd client instance.
243602adf40SYehuda Sadeh  * We own *opt.
244602adf40SYehuda Sadeh  */
24559c2be1eSYehuda Sadeh static struct rbd_client *rbd_client_create(struct ceph_options *opt,
24659c2be1eSYehuda Sadeh 					    struct rbd_options *rbd_opts)
247602adf40SYehuda Sadeh {
248602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
249602adf40SYehuda Sadeh 	int ret = -ENOMEM;
250602adf40SYehuda Sadeh 
251602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
252602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
253602adf40SYehuda Sadeh 	if (!rbdc)
254602adf40SYehuda Sadeh 		goto out_opt;
255602adf40SYehuda Sadeh 
256602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
257602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
258602adf40SYehuda Sadeh 
2596ab00d46SSage Weil 	rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
260602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
261602adf40SYehuda Sadeh 		goto out_rbdc;
26228f259b7SVasiliy Kulikov 	opt = NULL; /* Now rbdc->client is responsible for opt */
263602adf40SYehuda Sadeh 
264602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
265602adf40SYehuda Sadeh 	if (ret < 0)
266602adf40SYehuda Sadeh 		goto out_err;
267602adf40SYehuda Sadeh 
26859c2be1eSYehuda Sadeh 	rbdc->rbd_opts = rbd_opts;
26959c2be1eSYehuda Sadeh 
270602adf40SYehuda Sadeh 	spin_lock(&node_lock);
271602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
272602adf40SYehuda Sadeh 	spin_unlock(&node_lock);
273602adf40SYehuda Sadeh 
274602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
275602adf40SYehuda Sadeh 	return rbdc;
276602adf40SYehuda Sadeh 
277602adf40SYehuda Sadeh out_err:
278602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
279602adf40SYehuda Sadeh out_rbdc:
280602adf40SYehuda Sadeh 	kfree(rbdc);
281602adf40SYehuda Sadeh out_opt:
28228f259b7SVasiliy Kulikov 	if (opt)
283602adf40SYehuda Sadeh 		ceph_destroy_options(opt);
28428f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
285602adf40SYehuda Sadeh }
286602adf40SYehuda Sadeh 
287602adf40SYehuda Sadeh /*
288602adf40SYehuda Sadeh  * Find a ceph client with specific addr and configuration.
289602adf40SYehuda Sadeh  */
290602adf40SYehuda Sadeh static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
291602adf40SYehuda Sadeh {
292602adf40SYehuda Sadeh 	struct rbd_client *client_node;
293602adf40SYehuda Sadeh 
294602adf40SYehuda Sadeh 	if (opt->flags & CEPH_OPT_NOSHARE)
295602adf40SYehuda Sadeh 		return NULL;
296602adf40SYehuda Sadeh 
297602adf40SYehuda Sadeh 	list_for_each_entry(client_node, &rbd_client_list, node)
298602adf40SYehuda Sadeh 		if (ceph_compare_options(opt, client_node->client) == 0)
299602adf40SYehuda Sadeh 			return client_node;
300602adf40SYehuda Sadeh 	return NULL;
301602adf40SYehuda Sadeh }
302602adf40SYehuda Sadeh 
303602adf40SYehuda Sadeh /*
30459c2be1eSYehuda Sadeh  * mount options
30559c2be1eSYehuda Sadeh  */
30659c2be1eSYehuda Sadeh enum {
30759c2be1eSYehuda Sadeh 	Opt_notify_timeout,
30859c2be1eSYehuda Sadeh 	Opt_last_int,
30959c2be1eSYehuda Sadeh 	/* int args above */
31059c2be1eSYehuda Sadeh 	Opt_last_string,
31159c2be1eSYehuda Sadeh 	/* string args above */
31259c2be1eSYehuda Sadeh };
31359c2be1eSYehuda Sadeh 
31459c2be1eSYehuda Sadeh static match_table_t rbdopt_tokens = {
31559c2be1eSYehuda Sadeh 	{Opt_notify_timeout, "notify_timeout=%d"},
31659c2be1eSYehuda Sadeh 	/* int args above */
31759c2be1eSYehuda Sadeh 	/* string args above */
31859c2be1eSYehuda Sadeh 	{-1, NULL}
31959c2be1eSYehuda Sadeh };
32059c2be1eSYehuda Sadeh 
32159c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
32259c2be1eSYehuda Sadeh {
32359c2be1eSYehuda Sadeh 	struct rbd_options *rbdopt = private;
32459c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
32559c2be1eSYehuda Sadeh 	int token, intval, ret;
32659c2be1eSYehuda Sadeh 
32721079786SAlex Elder 	token = match_token(c, rbdopt_tokens, argstr);
32859c2be1eSYehuda Sadeh 	if (token < 0)
32959c2be1eSYehuda Sadeh 		return -EINVAL;
33059c2be1eSYehuda Sadeh 
33159c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
33259c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
33359c2be1eSYehuda Sadeh 		if (ret < 0) {
33459c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
33559c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
33659c2be1eSYehuda Sadeh 			return ret;
33759c2be1eSYehuda Sadeh 		}
33859c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
33959c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
34059c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
34159c2be1eSYehuda Sadeh 		     argstr[0].from);
34259c2be1eSYehuda Sadeh 	} else {
34359c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
34459c2be1eSYehuda Sadeh 	}
34559c2be1eSYehuda Sadeh 
34659c2be1eSYehuda Sadeh 	switch (token) {
34759c2be1eSYehuda Sadeh 	case Opt_notify_timeout:
34859c2be1eSYehuda Sadeh 		rbdopt->notify_timeout = intval;
34959c2be1eSYehuda Sadeh 		break;
35059c2be1eSYehuda Sadeh 	default:
35159c2be1eSYehuda Sadeh 		BUG_ON(token);
35259c2be1eSYehuda Sadeh 	}
35359c2be1eSYehuda Sadeh 	return 0;
35459c2be1eSYehuda Sadeh }
35559c2be1eSYehuda Sadeh 
35659c2be1eSYehuda Sadeh /*
357602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
358602adf40SYehuda Sadeh  * not exist create it.
359602adf40SYehuda Sadeh  */
360602adf40SYehuda Sadeh static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
361602adf40SYehuda Sadeh 			  char *options)
362602adf40SYehuda Sadeh {
363602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
364602adf40SYehuda Sadeh 	struct ceph_options *opt;
365602adf40SYehuda Sadeh 	int ret;
36659c2be1eSYehuda Sadeh 	struct rbd_options *rbd_opts;
36759c2be1eSYehuda Sadeh 
36859c2be1eSYehuda Sadeh 	rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
36959c2be1eSYehuda Sadeh 	if (!rbd_opts)
37059c2be1eSYehuda Sadeh 		return -ENOMEM;
37159c2be1eSYehuda Sadeh 
37259c2be1eSYehuda Sadeh 	rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
373602adf40SYehuda Sadeh 
374*ee57741cSAlex Elder 	opt = ceph_parse_options(options, mon_addr,
37521079786SAlex Elder 				mon_addr + strlen(mon_addr),
37621079786SAlex Elder 				parse_rbd_opts_token, rbd_opts);
377*ee57741cSAlex Elder 	if (IS_ERR(opt)) {
378*ee57741cSAlex Elder 		ret = PTR_ERR(opt);
37959c2be1eSYehuda Sadeh 		goto done_err;
380*ee57741cSAlex Elder 	}
381602adf40SYehuda Sadeh 
382602adf40SYehuda Sadeh 	spin_lock(&node_lock);
383602adf40SYehuda Sadeh 	rbdc = __rbd_client_find(opt);
384602adf40SYehuda Sadeh 	if (rbdc) {
385602adf40SYehuda Sadeh 		ceph_destroy_options(opt);
38697bb59a0SAlex Elder 		kfree(rbd_opts);
387602adf40SYehuda Sadeh 
388602adf40SYehuda Sadeh 		/* using an existing client */
389602adf40SYehuda Sadeh 		kref_get(&rbdc->kref);
390602adf40SYehuda Sadeh 		rbd_dev->rbd_client = rbdc;
391602adf40SYehuda Sadeh 		rbd_dev->client = rbdc->client;
392602adf40SYehuda Sadeh 		spin_unlock(&node_lock);
393602adf40SYehuda Sadeh 		return 0;
394602adf40SYehuda Sadeh 	}
395602adf40SYehuda Sadeh 	spin_unlock(&node_lock);
396602adf40SYehuda Sadeh 
39759c2be1eSYehuda Sadeh 	rbdc = rbd_client_create(opt, rbd_opts);
39859c2be1eSYehuda Sadeh 	if (IS_ERR(rbdc)) {
39959c2be1eSYehuda Sadeh 		ret = PTR_ERR(rbdc);
40059c2be1eSYehuda Sadeh 		goto done_err;
40159c2be1eSYehuda Sadeh 	}
402602adf40SYehuda Sadeh 
403602adf40SYehuda Sadeh 	rbd_dev->rbd_client = rbdc;
404602adf40SYehuda Sadeh 	rbd_dev->client = rbdc->client;
405602adf40SYehuda Sadeh 	return 0;
40659c2be1eSYehuda Sadeh done_err:
40759c2be1eSYehuda Sadeh 	kfree(rbd_opts);
40859c2be1eSYehuda Sadeh 	return ret;
409602adf40SYehuda Sadeh }
410602adf40SYehuda Sadeh 
411602adf40SYehuda Sadeh /*
412602adf40SYehuda Sadeh  * Destroy ceph client
413d23a4b3fSAlex Elder  *
414d23a4b3fSAlex Elder  * Caller must hold node_lock.
415602adf40SYehuda Sadeh  */
416602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
417602adf40SYehuda Sadeh {
418602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
419602adf40SYehuda Sadeh 
420602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
421602adf40SYehuda Sadeh 	list_del(&rbdc->node);
422602adf40SYehuda Sadeh 
423602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
42459c2be1eSYehuda Sadeh 	kfree(rbdc->rbd_opts);
425602adf40SYehuda Sadeh 	kfree(rbdc);
426602adf40SYehuda Sadeh }
427602adf40SYehuda Sadeh 
428602adf40SYehuda Sadeh /*
429602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
430602adf40SYehuda Sadeh  * it.
431602adf40SYehuda Sadeh  */
432602adf40SYehuda Sadeh static void rbd_put_client(struct rbd_device *rbd_dev)
433602adf40SYehuda Sadeh {
434d23a4b3fSAlex Elder 	spin_lock(&node_lock);
435602adf40SYehuda Sadeh 	kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
436d23a4b3fSAlex Elder 	spin_unlock(&node_lock);
437602adf40SYehuda Sadeh 	rbd_dev->rbd_client = NULL;
438602adf40SYehuda Sadeh 	rbd_dev->client = NULL;
439602adf40SYehuda Sadeh }
440602adf40SYehuda Sadeh 
4411fec7093SYehuda Sadeh /*
4421fec7093SYehuda Sadeh  * Destroy requests collection
4431fec7093SYehuda Sadeh  */
4441fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
4451fec7093SYehuda Sadeh {
4461fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
4471fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
4481fec7093SYehuda Sadeh 
4491fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
4501fec7093SYehuda Sadeh 	kfree(coll);
4511fec7093SYehuda Sadeh }
452602adf40SYehuda Sadeh 
453602adf40SYehuda Sadeh /*
454602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
455602adf40SYehuda Sadeh  * header.
456602adf40SYehuda Sadeh  */
457602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
458602adf40SYehuda Sadeh 				 struct rbd_image_header_ondisk *ondisk,
459602adf40SYehuda Sadeh 				 int allocated_snaps,
460602adf40SYehuda Sadeh 				 gfp_t gfp_flags)
461602adf40SYehuda Sadeh {
462602adf40SYehuda Sadeh 	int i;
463602adf40SYehuda Sadeh 	u32 snap_count = le32_to_cpu(ondisk->snap_count);
464602adf40SYehuda Sadeh 	int ret = -ENOMEM;
465602adf40SYehuda Sadeh 
46621079786SAlex Elder 	if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
46781e759fbSJosh Durgin 		return -ENXIO;
46881e759fbSJosh Durgin 
469602adf40SYehuda Sadeh 	init_rwsem(&header->snap_rwsem);
470602adf40SYehuda Sadeh 	header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
471602adf40SYehuda Sadeh 	header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
47221079786SAlex Elder 				snap_count * sizeof (*ondisk),
473602adf40SYehuda Sadeh 				gfp_flags);
474602adf40SYehuda Sadeh 	if (!header->snapc)
475602adf40SYehuda Sadeh 		return -ENOMEM;
476602adf40SYehuda Sadeh 	if (snap_count) {
477602adf40SYehuda Sadeh 		header->snap_names = kmalloc(header->snap_names_len,
478602adf40SYehuda Sadeh 					     GFP_KERNEL);
479602adf40SYehuda Sadeh 		if (!header->snap_names)
480602adf40SYehuda Sadeh 			goto err_snapc;
481602adf40SYehuda Sadeh 		header->snap_sizes = kmalloc(snap_count * sizeof(u64),
482602adf40SYehuda Sadeh 					     GFP_KERNEL);
483602adf40SYehuda Sadeh 		if (!header->snap_sizes)
484602adf40SYehuda Sadeh 			goto err_names;
485602adf40SYehuda Sadeh 	} else {
486602adf40SYehuda Sadeh 		header->snap_names = NULL;
487602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
488602adf40SYehuda Sadeh 	}
489602adf40SYehuda Sadeh 	memcpy(header->block_name, ondisk->block_name,
490602adf40SYehuda Sadeh 	       sizeof(ondisk->block_name));
491602adf40SYehuda Sadeh 
492602adf40SYehuda Sadeh 	header->image_size = le64_to_cpu(ondisk->image_size);
493602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
494602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
495602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
496602adf40SYehuda Sadeh 
497602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
498602adf40SYehuda Sadeh 	header->snap_seq = le64_to_cpu(ondisk->snap_seq);
499602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
500602adf40SYehuda Sadeh 	header->total_snaps = snap_count;
501602adf40SYehuda Sadeh 
50221079786SAlex Elder 	if (snap_count && allocated_snaps == snap_count) {
503602adf40SYehuda Sadeh 		for (i = 0; i < snap_count; i++) {
504602adf40SYehuda Sadeh 			header->snapc->snaps[i] =
505602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].id);
506602adf40SYehuda Sadeh 			header->snap_sizes[i] =
507602adf40SYehuda Sadeh 				le64_to_cpu(ondisk->snaps[i].image_size);
508602adf40SYehuda Sadeh 		}
509602adf40SYehuda Sadeh 
510602adf40SYehuda Sadeh 		/* copy snapshot names */
511602adf40SYehuda Sadeh 		memcpy(header->snap_names, &ondisk->snaps[i],
512602adf40SYehuda Sadeh 			header->snap_names_len);
513602adf40SYehuda Sadeh 	}
514602adf40SYehuda Sadeh 
515602adf40SYehuda Sadeh 	return 0;
516602adf40SYehuda Sadeh 
517602adf40SYehuda Sadeh err_names:
518602adf40SYehuda Sadeh 	kfree(header->snap_names);
519602adf40SYehuda Sadeh err_snapc:
520602adf40SYehuda Sadeh 	kfree(header->snapc);
521602adf40SYehuda Sadeh 	return ret;
522602adf40SYehuda Sadeh }
523602adf40SYehuda Sadeh 
524602adf40SYehuda Sadeh static int snap_index(struct rbd_image_header *header, int snap_num)
525602adf40SYehuda Sadeh {
526602adf40SYehuda Sadeh 	return header->total_snaps - snap_num;
527602adf40SYehuda Sadeh }
528602adf40SYehuda Sadeh 
529602adf40SYehuda Sadeh static u64 cur_snap_id(struct rbd_device *rbd_dev)
530602adf40SYehuda Sadeh {
531602adf40SYehuda Sadeh 	struct rbd_image_header *header = &rbd_dev->header;
532602adf40SYehuda Sadeh 
533602adf40SYehuda Sadeh 	if (!rbd_dev->cur_snap)
534602adf40SYehuda Sadeh 		return 0;
535602adf40SYehuda Sadeh 
536602adf40SYehuda Sadeh 	return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
537602adf40SYehuda Sadeh }
538602adf40SYehuda Sadeh 
539602adf40SYehuda Sadeh static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
540602adf40SYehuda Sadeh 			u64 *seq, u64 *size)
541602adf40SYehuda Sadeh {
542602adf40SYehuda Sadeh 	int i;
543602adf40SYehuda Sadeh 	char *p = header->snap_names;
544602adf40SYehuda Sadeh 
545602adf40SYehuda Sadeh 	for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
546602adf40SYehuda Sadeh 		if (strcmp(snap_name, p) == 0)
547602adf40SYehuda Sadeh 			break;
548602adf40SYehuda Sadeh 	}
549602adf40SYehuda Sadeh 	if (i == header->total_snaps)
550602adf40SYehuda Sadeh 		return -ENOENT;
551602adf40SYehuda Sadeh 	if (seq)
552602adf40SYehuda Sadeh 		*seq = header->snapc->snaps[i];
553602adf40SYehuda Sadeh 
554602adf40SYehuda Sadeh 	if (size)
555602adf40SYehuda Sadeh 		*size = header->snap_sizes[i];
556602adf40SYehuda Sadeh 
557602adf40SYehuda Sadeh 	return i;
558602adf40SYehuda Sadeh }
559602adf40SYehuda Sadeh 
560602adf40SYehuda Sadeh static int rbd_header_set_snap(struct rbd_device *dev,
561602adf40SYehuda Sadeh 			       const char *snap_name,
562602adf40SYehuda Sadeh 			       u64 *size)
563602adf40SYehuda Sadeh {
564602adf40SYehuda Sadeh 	struct rbd_image_header *header = &dev->header;
565602adf40SYehuda Sadeh 	struct ceph_snap_context *snapc = header->snapc;
566602adf40SYehuda Sadeh 	int ret = -ENOENT;
567602adf40SYehuda Sadeh 
568602adf40SYehuda Sadeh 	down_write(&header->snap_rwsem);
569602adf40SYehuda Sadeh 
570602adf40SYehuda Sadeh 	if (!snap_name ||
571602adf40SYehuda Sadeh 	    !*snap_name ||
572602adf40SYehuda Sadeh 	    strcmp(snap_name, "-") == 0 ||
573602adf40SYehuda Sadeh 	    strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
574602adf40SYehuda Sadeh 		if (header->total_snaps)
575602adf40SYehuda Sadeh 			snapc->seq = header->snap_seq;
576602adf40SYehuda Sadeh 		else
577602adf40SYehuda Sadeh 			snapc->seq = 0;
578602adf40SYehuda Sadeh 		dev->cur_snap = 0;
579602adf40SYehuda Sadeh 		dev->read_only = 0;
580602adf40SYehuda Sadeh 		if (size)
581602adf40SYehuda Sadeh 			*size = header->image_size;
582602adf40SYehuda Sadeh 	} else {
583602adf40SYehuda Sadeh 		ret = snap_by_name(header, snap_name, &snapc->seq, size);
584602adf40SYehuda Sadeh 		if (ret < 0)
585602adf40SYehuda Sadeh 			goto done;
586602adf40SYehuda Sadeh 
587602adf40SYehuda Sadeh 		dev->cur_snap = header->total_snaps - ret;
588602adf40SYehuda Sadeh 		dev->read_only = 1;
589602adf40SYehuda Sadeh 	}
590602adf40SYehuda Sadeh 
591602adf40SYehuda Sadeh 	ret = 0;
592602adf40SYehuda Sadeh done:
593602adf40SYehuda Sadeh 	up_write(&header->snap_rwsem);
594602adf40SYehuda Sadeh 	return ret;
595602adf40SYehuda Sadeh }
596602adf40SYehuda Sadeh 
597602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
598602adf40SYehuda Sadeh {
599602adf40SYehuda Sadeh 	kfree(header->snapc);
600602adf40SYehuda Sadeh 	kfree(header->snap_names);
601602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
602602adf40SYehuda Sadeh }
603602adf40SYehuda Sadeh 
604602adf40SYehuda Sadeh /*
605602adf40SYehuda Sadeh  * get the actual striped segment name, offset and length
606602adf40SYehuda Sadeh  */
607602adf40SYehuda Sadeh static u64 rbd_get_segment(struct rbd_image_header *header,
608602adf40SYehuda Sadeh 			   const char *block_name,
609602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
610602adf40SYehuda Sadeh 			   char *seg_name, u64 *segofs)
611602adf40SYehuda Sadeh {
612602adf40SYehuda Sadeh 	u64 seg = ofs >> header->obj_order;
613602adf40SYehuda Sadeh 
614602adf40SYehuda Sadeh 	if (seg_name)
615602adf40SYehuda Sadeh 		snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
616602adf40SYehuda Sadeh 			 "%s.%012llx", block_name, seg);
617602adf40SYehuda Sadeh 
618602adf40SYehuda Sadeh 	ofs = ofs & ((1 << header->obj_order) - 1);
619602adf40SYehuda Sadeh 	len = min_t(u64, len, (1 << header->obj_order) - ofs);
620602adf40SYehuda Sadeh 
621602adf40SYehuda Sadeh 	if (segofs)
622602adf40SYehuda Sadeh 		*segofs = ofs;
623602adf40SYehuda Sadeh 
624602adf40SYehuda Sadeh 	return len;
625602adf40SYehuda Sadeh }
626602adf40SYehuda Sadeh 
6271fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
6281fec7093SYehuda Sadeh 				u64 ofs, u64 len)
6291fec7093SYehuda Sadeh {
6301fec7093SYehuda Sadeh 	u64 start_seg = ofs >> header->obj_order;
6311fec7093SYehuda Sadeh 	u64 end_seg = (ofs + len - 1) >> header->obj_order;
6321fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
6331fec7093SYehuda Sadeh }
6341fec7093SYehuda Sadeh 
635602adf40SYehuda Sadeh /*
636029bcbd8SJosh Durgin  * returns the size of an object in the image
637029bcbd8SJosh Durgin  */
638029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
639029bcbd8SJosh Durgin {
640029bcbd8SJosh Durgin 	return 1 << header->obj_order;
641029bcbd8SJosh Durgin }
642029bcbd8SJosh Durgin 
643029bcbd8SJosh Durgin /*
644602adf40SYehuda Sadeh  * bio helpers
645602adf40SYehuda Sadeh  */
646602adf40SYehuda Sadeh 
647602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
648602adf40SYehuda Sadeh {
649602adf40SYehuda Sadeh 	struct bio *tmp;
650602adf40SYehuda Sadeh 
651602adf40SYehuda Sadeh 	while (chain) {
652602adf40SYehuda Sadeh 		tmp = chain;
653602adf40SYehuda Sadeh 		chain = chain->bi_next;
654602adf40SYehuda Sadeh 		bio_put(tmp);
655602adf40SYehuda Sadeh 	}
656602adf40SYehuda Sadeh }
657602adf40SYehuda Sadeh 
658602adf40SYehuda Sadeh /*
659602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
660602adf40SYehuda Sadeh  */
661602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
662602adf40SYehuda Sadeh {
663602adf40SYehuda Sadeh 	struct bio_vec *bv;
664602adf40SYehuda Sadeh 	unsigned long flags;
665602adf40SYehuda Sadeh 	void *buf;
666602adf40SYehuda Sadeh 	int i;
667602adf40SYehuda Sadeh 	int pos = 0;
668602adf40SYehuda Sadeh 
669602adf40SYehuda Sadeh 	while (chain) {
670602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
671602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
672602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
673602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
674602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
675602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
67685b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
677602adf40SYehuda Sadeh 			}
678602adf40SYehuda Sadeh 			pos += bv->bv_len;
679602adf40SYehuda Sadeh 		}
680602adf40SYehuda Sadeh 
681602adf40SYehuda Sadeh 		chain = chain->bi_next;
682602adf40SYehuda Sadeh 	}
683602adf40SYehuda Sadeh }
684602adf40SYehuda Sadeh 
685602adf40SYehuda Sadeh /*
686602adf40SYehuda Sadeh  * bio_chain_clone - clone a chain of bios up to a certain length.
687602adf40SYehuda Sadeh  * might return a bio_pair that will need to be released.
688602adf40SYehuda Sadeh  */
689602adf40SYehuda Sadeh static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
690602adf40SYehuda Sadeh 				   struct bio_pair **bp,
691602adf40SYehuda Sadeh 				   int len, gfp_t gfpmask)
692602adf40SYehuda Sadeh {
693602adf40SYehuda Sadeh 	struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
694602adf40SYehuda Sadeh 	int total = 0;
695602adf40SYehuda Sadeh 
696602adf40SYehuda Sadeh 	if (*bp) {
697602adf40SYehuda Sadeh 		bio_pair_release(*bp);
698602adf40SYehuda Sadeh 		*bp = NULL;
699602adf40SYehuda Sadeh 	}
700602adf40SYehuda Sadeh 
701602adf40SYehuda Sadeh 	while (old_chain && (total < len)) {
702602adf40SYehuda Sadeh 		tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
703602adf40SYehuda Sadeh 		if (!tmp)
704602adf40SYehuda Sadeh 			goto err_out;
705602adf40SYehuda Sadeh 
706602adf40SYehuda Sadeh 		if (total + old_chain->bi_size > len) {
707602adf40SYehuda Sadeh 			struct bio_pair *bp;
708602adf40SYehuda Sadeh 
709602adf40SYehuda Sadeh 			/*
710602adf40SYehuda Sadeh 			 * this split can only happen with a single paged bio,
711602adf40SYehuda Sadeh 			 * split_bio will BUG_ON if this is not the case
712602adf40SYehuda Sadeh 			 */
713602adf40SYehuda Sadeh 			dout("bio_chain_clone split! total=%d remaining=%d"
714602adf40SYehuda Sadeh 			     "bi_size=%d\n",
715602adf40SYehuda Sadeh 			     (int)total, (int)len-total,
716602adf40SYehuda Sadeh 			     (int)old_chain->bi_size);
717602adf40SYehuda Sadeh 
718602adf40SYehuda Sadeh 			/* split the bio. We'll release it either in the next
719602adf40SYehuda Sadeh 			   call, or it will have to be released outside */
720602adf40SYehuda Sadeh 			bp = bio_split(old_chain, (len - total) / 512ULL);
721602adf40SYehuda Sadeh 			if (!bp)
722602adf40SYehuda Sadeh 				goto err_out;
723602adf40SYehuda Sadeh 
724602adf40SYehuda Sadeh 			__bio_clone(tmp, &bp->bio1);
725602adf40SYehuda Sadeh 
726602adf40SYehuda Sadeh 			*next = &bp->bio2;
727602adf40SYehuda Sadeh 		} else {
728602adf40SYehuda Sadeh 			__bio_clone(tmp, old_chain);
729602adf40SYehuda Sadeh 			*next = old_chain->bi_next;
730602adf40SYehuda Sadeh 		}
731602adf40SYehuda Sadeh 
732602adf40SYehuda Sadeh 		tmp->bi_bdev = NULL;
733602adf40SYehuda Sadeh 		gfpmask &= ~__GFP_WAIT;
734602adf40SYehuda Sadeh 		tmp->bi_next = NULL;
735602adf40SYehuda Sadeh 
736602adf40SYehuda Sadeh 		if (!new_chain) {
737602adf40SYehuda Sadeh 			new_chain = tail = tmp;
738602adf40SYehuda Sadeh 		} else {
739602adf40SYehuda Sadeh 			tail->bi_next = tmp;
740602adf40SYehuda Sadeh 			tail = tmp;
741602adf40SYehuda Sadeh 		}
742602adf40SYehuda Sadeh 		old_chain = old_chain->bi_next;
743602adf40SYehuda Sadeh 
744602adf40SYehuda Sadeh 		total += tmp->bi_size;
745602adf40SYehuda Sadeh 	}
746602adf40SYehuda Sadeh 
747602adf40SYehuda Sadeh 	BUG_ON(total < len);
748602adf40SYehuda Sadeh 
749602adf40SYehuda Sadeh 	if (tail)
750602adf40SYehuda Sadeh 		tail->bi_next = NULL;
751602adf40SYehuda Sadeh 
752602adf40SYehuda Sadeh 	*old = old_chain;
753602adf40SYehuda Sadeh 
754602adf40SYehuda Sadeh 	return new_chain;
755602adf40SYehuda Sadeh 
756602adf40SYehuda Sadeh err_out:
757602adf40SYehuda Sadeh 	dout("bio_chain_clone with err\n");
758602adf40SYehuda Sadeh 	bio_chain_put(new_chain);
759602adf40SYehuda Sadeh 	return NULL;
760602adf40SYehuda Sadeh }
761602adf40SYehuda Sadeh 
762602adf40SYehuda Sadeh /*
763602adf40SYehuda Sadeh  * helpers for osd request op vectors.
764602adf40SYehuda Sadeh  */
765602adf40SYehuda Sadeh static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
766602adf40SYehuda Sadeh 			    int num_ops,
767602adf40SYehuda Sadeh 			    int opcode,
768602adf40SYehuda Sadeh 			    u32 payload_len)
769602adf40SYehuda Sadeh {
770602adf40SYehuda Sadeh 	*ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
771602adf40SYehuda Sadeh 		       GFP_NOIO);
772602adf40SYehuda Sadeh 	if (!*ops)
773602adf40SYehuda Sadeh 		return -ENOMEM;
774602adf40SYehuda Sadeh 	(*ops)[0].op = opcode;
775602adf40SYehuda Sadeh 	/*
776602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
777602adf40SYehuda Sadeh 	 * in calc_raw_layout()
778602adf40SYehuda Sadeh 	 */
779602adf40SYehuda Sadeh 	(*ops)[0].payload_len = payload_len;
780602adf40SYehuda Sadeh 	return 0;
781602adf40SYehuda Sadeh }
782602adf40SYehuda Sadeh 
783602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
784602adf40SYehuda Sadeh {
785602adf40SYehuda Sadeh 	kfree(ops);
786602adf40SYehuda Sadeh }
787602adf40SYehuda Sadeh 
7881fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
7891fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
7901fec7093SYehuda Sadeh 				   int index,
7911fec7093SYehuda Sadeh 				   int ret, u64 len)
7921fec7093SYehuda Sadeh {
7931fec7093SYehuda Sadeh 	struct request_queue *q;
7941fec7093SYehuda Sadeh 	int min, max, i;
7951fec7093SYehuda Sadeh 
7961fec7093SYehuda Sadeh 	dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
7971fec7093SYehuda Sadeh 	     coll, index, ret, len);
7981fec7093SYehuda Sadeh 
7991fec7093SYehuda Sadeh 	if (!rq)
8001fec7093SYehuda Sadeh 		return;
8011fec7093SYehuda Sadeh 
8021fec7093SYehuda Sadeh 	if (!coll) {
8031fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
8041fec7093SYehuda Sadeh 		return;
8051fec7093SYehuda Sadeh 	}
8061fec7093SYehuda Sadeh 
8071fec7093SYehuda Sadeh 	q = rq->q;
8081fec7093SYehuda Sadeh 
8091fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
8101fec7093SYehuda Sadeh 	coll->status[index].done = 1;
8111fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
8121fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
8131fec7093SYehuda Sadeh 	max = min = coll->num_done;
8141fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
8151fec7093SYehuda Sadeh 		max++;
8161fec7093SYehuda Sadeh 
8171fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
8181fec7093SYehuda Sadeh 		__blk_end_request(rq, coll->status[i].rc,
8191fec7093SYehuda Sadeh 				  coll->status[i].bytes);
8201fec7093SYehuda Sadeh 		coll->num_done++;
8211fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
8221fec7093SYehuda Sadeh 	}
8231fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
8241fec7093SYehuda Sadeh }
8251fec7093SYehuda Sadeh 
8261fec7093SYehuda Sadeh static void rbd_coll_end_req(struct rbd_request *req,
8271fec7093SYehuda Sadeh 			     int ret, u64 len)
8281fec7093SYehuda Sadeh {
8291fec7093SYehuda Sadeh 	rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
8301fec7093SYehuda Sadeh }
8311fec7093SYehuda Sadeh 
832602adf40SYehuda Sadeh /*
833602adf40SYehuda Sadeh  * Send ceph osd request
834602adf40SYehuda Sadeh  */
835602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
836602adf40SYehuda Sadeh 			  struct rbd_device *dev,
837602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
838602adf40SYehuda Sadeh 			  u64 snapid,
839602adf40SYehuda Sadeh 			  const char *obj, u64 ofs, u64 len,
840602adf40SYehuda Sadeh 			  struct bio *bio,
841602adf40SYehuda Sadeh 			  struct page **pages,
842602adf40SYehuda Sadeh 			  int num_pages,
843602adf40SYehuda Sadeh 			  int flags,
844602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
845602adf40SYehuda Sadeh 			  int num_reply,
8461fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
8471fec7093SYehuda Sadeh 			  int coll_index,
848602adf40SYehuda Sadeh 			  void (*rbd_cb)(struct ceph_osd_request *req,
84959c2be1eSYehuda Sadeh 					 struct ceph_msg *msg),
85059c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
85159c2be1eSYehuda Sadeh 			  u64 *ver)
852602adf40SYehuda Sadeh {
853602adf40SYehuda Sadeh 	struct ceph_osd_request *req;
854602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
855602adf40SYehuda Sadeh 	int ret;
856602adf40SYehuda Sadeh 	u64 bno;
857602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
858602adf40SYehuda Sadeh 	struct rbd_request *req_data;
859602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
860602adf40SYehuda Sadeh 	struct rbd_image_header *header = &dev->header;
861602adf40SYehuda Sadeh 
862602adf40SYehuda Sadeh 	req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
8631fec7093SYehuda Sadeh 	if (!req_data) {
8641fec7093SYehuda Sadeh 		if (coll)
8651fec7093SYehuda Sadeh 			rbd_coll_end_req_index(rq, coll, coll_index,
8661fec7093SYehuda Sadeh 					       -ENOMEM, len);
8671fec7093SYehuda Sadeh 		return -ENOMEM;
8681fec7093SYehuda Sadeh 	}
869602adf40SYehuda Sadeh 
8701fec7093SYehuda Sadeh 	if (coll) {
8711fec7093SYehuda Sadeh 		req_data->coll = coll;
8721fec7093SYehuda Sadeh 		req_data->coll_index = coll_index;
8731fec7093SYehuda Sadeh 	}
8741fec7093SYehuda Sadeh 
8751fec7093SYehuda Sadeh 	dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
876602adf40SYehuda Sadeh 
877602adf40SYehuda Sadeh 	down_read(&header->snap_rwsem);
878602adf40SYehuda Sadeh 
879602adf40SYehuda Sadeh 	req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
880602adf40SYehuda Sadeh 				      snapc,
881602adf40SYehuda Sadeh 				      ops,
882602adf40SYehuda Sadeh 				      false,
883602adf40SYehuda Sadeh 				      GFP_NOIO, pages, bio);
8844ad12621SSage Weil 	if (!req) {
885602adf40SYehuda Sadeh 		up_read(&header->snap_rwsem);
8864ad12621SSage Weil 		ret = -ENOMEM;
887602adf40SYehuda Sadeh 		goto done_pages;
888602adf40SYehuda Sadeh 	}
889602adf40SYehuda Sadeh 
890602adf40SYehuda Sadeh 	req->r_callback = rbd_cb;
891602adf40SYehuda Sadeh 
892602adf40SYehuda Sadeh 	req_data->rq = rq;
893602adf40SYehuda Sadeh 	req_data->bio = bio;
894602adf40SYehuda Sadeh 	req_data->pages = pages;
895602adf40SYehuda Sadeh 	req_data->len = len;
896602adf40SYehuda Sadeh 
897602adf40SYehuda Sadeh 	req->r_priv = req_data;
898602adf40SYehuda Sadeh 
899602adf40SYehuda Sadeh 	reqhead = req->r_request->front.iov_base;
900602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
901602adf40SYehuda Sadeh 
902602adf40SYehuda Sadeh 	strncpy(req->r_oid, obj, sizeof(req->r_oid));
903602adf40SYehuda Sadeh 	req->r_oid_len = strlen(req->r_oid);
904602adf40SYehuda Sadeh 
905602adf40SYehuda Sadeh 	layout = &req->r_file_layout;
906602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
907602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
908602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
909602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
910602adf40SYehuda Sadeh 	layout->fl_pg_preferred = cpu_to_le32(-1);
911602adf40SYehuda Sadeh 	layout->fl_pg_pool = cpu_to_le32(dev->poolid);
912602adf40SYehuda Sadeh 	ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
913602adf40SYehuda Sadeh 			     ofs, &len, &bno, req, ops);
914602adf40SYehuda Sadeh 
915602adf40SYehuda Sadeh 	ceph_osdc_build_request(req, ofs, &len,
916602adf40SYehuda Sadeh 				ops,
917602adf40SYehuda Sadeh 				snapc,
918602adf40SYehuda Sadeh 				&mtime,
919602adf40SYehuda Sadeh 				req->r_oid, req->r_oid_len);
920602adf40SYehuda Sadeh 	up_read(&header->snap_rwsem);
921602adf40SYehuda Sadeh 
92259c2be1eSYehuda Sadeh 	if (linger_req) {
92359c2be1eSYehuda Sadeh 		ceph_osdc_set_request_linger(&dev->client->osdc, req);
92459c2be1eSYehuda Sadeh 		*linger_req = req;
92559c2be1eSYehuda Sadeh 	}
92659c2be1eSYehuda Sadeh 
927602adf40SYehuda Sadeh 	ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
928602adf40SYehuda Sadeh 	if (ret < 0)
929602adf40SYehuda Sadeh 		goto done_err;
930602adf40SYehuda Sadeh 
931602adf40SYehuda Sadeh 	if (!rbd_cb) {
932602adf40SYehuda Sadeh 		ret = ceph_osdc_wait_request(&dev->client->osdc, req);
93359c2be1eSYehuda Sadeh 		if (ver)
93459c2be1eSYehuda Sadeh 			*ver = le64_to_cpu(req->r_reassert_version.version);
9351fec7093SYehuda Sadeh 		dout("reassert_ver=%lld\n",
9361fec7093SYehuda Sadeh 		     le64_to_cpu(req->r_reassert_version.version));
937602adf40SYehuda Sadeh 		ceph_osdc_put_request(req);
938602adf40SYehuda Sadeh 	}
939602adf40SYehuda Sadeh 	return ret;
940602adf40SYehuda Sadeh 
941602adf40SYehuda Sadeh done_err:
942602adf40SYehuda Sadeh 	bio_chain_put(req_data->bio);
943602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
944602adf40SYehuda Sadeh done_pages:
9451fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, ret, len);
946602adf40SYehuda Sadeh 	kfree(req_data);
947602adf40SYehuda Sadeh 	return ret;
948602adf40SYehuda Sadeh }
949602adf40SYehuda Sadeh 
950602adf40SYehuda Sadeh /*
951602adf40SYehuda Sadeh  * Ceph osd op callback
952602adf40SYehuda Sadeh  */
953602adf40SYehuda Sadeh static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
954602adf40SYehuda Sadeh {
955602adf40SYehuda Sadeh 	struct rbd_request *req_data = req->r_priv;
956602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
957602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
958602adf40SYehuda Sadeh 	__s32 rc;
959602adf40SYehuda Sadeh 	u64 bytes;
960602adf40SYehuda Sadeh 	int read_op;
961602adf40SYehuda Sadeh 
962602adf40SYehuda Sadeh 	/* parse reply */
963602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
964602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
965602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
966602adf40SYehuda Sadeh 	rc = le32_to_cpu(replyhead->result);
967602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
968602adf40SYehuda Sadeh 	read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
969602adf40SYehuda Sadeh 
970602adf40SYehuda Sadeh 	dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
971602adf40SYehuda Sadeh 
972602adf40SYehuda Sadeh 	if (rc == -ENOENT && read_op) {
973602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, 0);
974602adf40SYehuda Sadeh 		rc = 0;
975602adf40SYehuda Sadeh 	} else if (rc == 0 && read_op && bytes < req_data->len) {
976602adf40SYehuda Sadeh 		zero_bio_chain(req_data->bio, bytes);
977602adf40SYehuda Sadeh 		bytes = req_data->len;
978602adf40SYehuda Sadeh 	}
979602adf40SYehuda Sadeh 
9801fec7093SYehuda Sadeh 	rbd_coll_end_req(req_data, rc, bytes);
981602adf40SYehuda Sadeh 
982602adf40SYehuda Sadeh 	if (req_data->bio)
983602adf40SYehuda Sadeh 		bio_chain_put(req_data->bio);
984602adf40SYehuda Sadeh 
985602adf40SYehuda Sadeh 	ceph_osdc_put_request(req);
986602adf40SYehuda Sadeh 	kfree(req_data);
987602adf40SYehuda Sadeh }
988602adf40SYehuda Sadeh 
98959c2be1eSYehuda Sadeh static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
99059c2be1eSYehuda Sadeh {
99159c2be1eSYehuda Sadeh 	ceph_osdc_put_request(req);
99259c2be1eSYehuda Sadeh }
99359c2be1eSYehuda Sadeh 
994602adf40SYehuda Sadeh /*
995602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
996602adf40SYehuda Sadeh  */
997602adf40SYehuda Sadeh static int rbd_req_sync_op(struct rbd_device *dev,
998602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
999602adf40SYehuda Sadeh 			   u64 snapid,
1000602adf40SYehuda Sadeh 			   int opcode,
1001602adf40SYehuda Sadeh 			   int flags,
1002602adf40SYehuda Sadeh 			   struct ceph_osd_req_op *orig_ops,
1003602adf40SYehuda Sadeh 			   int num_reply,
1004602adf40SYehuda Sadeh 			   const char *obj,
1005602adf40SYehuda Sadeh 			   u64 ofs, u64 len,
100659c2be1eSYehuda Sadeh 			   char *buf,
100759c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
100859c2be1eSYehuda Sadeh 			   u64 *ver)
1009602adf40SYehuda Sadeh {
1010602adf40SYehuda Sadeh 	int ret;
1011602adf40SYehuda Sadeh 	struct page **pages;
1012602adf40SYehuda Sadeh 	int num_pages;
1013602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops = orig_ops;
1014602adf40SYehuda Sadeh 	u32 payload_len;
1015602adf40SYehuda Sadeh 
1016602adf40SYehuda Sadeh 	num_pages = calc_pages_for(ofs , len);
1017602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1018b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1019b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1020602adf40SYehuda Sadeh 
1021602adf40SYehuda Sadeh 	if (!orig_ops) {
1022602adf40SYehuda Sadeh 		payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1023602adf40SYehuda Sadeh 		ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1024602adf40SYehuda Sadeh 		if (ret < 0)
1025602adf40SYehuda Sadeh 			goto done;
1026602adf40SYehuda Sadeh 
1027602adf40SYehuda Sadeh 		if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1028602adf40SYehuda Sadeh 			ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1029602adf40SYehuda Sadeh 			if (ret < 0)
1030602adf40SYehuda Sadeh 				goto done_ops;
1031602adf40SYehuda Sadeh 		}
1032602adf40SYehuda Sadeh 	}
1033602adf40SYehuda Sadeh 
1034602adf40SYehuda Sadeh 	ret = rbd_do_request(NULL, dev, snapc, snapid,
1035602adf40SYehuda Sadeh 			  obj, ofs, len, NULL,
1036602adf40SYehuda Sadeh 			  pages, num_pages,
1037602adf40SYehuda Sadeh 			  flags,
1038602adf40SYehuda Sadeh 			  ops,
1039602adf40SYehuda Sadeh 			  2,
10401fec7093SYehuda Sadeh 			  NULL, 0,
104159c2be1eSYehuda Sadeh 			  NULL,
104259c2be1eSYehuda Sadeh 			  linger_req, ver);
1043602adf40SYehuda Sadeh 	if (ret < 0)
1044602adf40SYehuda Sadeh 		goto done_ops;
1045602adf40SYehuda Sadeh 
1046602adf40SYehuda Sadeh 	if ((flags & CEPH_OSD_FLAG_READ) && buf)
1047602adf40SYehuda Sadeh 		ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1048602adf40SYehuda Sadeh 
1049602adf40SYehuda Sadeh done_ops:
1050602adf40SYehuda Sadeh 	if (!orig_ops)
1051602adf40SYehuda Sadeh 		rbd_destroy_ops(ops);
1052602adf40SYehuda Sadeh done:
1053602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1054602adf40SYehuda Sadeh 	return ret;
1055602adf40SYehuda Sadeh }
1056602adf40SYehuda Sadeh 
1057602adf40SYehuda Sadeh /*
1058602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1059602adf40SYehuda Sadeh  */
1060602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1061602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev ,
1062602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1063602adf40SYehuda Sadeh 		     u64 snapid,
1064602adf40SYehuda Sadeh 		     int opcode, int flags, int num_reply,
1065602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
10661fec7093SYehuda Sadeh 		     struct bio *bio,
10671fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
10681fec7093SYehuda Sadeh 		     int coll_index)
1069602adf40SYehuda Sadeh {
1070602adf40SYehuda Sadeh 	char *seg_name;
1071602adf40SYehuda Sadeh 	u64 seg_ofs;
1072602adf40SYehuda Sadeh 	u64 seg_len;
1073602adf40SYehuda Sadeh 	int ret;
1074602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1075602adf40SYehuda Sadeh 	u32 payload_len;
1076602adf40SYehuda Sadeh 
1077602adf40SYehuda Sadeh 	seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1078602adf40SYehuda Sadeh 	if (!seg_name)
1079602adf40SYehuda Sadeh 		return -ENOMEM;
1080602adf40SYehuda Sadeh 
1081602adf40SYehuda Sadeh 	seg_len = rbd_get_segment(&rbd_dev->header,
1082602adf40SYehuda Sadeh 				  rbd_dev->header.block_name,
1083602adf40SYehuda Sadeh 				  ofs, len,
1084602adf40SYehuda Sadeh 				  seg_name, &seg_ofs);
1085602adf40SYehuda Sadeh 
1086602adf40SYehuda Sadeh 	payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1087602adf40SYehuda Sadeh 
1088602adf40SYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1089602adf40SYehuda Sadeh 	if (ret < 0)
1090602adf40SYehuda Sadeh 		goto done;
1091602adf40SYehuda Sadeh 
1092602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1093602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1094602adf40SYehuda Sadeh 	   truncated at this point */
1095602adf40SYehuda Sadeh 	BUG_ON(seg_len < len);
1096602adf40SYehuda Sadeh 
1097602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1098602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1099602adf40SYehuda Sadeh 			     bio,
1100602adf40SYehuda Sadeh 			     NULL, 0,
1101602adf40SYehuda Sadeh 			     flags,
1102602adf40SYehuda Sadeh 			     ops,
1103602adf40SYehuda Sadeh 			     num_reply,
11041fec7093SYehuda Sadeh 			     coll, coll_index,
110559c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
110611f77002SSage Weil 
110711f77002SSage Weil 	rbd_destroy_ops(ops);
1108602adf40SYehuda Sadeh done:
1109602adf40SYehuda Sadeh 	kfree(seg_name);
1110602adf40SYehuda Sadeh 	return ret;
1111602adf40SYehuda Sadeh }
1112602adf40SYehuda Sadeh 
1113602adf40SYehuda Sadeh /*
1114602adf40SYehuda Sadeh  * Request async osd write
1115602adf40SYehuda Sadeh  */
1116602adf40SYehuda Sadeh static int rbd_req_write(struct request *rq,
1117602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1118602adf40SYehuda Sadeh 			 struct ceph_snap_context *snapc,
1119602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11201fec7093SYehuda Sadeh 			 struct bio *bio,
11211fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11221fec7093SYehuda Sadeh 			 int coll_index)
1123602adf40SYehuda Sadeh {
1124602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1125602adf40SYehuda Sadeh 			 CEPH_OSD_OP_WRITE,
1126602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1127602adf40SYehuda Sadeh 			 2,
11281fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1129602adf40SYehuda Sadeh }
1130602adf40SYehuda Sadeh 
1131602adf40SYehuda Sadeh /*
1132602adf40SYehuda Sadeh  * Request async osd read
1133602adf40SYehuda Sadeh  */
1134602adf40SYehuda Sadeh static int rbd_req_read(struct request *rq,
1135602adf40SYehuda Sadeh 			 struct rbd_device *rbd_dev,
1136602adf40SYehuda Sadeh 			 u64 snapid,
1137602adf40SYehuda Sadeh 			 u64 ofs, u64 len,
11381fec7093SYehuda Sadeh 			 struct bio *bio,
11391fec7093SYehuda Sadeh 			 struct rbd_req_coll *coll,
11401fec7093SYehuda Sadeh 			 int coll_index)
1141602adf40SYehuda Sadeh {
1142602adf40SYehuda Sadeh 	return rbd_do_op(rq, rbd_dev, NULL,
1143602adf40SYehuda Sadeh 			 (snapid ? snapid : CEPH_NOSNAP),
1144602adf40SYehuda Sadeh 			 CEPH_OSD_OP_READ,
1145602adf40SYehuda Sadeh 			 CEPH_OSD_FLAG_READ,
1146602adf40SYehuda Sadeh 			 2,
11471fec7093SYehuda Sadeh 			 ofs, len, bio, coll, coll_index);
1148602adf40SYehuda Sadeh }
1149602adf40SYehuda Sadeh 
1150602adf40SYehuda Sadeh /*
1151602adf40SYehuda Sadeh  * Request sync osd read
1152602adf40SYehuda Sadeh  */
1153602adf40SYehuda Sadeh static int rbd_req_sync_read(struct rbd_device *dev,
1154602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1155602adf40SYehuda Sadeh 			  u64 snapid,
1156602adf40SYehuda Sadeh 			  const char *obj,
1157602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
115859c2be1eSYehuda Sadeh 			  char *buf,
115959c2be1eSYehuda Sadeh 			  u64 *ver)
1160602adf40SYehuda Sadeh {
1161602adf40SYehuda Sadeh 	return rbd_req_sync_op(dev, NULL,
1162602adf40SYehuda Sadeh 			       (snapid ? snapid : CEPH_NOSNAP),
1163602adf40SYehuda Sadeh 			       CEPH_OSD_OP_READ,
1164602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1165602adf40SYehuda Sadeh 			       NULL,
116659c2be1eSYehuda Sadeh 			       1, obj, ofs, len, buf, NULL, ver);
1167602adf40SYehuda Sadeh }
1168602adf40SYehuda Sadeh 
1169602adf40SYehuda Sadeh /*
117059c2be1eSYehuda Sadeh  * Request sync osd watch
117159c2be1eSYehuda Sadeh  */
117259c2be1eSYehuda Sadeh static int rbd_req_sync_notify_ack(struct rbd_device *dev,
117359c2be1eSYehuda Sadeh 				   u64 ver,
117459c2be1eSYehuda Sadeh 				   u64 notify_id,
117559c2be1eSYehuda Sadeh 				   const char *obj)
117659c2be1eSYehuda Sadeh {
117759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
117859c2be1eSYehuda Sadeh 	struct page **pages = NULL;
117911f77002SSage Weil 	int ret;
118011f77002SSage Weil 
118111f77002SSage Weil 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
118259c2be1eSYehuda Sadeh 	if (ret < 0)
118359c2be1eSYehuda Sadeh 		return ret;
118459c2be1eSYehuda Sadeh 
118559c2be1eSYehuda Sadeh 	ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
118659c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
118759c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
118859c2be1eSYehuda Sadeh 
118959c2be1eSYehuda Sadeh 	ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
119059c2be1eSYehuda Sadeh 			  obj, 0, 0, NULL,
119159c2be1eSYehuda Sadeh 			  pages, 0,
119259c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
119359c2be1eSYehuda Sadeh 			  ops,
119459c2be1eSYehuda Sadeh 			  1,
11951fec7093SYehuda Sadeh 			  NULL, 0,
119659c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
119759c2be1eSYehuda Sadeh 
119859c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
119959c2be1eSYehuda Sadeh 	return ret;
120059c2be1eSYehuda Sadeh }
120159c2be1eSYehuda Sadeh 
120259c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
120359c2be1eSYehuda Sadeh {
120459c2be1eSYehuda Sadeh 	struct rbd_device *dev = (struct rbd_device *)data;
120513143d2dSSage Weil 	int rc;
120613143d2dSSage Weil 
120759c2be1eSYehuda Sadeh 	if (!dev)
120859c2be1eSYehuda Sadeh 		return;
120959c2be1eSYehuda Sadeh 
121059c2be1eSYehuda Sadeh 	dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
121159c2be1eSYehuda Sadeh 		notify_id, (int)opcode);
121259c2be1eSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
121313143d2dSSage Weil 	rc = __rbd_update_snaps(dev);
121459c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
121513143d2dSSage Weil 	if (rc)
121613143d2dSSage Weil 		pr_warning(DRV_NAME "%d got notification but failed to update"
121713143d2dSSage Weil 			   " snaps: %d\n", dev->major, rc);
121859c2be1eSYehuda Sadeh 
121959c2be1eSYehuda Sadeh 	rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
122059c2be1eSYehuda Sadeh }
122159c2be1eSYehuda Sadeh 
122259c2be1eSYehuda Sadeh /*
122359c2be1eSYehuda Sadeh  * Request sync osd watch
122459c2be1eSYehuda Sadeh  */
122559c2be1eSYehuda Sadeh static int rbd_req_sync_watch(struct rbd_device *dev,
122659c2be1eSYehuda Sadeh 			      const char *obj,
122759c2be1eSYehuda Sadeh 			      u64 ver)
122859c2be1eSYehuda Sadeh {
122959c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
123059c2be1eSYehuda Sadeh 	struct ceph_osd_client *osdc = &dev->client->osdc;
123159c2be1eSYehuda Sadeh 
123259c2be1eSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
123359c2be1eSYehuda Sadeh 	if (ret < 0)
123459c2be1eSYehuda Sadeh 		return ret;
123559c2be1eSYehuda Sadeh 
123659c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
123759c2be1eSYehuda Sadeh 				     (void *)dev, &dev->watch_event);
123859c2be1eSYehuda Sadeh 	if (ret < 0)
123959c2be1eSYehuda Sadeh 		goto fail;
124059c2be1eSYehuda Sadeh 
124159c2be1eSYehuda Sadeh 	ops[0].watch.ver = cpu_to_le64(ver);
124259c2be1eSYehuda Sadeh 	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
124359c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
124459c2be1eSYehuda Sadeh 
124559c2be1eSYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
124659c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
124759c2be1eSYehuda Sadeh 			      0,
124859c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
124959c2be1eSYehuda Sadeh 			      ops,
125059c2be1eSYehuda Sadeh 			      1, obj, 0, 0, NULL,
125159c2be1eSYehuda Sadeh 			      &dev->watch_request, NULL);
125259c2be1eSYehuda Sadeh 
125359c2be1eSYehuda Sadeh 	if (ret < 0)
125459c2be1eSYehuda Sadeh 		goto fail_event;
125559c2be1eSYehuda Sadeh 
125659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
125759c2be1eSYehuda Sadeh 	return 0;
125859c2be1eSYehuda Sadeh 
125959c2be1eSYehuda Sadeh fail_event:
126059c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(dev->watch_event);
126159c2be1eSYehuda Sadeh 	dev->watch_event = NULL;
126259c2be1eSYehuda Sadeh fail:
126359c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
126459c2be1eSYehuda Sadeh 	return ret;
126559c2be1eSYehuda Sadeh }
126659c2be1eSYehuda Sadeh 
126779e3057cSYehuda Sadeh /*
126879e3057cSYehuda Sadeh  * Request sync osd unwatch
126979e3057cSYehuda Sadeh  */
127079e3057cSYehuda Sadeh static int rbd_req_sync_unwatch(struct rbd_device *dev,
127179e3057cSYehuda Sadeh 				const char *obj)
127279e3057cSYehuda Sadeh {
127379e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
127479e3057cSYehuda Sadeh 
127579e3057cSYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
127679e3057cSYehuda Sadeh 	if (ret < 0)
127779e3057cSYehuda Sadeh 		return ret;
127879e3057cSYehuda Sadeh 
127979e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
128079e3057cSYehuda Sadeh 	ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
128179e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
128279e3057cSYehuda Sadeh 
128379e3057cSYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
128479e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
128579e3057cSYehuda Sadeh 			      0,
128679e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
128779e3057cSYehuda Sadeh 			      ops,
128879e3057cSYehuda Sadeh 			      1, obj, 0, 0, NULL, NULL, NULL);
128979e3057cSYehuda Sadeh 
129079e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
129179e3057cSYehuda Sadeh 	ceph_osdc_cancel_event(dev->watch_event);
129279e3057cSYehuda Sadeh 	dev->watch_event = NULL;
129379e3057cSYehuda Sadeh 	return ret;
129479e3057cSYehuda Sadeh }
129579e3057cSYehuda Sadeh 
129659c2be1eSYehuda Sadeh struct rbd_notify_info {
129759c2be1eSYehuda Sadeh 	struct rbd_device *dev;
129859c2be1eSYehuda Sadeh };
129959c2be1eSYehuda Sadeh 
130059c2be1eSYehuda Sadeh static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
130159c2be1eSYehuda Sadeh {
130259c2be1eSYehuda Sadeh 	struct rbd_device *dev = (struct rbd_device *)data;
130359c2be1eSYehuda Sadeh 	if (!dev)
130459c2be1eSYehuda Sadeh 		return;
130559c2be1eSYehuda Sadeh 
130659c2be1eSYehuda Sadeh 	dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
130759c2be1eSYehuda Sadeh 		notify_id, (int)opcode);
130859c2be1eSYehuda Sadeh }
130959c2be1eSYehuda Sadeh 
131059c2be1eSYehuda Sadeh /*
131159c2be1eSYehuda Sadeh  * Request sync osd notify
131259c2be1eSYehuda Sadeh  */
131359c2be1eSYehuda Sadeh static int rbd_req_sync_notify(struct rbd_device *dev,
131459c2be1eSYehuda Sadeh 		          const char *obj)
131559c2be1eSYehuda Sadeh {
131659c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
131759c2be1eSYehuda Sadeh 	struct ceph_osd_client *osdc = &dev->client->osdc;
131859c2be1eSYehuda Sadeh 	struct ceph_osd_event *event;
131959c2be1eSYehuda Sadeh 	struct rbd_notify_info info;
132059c2be1eSYehuda Sadeh 	int payload_len = sizeof(u32) + sizeof(u32);
132159c2be1eSYehuda Sadeh 	int ret;
132259c2be1eSYehuda Sadeh 
132359c2be1eSYehuda Sadeh 	ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
132459c2be1eSYehuda Sadeh 	if (ret < 0)
132559c2be1eSYehuda Sadeh 		return ret;
132659c2be1eSYehuda Sadeh 
132759c2be1eSYehuda Sadeh 	info.dev = dev;
132859c2be1eSYehuda Sadeh 
132959c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
133059c2be1eSYehuda Sadeh 				     (void *)&info, &event);
133159c2be1eSYehuda Sadeh 	if (ret < 0)
133259c2be1eSYehuda Sadeh 		goto fail;
133359c2be1eSYehuda Sadeh 
133459c2be1eSYehuda Sadeh 	ops[0].watch.ver = 1;
133559c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
133659c2be1eSYehuda Sadeh 	ops[0].watch.cookie = event->cookie;
133759c2be1eSYehuda Sadeh 	ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
133859c2be1eSYehuda Sadeh 	ops[0].watch.timeout = 12;
133959c2be1eSYehuda Sadeh 
134059c2be1eSYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
134159c2be1eSYehuda Sadeh 			       CEPH_NOSNAP,
134259c2be1eSYehuda Sadeh 			       0,
134359c2be1eSYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
134459c2be1eSYehuda Sadeh 			       ops,
134559c2be1eSYehuda Sadeh 			       1, obj, 0, 0, NULL, NULL, NULL);
134659c2be1eSYehuda Sadeh 	if (ret < 0)
134759c2be1eSYehuda Sadeh 		goto fail_event;
134859c2be1eSYehuda Sadeh 
134959c2be1eSYehuda Sadeh 	ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
135059c2be1eSYehuda Sadeh 	dout("ceph_osdc_wait_event returned %d\n", ret);
135159c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
135259c2be1eSYehuda Sadeh 	return 0;
135359c2be1eSYehuda Sadeh 
135459c2be1eSYehuda Sadeh fail_event:
135559c2be1eSYehuda Sadeh 	ceph_osdc_cancel_event(event);
135659c2be1eSYehuda Sadeh fail:
135759c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
135859c2be1eSYehuda Sadeh 	return ret;
135959c2be1eSYehuda Sadeh }
136059c2be1eSYehuda Sadeh 
136159c2be1eSYehuda Sadeh /*
1362602adf40SYehuda Sadeh  * Request sync osd read
1363602adf40SYehuda Sadeh  */
1364602adf40SYehuda Sadeh static int rbd_req_sync_exec(struct rbd_device *dev,
1365602adf40SYehuda Sadeh 			     const char *obj,
1366602adf40SYehuda Sadeh 			     const char *cls,
1367602adf40SYehuda Sadeh 			     const char *method,
1368602adf40SYehuda Sadeh 			     const char *data,
136959c2be1eSYehuda Sadeh 			     int len,
137059c2be1eSYehuda Sadeh 			     u64 *ver)
1371602adf40SYehuda Sadeh {
1372602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1373602adf40SYehuda Sadeh 	int cls_len = strlen(cls);
1374602adf40SYehuda Sadeh 	int method_len = strlen(method);
1375602adf40SYehuda Sadeh 	int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1376602adf40SYehuda Sadeh 				    cls_len + method_len + len);
1377602adf40SYehuda Sadeh 	if (ret < 0)
1378602adf40SYehuda Sadeh 		return ret;
1379602adf40SYehuda Sadeh 
1380602adf40SYehuda Sadeh 	ops[0].cls.class_name = cls;
1381602adf40SYehuda Sadeh 	ops[0].cls.class_len = (__u8)cls_len;
1382602adf40SYehuda Sadeh 	ops[0].cls.method_name = method;
1383602adf40SYehuda Sadeh 	ops[0].cls.method_len = (__u8)method_len;
1384602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
1385602adf40SYehuda Sadeh 	ops[0].cls.indata = data;
1386602adf40SYehuda Sadeh 	ops[0].cls.indata_len = len;
1387602adf40SYehuda Sadeh 
1388602adf40SYehuda Sadeh 	ret = rbd_req_sync_op(dev, NULL,
1389602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
1390602adf40SYehuda Sadeh 			       0,
1391602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1392602adf40SYehuda Sadeh 			       ops,
139359c2be1eSYehuda Sadeh 			       1, obj, 0, 0, NULL, NULL, ver);
1394602adf40SYehuda Sadeh 
1395602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1396602adf40SYehuda Sadeh 
1397602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1398602adf40SYehuda Sadeh 	return ret;
1399602adf40SYehuda Sadeh }
1400602adf40SYehuda Sadeh 
14011fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
14021fec7093SYehuda Sadeh {
14031fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
14041fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
14051fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
14061fec7093SYehuda Sadeh 				GFP_ATOMIC);
14071fec7093SYehuda Sadeh 
14081fec7093SYehuda Sadeh 	if (!coll)
14091fec7093SYehuda Sadeh 		return NULL;
14101fec7093SYehuda Sadeh 	coll->total = num_reqs;
14111fec7093SYehuda Sadeh 	kref_init(&coll->kref);
14121fec7093SYehuda Sadeh 	return coll;
14131fec7093SYehuda Sadeh }
14141fec7093SYehuda Sadeh 
1415602adf40SYehuda Sadeh /*
1416602adf40SYehuda Sadeh  * block device queue callback
1417602adf40SYehuda Sadeh  */
1418602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1419602adf40SYehuda Sadeh {
1420602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1421602adf40SYehuda Sadeh 	struct request *rq;
1422602adf40SYehuda Sadeh 	struct bio_pair *bp = NULL;
1423602adf40SYehuda Sadeh 
1424602adf40SYehuda Sadeh 	rq = blk_fetch_request(q);
1425602adf40SYehuda Sadeh 
1426602adf40SYehuda Sadeh 	while (1) {
1427602adf40SYehuda Sadeh 		struct bio *bio;
1428602adf40SYehuda Sadeh 		struct bio *rq_bio, *next_bio = NULL;
1429602adf40SYehuda Sadeh 		bool do_write;
1430602adf40SYehuda Sadeh 		int size, op_size = 0;
1431602adf40SYehuda Sadeh 		u64 ofs;
14321fec7093SYehuda Sadeh 		int num_segs, cur_seg = 0;
14331fec7093SYehuda Sadeh 		struct rbd_req_coll *coll;
1434602adf40SYehuda Sadeh 
1435602adf40SYehuda Sadeh 		/* peek at request from block layer */
1436602adf40SYehuda Sadeh 		if (!rq)
1437602adf40SYehuda Sadeh 			break;
1438602adf40SYehuda Sadeh 
1439602adf40SYehuda Sadeh 		dout("fetched request\n");
1440602adf40SYehuda Sadeh 
1441602adf40SYehuda Sadeh 		/* filter out block requests we don't understand */
1442602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1443602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
1444602adf40SYehuda Sadeh 			goto next;
1445602adf40SYehuda Sadeh 		}
1446602adf40SYehuda Sadeh 
1447602adf40SYehuda Sadeh 		/* deduce our operation (read, write) */
1448602adf40SYehuda Sadeh 		do_write = (rq_data_dir(rq) == WRITE);
1449602adf40SYehuda Sadeh 
1450602adf40SYehuda Sadeh 		size = blk_rq_bytes(rq);
1451602adf40SYehuda Sadeh 		ofs = blk_rq_pos(rq) * 512ULL;
1452602adf40SYehuda Sadeh 		rq_bio = rq->bio;
1453602adf40SYehuda Sadeh 		if (do_write && rbd_dev->read_only) {
1454602adf40SYehuda Sadeh 			__blk_end_request_all(rq, -EROFS);
1455602adf40SYehuda Sadeh 			goto next;
1456602adf40SYehuda Sadeh 		}
1457602adf40SYehuda Sadeh 
1458602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1459602adf40SYehuda Sadeh 
1460602adf40SYehuda Sadeh 		dout("%s 0x%x bytes at 0x%llx\n",
1461602adf40SYehuda Sadeh 		     do_write ? "write" : "read",
1462602adf40SYehuda Sadeh 		     size, blk_rq_pos(rq) * 512ULL);
1463602adf40SYehuda Sadeh 
14641fec7093SYehuda Sadeh 		num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
14651fec7093SYehuda Sadeh 		coll = rbd_alloc_coll(num_segs);
14661fec7093SYehuda Sadeh 		if (!coll) {
14671fec7093SYehuda Sadeh 			spin_lock_irq(q->queue_lock);
14681fec7093SYehuda Sadeh 			__blk_end_request_all(rq, -ENOMEM);
14691fec7093SYehuda Sadeh 			goto next;
14701fec7093SYehuda Sadeh 		}
14711fec7093SYehuda Sadeh 
1472602adf40SYehuda Sadeh 		do {
1473602adf40SYehuda Sadeh 			/* a bio clone to be passed down to OSD req */
1474602adf40SYehuda Sadeh 			dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1475602adf40SYehuda Sadeh 			op_size = rbd_get_segment(&rbd_dev->header,
1476602adf40SYehuda Sadeh 						  rbd_dev->header.block_name,
1477602adf40SYehuda Sadeh 						  ofs, size,
1478602adf40SYehuda Sadeh 						  NULL, NULL);
14791fec7093SYehuda Sadeh 			kref_get(&coll->kref);
1480602adf40SYehuda Sadeh 			bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1481602adf40SYehuda Sadeh 					      op_size, GFP_ATOMIC);
1482602adf40SYehuda Sadeh 			if (!bio) {
14831fec7093SYehuda Sadeh 				rbd_coll_end_req_index(rq, coll, cur_seg,
14841fec7093SYehuda Sadeh 						       -ENOMEM, op_size);
14851fec7093SYehuda Sadeh 				goto next_seg;
1486602adf40SYehuda Sadeh 			}
1487602adf40SYehuda Sadeh 
14881fec7093SYehuda Sadeh 
1489602adf40SYehuda Sadeh 			/* init OSD command: write or read */
1490602adf40SYehuda Sadeh 			if (do_write)
1491602adf40SYehuda Sadeh 				rbd_req_write(rq, rbd_dev,
1492602adf40SYehuda Sadeh 					      rbd_dev->header.snapc,
1493602adf40SYehuda Sadeh 					      ofs,
14941fec7093SYehuda Sadeh 					      op_size, bio,
14951fec7093SYehuda Sadeh 					      coll, cur_seg);
1496602adf40SYehuda Sadeh 			else
1497602adf40SYehuda Sadeh 				rbd_req_read(rq, rbd_dev,
1498602adf40SYehuda Sadeh 					     cur_snap_id(rbd_dev),
1499602adf40SYehuda Sadeh 					     ofs,
15001fec7093SYehuda Sadeh 					     op_size, bio,
15011fec7093SYehuda Sadeh 					     coll, cur_seg);
1502602adf40SYehuda Sadeh 
15031fec7093SYehuda Sadeh next_seg:
1504602adf40SYehuda Sadeh 			size -= op_size;
1505602adf40SYehuda Sadeh 			ofs += op_size;
1506602adf40SYehuda Sadeh 
15071fec7093SYehuda Sadeh 			cur_seg++;
1508602adf40SYehuda Sadeh 			rq_bio = next_bio;
1509602adf40SYehuda Sadeh 		} while (size > 0);
15101fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
1511602adf40SYehuda Sadeh 
1512602adf40SYehuda Sadeh 		if (bp)
1513602adf40SYehuda Sadeh 			bio_pair_release(bp);
1514602adf40SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
1515602adf40SYehuda Sadeh next:
1516602adf40SYehuda Sadeh 		rq = blk_fetch_request(q);
1517602adf40SYehuda Sadeh 	}
1518602adf40SYehuda Sadeh }
1519602adf40SYehuda Sadeh 
1520602adf40SYehuda Sadeh /*
1521602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1522602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1523602adf40SYehuda Sadeh  * which we handle later at bio_chain_clone
1524602adf40SYehuda Sadeh  */
1525602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1526602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1527602adf40SYehuda Sadeh {
1528602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1529602adf40SYehuda Sadeh 	unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1530602adf40SYehuda Sadeh 	sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1531602adf40SYehuda Sadeh 	unsigned int bio_sectors = bmd->bi_size >> 9;
1532602adf40SYehuda Sadeh 	int max;
1533602adf40SYehuda Sadeh 
1534602adf40SYehuda Sadeh 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1))
1535602adf40SYehuda Sadeh 				 + bio_sectors)) << 9;
1536602adf40SYehuda Sadeh 	if (max < 0)
1537602adf40SYehuda Sadeh 		max = 0; /* bio_add cannot handle a negative return */
1538602adf40SYehuda Sadeh 	if (max <= bvec->bv_len && bio_sectors == 0)
1539602adf40SYehuda Sadeh 		return bvec->bv_len;
1540602adf40SYehuda Sadeh 	return max;
1541602adf40SYehuda Sadeh }
1542602adf40SYehuda Sadeh 
1543602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1544602adf40SYehuda Sadeh {
1545602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1546602adf40SYehuda Sadeh 
1547602adf40SYehuda Sadeh 	if (!disk)
1548602adf40SYehuda Sadeh 		return;
1549602adf40SYehuda Sadeh 
1550602adf40SYehuda Sadeh 	rbd_header_free(&rbd_dev->header);
1551602adf40SYehuda Sadeh 
1552602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1553602adf40SYehuda Sadeh 		del_gendisk(disk);
1554602adf40SYehuda Sadeh 	if (disk->queue)
1555602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1556602adf40SYehuda Sadeh 	put_disk(disk);
1557602adf40SYehuda Sadeh }
1558602adf40SYehuda Sadeh 
1559602adf40SYehuda Sadeh /*
1560602adf40SYehuda Sadeh  * reload the ondisk the header
1561602adf40SYehuda Sadeh  */
1562602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1563602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1564602adf40SYehuda Sadeh {
1565602adf40SYehuda Sadeh 	ssize_t rc;
1566602adf40SYehuda Sadeh 	struct rbd_image_header_ondisk *dh;
1567602adf40SYehuda Sadeh 	int snap_count = 0;
1568602adf40SYehuda Sadeh 	u64 snap_names_len = 0;
156959c2be1eSYehuda Sadeh 	u64 ver;
1570602adf40SYehuda Sadeh 
1571602adf40SYehuda Sadeh 	while (1) {
1572602adf40SYehuda Sadeh 		int len = sizeof(*dh) +
1573602adf40SYehuda Sadeh 			  snap_count * sizeof(struct rbd_image_snap_ondisk) +
1574602adf40SYehuda Sadeh 			  snap_names_len;
1575602adf40SYehuda Sadeh 
1576602adf40SYehuda Sadeh 		rc = -ENOMEM;
1577602adf40SYehuda Sadeh 		dh = kmalloc(len, GFP_KERNEL);
1578602adf40SYehuda Sadeh 		if (!dh)
1579602adf40SYehuda Sadeh 			return -ENOMEM;
1580602adf40SYehuda Sadeh 
1581602adf40SYehuda Sadeh 		rc = rbd_req_sync_read(rbd_dev,
1582602adf40SYehuda Sadeh 				       NULL, CEPH_NOSNAP,
1583602adf40SYehuda Sadeh 				       rbd_dev->obj_md_name,
1584602adf40SYehuda Sadeh 				       0, len,
158559c2be1eSYehuda Sadeh 				       (char *)dh, &ver);
1586602adf40SYehuda Sadeh 		if (rc < 0)
1587602adf40SYehuda Sadeh 			goto out_dh;
1588602adf40SYehuda Sadeh 
1589602adf40SYehuda Sadeh 		rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
159081e759fbSJosh Durgin 		if (rc < 0) {
159181e759fbSJosh Durgin 			if (rc == -ENXIO) {
159281e759fbSJosh Durgin 				pr_warning("unrecognized header format"
159381e759fbSJosh Durgin 					   " for image %s", rbd_dev->obj);
159481e759fbSJosh Durgin 			}
1595602adf40SYehuda Sadeh 			goto out_dh;
159681e759fbSJosh Durgin 		}
1597602adf40SYehuda Sadeh 
1598602adf40SYehuda Sadeh 		if (snap_count != header->total_snaps) {
1599602adf40SYehuda Sadeh 			snap_count = header->total_snaps;
1600602adf40SYehuda Sadeh 			snap_names_len = header->snap_names_len;
1601602adf40SYehuda Sadeh 			rbd_header_free(header);
1602602adf40SYehuda Sadeh 			kfree(dh);
1603602adf40SYehuda Sadeh 			continue;
1604602adf40SYehuda Sadeh 		}
1605602adf40SYehuda Sadeh 		break;
1606602adf40SYehuda Sadeh 	}
160759c2be1eSYehuda Sadeh 	header->obj_version = ver;
1608602adf40SYehuda Sadeh 
1609602adf40SYehuda Sadeh out_dh:
1610602adf40SYehuda Sadeh 	kfree(dh);
1611602adf40SYehuda Sadeh 	return rc;
1612602adf40SYehuda Sadeh }
1613602adf40SYehuda Sadeh 
1614602adf40SYehuda Sadeh /*
1615602adf40SYehuda Sadeh  * create a snapshot
1616602adf40SYehuda Sadeh  */
1617602adf40SYehuda Sadeh static int rbd_header_add_snap(struct rbd_device *dev,
1618602adf40SYehuda Sadeh 			       const char *snap_name,
1619602adf40SYehuda Sadeh 			       gfp_t gfp_flags)
1620602adf40SYehuda Sadeh {
1621602adf40SYehuda Sadeh 	int name_len = strlen(snap_name);
1622602adf40SYehuda Sadeh 	u64 new_snapid;
1623602adf40SYehuda Sadeh 	int ret;
1624916d4d67SSage Weil 	void *data, *p, *e;
162559c2be1eSYehuda Sadeh 	u64 ver;
1626602adf40SYehuda Sadeh 
1627602adf40SYehuda Sadeh 	/* we should create a snapshot only if we're pointing at the head */
1628602adf40SYehuda Sadeh 	if (dev->cur_snap)
1629602adf40SYehuda Sadeh 		return -EINVAL;
1630602adf40SYehuda Sadeh 
1631602adf40SYehuda Sadeh 	ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1632602adf40SYehuda Sadeh 				      &new_snapid);
1633602adf40SYehuda Sadeh 	dout("created snapid=%lld\n", new_snapid);
1634602adf40SYehuda Sadeh 	if (ret < 0)
1635602adf40SYehuda Sadeh 		return ret;
1636602adf40SYehuda Sadeh 
1637602adf40SYehuda Sadeh 	data = kmalloc(name_len + 16, gfp_flags);
1638602adf40SYehuda Sadeh 	if (!data)
1639602adf40SYehuda Sadeh 		return -ENOMEM;
1640602adf40SYehuda Sadeh 
1641916d4d67SSage Weil 	p = data;
1642916d4d67SSage Weil 	e = data + name_len + 16;
1643602adf40SYehuda Sadeh 
1644916d4d67SSage Weil 	ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1645916d4d67SSage Weil 	ceph_encode_64_safe(&p, e, new_snapid, bad);
1646602adf40SYehuda Sadeh 
1647602adf40SYehuda Sadeh 	ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1648916d4d67SSage Weil 				data, p - data, &ver);
1649602adf40SYehuda Sadeh 
1650916d4d67SSage Weil 	kfree(data);
1651602adf40SYehuda Sadeh 
1652602adf40SYehuda Sadeh 	if (ret < 0)
1653602adf40SYehuda Sadeh 		return ret;
1654602adf40SYehuda Sadeh 
1655602adf40SYehuda Sadeh 	dev->header.snapc->seq =  new_snapid;
1656602adf40SYehuda Sadeh 
1657602adf40SYehuda Sadeh 	return 0;
1658602adf40SYehuda Sadeh bad:
1659602adf40SYehuda Sadeh 	return -ERANGE;
1660602adf40SYehuda Sadeh }
1661602adf40SYehuda Sadeh 
1662dfc5606dSYehuda Sadeh static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1663dfc5606dSYehuda Sadeh {
1664dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1665dfc5606dSYehuda Sadeh 
1666dfc5606dSYehuda Sadeh 	while (!list_empty(&rbd_dev->snaps)) {
1667dfc5606dSYehuda Sadeh 		snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1668dfc5606dSYehuda Sadeh 		__rbd_remove_snap_dev(rbd_dev, snap);
1669dfc5606dSYehuda Sadeh 	}
1670dfc5606dSYehuda Sadeh }
1671dfc5606dSYehuda Sadeh 
1672602adf40SYehuda Sadeh /*
1673602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1674602adf40SYehuda Sadeh  */
1675dfc5606dSYehuda Sadeh static int __rbd_update_snaps(struct rbd_device *rbd_dev)
1676602adf40SYehuda Sadeh {
1677602adf40SYehuda Sadeh 	int ret;
1678602adf40SYehuda Sadeh 	struct rbd_image_header h;
1679602adf40SYehuda Sadeh 	u64 snap_seq;
168059c2be1eSYehuda Sadeh 	int follow_seq = 0;
1681602adf40SYehuda Sadeh 
1682602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1683602adf40SYehuda Sadeh 	if (ret < 0)
1684602adf40SYehuda Sadeh 		return ret;
1685602adf40SYehuda Sadeh 
16869db4b3e3SSage Weil 	/* resized? */
16879db4b3e3SSage Weil 	set_capacity(rbd_dev->disk, h.image_size / 512ULL);
16889db4b3e3SSage Weil 
1689602adf40SYehuda Sadeh 	down_write(&rbd_dev->header.snap_rwsem);
1690602adf40SYehuda Sadeh 
1691602adf40SYehuda Sadeh 	snap_seq = rbd_dev->header.snapc->seq;
169259c2be1eSYehuda Sadeh 	if (rbd_dev->header.total_snaps &&
169359c2be1eSYehuda Sadeh 	    rbd_dev->header.snapc->snaps[0] == snap_seq)
169459c2be1eSYehuda Sadeh 		/* pointing at the head, will need to follow that
169559c2be1eSYehuda Sadeh 		   if head moves */
169659c2be1eSYehuda Sadeh 		follow_seq = 1;
1697602adf40SYehuda Sadeh 
1698602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snapc);
1699602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_names);
1700602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1701602adf40SYehuda Sadeh 
1702602adf40SYehuda Sadeh 	rbd_dev->header.total_snaps = h.total_snaps;
1703602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1704602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1705dfc5606dSYehuda Sadeh 	rbd_dev->header.snap_names_len = h.snap_names_len;
1706602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
170759c2be1eSYehuda Sadeh 	if (follow_seq)
170859c2be1eSYehuda Sadeh 		rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
170959c2be1eSYehuda Sadeh 	else
1710602adf40SYehuda Sadeh 		rbd_dev->header.snapc->seq = snap_seq;
1711602adf40SYehuda Sadeh 
1712dfc5606dSYehuda Sadeh 	ret = __rbd_init_snaps_header(rbd_dev);
1713dfc5606dSYehuda Sadeh 
1714602adf40SYehuda Sadeh 	up_write(&rbd_dev->header.snap_rwsem);
1715602adf40SYehuda Sadeh 
1716dfc5606dSYehuda Sadeh 	return ret;
1717602adf40SYehuda Sadeh }
1718602adf40SYehuda Sadeh 
1719602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1720602adf40SYehuda Sadeh {
1721602adf40SYehuda Sadeh 	struct gendisk *disk;
1722602adf40SYehuda Sadeh 	struct request_queue *q;
1723602adf40SYehuda Sadeh 	int rc;
1724602adf40SYehuda Sadeh 	u64 total_size = 0;
1725602adf40SYehuda Sadeh 
1726602adf40SYehuda Sadeh 	/* contact OSD, request size info about the object being mapped */
1727602adf40SYehuda Sadeh 	rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1728602adf40SYehuda Sadeh 	if (rc)
1729602adf40SYehuda Sadeh 		return rc;
1730602adf40SYehuda Sadeh 
1731dfc5606dSYehuda Sadeh 	/* no need to lock here, as rbd_dev is not registered yet */
1732dfc5606dSYehuda Sadeh 	rc = __rbd_init_snaps_header(rbd_dev);
1733dfc5606dSYehuda Sadeh 	if (rc)
1734dfc5606dSYehuda Sadeh 		return rc;
1735dfc5606dSYehuda Sadeh 
1736602adf40SYehuda Sadeh 	rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1737602adf40SYehuda Sadeh 	if (rc)
1738602adf40SYehuda Sadeh 		return rc;
1739602adf40SYehuda Sadeh 
1740602adf40SYehuda Sadeh 	/* create gendisk info */
1741602adf40SYehuda Sadeh 	rc = -ENOMEM;
1742602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1743602adf40SYehuda Sadeh 	if (!disk)
1744602adf40SYehuda Sadeh 		goto out;
1745602adf40SYehuda Sadeh 
1746aedfec59SSage Weil 	snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d",
1747aedfec59SSage Weil 		 rbd_dev->id);
1748602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1749602adf40SYehuda Sadeh 	disk->first_minor = 0;
1750602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1751602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1752602adf40SYehuda Sadeh 
1753602adf40SYehuda Sadeh 	/* init rq */
1754602adf40SYehuda Sadeh 	rc = -ENOMEM;
1755602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1756602adf40SYehuda Sadeh 	if (!q)
1757602adf40SYehuda Sadeh 		goto out_disk;
1758029bcbd8SJosh Durgin 
1759029bcbd8SJosh Durgin 	/* set io sizes to object size */
1760029bcbd8SJosh Durgin 	blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1761029bcbd8SJosh Durgin 	blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1762029bcbd8SJosh Durgin 	blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1763029bcbd8SJosh Durgin 	blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1764029bcbd8SJosh Durgin 
1765602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1766602adf40SYehuda Sadeh 	disk->queue = q;
1767602adf40SYehuda Sadeh 
1768602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1769602adf40SYehuda Sadeh 
1770602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1771602adf40SYehuda Sadeh 	rbd_dev->q = q;
1772602adf40SYehuda Sadeh 
1773602adf40SYehuda Sadeh 	/* finally, announce the disk to the world */
1774602adf40SYehuda Sadeh 	set_capacity(disk, total_size / 512ULL);
1775602adf40SYehuda Sadeh 	add_disk(disk);
1776602adf40SYehuda Sadeh 
1777602adf40SYehuda Sadeh 	pr_info("%s: added with size 0x%llx\n",
1778602adf40SYehuda Sadeh 		disk->disk_name, (unsigned long long)total_size);
1779602adf40SYehuda Sadeh 	return 0;
1780602adf40SYehuda Sadeh 
1781602adf40SYehuda Sadeh out_disk:
1782602adf40SYehuda Sadeh 	put_disk(disk);
1783602adf40SYehuda Sadeh out:
1784602adf40SYehuda Sadeh 	return rc;
1785602adf40SYehuda Sadeh }
1786602adf40SYehuda Sadeh 
1787dfc5606dSYehuda Sadeh /*
1788dfc5606dSYehuda Sadeh   sysfs
1789dfc5606dSYehuda Sadeh */
1790602adf40SYehuda Sadeh 
1791dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1792dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1793602adf40SYehuda Sadeh {
1794dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1795dfc5606dSYehuda Sadeh 
1796dfc5606dSYehuda Sadeh 	return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
1797602adf40SYehuda Sadeh }
1798602adf40SYehuda Sadeh 
1799dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1800dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1801602adf40SYehuda Sadeh {
1802dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1803dfc5606dSYehuda Sadeh 
1804dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
1805dfc5606dSYehuda Sadeh }
1806dfc5606dSYehuda Sadeh 
1807dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
1808dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
1809dfc5606dSYehuda Sadeh {
1810dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1811dfc5606dSYehuda Sadeh 
1812dfc5606dSYehuda Sadeh 	return sprintf(buf, "client%lld\n", ceph_client_id(rbd_dev->client));
1813dfc5606dSYehuda Sadeh }
1814dfc5606dSYehuda Sadeh 
1815dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
1816dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1817dfc5606dSYehuda Sadeh {
1818dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1819dfc5606dSYehuda Sadeh 
1820dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->pool_name);
1821dfc5606dSYehuda Sadeh }
1822dfc5606dSYehuda Sadeh 
1823dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
1824dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1825dfc5606dSYehuda Sadeh {
1826dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1827dfc5606dSYehuda Sadeh 
1828dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->obj);
1829dfc5606dSYehuda Sadeh }
1830dfc5606dSYehuda Sadeh 
1831dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
1832dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
1833dfc5606dSYehuda Sadeh 			     char *buf)
1834dfc5606dSYehuda Sadeh {
1835dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1836dfc5606dSYehuda Sadeh 
1837dfc5606dSYehuda Sadeh 	return sprintf(buf, "%s\n", rbd_dev->snap_name);
1838dfc5606dSYehuda Sadeh }
1839dfc5606dSYehuda Sadeh 
1840dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
1841dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
1842dfc5606dSYehuda Sadeh 				 const char *buf,
1843dfc5606dSYehuda Sadeh 				 size_t size)
1844dfc5606dSYehuda Sadeh {
1845dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
1846dfc5606dSYehuda Sadeh 	int rc;
1847dfc5606dSYehuda Sadeh 	int ret = size;
1848602adf40SYehuda Sadeh 
1849602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1850602adf40SYehuda Sadeh 
1851dfc5606dSYehuda Sadeh 	rc = __rbd_update_snaps(rbd_dev);
1852dfc5606dSYehuda Sadeh 	if (rc < 0)
1853dfc5606dSYehuda Sadeh 		ret = rc;
1854602adf40SYehuda Sadeh 
1855dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
1856dfc5606dSYehuda Sadeh 	return ret;
1857dfc5606dSYehuda Sadeh }
1858602adf40SYehuda Sadeh 
1859dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1860dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1861dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1862dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1863dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1864dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1865dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1866dfc5606dSYehuda Sadeh static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
1867dfc5606dSYehuda Sadeh 
1868dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
1869dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
1870dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
1871dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
1872dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
1873dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
1874dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
1875dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
1876dfc5606dSYehuda Sadeh 	&dev_attr_create_snap.attr,
1877dfc5606dSYehuda Sadeh 	NULL
1878dfc5606dSYehuda Sadeh };
1879dfc5606dSYehuda Sadeh 
1880dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
1881dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
1882dfc5606dSYehuda Sadeh };
1883dfc5606dSYehuda Sadeh 
1884dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
1885dfc5606dSYehuda Sadeh 	&rbd_attr_group,
1886dfc5606dSYehuda Sadeh 	NULL
1887dfc5606dSYehuda Sadeh };
1888dfc5606dSYehuda Sadeh 
1889dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
1890dfc5606dSYehuda Sadeh {
1891dfc5606dSYehuda Sadeh }
1892dfc5606dSYehuda Sadeh 
1893dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
1894dfc5606dSYehuda Sadeh 	.name		= "rbd",
1895dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
1896dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
1897dfc5606dSYehuda Sadeh };
1898dfc5606dSYehuda Sadeh 
1899dfc5606dSYehuda Sadeh 
1900dfc5606dSYehuda Sadeh /*
1901dfc5606dSYehuda Sadeh   sysfs - snapshots
1902dfc5606dSYehuda Sadeh */
1903dfc5606dSYehuda Sadeh 
1904dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
1905dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
1906dfc5606dSYehuda Sadeh 				  char *buf)
1907dfc5606dSYehuda Sadeh {
1908dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1909dfc5606dSYehuda Sadeh 
1910dfc5606dSYehuda Sadeh 	return sprintf(buf, "%lld\n", (long long)snap->size);
1911dfc5606dSYehuda Sadeh }
1912dfc5606dSYehuda Sadeh 
1913dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
1914dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
1915dfc5606dSYehuda Sadeh 				char *buf)
1916dfc5606dSYehuda Sadeh {
1917dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1918dfc5606dSYehuda Sadeh 
1919dfc5606dSYehuda Sadeh 	return sprintf(buf, "%lld\n", (long long)snap->id);
1920dfc5606dSYehuda Sadeh }
1921dfc5606dSYehuda Sadeh 
1922dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1923dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1924dfc5606dSYehuda Sadeh 
1925dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
1926dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
1927dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
1928dfc5606dSYehuda Sadeh 	NULL,
1929dfc5606dSYehuda Sadeh };
1930dfc5606dSYehuda Sadeh 
1931dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
1932dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
1933dfc5606dSYehuda Sadeh };
1934dfc5606dSYehuda Sadeh 
1935dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
1936dfc5606dSYehuda Sadeh {
1937dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1938dfc5606dSYehuda Sadeh 	kfree(snap->name);
1939dfc5606dSYehuda Sadeh 	kfree(snap);
1940dfc5606dSYehuda Sadeh }
1941dfc5606dSYehuda Sadeh 
1942dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
1943dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
1944dfc5606dSYehuda Sadeh 	NULL
1945dfc5606dSYehuda Sadeh };
1946dfc5606dSYehuda Sadeh 
1947dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
1948dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
1949dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
1950dfc5606dSYehuda Sadeh };
1951dfc5606dSYehuda Sadeh 
1952dfc5606dSYehuda Sadeh static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1953dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap)
1954dfc5606dSYehuda Sadeh {
1955dfc5606dSYehuda Sadeh 	list_del(&snap->node);
1956dfc5606dSYehuda Sadeh 	device_unregister(&snap->dev);
1957dfc5606dSYehuda Sadeh }
1958dfc5606dSYehuda Sadeh 
1959dfc5606dSYehuda Sadeh static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1960dfc5606dSYehuda Sadeh 				  struct rbd_snap *snap,
1961dfc5606dSYehuda Sadeh 				  struct device *parent)
1962dfc5606dSYehuda Sadeh {
1963dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
1964dfc5606dSYehuda Sadeh 	int ret;
1965dfc5606dSYehuda Sadeh 
1966dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
1967dfc5606dSYehuda Sadeh 	dev->parent = parent;
1968dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
1969dfc5606dSYehuda Sadeh 	dev_set_name(dev, "snap_%s", snap->name);
1970dfc5606dSYehuda Sadeh 	ret = device_register(dev);
1971dfc5606dSYehuda Sadeh 
1972dfc5606dSYehuda Sadeh 	return ret;
1973dfc5606dSYehuda Sadeh }
1974dfc5606dSYehuda Sadeh 
1975dfc5606dSYehuda Sadeh static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1976dfc5606dSYehuda Sadeh 			      int i, const char *name,
1977dfc5606dSYehuda Sadeh 			      struct rbd_snap **snapp)
1978dfc5606dSYehuda Sadeh {
1979dfc5606dSYehuda Sadeh 	int ret;
1980dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
1981dfc5606dSYehuda Sadeh 	if (!snap)
1982dfc5606dSYehuda Sadeh 		return -ENOMEM;
1983dfc5606dSYehuda Sadeh 	snap->name = kstrdup(name, GFP_KERNEL);
1984dfc5606dSYehuda Sadeh 	snap->size = rbd_dev->header.snap_sizes[i];
1985dfc5606dSYehuda Sadeh 	snap->id = rbd_dev->header.snapc->snaps[i];
1986dfc5606dSYehuda Sadeh 	if (device_is_registered(&rbd_dev->dev)) {
1987dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
1988dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
1989dfc5606dSYehuda Sadeh 		if (ret < 0)
1990dfc5606dSYehuda Sadeh 			goto err;
1991dfc5606dSYehuda Sadeh 	}
1992dfc5606dSYehuda Sadeh 	*snapp = snap;
1993dfc5606dSYehuda Sadeh 	return 0;
1994dfc5606dSYehuda Sadeh err:
1995dfc5606dSYehuda Sadeh 	kfree(snap->name);
1996dfc5606dSYehuda Sadeh 	kfree(snap);
1997dfc5606dSYehuda Sadeh 	return ret;
1998dfc5606dSYehuda Sadeh }
1999dfc5606dSYehuda Sadeh 
2000dfc5606dSYehuda Sadeh /*
2001dfc5606dSYehuda Sadeh  * search for the previous snap in a null delimited string list
2002dfc5606dSYehuda Sadeh  */
2003dfc5606dSYehuda Sadeh const char *rbd_prev_snap_name(const char *name, const char *start)
2004dfc5606dSYehuda Sadeh {
2005dfc5606dSYehuda Sadeh 	if (name < start + 2)
2006dfc5606dSYehuda Sadeh 		return NULL;
2007dfc5606dSYehuda Sadeh 
2008dfc5606dSYehuda Sadeh 	name -= 2;
2009dfc5606dSYehuda Sadeh 	while (*name) {
2010dfc5606dSYehuda Sadeh 		if (name == start)
2011dfc5606dSYehuda Sadeh 			return start;
2012dfc5606dSYehuda Sadeh 		name--;
2013dfc5606dSYehuda Sadeh 	}
2014dfc5606dSYehuda Sadeh 	return name + 1;
2015dfc5606dSYehuda Sadeh }
2016dfc5606dSYehuda Sadeh 
2017dfc5606dSYehuda Sadeh /*
2018dfc5606dSYehuda Sadeh  * compare the old list of snapshots that we have to what's in the header
2019dfc5606dSYehuda Sadeh  * and update it accordingly. Note that the header holds the snapshots
2020dfc5606dSYehuda Sadeh  * in a reverse order (from newest to oldest) and we need to go from
2021dfc5606dSYehuda Sadeh  * older to new so that we don't get a duplicate snap name when
2022dfc5606dSYehuda Sadeh  * doing the process (e.g., removed snapshot and recreated a new
2023dfc5606dSYehuda Sadeh  * one with the same name.
2024dfc5606dSYehuda Sadeh  */
2025dfc5606dSYehuda Sadeh static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2026dfc5606dSYehuda Sadeh {
2027dfc5606dSYehuda Sadeh 	const char *name, *first_name;
2028dfc5606dSYehuda Sadeh 	int i = rbd_dev->header.total_snaps;
2029dfc5606dSYehuda Sadeh 	struct rbd_snap *snap, *old_snap = NULL;
2030dfc5606dSYehuda Sadeh 	int ret;
2031dfc5606dSYehuda Sadeh 	struct list_head *p, *n;
2032dfc5606dSYehuda Sadeh 
2033dfc5606dSYehuda Sadeh 	first_name = rbd_dev->header.snap_names;
2034dfc5606dSYehuda Sadeh 	name = first_name + rbd_dev->header.snap_names_len;
2035dfc5606dSYehuda Sadeh 
2036dfc5606dSYehuda Sadeh 	list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2037dfc5606dSYehuda Sadeh 		u64 cur_id;
2038dfc5606dSYehuda Sadeh 
2039dfc5606dSYehuda Sadeh 		old_snap = list_entry(p, struct rbd_snap, node);
2040dfc5606dSYehuda Sadeh 
2041dfc5606dSYehuda Sadeh 		if (i)
2042dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i - 1];
2043dfc5606dSYehuda Sadeh 
2044dfc5606dSYehuda Sadeh 		if (!i || old_snap->id < cur_id) {
2045dfc5606dSYehuda Sadeh 			/* old_snap->id was skipped, thus was removed */
2046dfc5606dSYehuda Sadeh 			__rbd_remove_snap_dev(rbd_dev, old_snap);
2047dfc5606dSYehuda Sadeh 			continue;
2048dfc5606dSYehuda Sadeh 		}
2049dfc5606dSYehuda Sadeh 		if (old_snap->id == cur_id) {
2050dfc5606dSYehuda Sadeh 			/* we have this snapshot already */
2051dfc5606dSYehuda Sadeh 			i--;
2052dfc5606dSYehuda Sadeh 			name = rbd_prev_snap_name(name, first_name);
2053dfc5606dSYehuda Sadeh 			continue;
2054dfc5606dSYehuda Sadeh 		}
2055dfc5606dSYehuda Sadeh 		for (; i > 0;
2056dfc5606dSYehuda Sadeh 		     i--, name = rbd_prev_snap_name(name, first_name)) {
2057dfc5606dSYehuda Sadeh 			if (!name) {
2058dfc5606dSYehuda Sadeh 				WARN_ON(1);
2059dfc5606dSYehuda Sadeh 				return -EINVAL;
2060dfc5606dSYehuda Sadeh 			}
2061dfc5606dSYehuda Sadeh 			cur_id = rbd_dev->header.snapc->snaps[i];
2062dfc5606dSYehuda Sadeh 			/* snapshot removal? handle it above */
2063dfc5606dSYehuda Sadeh 			if (cur_id >= old_snap->id)
2064dfc5606dSYehuda Sadeh 				break;
2065dfc5606dSYehuda Sadeh 			/* a new snapshot */
2066dfc5606dSYehuda Sadeh 			ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2067dfc5606dSYehuda Sadeh 			if (ret < 0)
2068dfc5606dSYehuda Sadeh 				return ret;
2069dfc5606dSYehuda Sadeh 
2070dfc5606dSYehuda Sadeh 			/* note that we add it backward so using n and not p */
2071dfc5606dSYehuda Sadeh 			list_add(&snap->node, n);
2072dfc5606dSYehuda Sadeh 			p = &snap->node;
2073dfc5606dSYehuda Sadeh 		}
2074dfc5606dSYehuda Sadeh 	}
2075dfc5606dSYehuda Sadeh 	/* we're done going over the old snap list, just add what's left */
2076dfc5606dSYehuda Sadeh 	for (; i > 0; i--) {
2077dfc5606dSYehuda Sadeh 		name = rbd_prev_snap_name(name, first_name);
2078dfc5606dSYehuda Sadeh 		if (!name) {
2079dfc5606dSYehuda Sadeh 			WARN_ON(1);
2080dfc5606dSYehuda Sadeh 			return -EINVAL;
2081dfc5606dSYehuda Sadeh 		}
2082dfc5606dSYehuda Sadeh 		ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2083dfc5606dSYehuda Sadeh 		if (ret < 0)
2084dfc5606dSYehuda Sadeh 			return ret;
2085dfc5606dSYehuda Sadeh 		list_add(&snap->node, &rbd_dev->snaps);
2086dfc5606dSYehuda Sadeh 	}
2087dfc5606dSYehuda Sadeh 
2088dfc5606dSYehuda Sadeh 	return 0;
2089dfc5606dSYehuda Sadeh }
2090dfc5606dSYehuda Sadeh 
2091dfc5606dSYehuda Sadeh 
2092dfc5606dSYehuda Sadeh static void rbd_root_dev_release(struct device *dev)
2093dfc5606dSYehuda Sadeh {
2094dfc5606dSYehuda Sadeh }
2095dfc5606dSYehuda Sadeh 
2096dfc5606dSYehuda Sadeh static struct device rbd_root_dev = {
2097dfc5606dSYehuda Sadeh 	.init_name =    "rbd",
2098dfc5606dSYehuda Sadeh 	.release =      rbd_root_dev_release,
2099dfc5606dSYehuda Sadeh };
2100dfc5606dSYehuda Sadeh 
2101dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2102dfc5606dSYehuda Sadeh {
2103dfc5606dSYehuda Sadeh 	int ret = -ENOMEM;
2104dfc5606dSYehuda Sadeh 	struct device *dev;
2105dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2106dfc5606dSYehuda Sadeh 
2107dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2108dfc5606dSYehuda Sadeh 	dev = &rbd_dev->dev;
2109dfc5606dSYehuda Sadeh 
2110dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
2111dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
2112dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
2113dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
2114dfc5606dSYehuda Sadeh 	dev_set_name(dev, "%d", rbd_dev->id);
2115dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2116dfc5606dSYehuda Sadeh 	if (ret < 0)
2117dfc5606dSYehuda Sadeh 		goto done_free;
2118dfc5606dSYehuda Sadeh 
2119dfc5606dSYehuda Sadeh 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2120dfc5606dSYehuda Sadeh 		ret = rbd_register_snap_dev(rbd_dev, snap,
2121dfc5606dSYehuda Sadeh 					     &rbd_dev->dev);
2122dfc5606dSYehuda Sadeh 		if (ret < 0)
2123602adf40SYehuda Sadeh 			break;
2124602adf40SYehuda Sadeh 	}
2125602adf40SYehuda Sadeh 
2126602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2127dfc5606dSYehuda Sadeh 	return 0;
2128dfc5606dSYehuda Sadeh done_free:
2129dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2130dfc5606dSYehuda Sadeh 	return ret;
2131602adf40SYehuda Sadeh }
2132602adf40SYehuda Sadeh 
2133dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2134dfc5606dSYehuda Sadeh {
2135dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
2136dfc5606dSYehuda Sadeh }
2137dfc5606dSYehuda Sadeh 
213859c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
213959c2be1eSYehuda Sadeh {
214059c2be1eSYehuda Sadeh 	int ret, rc;
214159c2be1eSYehuda Sadeh 
214259c2be1eSYehuda Sadeh 	do {
214359c2be1eSYehuda Sadeh 		ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
214459c2be1eSYehuda Sadeh 					 rbd_dev->header.obj_version);
214559c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
214659c2be1eSYehuda Sadeh 			mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
214759c2be1eSYehuda Sadeh 			rc = __rbd_update_snaps(rbd_dev);
214859c2be1eSYehuda Sadeh 			mutex_unlock(&ctl_mutex);
214959c2be1eSYehuda Sadeh 			if (rc < 0)
215059c2be1eSYehuda Sadeh 				return rc;
215159c2be1eSYehuda Sadeh 		}
215259c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
215359c2be1eSYehuda Sadeh 
215459c2be1eSYehuda Sadeh 	return ret;
215559c2be1eSYehuda Sadeh }
215659c2be1eSYehuda Sadeh 
215759c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
215859c2be1eSYehuda Sadeh 		       const char *buf,
215959c2be1eSYehuda Sadeh 		       size_t count)
2160602adf40SYehuda Sadeh {
2161602adf40SYehuda Sadeh 	struct ceph_osd_client *osdc;
2162602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2163602adf40SYehuda Sadeh 	ssize_t rc = -ENOMEM;
2164602adf40SYehuda Sadeh 	int irc, new_id = 0;
2165602adf40SYehuda Sadeh 	struct list_head *tmp;
2166602adf40SYehuda Sadeh 	char *mon_dev_name;
2167602adf40SYehuda Sadeh 	char *options;
2168602adf40SYehuda Sadeh 
2169602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
2170602adf40SYehuda Sadeh 		return -ENODEV;
2171602adf40SYehuda Sadeh 
2172602adf40SYehuda Sadeh 	mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2173602adf40SYehuda Sadeh 	if (!mon_dev_name)
2174602adf40SYehuda Sadeh 		goto err_out_mod;
2175602adf40SYehuda Sadeh 
2176602adf40SYehuda Sadeh 	options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
2177602adf40SYehuda Sadeh 	if (!options)
2178602adf40SYehuda Sadeh 		goto err_mon_dev;
2179602adf40SYehuda Sadeh 
2180602adf40SYehuda Sadeh 	/* new rbd_device object */
2181602adf40SYehuda Sadeh 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2182602adf40SYehuda Sadeh 	if (!rbd_dev)
2183602adf40SYehuda Sadeh 		goto err_out_opt;
2184602adf40SYehuda Sadeh 
2185602adf40SYehuda Sadeh 	/* static rbd_device initialization */
2186602adf40SYehuda Sadeh 	spin_lock_init(&rbd_dev->lock);
2187602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->node);
2188dfc5606dSYehuda Sadeh 	INIT_LIST_HEAD(&rbd_dev->snaps);
2189602adf40SYehuda Sadeh 
21900e805a1dSAlex Elder 	init_rwsem(&rbd_dev->header.snap_rwsem);
21910e805a1dSAlex Elder 
2192602adf40SYehuda Sadeh 	/* generate unique id: find highest unique id, add one */
2193602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2194602adf40SYehuda Sadeh 
2195602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2196602adf40SYehuda Sadeh 		struct rbd_device *rbd_dev;
2197602adf40SYehuda Sadeh 
2198602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2199602adf40SYehuda Sadeh 		if (rbd_dev->id >= new_id)
2200602adf40SYehuda Sadeh 			new_id = rbd_dev->id + 1;
2201602adf40SYehuda Sadeh 	}
2202602adf40SYehuda Sadeh 
2203602adf40SYehuda Sadeh 	rbd_dev->id = new_id;
2204602adf40SYehuda Sadeh 
2205602adf40SYehuda Sadeh 	/* add to global list */
2206602adf40SYehuda Sadeh 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
2207602adf40SYehuda Sadeh 
2208602adf40SYehuda Sadeh 	/* parse add command */
2209602adf40SYehuda Sadeh 	if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
2210602adf40SYehuda Sadeh 		   "%" __stringify(RBD_MAX_OPT_LEN) "s "
2211602adf40SYehuda Sadeh 		   "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
2212602adf40SYehuda Sadeh 		   "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
2213602adf40SYehuda Sadeh 		   "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
2214602adf40SYehuda Sadeh 		   mon_dev_name, options, rbd_dev->pool_name,
2215602adf40SYehuda Sadeh 		   rbd_dev->obj, rbd_dev->snap_name) < 4) {
2216602adf40SYehuda Sadeh 		rc = -EINVAL;
2217602adf40SYehuda Sadeh 		goto err_out_slot;
2218602adf40SYehuda Sadeh 	}
2219602adf40SYehuda Sadeh 
2220602adf40SYehuda Sadeh 	if (rbd_dev->snap_name[0] == 0)
2221602adf40SYehuda Sadeh 		rbd_dev->snap_name[0] = '-';
2222602adf40SYehuda Sadeh 
2223602adf40SYehuda Sadeh 	rbd_dev->obj_len = strlen(rbd_dev->obj);
2224602adf40SYehuda Sadeh 	snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
2225602adf40SYehuda Sadeh 		 rbd_dev->obj, RBD_SUFFIX);
2226602adf40SYehuda Sadeh 
2227602adf40SYehuda Sadeh 	/* initialize rest of new object */
2228602adf40SYehuda Sadeh 	snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
2229602adf40SYehuda Sadeh 	rc = rbd_get_client(rbd_dev, mon_dev_name, options);
2230602adf40SYehuda Sadeh 	if (rc < 0)
2231602adf40SYehuda Sadeh 		goto err_out_slot;
2232602adf40SYehuda Sadeh 
2233602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2234602adf40SYehuda Sadeh 
2235602adf40SYehuda Sadeh 	/* pick the pool */
2236602adf40SYehuda Sadeh 	osdc = &rbd_dev->client->osdc;
2237602adf40SYehuda Sadeh 	rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2238602adf40SYehuda Sadeh 	if (rc < 0)
2239602adf40SYehuda Sadeh 		goto err_out_client;
2240602adf40SYehuda Sadeh 	rbd_dev->poolid = rc;
2241602adf40SYehuda Sadeh 
2242602adf40SYehuda Sadeh 	/* register our block device */
2243602adf40SYehuda Sadeh 	irc = register_blkdev(0, rbd_dev->name);
2244602adf40SYehuda Sadeh 	if (irc < 0) {
2245602adf40SYehuda Sadeh 		rc = irc;
2246602adf40SYehuda Sadeh 		goto err_out_client;
2247602adf40SYehuda Sadeh 	}
2248602adf40SYehuda Sadeh 	rbd_dev->major = irc;
2249602adf40SYehuda Sadeh 
2250dfc5606dSYehuda Sadeh 	rc = rbd_bus_add_dev(rbd_dev);
2251dfc5606dSYehuda Sadeh 	if (rc)
2252766fc439SYehuda Sadeh 		goto err_out_blkdev;
2253766fc439SYehuda Sadeh 
2254602adf40SYehuda Sadeh 	/* set up and announce blkdev mapping */
2255602adf40SYehuda Sadeh 	rc = rbd_init_disk(rbd_dev);
2256602adf40SYehuda Sadeh 	if (rc)
2257766fc439SYehuda Sadeh 		goto err_out_bus;
2258602adf40SYehuda Sadeh 
225959c2be1eSYehuda Sadeh 	rc = rbd_init_watch_dev(rbd_dev);
226059c2be1eSYehuda Sadeh 	if (rc)
226159c2be1eSYehuda Sadeh 		goto err_out_bus;
226259c2be1eSYehuda Sadeh 
2263602adf40SYehuda Sadeh 	return count;
2264602adf40SYehuda Sadeh 
2265766fc439SYehuda Sadeh err_out_bus:
2266766fc439SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2267766fc439SYehuda Sadeh 	list_del_init(&rbd_dev->node);
2268766fc439SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2269766fc439SYehuda Sadeh 
2270766fc439SYehuda Sadeh 	/* this will also clean up rest of rbd_dev stuff */
2271766fc439SYehuda Sadeh 
2272766fc439SYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2273766fc439SYehuda Sadeh 	kfree(options);
2274766fc439SYehuda Sadeh 	kfree(mon_dev_name);
2275766fc439SYehuda Sadeh 	return rc;
2276766fc439SYehuda Sadeh 
2277602adf40SYehuda Sadeh err_out_blkdev:
2278602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2279602adf40SYehuda Sadeh err_out_client:
2280602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2281602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2282602adf40SYehuda Sadeh err_out_slot:
2283602adf40SYehuda Sadeh 	list_del_init(&rbd_dev->node);
2284602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2285602adf40SYehuda Sadeh 
2286602adf40SYehuda Sadeh 	kfree(rbd_dev);
2287602adf40SYehuda Sadeh err_out_opt:
2288602adf40SYehuda Sadeh 	kfree(options);
2289602adf40SYehuda Sadeh err_mon_dev:
2290602adf40SYehuda Sadeh 	kfree(mon_dev_name);
2291602adf40SYehuda Sadeh err_out_mod:
2292602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
2293602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2294602adf40SYehuda Sadeh 	return rc;
2295602adf40SYehuda Sadeh }
2296602adf40SYehuda Sadeh 
2297602adf40SYehuda Sadeh static struct rbd_device *__rbd_get_dev(unsigned long id)
2298602adf40SYehuda Sadeh {
2299602adf40SYehuda Sadeh 	struct list_head *tmp;
2300602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
2301602adf40SYehuda Sadeh 
2302602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
2303602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
2304602adf40SYehuda Sadeh 		if (rbd_dev->id == id)
2305602adf40SYehuda Sadeh 			return rbd_dev;
2306602adf40SYehuda Sadeh 	}
2307602adf40SYehuda Sadeh 	return NULL;
2308602adf40SYehuda Sadeh }
2309602adf40SYehuda Sadeh 
2310dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
2311602adf40SYehuda Sadeh {
2312dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev =
2313dfc5606dSYehuda Sadeh 			container_of(dev, struct rbd_device, dev);
2314602adf40SYehuda Sadeh 
231559c2be1eSYehuda Sadeh 	if (rbd_dev->watch_request)
231659c2be1eSYehuda Sadeh 		ceph_osdc_unregister_linger_request(&rbd_dev->client->osdc,
231759c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
231859c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
231979e3057cSYehuda Sadeh 		rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
232059c2be1eSYehuda Sadeh 
2321602adf40SYehuda Sadeh 	rbd_put_client(rbd_dev);
2322602adf40SYehuda Sadeh 
2323602adf40SYehuda Sadeh 	/* clean up and free blkdev */
2324602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
2325602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
2326602adf40SYehuda Sadeh 	kfree(rbd_dev);
2327602adf40SYehuda Sadeh 
2328602adf40SYehuda Sadeh 	/* release module ref */
2329602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
2330602adf40SYehuda Sadeh }
2331602adf40SYehuda Sadeh 
2332dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
2333602adf40SYehuda Sadeh 			  const char *buf,
2334602adf40SYehuda Sadeh 			  size_t count)
2335602adf40SYehuda Sadeh {
2336602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
2337602adf40SYehuda Sadeh 	int target_id, rc;
2338602adf40SYehuda Sadeh 	unsigned long ul;
2339602adf40SYehuda Sadeh 	int ret = count;
2340602adf40SYehuda Sadeh 
2341602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
2342602adf40SYehuda Sadeh 	if (rc)
2343602adf40SYehuda Sadeh 		return rc;
2344602adf40SYehuda Sadeh 
2345602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
2346602adf40SYehuda Sadeh 	target_id = (int) ul;
2347602adf40SYehuda Sadeh 	if (target_id != ul)
2348602adf40SYehuda Sadeh 		return -EINVAL;
2349602adf40SYehuda Sadeh 
2350602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2351602adf40SYehuda Sadeh 
2352602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
2353602adf40SYehuda Sadeh 	if (!rbd_dev) {
2354602adf40SYehuda Sadeh 		ret = -ENOENT;
2355602adf40SYehuda Sadeh 		goto done;
2356602adf40SYehuda Sadeh 	}
2357602adf40SYehuda Sadeh 
2358dfc5606dSYehuda Sadeh 	list_del_init(&rbd_dev->node);
2359dfc5606dSYehuda Sadeh 
2360dfc5606dSYehuda Sadeh 	__rbd_remove_all_snaps(rbd_dev);
2361dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
2362602adf40SYehuda Sadeh 
2363602adf40SYehuda Sadeh done:
2364602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2365602adf40SYehuda Sadeh 	return ret;
2366602adf40SYehuda Sadeh }
2367602adf40SYehuda Sadeh 
2368dfc5606dSYehuda Sadeh static ssize_t rbd_snap_add(struct device *dev,
2369dfc5606dSYehuda Sadeh 			    struct device_attribute *attr,
2370602adf40SYehuda Sadeh 			    const char *buf,
2371602adf40SYehuda Sadeh 			    size_t count)
2372602adf40SYehuda Sadeh {
2373dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = dev_to_rbd(dev);
2374dfc5606dSYehuda Sadeh 	int ret;
2375dfc5606dSYehuda Sadeh 	char *name = kmalloc(count + 1, GFP_KERNEL);
2376602adf40SYehuda Sadeh 	if (!name)
2377602adf40SYehuda Sadeh 		return -ENOMEM;
2378602adf40SYehuda Sadeh 
2379dfc5606dSYehuda Sadeh 	snprintf(name, count, "%s", buf);
2380602adf40SYehuda Sadeh 
2381602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2382602adf40SYehuda Sadeh 
2383602adf40SYehuda Sadeh 	ret = rbd_header_add_snap(rbd_dev,
2384602adf40SYehuda Sadeh 				  name, GFP_KERNEL);
2385602adf40SYehuda Sadeh 	if (ret < 0)
238659c2be1eSYehuda Sadeh 		goto err_unlock;
2387602adf40SYehuda Sadeh 
2388dfc5606dSYehuda Sadeh 	ret = __rbd_update_snaps(rbd_dev);
2389602adf40SYehuda Sadeh 	if (ret < 0)
239059c2be1eSYehuda Sadeh 		goto err_unlock;
239159c2be1eSYehuda Sadeh 
239259c2be1eSYehuda Sadeh 	/* shouldn't hold ctl_mutex when notifying.. notify might
239359c2be1eSYehuda Sadeh 	   trigger a watch callback that would need to get that mutex */
239459c2be1eSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
239559c2be1eSYehuda Sadeh 
239659c2be1eSYehuda Sadeh 	/* make a best effort, don't error if failed */
239759c2be1eSYehuda Sadeh 	rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
2398602adf40SYehuda Sadeh 
2399602adf40SYehuda Sadeh 	ret = count;
240059c2be1eSYehuda Sadeh 	kfree(name);
240159c2be1eSYehuda Sadeh 	return ret;
240259c2be1eSYehuda Sadeh 
240359c2be1eSYehuda Sadeh err_unlock:
2404602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
2405602adf40SYehuda Sadeh 	kfree(name);
2406602adf40SYehuda Sadeh 	return ret;
2407602adf40SYehuda Sadeh }
2408602adf40SYehuda Sadeh 
2409dfc5606dSYehuda Sadeh static struct bus_attribute rbd_bus_attrs[] = {
2410dfc5606dSYehuda Sadeh 	__ATTR(add, S_IWUSR, NULL, rbd_add),
2411dfc5606dSYehuda Sadeh 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
2412602adf40SYehuda Sadeh 	__ATTR_NULL
2413602adf40SYehuda Sadeh };
2414602adf40SYehuda Sadeh 
2415602adf40SYehuda Sadeh /*
2416602adf40SYehuda Sadeh  * create control files in sysfs
2417dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
2418602adf40SYehuda Sadeh  */
2419602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
2420602adf40SYehuda Sadeh {
2421dfc5606dSYehuda Sadeh 	int ret;
2422602adf40SYehuda Sadeh 
2423dfc5606dSYehuda Sadeh 	rbd_bus_type.bus_attrs = rbd_bus_attrs;
2424602adf40SYehuda Sadeh 
2425dfc5606dSYehuda Sadeh 	ret = bus_register(&rbd_bus_type);
2426dfc5606dSYehuda Sadeh 	if (ret < 0)
2427dfc5606dSYehuda Sadeh 		return ret;
2428602adf40SYehuda Sadeh 
2429dfc5606dSYehuda Sadeh 	ret = device_register(&rbd_root_dev);
2430602adf40SYehuda Sadeh 
2431602adf40SYehuda Sadeh 	return ret;
2432602adf40SYehuda Sadeh }
2433602adf40SYehuda Sadeh 
2434602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
2435602adf40SYehuda Sadeh {
2436dfc5606dSYehuda Sadeh 	device_unregister(&rbd_root_dev);
2437dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
2438602adf40SYehuda Sadeh }
2439602adf40SYehuda Sadeh 
2440602adf40SYehuda Sadeh int __init rbd_init(void)
2441602adf40SYehuda Sadeh {
2442602adf40SYehuda Sadeh 	int rc;
2443602adf40SYehuda Sadeh 
2444602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
2445602adf40SYehuda Sadeh 	if (rc)
2446602adf40SYehuda Sadeh 		return rc;
2447602adf40SYehuda Sadeh 	pr_info("loaded " DRV_NAME_LONG "\n");
2448602adf40SYehuda Sadeh 	return 0;
2449602adf40SYehuda Sadeh }
2450602adf40SYehuda Sadeh 
2451602adf40SYehuda Sadeh void __exit rbd_exit(void)
2452602adf40SYehuda Sadeh {
2453602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
2454602adf40SYehuda Sadeh }
2455602adf40SYehuda Sadeh 
2456602adf40SYehuda Sadeh module_init(rbd_init);
2457602adf40SYehuda Sadeh module_exit(rbd_exit);
2458602adf40SYehuda Sadeh 
2459602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2460602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2461602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
2462602adf40SYehuda Sadeh 
2463602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
2464602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2465602adf40SYehuda Sadeh 
2466602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
2467