xref: /openbmc/linux/drivers/block/rbd.c (revision af77f26c)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
570ec8ce87SAlex Elder #define	U32_MAX	((u32) (~0U))
58df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
59df111be6SAlex Elder 
60f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
61f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
62602adf40SYehuda Sadeh 
63602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
64602adf40SYehuda Sadeh 
65d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
66d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
67d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
68d4b125e9SAlex Elder 
6935d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
70602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
71602adf40SYehuda Sadeh 
72602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
73602adf40SYehuda Sadeh 
749e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
759e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
76589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
779e15b77dSAlex Elder 
781e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
79589d30e0SAlex Elder 
80d889140cSAlex Elder /* Feature bits */
81d889140cSAlex Elder 
82d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
83d889140cSAlex Elder 
84d889140cSAlex Elder /* Features supported by this (client software) implementation. */
85d889140cSAlex Elder 
86d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
87d889140cSAlex Elder 
8881a89793SAlex Elder /*
8981a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9081a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9181a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9281a89793SAlex Elder  * enough to hold all possible device names.
9381a89793SAlex Elder  */
94602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9581a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
96602adf40SYehuda Sadeh 
97cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
9859c2be1eSYehuda Sadeh 
99602adf40SYehuda Sadeh /*
100602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
101602adf40SYehuda Sadeh  */
102602adf40SYehuda Sadeh struct rbd_image_header {
103f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
104849b4260SAlex Elder 	char *object_prefix;
10534b13184SAlex Elder 	u64 features;
106602adf40SYehuda Sadeh 	__u8 obj_order;
107602adf40SYehuda Sadeh 	__u8 crypt_type;
108602adf40SYehuda Sadeh 	__u8 comp_type;
109602adf40SYehuda Sadeh 
110f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
111f84344f3SAlex Elder 	u64 image_size;
112f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
113602adf40SYehuda Sadeh 	char *snap_names;
114602adf40SYehuda Sadeh 	u64 *snap_sizes;
11559c2be1eSYehuda Sadeh 
11659c2be1eSYehuda Sadeh 	u64 obj_version;
11759c2be1eSYehuda Sadeh };
11859c2be1eSYehuda Sadeh 
1190d7dbfceSAlex Elder /*
1200d7dbfceSAlex Elder  * An rbd image specification.
1210d7dbfceSAlex Elder  *
1220d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
123c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
124c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
127c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
128c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
129c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
130c66c6e0cSAlex Elder  *
131c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
132c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
133c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
134c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
135c66c6e0cSAlex Elder  * is shared between the parent and child).
136c66c6e0cSAlex Elder  *
137c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
138c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
139c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
140c66c6e0cSAlex Elder  *
141c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
142c66c6e0cSAlex Elder  * could be a null pointer).
1430d7dbfceSAlex Elder  */
1440d7dbfceSAlex Elder struct rbd_spec {
1450d7dbfceSAlex Elder 	u64		pool_id;
1460d7dbfceSAlex Elder 	char		*pool_name;
1470d7dbfceSAlex Elder 
1480d7dbfceSAlex Elder 	char		*image_id;
1490d7dbfceSAlex Elder 	char		*image_name;
1500d7dbfceSAlex Elder 
1510d7dbfceSAlex Elder 	u64		snap_id;
1520d7dbfceSAlex Elder 	char		*snap_name;
1530d7dbfceSAlex Elder 
1540d7dbfceSAlex Elder 	struct kref	kref;
1550d7dbfceSAlex Elder };
1560d7dbfceSAlex Elder 
15759c2be1eSYehuda Sadeh struct rbd_options {
158cc0538b6SAlex Elder 	bool	read_only;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161602adf40SYehuda Sadeh /*
162f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
163602adf40SYehuda Sadeh  */
164602adf40SYehuda Sadeh struct rbd_client {
165602adf40SYehuda Sadeh 	struct ceph_client	*client;
166602adf40SYehuda Sadeh 	struct kref		kref;
167602adf40SYehuda Sadeh 	struct list_head	node;
168602adf40SYehuda Sadeh };
169602adf40SYehuda Sadeh 
170602adf40SYehuda Sadeh /*
171f0f8cef5SAlex Elder  * a request completion status
172602adf40SYehuda Sadeh  */
1731fec7093SYehuda Sadeh struct rbd_req_status {
1741fec7093SYehuda Sadeh 	int done;
1758986cb37SAlex Elder 	s32 rc;
1761fec7093SYehuda Sadeh 	u64 bytes;
1771fec7093SYehuda Sadeh };
1781fec7093SYehuda Sadeh 
1791fec7093SYehuda Sadeh /*
1801fec7093SYehuda Sadeh  * a collection of requests
1811fec7093SYehuda Sadeh  */
1821fec7093SYehuda Sadeh struct rbd_req_coll {
1831fec7093SYehuda Sadeh 	int			total;
1841fec7093SYehuda Sadeh 	int			num_done;
1851fec7093SYehuda Sadeh 	struct kref		kref;
1861fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
187602adf40SYehuda Sadeh };
188602adf40SYehuda Sadeh 
189f0f8cef5SAlex Elder /*
190f0f8cef5SAlex Elder  * a single io request
191f0f8cef5SAlex Elder  */
192f0f8cef5SAlex Elder struct rbd_request {
193f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
194f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
195f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
196f0f8cef5SAlex Elder 	u64			len;
197f0f8cef5SAlex Elder 	int			coll_index;
198f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
199f0f8cef5SAlex Elder };
200f0f8cef5SAlex Elder 
201dfc5606dSYehuda Sadeh struct rbd_snap {
202dfc5606dSYehuda Sadeh 	struct	device		dev;
203dfc5606dSYehuda Sadeh 	const char		*name;
2043591538fSJosh Durgin 	u64			size;
205dfc5606dSYehuda Sadeh 	struct list_head	node;
206dfc5606dSYehuda Sadeh 	u64			id;
20734b13184SAlex Elder 	u64			features;
208dfc5606dSYehuda Sadeh };
209dfc5606dSYehuda Sadeh 
210f84344f3SAlex Elder struct rbd_mapping {
21199c1f08fSAlex Elder 	u64                     size;
21234b13184SAlex Elder 	u64                     features;
213f84344f3SAlex Elder 	bool			read_only;
214f84344f3SAlex Elder };
215f84344f3SAlex Elder 
216602adf40SYehuda Sadeh /*
217602adf40SYehuda Sadeh  * a single device
218602adf40SYehuda Sadeh  */
219602adf40SYehuda Sadeh struct rbd_device {
220de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
221602adf40SYehuda Sadeh 
222602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
223602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
224602adf40SYehuda Sadeh 
225a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
226602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
227602adf40SYehuda Sadeh 
228602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
229602adf40SYehuda Sadeh 
230602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
231602adf40SYehuda Sadeh 
232602adf40SYehuda Sadeh 	struct rbd_image_header	header;
233d78b650aSAlex Elder 	atomic_t		exists;
2340d7dbfceSAlex Elder 	struct rbd_spec		*spec;
235602adf40SYehuda Sadeh 
2360d7dbfceSAlex Elder 	char			*header_name;
237971f839aSAlex Elder 
23859c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
23959c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
24059c2be1eSYehuda Sadeh 
24186b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
24286b00e0dSAlex Elder 	u64			parent_overlap;
24386b00e0dSAlex Elder 
244c666601aSJosh Durgin 	/* protects updating the header */
245c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
246f84344f3SAlex Elder 
247f84344f3SAlex Elder 	struct rbd_mapping	mapping;
248602adf40SYehuda Sadeh 
249602adf40SYehuda Sadeh 	struct list_head	node;
250dfc5606dSYehuda Sadeh 
251dfc5606dSYehuda Sadeh 	/* list of snapshots */
252dfc5606dSYehuda Sadeh 	struct list_head	snaps;
253dfc5606dSYehuda Sadeh 
254dfc5606dSYehuda Sadeh 	/* sysfs related */
255dfc5606dSYehuda Sadeh 	struct device		dev;
25642382b70SAlex Elder 	unsigned long		open_count;
257dfc5606dSYehuda Sadeh };
258dfc5606dSYehuda Sadeh 
259602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
260e124a82fSAlex Elder 
261602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
262e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
263e124a82fSAlex Elder 
264602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
265432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
266602adf40SYehuda Sadeh 
267304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
268304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
269304f6808SAlex Elder 
270dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
27141f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
272dfc5606dSYehuda Sadeh 
273f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
274f0f8cef5SAlex Elder 		       size_t count);
275f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
276f0f8cef5SAlex Elder 			  size_t count);
277f0f8cef5SAlex Elder 
278f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
279f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
280f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
281f0f8cef5SAlex Elder 	__ATTR_NULL
282f0f8cef5SAlex Elder };
283f0f8cef5SAlex Elder 
284f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
285f0f8cef5SAlex Elder 	.name		= "rbd",
286f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
287f0f8cef5SAlex Elder };
288f0f8cef5SAlex Elder 
289f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
290f0f8cef5SAlex Elder {
291f0f8cef5SAlex Elder }
292f0f8cef5SAlex Elder 
293f0f8cef5SAlex Elder static struct device rbd_root_dev = {
294f0f8cef5SAlex Elder 	.init_name =    "rbd",
295f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
296f0f8cef5SAlex Elder };
297f0f8cef5SAlex Elder 
29806ecc6cbSAlex Elder static __printf(2, 3)
29906ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
30006ecc6cbSAlex Elder {
30106ecc6cbSAlex Elder 	struct va_format vaf;
30206ecc6cbSAlex Elder 	va_list args;
30306ecc6cbSAlex Elder 
30406ecc6cbSAlex Elder 	va_start(args, fmt);
30506ecc6cbSAlex Elder 	vaf.fmt = fmt;
30606ecc6cbSAlex Elder 	vaf.va = &args;
30706ecc6cbSAlex Elder 
30806ecc6cbSAlex Elder 	if (!rbd_dev)
30906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
31006ecc6cbSAlex Elder 	else if (rbd_dev->disk)
31106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
31206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
31306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
31406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
31506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
31606ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
31706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
31806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
31906ecc6cbSAlex Elder 	else	/* punt */
32006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
32106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
32206ecc6cbSAlex Elder 	va_end(args);
32306ecc6cbSAlex Elder }
32406ecc6cbSAlex Elder 
325aafb230eSAlex Elder #ifdef RBD_DEBUG
326aafb230eSAlex Elder #define rbd_assert(expr)						\
327aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
328aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
329aafb230eSAlex Elder 						"at line %d:\n\n"	\
330aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
331aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
332aafb230eSAlex Elder 			BUG();						\
333aafb230eSAlex Elder 		}
334aafb230eSAlex Elder #else /* !RBD_DEBUG */
335aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
336aafb230eSAlex Elder #endif /* !RBD_DEBUG */
337dfc5606dSYehuda Sadeh 
338117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
339117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
34059c2be1eSYehuda Sadeh 
341602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
342602adf40SYehuda Sadeh {
343f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
344602adf40SYehuda Sadeh 
345f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
346602adf40SYehuda Sadeh 		return -EROFS;
347602adf40SYehuda Sadeh 
34842382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
349c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
350f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
35142382b70SAlex Elder 	rbd_dev->open_count++;
35242382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
353340c7a2bSAlex Elder 
354602adf40SYehuda Sadeh 	return 0;
355602adf40SYehuda Sadeh }
356602adf40SYehuda Sadeh 
357dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
358dfc5606dSYehuda Sadeh {
359dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
360dfc5606dSYehuda Sadeh 
36142382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
36242382b70SAlex Elder 	rbd_assert(rbd_dev->open_count > 0);
36342382b70SAlex Elder 	rbd_dev->open_count--;
364c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
36542382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
366dfc5606dSYehuda Sadeh 
367dfc5606dSYehuda Sadeh 	return 0;
368dfc5606dSYehuda Sadeh }
369dfc5606dSYehuda Sadeh 
370602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
371602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
372602adf40SYehuda Sadeh 	.open			= rbd_open,
373dfc5606dSYehuda Sadeh 	.release		= rbd_release,
374602adf40SYehuda Sadeh };
375602adf40SYehuda Sadeh 
376602adf40SYehuda Sadeh /*
377602adf40SYehuda Sadeh  * Initialize an rbd client instance.
37843ae4701SAlex Elder  * We own *ceph_opts.
379602adf40SYehuda Sadeh  */
380f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
381602adf40SYehuda Sadeh {
382602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
383602adf40SYehuda Sadeh 	int ret = -ENOMEM;
384602adf40SYehuda Sadeh 
385602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
386602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
387602adf40SYehuda Sadeh 	if (!rbdc)
388602adf40SYehuda Sadeh 		goto out_opt;
389602adf40SYehuda Sadeh 
390602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
391602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
392602adf40SYehuda Sadeh 
393bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
394bc534d86SAlex Elder 
39543ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
396602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
397bc534d86SAlex Elder 		goto out_mutex;
39843ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
399602adf40SYehuda Sadeh 
400602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
401602adf40SYehuda Sadeh 	if (ret < 0)
402602adf40SYehuda Sadeh 		goto out_err;
403602adf40SYehuda Sadeh 
404432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
405602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
406432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
407602adf40SYehuda Sadeh 
408bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
409bc534d86SAlex Elder 
410602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
411602adf40SYehuda Sadeh 	return rbdc;
412602adf40SYehuda Sadeh 
413602adf40SYehuda Sadeh out_err:
414602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
415bc534d86SAlex Elder out_mutex:
416bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
417602adf40SYehuda Sadeh 	kfree(rbdc);
418602adf40SYehuda Sadeh out_opt:
41943ae4701SAlex Elder 	if (ceph_opts)
42043ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
42128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
422602adf40SYehuda Sadeh }
423602adf40SYehuda Sadeh 
424602adf40SYehuda Sadeh /*
4251f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
4261f7ba331SAlex Elder  * found, bump its reference count.
427602adf40SYehuda Sadeh  */
4281f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
429602adf40SYehuda Sadeh {
430602adf40SYehuda Sadeh 	struct rbd_client *client_node;
4311f7ba331SAlex Elder 	bool found = false;
432602adf40SYehuda Sadeh 
43343ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
434602adf40SYehuda Sadeh 		return NULL;
435602adf40SYehuda Sadeh 
4361f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
4371f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
4381f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
4391f7ba331SAlex Elder 			kref_get(&client_node->kref);
4401f7ba331SAlex Elder 			found = true;
4411f7ba331SAlex Elder 			break;
4421f7ba331SAlex Elder 		}
4431f7ba331SAlex Elder 	}
4441f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
4451f7ba331SAlex Elder 
4461f7ba331SAlex Elder 	return found ? client_node : NULL;
447602adf40SYehuda Sadeh }
448602adf40SYehuda Sadeh 
449602adf40SYehuda Sadeh /*
45059c2be1eSYehuda Sadeh  * mount options
45159c2be1eSYehuda Sadeh  */
45259c2be1eSYehuda Sadeh enum {
45359c2be1eSYehuda Sadeh 	Opt_last_int,
45459c2be1eSYehuda Sadeh 	/* int args above */
45559c2be1eSYehuda Sadeh 	Opt_last_string,
45659c2be1eSYehuda Sadeh 	/* string args above */
457cc0538b6SAlex Elder 	Opt_read_only,
458cc0538b6SAlex Elder 	Opt_read_write,
459cc0538b6SAlex Elder 	/* Boolean args above */
460cc0538b6SAlex Elder 	Opt_last_bool,
46159c2be1eSYehuda Sadeh };
46259c2be1eSYehuda Sadeh 
46343ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
46459c2be1eSYehuda Sadeh 	/* int args above */
46559c2be1eSYehuda Sadeh 	/* string args above */
466be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
467cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
468cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
469cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
470cc0538b6SAlex Elder 	/* Boolean args above */
47159c2be1eSYehuda Sadeh 	{-1, NULL}
47259c2be1eSYehuda Sadeh };
47359c2be1eSYehuda Sadeh 
47459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
47559c2be1eSYehuda Sadeh {
47643ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
47759c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
47859c2be1eSYehuda Sadeh 	int token, intval, ret;
47959c2be1eSYehuda Sadeh 
48043ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
48159c2be1eSYehuda Sadeh 	if (token < 0)
48259c2be1eSYehuda Sadeh 		return -EINVAL;
48359c2be1eSYehuda Sadeh 
48459c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
48559c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
48659c2be1eSYehuda Sadeh 		if (ret < 0) {
48759c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
48859c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
48959c2be1eSYehuda Sadeh 			return ret;
49059c2be1eSYehuda Sadeh 		}
49159c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
49259c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
49359c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
49459c2be1eSYehuda Sadeh 		     argstr[0].from);
495cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
496cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
49759c2be1eSYehuda Sadeh 	} else {
49859c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
49959c2be1eSYehuda Sadeh 	}
50059c2be1eSYehuda Sadeh 
50159c2be1eSYehuda Sadeh 	switch (token) {
502cc0538b6SAlex Elder 	case Opt_read_only:
503cc0538b6SAlex Elder 		rbd_opts->read_only = true;
504cc0538b6SAlex Elder 		break;
505cc0538b6SAlex Elder 	case Opt_read_write:
506cc0538b6SAlex Elder 		rbd_opts->read_only = false;
507cc0538b6SAlex Elder 		break;
50859c2be1eSYehuda Sadeh 	default:
509aafb230eSAlex Elder 		rbd_assert(false);
510aafb230eSAlex Elder 		break;
51159c2be1eSYehuda Sadeh 	}
51259c2be1eSYehuda Sadeh 	return 0;
51359c2be1eSYehuda Sadeh }
51459c2be1eSYehuda Sadeh 
51559c2be1eSYehuda Sadeh /*
516602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
517602adf40SYehuda Sadeh  * not exist create it.
518602adf40SYehuda Sadeh  */
5199d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
520602adf40SYehuda Sadeh {
521f8c38929SAlex Elder 	struct rbd_client *rbdc;
52259c2be1eSYehuda Sadeh 
5231f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
5249d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
52543ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
5269d3997fdSAlex Elder 	else
527f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
528d720bcb0SAlex Elder 
5299d3997fdSAlex Elder 	return rbdc;
530602adf40SYehuda Sadeh }
531602adf40SYehuda Sadeh 
532602adf40SYehuda Sadeh /*
533602adf40SYehuda Sadeh  * Destroy ceph client
534d23a4b3fSAlex Elder  *
535432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
536602adf40SYehuda Sadeh  */
537602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
538602adf40SYehuda Sadeh {
539602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
540602adf40SYehuda Sadeh 
541602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
542cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
543602adf40SYehuda Sadeh 	list_del(&rbdc->node);
544cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
545602adf40SYehuda Sadeh 
546602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
547602adf40SYehuda Sadeh 	kfree(rbdc);
548602adf40SYehuda Sadeh }
549602adf40SYehuda Sadeh 
550602adf40SYehuda Sadeh /*
551602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
552602adf40SYehuda Sadeh  * it.
553602adf40SYehuda Sadeh  */
5549d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
555602adf40SYehuda Sadeh {
556c53d5893SAlex Elder 	if (rbdc)
5579d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
558602adf40SYehuda Sadeh }
559602adf40SYehuda Sadeh 
5601fec7093SYehuda Sadeh /*
5611fec7093SYehuda Sadeh  * Destroy requests collection
5621fec7093SYehuda Sadeh  */
5631fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5641fec7093SYehuda Sadeh {
5651fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5661fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5671fec7093SYehuda Sadeh 
5681fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5691fec7093SYehuda Sadeh 	kfree(coll);
5701fec7093SYehuda Sadeh }
571602adf40SYehuda Sadeh 
572a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
573a30b71b9SAlex Elder {
574a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
575a30b71b9SAlex Elder }
576a30b71b9SAlex Elder 
5778e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5788e94af8eSAlex Elder {
579103a150fSAlex Elder 	size_t size;
580103a150fSAlex Elder 	u32 snap_count;
581103a150fSAlex Elder 
582103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
583103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
584103a150fSAlex Elder 		return false;
585103a150fSAlex Elder 
586db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
587db2388b6SAlex Elder 
588db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
589db2388b6SAlex Elder 		return false;
590db2388b6SAlex Elder 
591db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
592db2388b6SAlex Elder 
593db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
594db2388b6SAlex Elder 		return false;
595db2388b6SAlex Elder 
596103a150fSAlex Elder 	/*
597103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
598103a150fSAlex Elder 	 * that limits the number of snapshots.
599103a150fSAlex Elder 	 */
600103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
601103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
602103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
603103a150fSAlex Elder 		return false;
604103a150fSAlex Elder 
605103a150fSAlex Elder 	/*
606103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
607103a150fSAlex Elder 	 * header must also be representable in a size_t.
608103a150fSAlex Elder 	 */
609103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
610103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
611103a150fSAlex Elder 		return false;
612103a150fSAlex Elder 
613103a150fSAlex Elder 	return true;
6148e94af8eSAlex Elder }
6158e94af8eSAlex Elder 
616602adf40SYehuda Sadeh /*
617602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
618602adf40SYehuda Sadeh  * header.
619602adf40SYehuda Sadeh  */
620602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
6214156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
622602adf40SYehuda Sadeh {
623ccece235SAlex Elder 	u32 snap_count;
62458c17b0eSAlex Elder 	size_t len;
625d2bb24e5SAlex Elder 	size_t size;
626621901d6SAlex Elder 	u32 i;
627602adf40SYehuda Sadeh 
6286a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
6296a52325fSAlex Elder 
630103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
631103a150fSAlex Elder 
63258c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
63358c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6346a52325fSAlex Elder 	if (!header->object_prefix)
635602adf40SYehuda Sadeh 		return -ENOMEM;
63658c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
63758c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
63800f1f36fSAlex Elder 
639602adf40SYehuda Sadeh 	if (snap_count) {
640f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
641f785cc1dSAlex Elder 
642621901d6SAlex Elder 		/* Save a copy of the snapshot names */
643621901d6SAlex Elder 
644f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
645f785cc1dSAlex Elder 			return -EIO;
646f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
647602adf40SYehuda Sadeh 		if (!header->snap_names)
6486a52325fSAlex Elder 			goto out_err;
649f785cc1dSAlex Elder 		/*
650f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
651f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
652f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
653f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
654f785cc1dSAlex Elder 		 */
655f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
656f785cc1dSAlex Elder 			snap_names_len);
6576a52325fSAlex Elder 
658621901d6SAlex Elder 		/* Record each snapshot's size */
659621901d6SAlex Elder 
660d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
661d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
662602adf40SYehuda Sadeh 		if (!header->snap_sizes)
6636a52325fSAlex Elder 			goto out_err;
664621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
665621901d6SAlex Elder 			header->snap_sizes[i] =
666621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
667602adf40SYehuda Sadeh 	} else {
668ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
669602adf40SYehuda Sadeh 		header->snap_names = NULL;
670602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
671602adf40SYehuda Sadeh 	}
672849b4260SAlex Elder 
67334b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
674602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
675602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
676602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6776a52325fSAlex Elder 
678621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
679621901d6SAlex Elder 
680f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6816a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6826a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6836a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6846a52325fSAlex Elder 	if (!header->snapc)
6856a52325fSAlex Elder 		goto out_err;
686602adf40SYehuda Sadeh 
687602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
688505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
689602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
690621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
691602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
692602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
693602adf40SYehuda Sadeh 
694602adf40SYehuda Sadeh 	return 0;
695602adf40SYehuda Sadeh 
6966a52325fSAlex Elder out_err:
697849b4260SAlex Elder 	kfree(header->snap_sizes);
698ccece235SAlex Elder 	header->snap_sizes = NULL;
699602adf40SYehuda Sadeh 	kfree(header->snap_names);
700ccece235SAlex Elder 	header->snap_names = NULL;
7016a52325fSAlex Elder 	kfree(header->object_prefix);
7026a52325fSAlex Elder 	header->object_prefix = NULL;
703ccece235SAlex Elder 
70400f1f36fSAlex Elder 	return -ENOMEM;
705602adf40SYehuda Sadeh }
706602adf40SYehuda Sadeh 
7079e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7089e15b77dSAlex Elder {
7099e15b77dSAlex Elder 	struct rbd_snap *snap;
7109e15b77dSAlex Elder 
7119e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7129e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7139e15b77dSAlex Elder 
7149e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7159e15b77dSAlex Elder 		if (snap_id == snap->id)
7169e15b77dSAlex Elder 			return snap->name;
7179e15b77dSAlex Elder 
7189e15b77dSAlex Elder 	return NULL;
7199e15b77dSAlex Elder }
7209e15b77dSAlex Elder 
7218836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
722602adf40SYehuda Sadeh {
723602adf40SYehuda Sadeh 
724e86924a8SAlex Elder 	struct rbd_snap *snap;
72500f1f36fSAlex Elder 
726e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
727e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
7280d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
729e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
73034b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
73100f1f36fSAlex Elder 
732e86924a8SAlex Elder 			return 0;
733602adf40SYehuda Sadeh 		}
73400f1f36fSAlex Elder 	}
735e86924a8SAlex Elder 
73600f1f36fSAlex Elder 	return -ENOENT;
73700f1f36fSAlex Elder }
738602adf40SYehuda Sadeh 
739819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
740602adf40SYehuda Sadeh {
74178dc447dSAlex Elder 	int ret;
742602adf40SYehuda Sadeh 
7430d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
744cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
7450d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
74699c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
74734b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
748e86924a8SAlex Elder 		ret = 0;
749602adf40SYehuda Sadeh 	} else {
7500d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
751602adf40SYehuda Sadeh 		if (ret < 0)
752602adf40SYehuda Sadeh 			goto done;
753f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
754602adf40SYehuda Sadeh 	}
755d78b650aSAlex Elder 	atomic_set(&rbd_dev->exists, 1);
756602adf40SYehuda Sadeh done:
757602adf40SYehuda Sadeh 	return ret;
758602adf40SYehuda Sadeh }
759602adf40SYehuda Sadeh 
760602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
761602adf40SYehuda Sadeh {
762849b4260SAlex Elder 	kfree(header->object_prefix);
763d78fd7aeSAlex Elder 	header->object_prefix = NULL;
764602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
765d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
766849b4260SAlex Elder 	kfree(header->snap_names);
767d78fd7aeSAlex Elder 	header->snap_names = NULL;
768d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
769d78fd7aeSAlex Elder 	header->snapc = NULL;
770602adf40SYehuda Sadeh }
771602adf40SYehuda Sadeh 
77265ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
773602adf40SYehuda Sadeh {
77465ccfe21SAlex Elder 	char *name;
77565ccfe21SAlex Elder 	u64 segment;
77665ccfe21SAlex Elder 	int ret;
777602adf40SYehuda Sadeh 
7782fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
77965ccfe21SAlex Elder 	if (!name)
78065ccfe21SAlex Elder 		return NULL;
78165ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
7822fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
78365ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
7842fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
78565ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
78665ccfe21SAlex Elder 			segment, ret);
78765ccfe21SAlex Elder 		kfree(name);
78865ccfe21SAlex Elder 		name = NULL;
78965ccfe21SAlex Elder 	}
790602adf40SYehuda Sadeh 
79165ccfe21SAlex Elder 	return name;
79265ccfe21SAlex Elder }
793602adf40SYehuda Sadeh 
79465ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
79565ccfe21SAlex Elder {
79665ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
797602adf40SYehuda Sadeh 
79865ccfe21SAlex Elder 	return offset & (segment_size - 1);
79965ccfe21SAlex Elder }
80065ccfe21SAlex Elder 
80165ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
80265ccfe21SAlex Elder 				u64 offset, u64 length)
80365ccfe21SAlex Elder {
80465ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
80565ccfe21SAlex Elder 
80665ccfe21SAlex Elder 	offset &= segment_size - 1;
80765ccfe21SAlex Elder 
808aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
80965ccfe21SAlex Elder 	if (offset + length > segment_size)
81065ccfe21SAlex Elder 		length = segment_size - offset;
81165ccfe21SAlex Elder 
81265ccfe21SAlex Elder 	return length;
813602adf40SYehuda Sadeh }
814602adf40SYehuda Sadeh 
8151fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
8161fec7093SYehuda Sadeh 				u64 ofs, u64 len)
8171fec7093SYehuda Sadeh {
818df111be6SAlex Elder 	u64 start_seg;
819df111be6SAlex Elder 	u64 end_seg;
820df111be6SAlex Elder 
821df111be6SAlex Elder 	if (!len)
822df111be6SAlex Elder 		return 0;
823df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
824df111be6SAlex Elder 		return -ERANGE;
825df111be6SAlex Elder 
826df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
827df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
828df111be6SAlex Elder 
8291fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
8301fec7093SYehuda Sadeh }
8311fec7093SYehuda Sadeh 
832602adf40SYehuda Sadeh /*
833029bcbd8SJosh Durgin  * returns the size of an object in the image
834029bcbd8SJosh Durgin  */
835029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
836029bcbd8SJosh Durgin {
837029bcbd8SJosh Durgin 	return 1 << header->obj_order;
838029bcbd8SJosh Durgin }
839029bcbd8SJosh Durgin 
840029bcbd8SJosh Durgin /*
841602adf40SYehuda Sadeh  * bio helpers
842602adf40SYehuda Sadeh  */
843602adf40SYehuda Sadeh 
844602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
845602adf40SYehuda Sadeh {
846602adf40SYehuda Sadeh 	struct bio *tmp;
847602adf40SYehuda Sadeh 
848602adf40SYehuda Sadeh 	while (chain) {
849602adf40SYehuda Sadeh 		tmp = chain;
850602adf40SYehuda Sadeh 		chain = chain->bi_next;
851602adf40SYehuda Sadeh 		bio_put(tmp);
852602adf40SYehuda Sadeh 	}
853602adf40SYehuda Sadeh }
854602adf40SYehuda Sadeh 
855602adf40SYehuda Sadeh /*
856602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
857602adf40SYehuda Sadeh  */
858602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
859602adf40SYehuda Sadeh {
860602adf40SYehuda Sadeh 	struct bio_vec *bv;
861602adf40SYehuda Sadeh 	unsigned long flags;
862602adf40SYehuda Sadeh 	void *buf;
863602adf40SYehuda Sadeh 	int i;
864602adf40SYehuda Sadeh 	int pos = 0;
865602adf40SYehuda Sadeh 
866602adf40SYehuda Sadeh 	while (chain) {
867602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
868602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
869602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
870602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
871602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
872602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
87385b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
874602adf40SYehuda Sadeh 			}
875602adf40SYehuda Sadeh 			pos += bv->bv_len;
876602adf40SYehuda Sadeh 		}
877602adf40SYehuda Sadeh 
878602adf40SYehuda Sadeh 		chain = chain->bi_next;
879602adf40SYehuda Sadeh 	}
880602adf40SYehuda Sadeh }
881602adf40SYehuda Sadeh 
882602adf40SYehuda Sadeh /*
883f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
884f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
885602adf40SYehuda Sadeh  */
886f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
887f7760dadSAlex Elder 					unsigned int offset,
888f7760dadSAlex Elder 					unsigned int len,
889f7760dadSAlex Elder 					gfp_t gfpmask)
890602adf40SYehuda Sadeh {
891f7760dadSAlex Elder 	struct bio_vec *bv;
892f7760dadSAlex Elder 	unsigned int resid;
893f7760dadSAlex Elder 	unsigned short idx;
894f7760dadSAlex Elder 	unsigned int voff;
895f7760dadSAlex Elder 	unsigned short end_idx;
896f7760dadSAlex Elder 	unsigned short vcnt;
897f7760dadSAlex Elder 	struct bio *bio;
898602adf40SYehuda Sadeh 
899f7760dadSAlex Elder 	/* Handle the easy case for the caller */
900f7760dadSAlex Elder 
901f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
902f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
903f7760dadSAlex Elder 
904f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
905f7760dadSAlex Elder 		return NULL;
906f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
907f7760dadSAlex Elder 		return NULL;
908f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
909f7760dadSAlex Elder 		return NULL;
910f7760dadSAlex Elder 
911f7760dadSAlex Elder 	/* Find first affected segment... */
912f7760dadSAlex Elder 
913f7760dadSAlex Elder 	resid = offset;
914f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
915f7760dadSAlex Elder 		if (resid < bv->bv_len)
916f7760dadSAlex Elder 			break;
917f7760dadSAlex Elder 		resid -= bv->bv_len;
918602adf40SYehuda Sadeh 	}
919f7760dadSAlex Elder 	voff = resid;
920602adf40SYehuda Sadeh 
921f7760dadSAlex Elder 	/* ...and the last affected segment */
922542582fcSAlex Elder 
923f7760dadSAlex Elder 	resid += len;
924f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
925f7760dadSAlex Elder 		if (resid <= bv->bv_len)
926f7760dadSAlex Elder 			break;
927f7760dadSAlex Elder 		resid -= bv->bv_len;
928f7760dadSAlex Elder 	}
929f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
930602adf40SYehuda Sadeh 
931f7760dadSAlex Elder 	/* Build the clone */
932f7760dadSAlex Elder 
933f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
934f7760dadSAlex Elder 	if (!bio)
935f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
936f7760dadSAlex Elder 
937f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
938f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
939f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
940f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
941602adf40SYehuda Sadeh 
942602adf40SYehuda Sadeh 	/*
943f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
944f7760dadSAlex Elder 	 * and last (or only) entries.
945602adf40SYehuda Sadeh 	 */
946f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
947f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
948f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
949f7760dadSAlex Elder 	if (vcnt > 1) {
950f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
951f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
952602adf40SYehuda Sadeh 	} else {
953f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
954602adf40SYehuda Sadeh 	}
955602adf40SYehuda Sadeh 
956f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
957f7760dadSAlex Elder 	bio->bi_size = len;
958f7760dadSAlex Elder 	bio->bi_idx = 0;
959602adf40SYehuda Sadeh 
960f7760dadSAlex Elder 	return bio;
961602adf40SYehuda Sadeh }
962602adf40SYehuda Sadeh 
963f7760dadSAlex Elder /*
964f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
965f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
966f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
967f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
968f7760dadSAlex Elder  *
969f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
970f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
971f7760dadSAlex Elder  * the start of data to be cloned is located.
972f7760dadSAlex Elder  *
973f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
974f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
975f7760dadSAlex Elder  * contain the offset of that byte within that bio.
976f7760dadSAlex Elder  */
977f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
978f7760dadSAlex Elder 					unsigned int *offset,
979f7760dadSAlex Elder 					unsigned int len,
980f7760dadSAlex Elder 					gfp_t gfpmask)
981f7760dadSAlex Elder {
982f7760dadSAlex Elder 	struct bio *bi = *bio_src;
983f7760dadSAlex Elder 	unsigned int off = *offset;
984f7760dadSAlex Elder 	struct bio *chain = NULL;
985f7760dadSAlex Elder 	struct bio **end;
986602adf40SYehuda Sadeh 
987f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
988602adf40SYehuda Sadeh 
989f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
990f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
991602adf40SYehuda Sadeh 
992f7760dadSAlex Elder 	end = &chain;
993f7760dadSAlex Elder 	while (len) {
994f7760dadSAlex Elder 		unsigned int bi_size;
995f7760dadSAlex Elder 		struct bio *bio;
996f7760dadSAlex Elder 
997f5400b7aSAlex Elder 		if (!bi) {
998f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
999f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1000f5400b7aSAlex Elder 		}
1001f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1002f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1003f7760dadSAlex Elder 		if (!bio)
1004f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1005f7760dadSAlex Elder 
1006f7760dadSAlex Elder 		*end = bio;
1007f7760dadSAlex Elder 		end = &bio->bi_next;
1008f7760dadSAlex Elder 
1009f7760dadSAlex Elder 		off += bi_size;
1010f7760dadSAlex Elder 		if (off == bi->bi_size) {
1011f7760dadSAlex Elder 			bi = bi->bi_next;
1012f7760dadSAlex Elder 			off = 0;
1013f7760dadSAlex Elder 		}
1014f7760dadSAlex Elder 		len -= bi_size;
1015f7760dadSAlex Elder 	}
1016f7760dadSAlex Elder 	*bio_src = bi;
1017f7760dadSAlex Elder 	*offset = off;
1018f7760dadSAlex Elder 
1019f7760dadSAlex Elder 	return chain;
1020f7760dadSAlex Elder out_err:
1021f7760dadSAlex Elder 	bio_chain_put(chain);
1022f7760dadSAlex Elder 
1023602adf40SYehuda Sadeh 	return NULL;
1024602adf40SYehuda Sadeh }
1025602adf40SYehuda Sadeh 
1026602adf40SYehuda Sadeh /*
1027602adf40SYehuda Sadeh  * helpers for osd request op vectors.
1028602adf40SYehuda Sadeh  */
102957cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
103057cfc106SAlex Elder 					int opcode, u32 payload_len)
1031602adf40SYehuda Sadeh {
103257cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
103357cfc106SAlex Elder 
103457cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
103557cfc106SAlex Elder 	if (!ops)
103657cfc106SAlex Elder 		return NULL;
103757cfc106SAlex Elder 
103857cfc106SAlex Elder 	ops[0].op = opcode;
103957cfc106SAlex Elder 
1040602adf40SYehuda Sadeh 	/*
1041602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
1042602adf40SYehuda Sadeh 	 * in calc_raw_layout()
1043602adf40SYehuda Sadeh 	 */
104457cfc106SAlex Elder 	ops[0].payload_len = payload_len;
104557cfc106SAlex Elder 
104657cfc106SAlex Elder 	return ops;
1047602adf40SYehuda Sadeh }
1048602adf40SYehuda Sadeh 
1049602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1050602adf40SYehuda Sadeh {
1051602adf40SYehuda Sadeh 	kfree(ops);
1052602adf40SYehuda Sadeh }
1053602adf40SYehuda Sadeh 
10541fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
10551fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
10561fec7093SYehuda Sadeh 				   int index,
10578986cb37SAlex Elder 				   s32 ret, u64 len)
10581fec7093SYehuda Sadeh {
10591fec7093SYehuda Sadeh 	struct request_queue *q;
10601fec7093SYehuda Sadeh 	int min, max, i;
10611fec7093SYehuda Sadeh 
1062bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
10638986cb37SAlex Elder 	     coll, index, (int)ret, (unsigned long long)len);
10641fec7093SYehuda Sadeh 
10651fec7093SYehuda Sadeh 	if (!rq)
10661fec7093SYehuda Sadeh 		return;
10671fec7093SYehuda Sadeh 
10681fec7093SYehuda Sadeh 	if (!coll) {
10691fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
10701fec7093SYehuda Sadeh 		return;
10711fec7093SYehuda Sadeh 	}
10721fec7093SYehuda Sadeh 
10731fec7093SYehuda Sadeh 	q = rq->q;
10741fec7093SYehuda Sadeh 
10751fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
10761fec7093SYehuda Sadeh 	coll->status[index].done = 1;
10771fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
10781fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
10791fec7093SYehuda Sadeh 	max = min = coll->num_done;
10801fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
10811fec7093SYehuda Sadeh 		max++;
10821fec7093SYehuda Sadeh 
10831fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
10848986cb37SAlex Elder 		__blk_end_request(rq, (int)coll->status[i].rc,
10851fec7093SYehuda Sadeh 				  coll->status[i].bytes);
10861fec7093SYehuda Sadeh 		coll->num_done++;
10871fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
10881fec7093SYehuda Sadeh 	}
10891fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
10901fec7093SYehuda Sadeh }
10911fec7093SYehuda Sadeh 
1092725afc97SAlex Elder static void rbd_coll_end_req(struct rbd_request *rbd_req,
10938986cb37SAlex Elder 			     s32 ret, u64 len)
10941fec7093SYehuda Sadeh {
1095725afc97SAlex Elder 	rbd_coll_end_req_index(rbd_req->rq,
1096725afc97SAlex Elder 				rbd_req->coll, rbd_req->coll_index,
1097725afc97SAlex Elder 				ret, len);
10981fec7093SYehuda Sadeh }
10991fec7093SYehuda Sadeh 
11000ec8ce87SAlex Elder static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
11010ec8ce87SAlex Elder {
11020ec8ce87SAlex Elder 	memset(layout, 0, sizeof (*layout));
11030ec8ce87SAlex Elder 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
11040ec8ce87SAlex Elder 	layout->fl_stripe_count = cpu_to_le32(1);
11050ec8ce87SAlex Elder 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
11060ec8ce87SAlex Elder 	rbd_assert(pool_id <= (u64) U32_MAX);
11070ec8ce87SAlex Elder 	layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
11080ec8ce87SAlex Elder }
11090ec8ce87SAlex Elder 
1110602adf40SYehuda Sadeh /*
1111602adf40SYehuda Sadeh  * Send ceph osd request
1112602adf40SYehuda Sadeh  */
1113602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
11140ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
1115602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1116602adf40SYehuda Sadeh 			  u64 snapid,
1117aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
1118602adf40SYehuda Sadeh 			  struct bio *bio,
1119602adf40SYehuda Sadeh 			  struct page **pages,
1120602adf40SYehuda Sadeh 			  int num_pages,
1121602adf40SYehuda Sadeh 			  int flags,
1122602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
11231fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
11241fec7093SYehuda Sadeh 			  int coll_index,
11255f29ddd4SAlex Elder 			  void (*rbd_cb)(struct ceph_osd_request *,
11265f29ddd4SAlex Elder 					 struct ceph_msg *),
112759c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
112859c2be1eSYehuda Sadeh 			  u64 *ver)
1129602adf40SYehuda Sadeh {
11305f29ddd4SAlex Elder 	struct ceph_osd_request *osd_req;
1131602adf40SYehuda Sadeh 	int ret;
1132602adf40SYehuda Sadeh 	u64 bno;
1133602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
1134725afc97SAlex Elder 	struct rbd_request *rbd_req;
1135602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
11361dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
1137602adf40SYehuda Sadeh 
1138725afc97SAlex Elder 	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
1139cd323ac0SAlex Elder 	if (!rbd_req)
11401fec7093SYehuda Sadeh 		return -ENOMEM;
1141602adf40SYehuda Sadeh 
11421fec7093SYehuda Sadeh 	if (coll) {
1143725afc97SAlex Elder 		rbd_req->coll = coll;
1144725afc97SAlex Elder 		rbd_req->coll_index = coll_index;
11451fec7093SYehuda Sadeh 	}
11461fec7093SYehuda Sadeh 
1147f7760dadSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1148f7760dadSAlex Elder 		object_name, (unsigned long long) ofs,
1149f7760dadSAlex Elder 		(unsigned long long) len, coll, coll_index);
1150602adf40SYehuda Sadeh 
11510ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
11525f29ddd4SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
11531dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
11545f29ddd4SAlex Elder 	if (!osd_req) {
11554ad12621SSage Weil 		ret = -ENOMEM;
1156602adf40SYehuda Sadeh 		goto done_pages;
1157602adf40SYehuda Sadeh 	}
1158602adf40SYehuda Sadeh 
11595f29ddd4SAlex Elder 	osd_req->r_callback = rbd_cb;
1160602adf40SYehuda Sadeh 
1161725afc97SAlex Elder 	rbd_req->rq = rq;
1162725afc97SAlex Elder 	rbd_req->bio = bio;
1163725afc97SAlex Elder 	rbd_req->pages = pages;
1164725afc97SAlex Elder 	rbd_req->len = len;
1165602adf40SYehuda Sadeh 
11665f29ddd4SAlex Elder 	osd_req->r_priv = rbd_req;
1167602adf40SYehuda Sadeh 
11685f29ddd4SAlex Elder 	reqhead = osd_req->r_request->front.iov_base;
1169602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1170602adf40SYehuda Sadeh 
11715f29ddd4SAlex Elder 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
11725f29ddd4SAlex Elder 	osd_req->r_oid_len = strlen(osd_req->r_oid);
1173602adf40SYehuda Sadeh 
11740ec8ce87SAlex Elder 	rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
11750ec8ce87SAlex Elder 	ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
11760ec8ce87SAlex Elder 				snapid, ofs, &len, &bno, osd_req, ops);
11776cae3717SSage Weil 	rbd_assert(ret == 0);
1178602adf40SYehuda Sadeh 
1179af77f26cSAlex Elder 	ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime);
1180602adf40SYehuda Sadeh 
118159c2be1eSYehuda Sadeh 	if (linger_req) {
11825f29ddd4SAlex Elder 		ceph_osdc_set_request_linger(osdc, osd_req);
11835f29ddd4SAlex Elder 		*linger_req = osd_req;
118459c2be1eSYehuda Sadeh 	}
118559c2be1eSYehuda Sadeh 
11865f29ddd4SAlex Elder 	ret = ceph_osdc_start_request(osdc, osd_req, false);
1187602adf40SYehuda Sadeh 	if (ret < 0)
1188602adf40SYehuda Sadeh 		goto done_err;
1189602adf40SYehuda Sadeh 
1190602adf40SYehuda Sadeh 	if (!rbd_cb) {
11915f29ddd4SAlex Elder 		u64 version;
11925f29ddd4SAlex Elder 
11935f29ddd4SAlex Elder 		ret = ceph_osdc_wait_request(osdc, osd_req);
11945f29ddd4SAlex Elder 		version = le64_to_cpu(osd_req->r_reassert_version.version);
119559c2be1eSYehuda Sadeh 		if (ver)
11965f29ddd4SAlex Elder 			*ver = version;
11975f29ddd4SAlex Elder 		dout("reassert_ver=%llu\n", (unsigned long long) version);
11985f29ddd4SAlex Elder 		ceph_osdc_put_request(osd_req);
1199602adf40SYehuda Sadeh 	}
1200602adf40SYehuda Sadeh 	return ret;
1201602adf40SYehuda Sadeh 
1202602adf40SYehuda Sadeh done_err:
1203725afc97SAlex Elder 	bio_chain_put(rbd_req->bio);
12045f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
1205602adf40SYehuda Sadeh done_pages:
1206725afc97SAlex Elder 	kfree(rbd_req);
1207602adf40SYehuda Sadeh 	return ret;
1208602adf40SYehuda Sadeh }
1209602adf40SYehuda Sadeh 
1210602adf40SYehuda Sadeh /*
1211602adf40SYehuda Sadeh  * Ceph osd op callback
1212602adf40SYehuda Sadeh  */
12135f29ddd4SAlex Elder static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
1214602adf40SYehuda Sadeh {
12155f29ddd4SAlex Elder 	struct rbd_request *rbd_req = osd_req->r_priv;
1216602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1217602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
12188986cb37SAlex Elder 	s32 rc;
1219602adf40SYehuda Sadeh 	u64 bytes;
1220602adf40SYehuda Sadeh 	int read_op;
1221602adf40SYehuda Sadeh 
1222602adf40SYehuda Sadeh 	/* parse reply */
1223602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1224602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1225602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
12268986cb37SAlex Elder 	rc = (s32)le32_to_cpu(replyhead->result);
1227602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1228895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1229602adf40SYehuda Sadeh 
1230bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1231bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1232602adf40SYehuda Sadeh 
12338986cb37SAlex Elder 	if (rc == (s32)-ENOENT && read_op) {
1234725afc97SAlex Elder 		zero_bio_chain(rbd_req->bio, 0);
1235602adf40SYehuda Sadeh 		rc = 0;
1236725afc97SAlex Elder 	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
1237725afc97SAlex Elder 		zero_bio_chain(rbd_req->bio, bytes);
1238725afc97SAlex Elder 		bytes = rbd_req->len;
1239602adf40SYehuda Sadeh 	}
1240602adf40SYehuda Sadeh 
1241725afc97SAlex Elder 	rbd_coll_end_req(rbd_req, rc, bytes);
1242602adf40SYehuda Sadeh 
1243725afc97SAlex Elder 	if (rbd_req->bio)
1244725afc97SAlex Elder 		bio_chain_put(rbd_req->bio);
1245602adf40SYehuda Sadeh 
12465f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
1247725afc97SAlex Elder 	kfree(rbd_req);
1248602adf40SYehuda Sadeh }
1249602adf40SYehuda Sadeh 
12505f29ddd4SAlex Elder static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
12515f29ddd4SAlex Elder 				struct ceph_msg *msg)
125259c2be1eSYehuda Sadeh {
12535f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
125459c2be1eSYehuda Sadeh }
125559c2be1eSYehuda Sadeh 
1256602adf40SYehuda Sadeh /*
1257602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1258602adf40SYehuda Sadeh  */
12590ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1260602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1261602adf40SYehuda Sadeh 			   u64 snapid,
1262602adf40SYehuda Sadeh 			   int flags,
1263913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1264aded07eaSAlex Elder 			   const char *object_name,
1265f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1266f8d4de6eSAlex Elder 			   char *inbound,
126759c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
126859c2be1eSYehuda Sadeh 			   u64 *ver)
1269602adf40SYehuda Sadeh {
1270602adf40SYehuda Sadeh 	int ret;
1271602adf40SYehuda Sadeh 	struct page **pages;
1272602adf40SYehuda Sadeh 	int num_pages;
1273913d2fdcSAlex Elder 
1274aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1275602adf40SYehuda Sadeh 
1276f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1277602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1278b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1279b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1280602adf40SYehuda Sadeh 
12810ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1282f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1283602adf40SYehuda Sadeh 			  pages, num_pages,
1284602adf40SYehuda Sadeh 			  flags,
1285602adf40SYehuda Sadeh 			  ops,
12861fec7093SYehuda Sadeh 			  NULL, 0,
128759c2be1eSYehuda Sadeh 			  NULL,
128859c2be1eSYehuda Sadeh 			  linger_req, ver);
1289602adf40SYehuda Sadeh 	if (ret < 0)
1290913d2fdcSAlex Elder 		goto done;
1291602adf40SYehuda Sadeh 
1292f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1293f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1294602adf40SYehuda Sadeh 
1295602adf40SYehuda Sadeh done:
1296602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1297602adf40SYehuda Sadeh 	return ret;
1298602adf40SYehuda Sadeh }
1299602adf40SYehuda Sadeh 
1300602adf40SYehuda Sadeh /*
1301602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1302602adf40SYehuda Sadeh  */
1303602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1304602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1305602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1306602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
13071fec7093SYehuda Sadeh 		     struct bio *bio,
13081fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
13091fec7093SYehuda Sadeh 		     int coll_index)
1310602adf40SYehuda Sadeh {
1311602adf40SYehuda Sadeh 	char *seg_name;
1312602adf40SYehuda Sadeh 	u64 seg_ofs;
1313602adf40SYehuda Sadeh 	u64 seg_len;
1314602adf40SYehuda Sadeh 	int ret;
1315602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1316602adf40SYehuda Sadeh 	u32 payload_len;
1317ff2e4bb5SAlex Elder 	int opcode;
1318ff2e4bb5SAlex Elder 	int flags;
13194634246dSAlex Elder 	u64 snapid;
1320602adf40SYehuda Sadeh 
132165ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1322602adf40SYehuda Sadeh 	if (!seg_name)
1323602adf40SYehuda Sadeh 		return -ENOMEM;
132465ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
132565ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1326602adf40SYehuda Sadeh 
1327ff2e4bb5SAlex Elder 	if (rq_data_dir(rq) == WRITE) {
1328ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_WRITE;
1329ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
13304634246dSAlex Elder 		snapid = CEPH_NOSNAP;
1331ff2e4bb5SAlex Elder 		payload_len = seg_len;
1332ff2e4bb5SAlex Elder 	} else {
1333ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_READ;
1334ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_READ;
1335a7b4c65fSAlex Elder 		rbd_assert(!snapc);
13360d7dbfceSAlex Elder 		snapid = rbd_dev->spec->snap_id;
1337ff2e4bb5SAlex Elder 		payload_len = 0;
1338ff2e4bb5SAlex Elder 	}
1339602adf40SYehuda Sadeh 
134057cfc106SAlex Elder 	ret = -ENOMEM;
134157cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
134257cfc106SAlex Elder 	if (!ops)
1343602adf40SYehuda Sadeh 		goto done;
1344602adf40SYehuda Sadeh 
1345602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1346602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1347602adf40SYehuda Sadeh 	   truncated at this point */
1348aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1349602adf40SYehuda Sadeh 
1350602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1351602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1352602adf40SYehuda Sadeh 			     bio,
1353602adf40SYehuda Sadeh 			     NULL, 0,
1354602adf40SYehuda Sadeh 			     flags,
1355602adf40SYehuda Sadeh 			     ops,
13561fec7093SYehuda Sadeh 			     coll, coll_index,
135759c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
1358cd323ac0SAlex Elder 	if (ret < 0)
1359cd323ac0SAlex Elder 		rbd_coll_end_req_index(rq, coll, coll_index,
1360cd323ac0SAlex Elder 					(s32)ret, seg_len);
136111f77002SSage Weil 	rbd_destroy_ops(ops);
1362602adf40SYehuda Sadeh done:
1363602adf40SYehuda Sadeh 	kfree(seg_name);
1364602adf40SYehuda Sadeh 	return ret;
1365602adf40SYehuda Sadeh }
1366602adf40SYehuda Sadeh 
1367602adf40SYehuda Sadeh /*
1368602adf40SYehuda Sadeh  * Request sync osd read
1369602adf40SYehuda Sadeh  */
13700ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1371602adf40SYehuda Sadeh 			  u64 snapid,
1372aded07eaSAlex Elder 			  const char *object_name,
1373602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
137459c2be1eSYehuda Sadeh 			  char *buf,
137559c2be1eSYehuda Sadeh 			  u64 *ver)
1376602adf40SYehuda Sadeh {
1377913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1378913d2fdcSAlex Elder 	int ret;
1379913d2fdcSAlex Elder 
1380913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1381913d2fdcSAlex Elder 	if (!ops)
1382913d2fdcSAlex Elder 		return -ENOMEM;
1383913d2fdcSAlex Elder 
1384913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1385b06e6a6bSJosh Durgin 			       snapid,
1386602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1387913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1388913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1389913d2fdcSAlex Elder 
1390913d2fdcSAlex Elder 	return ret;
1391602adf40SYehuda Sadeh }
1392602adf40SYehuda Sadeh 
1393602adf40SYehuda Sadeh /*
139459c2be1eSYehuda Sadeh  * Request sync osd watch
139559c2be1eSYehuda Sadeh  */
13960ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
139759c2be1eSYehuda Sadeh 				   u64 ver,
13987f0a24d8SAlex Elder 				   u64 notify_id)
139959c2be1eSYehuda Sadeh {
140059c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
140111f77002SSage Weil 	int ret;
140211f77002SSage Weil 
140357cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
140457cfc106SAlex Elder 	if (!ops)
140557cfc106SAlex Elder 		return -ENOMEM;
140659c2be1eSYehuda Sadeh 
1407a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
140859c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
140959c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
141059c2be1eSYehuda Sadeh 
14110ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
14127f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1413ad4f232fSAlex Elder 			  NULL, 0,
141459c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
141559c2be1eSYehuda Sadeh 			  ops,
14161fec7093SYehuda Sadeh 			  NULL, 0,
141759c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
141859c2be1eSYehuda Sadeh 
141959c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
142059c2be1eSYehuda Sadeh 	return ret;
142159c2be1eSYehuda Sadeh }
142259c2be1eSYehuda Sadeh 
142359c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
142459c2be1eSYehuda Sadeh {
14250ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1426a71b891bSJosh Durgin 	u64 hver;
142713143d2dSSage Weil 	int rc;
142813143d2dSSage Weil 
14290ce1a794SAlex Elder 	if (!rbd_dev)
143059c2be1eSYehuda Sadeh 		return;
143159c2be1eSYehuda Sadeh 
1432bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1433bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1434bd919d45SAlex Elder 		(unsigned int) opcode);
1435117973fbSAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
143613143d2dSSage Weil 	if (rc)
143706ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
143806ecc6cbSAlex Elder 			   " update snaps: %d\n", rc);
143959c2be1eSYehuda Sadeh 
14407f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
144159c2be1eSYehuda Sadeh }
144259c2be1eSYehuda Sadeh 
144359c2be1eSYehuda Sadeh /*
144459c2be1eSYehuda Sadeh  * Request sync osd watch
144559c2be1eSYehuda Sadeh  */
14460e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
144759c2be1eSYehuda Sadeh {
144859c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
14490ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
145057cfc106SAlex Elder 	int ret;
145159c2be1eSYehuda Sadeh 
145257cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
145357cfc106SAlex Elder 	if (!ops)
145457cfc106SAlex Elder 		return -ENOMEM;
145559c2be1eSYehuda Sadeh 
145659c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
14570ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
145859c2be1eSYehuda Sadeh 	if (ret < 0)
145959c2be1eSYehuda Sadeh 		goto fail;
146059c2be1eSYehuda Sadeh 
14610e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
14620ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
146359c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
146459c2be1eSYehuda Sadeh 
14650ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
146659c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
146759c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
146859c2be1eSYehuda Sadeh 			      ops,
14690e6f322dSAlex Elder 			      rbd_dev->header_name,
14700e6f322dSAlex Elder 			      0, 0, NULL,
14710ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
147259c2be1eSYehuda Sadeh 
147359c2be1eSYehuda Sadeh 	if (ret < 0)
147459c2be1eSYehuda Sadeh 		goto fail_event;
147559c2be1eSYehuda Sadeh 
147659c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
147759c2be1eSYehuda Sadeh 	return 0;
147859c2be1eSYehuda Sadeh 
147959c2be1eSYehuda Sadeh fail_event:
14800ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14810ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
148259c2be1eSYehuda Sadeh fail:
148359c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
148459c2be1eSYehuda Sadeh 	return ret;
148559c2be1eSYehuda Sadeh }
148659c2be1eSYehuda Sadeh 
148779e3057cSYehuda Sadeh /*
148879e3057cSYehuda Sadeh  * Request sync osd unwatch
148979e3057cSYehuda Sadeh  */
1490070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
149179e3057cSYehuda Sadeh {
149279e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
149357cfc106SAlex Elder 	int ret;
149479e3057cSYehuda Sadeh 
149557cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
149657cfc106SAlex Elder 	if (!ops)
149757cfc106SAlex Elder 		return -ENOMEM;
149879e3057cSYehuda Sadeh 
149979e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
15000ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
150179e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
150279e3057cSYehuda Sadeh 
15030ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
150479e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
150579e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
150679e3057cSYehuda Sadeh 			      ops,
1507070c633fSAlex Elder 			      rbd_dev->header_name,
1508070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1509070c633fSAlex Elder 
151079e3057cSYehuda Sadeh 
151179e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
15120ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
15130ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
151479e3057cSYehuda Sadeh 	return ret;
151579e3057cSYehuda Sadeh }
151679e3057cSYehuda Sadeh 
151759c2be1eSYehuda Sadeh /*
15183cb4a687SAlex Elder  * Synchronous osd object method call
1519602adf40SYehuda Sadeh  */
15200ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1521aded07eaSAlex Elder 			     const char *object_name,
1522aded07eaSAlex Elder 			     const char *class_name,
1523aded07eaSAlex Elder 			     const char *method_name,
15243cb4a687SAlex Elder 			     const char *outbound,
15253cb4a687SAlex Elder 			     size_t outbound_size,
1526f8d4de6eSAlex Elder 			     char *inbound,
1527f8d4de6eSAlex Elder 			     size_t inbound_size,
15283cb4a687SAlex Elder 			     int flags,
152959c2be1eSYehuda Sadeh 			     u64 *ver)
1530602adf40SYehuda Sadeh {
1531602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1532aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1533aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
15343cb4a687SAlex Elder 	int payload_size;
153557cfc106SAlex Elder 	int ret;
153657cfc106SAlex Elder 
15373cb4a687SAlex Elder 	/*
15383cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
15393cb4a687SAlex Elder 	 * will be sent along with the class and method names as
15403cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
15413cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
15423cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
15433cb4a687SAlex Elder 	 * operation.
15443cb4a687SAlex Elder 	 */
15453cb4a687SAlex Elder 	payload_size = class_name_len + method_name_len + outbound_size;
15463cb4a687SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
154757cfc106SAlex Elder 	if (!ops)
154857cfc106SAlex Elder 		return -ENOMEM;
1549602adf40SYehuda Sadeh 
1550aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1551aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1552aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1553aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1554602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
15553cb4a687SAlex Elder 	ops[0].cls.indata = outbound;
15563cb4a687SAlex Elder 	ops[0].cls.indata_len = outbound_size;
1557602adf40SYehuda Sadeh 
15580ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1559602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
15603cb4a687SAlex Elder 			       flags, ops,
1561f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1562f8d4de6eSAlex Elder 			       NULL, ver);
1563602adf40SYehuda Sadeh 
1564602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1565602adf40SYehuda Sadeh 
1566602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1567602adf40SYehuda Sadeh 	return ret;
1568602adf40SYehuda Sadeh }
1569602adf40SYehuda Sadeh 
15701fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
15711fec7093SYehuda Sadeh {
15721fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
15731fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
15741fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
15751fec7093SYehuda Sadeh 				GFP_ATOMIC);
15761fec7093SYehuda Sadeh 
15771fec7093SYehuda Sadeh 	if (!coll)
15781fec7093SYehuda Sadeh 		return NULL;
15791fec7093SYehuda Sadeh 	coll->total = num_reqs;
15801fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15811fec7093SYehuda Sadeh 	return coll;
15821fec7093SYehuda Sadeh }
15831fec7093SYehuda Sadeh 
15848295cda7SAlex Elder static int rbd_dev_do_request(struct request *rq,
15858295cda7SAlex Elder 				struct rbd_device *rbd_dev,
15868295cda7SAlex Elder 				struct ceph_snap_context *snapc,
15878295cda7SAlex Elder 				u64 ofs, unsigned int size,
15888295cda7SAlex Elder 				struct bio *bio_chain)
15898295cda7SAlex Elder {
15908295cda7SAlex Elder 	int num_segs;
15918295cda7SAlex Elder 	struct rbd_req_coll *coll;
15928295cda7SAlex Elder 	unsigned int bio_offset;
15938295cda7SAlex Elder 	int cur_seg = 0;
15948295cda7SAlex Elder 
15958295cda7SAlex Elder 	dout("%s 0x%x bytes at 0x%llx\n",
15968295cda7SAlex Elder 		rq_data_dir(rq) == WRITE ? "write" : "read",
15978295cda7SAlex Elder 		size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
15988295cda7SAlex Elder 
15998295cda7SAlex Elder 	num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
16008295cda7SAlex Elder 	if (num_segs <= 0)
16018295cda7SAlex Elder 		return num_segs;
16028295cda7SAlex Elder 
16038295cda7SAlex Elder 	coll = rbd_alloc_coll(num_segs);
16048295cda7SAlex Elder 	if (!coll)
16058295cda7SAlex Elder 		return -ENOMEM;
16068295cda7SAlex Elder 
16078295cda7SAlex Elder 	bio_offset = 0;
16088295cda7SAlex Elder 	do {
16098295cda7SAlex Elder 		u64 limit = rbd_segment_length(rbd_dev, ofs, size);
16108295cda7SAlex Elder 		unsigned int clone_size;
16118295cda7SAlex Elder 		struct bio *bio_clone;
16128295cda7SAlex Elder 
16138295cda7SAlex Elder 		BUG_ON(limit > (u64)UINT_MAX);
16148295cda7SAlex Elder 		clone_size = (unsigned int)limit;
16158295cda7SAlex Elder 		dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
16168295cda7SAlex Elder 
16178295cda7SAlex Elder 		kref_get(&coll->kref);
16188295cda7SAlex Elder 
16198295cda7SAlex Elder 		/* Pass a cloned bio chain via an osd request */
16208295cda7SAlex Elder 
16218295cda7SAlex Elder 		bio_clone = bio_chain_clone_range(&bio_chain,
16228295cda7SAlex Elder 					&bio_offset, clone_size,
16238295cda7SAlex Elder 					GFP_ATOMIC);
16248295cda7SAlex Elder 		if (bio_clone)
16258295cda7SAlex Elder 			(void)rbd_do_op(rq, rbd_dev, snapc,
16268295cda7SAlex Elder 					ofs, clone_size,
16278295cda7SAlex Elder 					bio_clone, coll, cur_seg);
16288295cda7SAlex Elder 		else
16298295cda7SAlex Elder 			rbd_coll_end_req_index(rq, coll, cur_seg,
16308295cda7SAlex Elder 						(s32)-ENOMEM,
16318295cda7SAlex Elder 						clone_size);
16328295cda7SAlex Elder 		size -= clone_size;
16338295cda7SAlex Elder 		ofs += clone_size;
16348295cda7SAlex Elder 
16358295cda7SAlex Elder 		cur_seg++;
16368295cda7SAlex Elder 	} while (size > 0);
16378295cda7SAlex Elder 	kref_put(&coll->kref, rbd_coll_release);
16388295cda7SAlex Elder 
16398295cda7SAlex Elder 	return 0;
16408295cda7SAlex Elder }
16418295cda7SAlex Elder 
1642602adf40SYehuda Sadeh /*
1643602adf40SYehuda Sadeh  * block device queue callback
1644602adf40SYehuda Sadeh  */
1645602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1646602adf40SYehuda Sadeh {
1647602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1648b395e8b5SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
1649602adf40SYehuda Sadeh 	struct request *rq;
1650602adf40SYehuda Sadeh 
165100f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1652b395e8b5SAlex Elder 		struct ceph_snap_context *snapc = NULL;
1653b395e8b5SAlex Elder 		unsigned int size = 0;
16548295cda7SAlex Elder 		int result;
1655602adf40SYehuda Sadeh 
1656602adf40SYehuda Sadeh 		dout("fetched request\n");
1657602adf40SYehuda Sadeh 
1658b395e8b5SAlex Elder 		/* Filter out block requests we don't understand */
1659b395e8b5SAlex Elder 
1660602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1661602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
166200f1f36fSAlex Elder 			continue;
1663602adf40SYehuda Sadeh 		}
1664602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1665602adf40SYehuda Sadeh 
1666a7b4c65fSAlex Elder 		/* Write requests need a reference to the snapshot context */
1667e88a36ecSJosh Durgin 
1668a7b4c65fSAlex Elder 		if (rq_data_dir(rq) == WRITE) {
1669b395e8b5SAlex Elder 			result = -EROFS;
1670a7b4c65fSAlex Elder 			if (read_only) /* Can't write to a read-only device */
1671b395e8b5SAlex Elder 				goto out_end_request;
1672b395e8b5SAlex Elder 
1673a7b4c65fSAlex Elder 			/*
1674a7b4c65fSAlex Elder 			 * Note that each osd request will take its
1675a7b4c65fSAlex Elder 			 * own reference to the snapshot context
1676a7b4c65fSAlex Elder 			 * supplied.  The reference we take here
1677a7b4c65fSAlex Elder 			 * just guarantees the one we provide stays
1678a7b4c65fSAlex Elder 			 * valid.
1679a7b4c65fSAlex Elder 			 */
1680b395e8b5SAlex Elder 			down_read(&rbd_dev->header_rwsem);
1681b395e8b5SAlex Elder 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1682d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1683a7b4c65fSAlex Elder 			rbd_assert(snapc != NULL);
1684a7b4c65fSAlex Elder 		} else if (!atomic_read(&rbd_dev->exists)) {
1685b395e8b5SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1686e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1687b395e8b5SAlex Elder 			result = -ENXIO;
1688b395e8b5SAlex Elder 			goto out_end_request;
1689e88a36ecSJosh Durgin 		}
1690d1d25646SJosh Durgin 
1691f7760dadSAlex Elder 		size = blk_rq_bytes(rq);
1692b395e8b5SAlex Elder 		result = rbd_dev_do_request(rq, rbd_dev, snapc,
1693b395e8b5SAlex Elder 				blk_rq_pos(rq) * SECTOR_SIZE,
1694b395e8b5SAlex Elder 				size, rq->bio);
1695b395e8b5SAlex Elder out_end_request:
1696a7b4c65fSAlex Elder 		if (snapc)
1697df111be6SAlex Elder 			ceph_put_snap_context(snapc);
16981fec7093SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
16998295cda7SAlex Elder 		if (!size || result < 0)
17008295cda7SAlex Elder 			__blk_end_request_all(rq, result);
1701602adf40SYehuda Sadeh 	}
1702602adf40SYehuda Sadeh }
1703602adf40SYehuda Sadeh 
1704602adf40SYehuda Sadeh /*
1705602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1706602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1707f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1708602adf40SYehuda Sadeh  */
1709602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1710602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1711602adf40SYehuda Sadeh {
1712602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1713e5cfeed2SAlex Elder 	sector_t sector_offset;
1714e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1715e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1716e5cfeed2SAlex Elder 	int ret;
1717602adf40SYehuda Sadeh 
1718e5cfeed2SAlex Elder 	/*
1719e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1720e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1721e5cfeed2SAlex Elder 	 * device.
1722e5cfeed2SAlex Elder 	 */
1723e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1724e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1725e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1726593a9e7bSAlex Elder 
1727e5cfeed2SAlex Elder 	/*
1728e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
1729e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
1730e5cfeed2SAlex Elder 	 */
1731e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1732e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
1733e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
1734e5cfeed2SAlex Elder 	else
1735e5cfeed2SAlex Elder 		ret = 0;
1736e5cfeed2SAlex Elder 
1737e5cfeed2SAlex Elder 	/*
1738e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
1739e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
1740e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
1741e5cfeed2SAlex Elder 	 * added to an empty bio."
1742e5cfeed2SAlex Elder 	 */
1743e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
1744e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
1745e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
1746e5cfeed2SAlex Elder 
1747e5cfeed2SAlex Elder 	return ret;
1748602adf40SYehuda Sadeh }
1749602adf40SYehuda Sadeh 
1750602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1751602adf40SYehuda Sadeh {
1752602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1753602adf40SYehuda Sadeh 
1754602adf40SYehuda Sadeh 	if (!disk)
1755602adf40SYehuda Sadeh 		return;
1756602adf40SYehuda Sadeh 
1757602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1758602adf40SYehuda Sadeh 		del_gendisk(disk);
1759602adf40SYehuda Sadeh 	if (disk->queue)
1760602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1761602adf40SYehuda Sadeh 	put_disk(disk);
1762602adf40SYehuda Sadeh }
1763602adf40SYehuda Sadeh 
1764602adf40SYehuda Sadeh /*
17654156d998SAlex Elder  * Read the complete header for the given rbd device.
17664156d998SAlex Elder  *
17674156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
17684156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
17694156d998SAlex Elder  * of a variable that will be filled in with the version of the
17704156d998SAlex Elder  * header object at the time it was read.
17714156d998SAlex Elder  *
17724156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
17734156d998SAlex Elder  */
17744156d998SAlex Elder static struct rbd_image_header_ondisk *
17754156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
17764156d998SAlex Elder {
17774156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
17784156d998SAlex Elder 	u32 snap_count = 0;
17794156d998SAlex Elder 	u64 names_size = 0;
17804156d998SAlex Elder 	u32 want_count;
17814156d998SAlex Elder 	int ret;
17824156d998SAlex Elder 
17834156d998SAlex Elder 	/*
17844156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
17854156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
17864156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
17874156d998SAlex Elder 	 * the number of snapshots could change by the time we read
17884156d998SAlex Elder 	 * it in, in which case we re-read it.
17894156d998SAlex Elder 	 */
17904156d998SAlex Elder 	do {
17914156d998SAlex Elder 		size_t size;
17924156d998SAlex Elder 
17934156d998SAlex Elder 		kfree(ondisk);
17944156d998SAlex Elder 
17954156d998SAlex Elder 		size = sizeof (*ondisk);
17964156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17974156d998SAlex Elder 		size += names_size;
17984156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17994156d998SAlex Elder 		if (!ondisk)
18004156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
18014156d998SAlex Elder 
18024156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
18034156d998SAlex Elder 				       rbd_dev->header_name,
18044156d998SAlex Elder 				       0, size,
18054156d998SAlex Elder 				       (char *) ondisk, version);
18064156d998SAlex Elder 
18074156d998SAlex Elder 		if (ret < 0)
18084156d998SAlex Elder 			goto out_err;
18094156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
18104156d998SAlex Elder 			ret = -ENXIO;
181106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
181206ecc6cbSAlex Elder 				size, ret);
18134156d998SAlex Elder 			goto out_err;
18144156d998SAlex Elder 		}
18154156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
18164156d998SAlex Elder 			ret = -ENXIO;
181706ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
18184156d998SAlex Elder 			goto out_err;
18194156d998SAlex Elder 		}
18204156d998SAlex Elder 
18214156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
18224156d998SAlex Elder 		want_count = snap_count;
18234156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
18244156d998SAlex Elder 	} while (snap_count != want_count);
18254156d998SAlex Elder 
18264156d998SAlex Elder 	return ondisk;
18274156d998SAlex Elder 
18284156d998SAlex Elder out_err:
18294156d998SAlex Elder 	kfree(ondisk);
18304156d998SAlex Elder 
18314156d998SAlex Elder 	return ERR_PTR(ret);
18324156d998SAlex Elder }
18334156d998SAlex Elder 
18344156d998SAlex Elder /*
1835602adf40SYehuda Sadeh  * reload the ondisk the header
1836602adf40SYehuda Sadeh  */
1837602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1838602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1839602adf40SYehuda Sadeh {
18404156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
18414156d998SAlex Elder 	u64 ver = 0;
18424156d998SAlex Elder 	int ret;
1843602adf40SYehuda Sadeh 
18444156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
18454156d998SAlex Elder 	if (IS_ERR(ondisk))
18464156d998SAlex Elder 		return PTR_ERR(ondisk);
18474156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
18484156d998SAlex Elder 	if (ret >= 0)
184959c2be1eSYehuda Sadeh 		header->obj_version = ver;
18504156d998SAlex Elder 	kfree(ondisk);
1851602adf40SYehuda Sadeh 
18524156d998SAlex Elder 	return ret;
1853602adf40SYehuda Sadeh }
1854602adf40SYehuda Sadeh 
185541f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1856dfc5606dSYehuda Sadeh {
1857dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1858a0593290SAlex Elder 	struct rbd_snap *next;
1859dfc5606dSYehuda Sadeh 
1860a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
186141f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
1862dfc5606dSYehuda Sadeh }
1863dfc5606dSYehuda Sadeh 
18649478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
18659478554aSAlex Elder {
18669478554aSAlex Elder 	sector_t size;
18679478554aSAlex Elder 
18680d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
18699478554aSAlex Elder 		return;
18709478554aSAlex Elder 
18719478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
18729478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
18739478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
18749478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
18759478554aSAlex Elder }
18769478554aSAlex Elder 
1877602adf40SYehuda Sadeh /*
1878602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1879602adf40SYehuda Sadeh  */
1880117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1881602adf40SYehuda Sadeh {
1882602adf40SYehuda Sadeh 	int ret;
1883602adf40SYehuda Sadeh 	struct rbd_image_header h;
1884602adf40SYehuda Sadeh 
1885602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1886602adf40SYehuda Sadeh 	if (ret < 0)
1887602adf40SYehuda Sadeh 		return ret;
1888602adf40SYehuda Sadeh 
1889a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1890a51aa0c0SJosh Durgin 
18919478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
18929478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
18939478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
18949db4b3e3SSage Weil 
1895849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1896602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1897849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1898d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1899d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1900602adf40SYehuda Sadeh 
1901b813623aSAlex Elder 	if (hver)
1902b813623aSAlex Elder 		*hver = h.obj_version;
1903a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
190493a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1905602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1906602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1907602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1908849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1909849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1910849b4260SAlex Elder 	kfree(h.object_prefix);
1911849b4260SAlex Elder 
1912304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1913304f6808SAlex Elder 	if (!ret)
1914304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1915dfc5606dSYehuda Sadeh 
1916c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1917602adf40SYehuda Sadeh 
1918dfc5606dSYehuda Sadeh 	return ret;
1919602adf40SYehuda Sadeh }
1920602adf40SYehuda Sadeh 
1921117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
19221fe5e993SAlex Elder {
19231fe5e993SAlex Elder 	int ret;
19241fe5e993SAlex Elder 
1925117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
19261fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1927117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
1928117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
1929117973fbSAlex Elder 	else
1930117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
19311fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
19321fe5e993SAlex Elder 
19331fe5e993SAlex Elder 	return ret;
19341fe5e993SAlex Elder }
19351fe5e993SAlex Elder 
1936602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1937602adf40SYehuda Sadeh {
1938602adf40SYehuda Sadeh 	struct gendisk *disk;
1939602adf40SYehuda Sadeh 	struct request_queue *q;
1940593a9e7bSAlex Elder 	u64 segment_size;
1941602adf40SYehuda Sadeh 
1942602adf40SYehuda Sadeh 	/* create gendisk info */
1943602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1944602adf40SYehuda Sadeh 	if (!disk)
19451fcdb8aaSAlex Elder 		return -ENOMEM;
1946602adf40SYehuda Sadeh 
1947f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1948de71a297SAlex Elder 		 rbd_dev->dev_id);
1949602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1950602adf40SYehuda Sadeh 	disk->first_minor = 0;
1951602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1952602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1953602adf40SYehuda Sadeh 
1954602adf40SYehuda Sadeh 	/* init rq */
1955602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1956602adf40SYehuda Sadeh 	if (!q)
1957602adf40SYehuda Sadeh 		goto out_disk;
1958029bcbd8SJosh Durgin 
1959593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1960593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1961593a9e7bSAlex Elder 
1962029bcbd8SJosh Durgin 	/* set io sizes to object size */
1963593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1964593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1965593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1966593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1967593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1968029bcbd8SJosh Durgin 
1969602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1970602adf40SYehuda Sadeh 	disk->queue = q;
1971602adf40SYehuda Sadeh 
1972602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1973602adf40SYehuda Sadeh 
1974602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1975602adf40SYehuda Sadeh 
197612f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
197712f02944SAlex Elder 
1978602adf40SYehuda Sadeh 	return 0;
1979602adf40SYehuda Sadeh out_disk:
1980602adf40SYehuda Sadeh 	put_disk(disk);
19811fcdb8aaSAlex Elder 
19821fcdb8aaSAlex Elder 	return -ENOMEM;
1983602adf40SYehuda Sadeh }
1984602adf40SYehuda Sadeh 
1985dfc5606dSYehuda Sadeh /*
1986dfc5606dSYehuda Sadeh   sysfs
1987dfc5606dSYehuda Sadeh */
1988602adf40SYehuda Sadeh 
1989593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1990593a9e7bSAlex Elder {
1991593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1992593a9e7bSAlex Elder }
1993593a9e7bSAlex Elder 
1994dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1995dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1996602adf40SYehuda Sadeh {
1997593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1998a51aa0c0SJosh Durgin 	sector_t size;
1999dfc5606dSYehuda Sadeh 
2000a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2001a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2002a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2003a51aa0c0SJosh Durgin 
2004a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2005602adf40SYehuda Sadeh }
2006602adf40SYehuda Sadeh 
200734b13184SAlex Elder /*
200834b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
200934b13184SAlex Elder  * necessarily the base image.
201034b13184SAlex Elder  */
201134b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
201234b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
201334b13184SAlex Elder {
201434b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
201534b13184SAlex Elder 
201634b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
201734b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
201834b13184SAlex Elder }
201934b13184SAlex Elder 
2020dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2021dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2022602adf40SYehuda Sadeh {
2023593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2024dfc5606dSYehuda Sadeh 
2025dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2026dfc5606dSYehuda Sadeh }
2027dfc5606dSYehuda Sadeh 
2028dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2029dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2030dfc5606dSYehuda Sadeh {
2031593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2032dfc5606dSYehuda Sadeh 
20331dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
20341dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2035dfc5606dSYehuda Sadeh }
2036dfc5606dSYehuda Sadeh 
2037dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2038dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2039dfc5606dSYehuda Sadeh {
2040593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2041dfc5606dSYehuda Sadeh 
20420d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2043dfc5606dSYehuda Sadeh }
2044dfc5606dSYehuda Sadeh 
20459bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
20469bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
20479bb2f334SAlex Elder {
20489bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
20499bb2f334SAlex Elder 
20500d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
20510d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
20529bb2f334SAlex Elder }
20539bb2f334SAlex Elder 
2054dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2055dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2056dfc5606dSYehuda Sadeh {
2057593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2058dfc5606dSYehuda Sadeh 
2059a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
20600d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2061a92ffdf8SAlex Elder 
2062a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2063dfc5606dSYehuda Sadeh }
2064dfc5606dSYehuda Sadeh 
2065589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2066589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2067589d30e0SAlex Elder {
2068589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2069589d30e0SAlex Elder 
20700d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2071589d30e0SAlex Elder }
2072589d30e0SAlex Elder 
207334b13184SAlex Elder /*
207434b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
207534b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
207634b13184SAlex Elder  */
2077dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2078dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2079dfc5606dSYehuda Sadeh 			     char *buf)
2080dfc5606dSYehuda Sadeh {
2081593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2082dfc5606dSYehuda Sadeh 
20830d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2084dfc5606dSYehuda Sadeh }
2085dfc5606dSYehuda Sadeh 
208686b00e0dSAlex Elder /*
208786b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
208886b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
208986b00e0dSAlex Elder  * "(no parent image)".
209086b00e0dSAlex Elder  */
209186b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
209286b00e0dSAlex Elder 			     struct device_attribute *attr,
209386b00e0dSAlex Elder 			     char *buf)
209486b00e0dSAlex Elder {
209586b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
209686b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
209786b00e0dSAlex Elder 	int count;
209886b00e0dSAlex Elder 	char *bufp = buf;
209986b00e0dSAlex Elder 
210086b00e0dSAlex Elder 	if (!spec)
210186b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
210286b00e0dSAlex Elder 
210386b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
210486b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
210586b00e0dSAlex Elder 	if (count < 0)
210686b00e0dSAlex Elder 		return count;
210786b00e0dSAlex Elder 	bufp += count;
210886b00e0dSAlex Elder 
210986b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
211086b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
211186b00e0dSAlex Elder 	if (count < 0)
211286b00e0dSAlex Elder 		return count;
211386b00e0dSAlex Elder 	bufp += count;
211486b00e0dSAlex Elder 
211586b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
211686b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
211786b00e0dSAlex Elder 	if (count < 0)
211886b00e0dSAlex Elder 		return count;
211986b00e0dSAlex Elder 	bufp += count;
212086b00e0dSAlex Elder 
212186b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
212286b00e0dSAlex Elder 	if (count < 0)
212386b00e0dSAlex Elder 		return count;
212486b00e0dSAlex Elder 	bufp += count;
212586b00e0dSAlex Elder 
212686b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
212786b00e0dSAlex Elder }
212886b00e0dSAlex Elder 
2129dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2130dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2131dfc5606dSYehuda Sadeh 				 const char *buf,
2132dfc5606dSYehuda Sadeh 				 size_t size)
2133dfc5606dSYehuda Sadeh {
2134593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2135b813623aSAlex Elder 	int ret;
2136602adf40SYehuda Sadeh 
2137117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2138b813623aSAlex Elder 
2139b813623aSAlex Elder 	return ret < 0 ? ret : size;
2140dfc5606dSYehuda Sadeh }
2141602adf40SYehuda Sadeh 
2142dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
214334b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2144dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2145dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2146dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
21479bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2148dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2149589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2150dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2151dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
215286b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2153dfc5606dSYehuda Sadeh 
2154dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2155dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
215634b13184SAlex Elder 	&dev_attr_features.attr,
2157dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2158dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2159dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
21609bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2161dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2162589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2163dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
216486b00e0dSAlex Elder 	&dev_attr_parent.attr,
2165dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2166dfc5606dSYehuda Sadeh 	NULL
2167dfc5606dSYehuda Sadeh };
2168dfc5606dSYehuda Sadeh 
2169dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2170dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2171dfc5606dSYehuda Sadeh };
2172dfc5606dSYehuda Sadeh 
2173dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2174dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2175dfc5606dSYehuda Sadeh 	NULL
2176dfc5606dSYehuda Sadeh };
2177dfc5606dSYehuda Sadeh 
2178dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2179dfc5606dSYehuda Sadeh {
2180dfc5606dSYehuda Sadeh }
2181dfc5606dSYehuda Sadeh 
2182dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2183dfc5606dSYehuda Sadeh 	.name		= "rbd",
2184dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2185dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2186dfc5606dSYehuda Sadeh };
2187dfc5606dSYehuda Sadeh 
2188dfc5606dSYehuda Sadeh 
2189dfc5606dSYehuda Sadeh /*
2190dfc5606dSYehuda Sadeh   sysfs - snapshots
2191dfc5606dSYehuda Sadeh */
2192dfc5606dSYehuda Sadeh 
2193dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2194dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2195dfc5606dSYehuda Sadeh 				  char *buf)
2196dfc5606dSYehuda Sadeh {
2197dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2198dfc5606dSYehuda Sadeh 
21993591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2200dfc5606dSYehuda Sadeh }
2201dfc5606dSYehuda Sadeh 
2202dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2203dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2204dfc5606dSYehuda Sadeh 				char *buf)
2205dfc5606dSYehuda Sadeh {
2206dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2207dfc5606dSYehuda Sadeh 
2208593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2209dfc5606dSYehuda Sadeh }
2210dfc5606dSYehuda Sadeh 
221134b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
221234b13184SAlex Elder 				struct device_attribute *attr,
221334b13184SAlex Elder 				char *buf)
221434b13184SAlex Elder {
221534b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
221634b13184SAlex Elder 
221734b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
221834b13184SAlex Elder 			(unsigned long long) snap->features);
221934b13184SAlex Elder }
222034b13184SAlex Elder 
2221dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2222dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
222334b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2224dfc5606dSYehuda Sadeh 
2225dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2226dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2227dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
222834b13184SAlex Elder 	&dev_attr_snap_features.attr,
2229dfc5606dSYehuda Sadeh 	NULL,
2230dfc5606dSYehuda Sadeh };
2231dfc5606dSYehuda Sadeh 
2232dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2233dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2234dfc5606dSYehuda Sadeh };
2235dfc5606dSYehuda Sadeh 
2236dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2237dfc5606dSYehuda Sadeh {
2238dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2239dfc5606dSYehuda Sadeh 	kfree(snap->name);
2240dfc5606dSYehuda Sadeh 	kfree(snap);
2241dfc5606dSYehuda Sadeh }
2242dfc5606dSYehuda Sadeh 
2243dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2244dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2245dfc5606dSYehuda Sadeh 	NULL
2246dfc5606dSYehuda Sadeh };
2247dfc5606dSYehuda Sadeh 
2248dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2249dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2250dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2251dfc5606dSYehuda Sadeh };
2252dfc5606dSYehuda Sadeh 
22538b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
22548b8fb99cSAlex Elder {
22558b8fb99cSAlex Elder 	kref_get(&spec->kref);
22568b8fb99cSAlex Elder 
22578b8fb99cSAlex Elder 	return spec;
22588b8fb99cSAlex Elder }
22598b8fb99cSAlex Elder 
22608b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
22618b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
22628b8fb99cSAlex Elder {
22638b8fb99cSAlex Elder 	if (spec)
22648b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
22658b8fb99cSAlex Elder }
22668b8fb99cSAlex Elder 
22678b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
22688b8fb99cSAlex Elder {
22698b8fb99cSAlex Elder 	struct rbd_spec *spec;
22708b8fb99cSAlex Elder 
22718b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
22728b8fb99cSAlex Elder 	if (!spec)
22738b8fb99cSAlex Elder 		return NULL;
22748b8fb99cSAlex Elder 	kref_init(&spec->kref);
22758b8fb99cSAlex Elder 
22768b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
22778b8fb99cSAlex Elder 
22788b8fb99cSAlex Elder 	return spec;
22798b8fb99cSAlex Elder }
22808b8fb99cSAlex Elder 
22818b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
22828b8fb99cSAlex Elder {
22838b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
22848b8fb99cSAlex Elder 
22858b8fb99cSAlex Elder 	kfree(spec->pool_name);
22868b8fb99cSAlex Elder 	kfree(spec->image_id);
22878b8fb99cSAlex Elder 	kfree(spec->image_name);
22888b8fb99cSAlex Elder 	kfree(spec->snap_name);
22898b8fb99cSAlex Elder 	kfree(spec);
22908b8fb99cSAlex Elder }
22918b8fb99cSAlex Elder 
2292c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2293c53d5893SAlex Elder 				struct rbd_spec *spec)
2294c53d5893SAlex Elder {
2295c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2296c53d5893SAlex Elder 
2297c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2298c53d5893SAlex Elder 	if (!rbd_dev)
2299c53d5893SAlex Elder 		return NULL;
2300c53d5893SAlex Elder 
2301c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
2302d78b650aSAlex Elder 	atomic_set(&rbd_dev->exists, 0);
2303c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2304c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2305c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2306c53d5893SAlex Elder 
2307c53d5893SAlex Elder 	rbd_dev->spec = spec;
2308c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2309c53d5893SAlex Elder 
2310c53d5893SAlex Elder 	return rbd_dev;
2311c53d5893SAlex Elder }
2312c53d5893SAlex Elder 
2313c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2314c53d5893SAlex Elder {
231586b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2316c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2317c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2318c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2319c53d5893SAlex Elder 	kfree(rbd_dev);
2320c53d5893SAlex Elder }
2321c53d5893SAlex Elder 
2322304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2323304f6808SAlex Elder {
2324304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2325304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2326304f6808SAlex Elder 
2327304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2328304f6808SAlex Elder 
2329304f6808SAlex Elder 	return ret;
2330304f6808SAlex Elder }
2331304f6808SAlex Elder 
233241f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2333dfc5606dSYehuda Sadeh {
2334dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2335304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2336dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2337dfc5606dSYehuda Sadeh }
2338dfc5606dSYehuda Sadeh 
233914e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2340dfc5606dSYehuda Sadeh 				  struct device *parent)
2341dfc5606dSYehuda Sadeh {
2342dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2343dfc5606dSYehuda Sadeh 	int ret;
2344dfc5606dSYehuda Sadeh 
2345dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2346dfc5606dSYehuda Sadeh 	dev->parent = parent;
2347dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2348d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2349304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2350304f6808SAlex Elder 
2351dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2352dfc5606dSYehuda Sadeh 
2353dfc5606dSYehuda Sadeh 	return ret;
2354dfc5606dSYehuda Sadeh }
2355dfc5606dSYehuda Sadeh 
23564e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2357c8d18425SAlex Elder 						const char *snap_name,
235834b13184SAlex Elder 						u64 snap_id, u64 snap_size,
235934b13184SAlex Elder 						u64 snap_features)
2360dfc5606dSYehuda Sadeh {
23614e891e0aSAlex Elder 	struct rbd_snap *snap;
2362dfc5606dSYehuda Sadeh 	int ret;
23634e891e0aSAlex Elder 
23644e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2365dfc5606dSYehuda Sadeh 	if (!snap)
23664e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
23674e891e0aSAlex Elder 
23684e891e0aSAlex Elder 	ret = -ENOMEM;
2369c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
23704e891e0aSAlex Elder 	if (!snap->name)
23714e891e0aSAlex Elder 		goto err;
23724e891e0aSAlex Elder 
2373c8d18425SAlex Elder 	snap->id = snap_id;
2374c8d18425SAlex Elder 	snap->size = snap_size;
237534b13184SAlex Elder 	snap->features = snap_features;
23764e891e0aSAlex Elder 
23774e891e0aSAlex Elder 	return snap;
23784e891e0aSAlex Elder 
2379dfc5606dSYehuda Sadeh err:
2380dfc5606dSYehuda Sadeh 	kfree(snap->name);
2381dfc5606dSYehuda Sadeh 	kfree(snap);
23824e891e0aSAlex Elder 
23834e891e0aSAlex Elder 	return ERR_PTR(ret);
2384dfc5606dSYehuda Sadeh }
2385dfc5606dSYehuda Sadeh 
2386cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2387cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2388cd892126SAlex Elder {
2389cd892126SAlex Elder 	char *snap_name;
2390cd892126SAlex Elder 
2391cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2392cd892126SAlex Elder 
2393cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2394cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2395cd892126SAlex Elder 
2396cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2397cd892126SAlex Elder 
2398cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2399cd892126SAlex Elder 	while (which--)
2400cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2401cd892126SAlex Elder 
2402cd892126SAlex Elder 	return snap_name;
2403cd892126SAlex Elder }
2404cd892126SAlex Elder 
2405dfc5606dSYehuda Sadeh /*
24069d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
24079d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
24089d475de5SAlex Elder  * image.
24099d475de5SAlex Elder  */
24109d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
24119d475de5SAlex Elder 				u8 *order, u64 *snap_size)
24129d475de5SAlex Elder {
24139d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
24149d475de5SAlex Elder 	int ret;
24159d475de5SAlex Elder 	struct {
24169d475de5SAlex Elder 		u8 order;
24179d475de5SAlex Elder 		__le64 size;
24189d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
24199d475de5SAlex Elder 
24209d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24219d475de5SAlex Elder 				"rbd", "get_size",
24229d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
24239d475de5SAlex Elder 				(char *) &size_buf, sizeof (size_buf),
24249d475de5SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
24259d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24269d475de5SAlex Elder 	if (ret < 0)
24279d475de5SAlex Elder 		return ret;
24289d475de5SAlex Elder 
24299d475de5SAlex Elder 	*order = size_buf.order;
24309d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
24319d475de5SAlex Elder 
24329d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
24339d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
24349d475de5SAlex Elder 		(unsigned long long) *snap_size);
24359d475de5SAlex Elder 
24369d475de5SAlex Elder 	return 0;
24379d475de5SAlex Elder }
24389d475de5SAlex Elder 
24399d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
24409d475de5SAlex Elder {
24419d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
24429d475de5SAlex Elder 					&rbd_dev->header.obj_order,
24439d475de5SAlex Elder 					&rbd_dev->header.image_size);
24449d475de5SAlex Elder }
24459d475de5SAlex Elder 
24461e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
24471e130199SAlex Elder {
24481e130199SAlex Elder 	void *reply_buf;
24491e130199SAlex Elder 	int ret;
24501e130199SAlex Elder 	void *p;
24511e130199SAlex Elder 
24521e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
24531e130199SAlex Elder 	if (!reply_buf)
24541e130199SAlex Elder 		return -ENOMEM;
24551e130199SAlex Elder 
24561e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24571e130199SAlex Elder 				"rbd", "get_object_prefix",
24581e130199SAlex Elder 				NULL, 0,
24591e130199SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
24601e130199SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
24611e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24621e130199SAlex Elder 	if (ret < 0)
24631e130199SAlex Elder 		goto out;
2464a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
24651e130199SAlex Elder 
24661e130199SAlex Elder 	p = reply_buf;
24671e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
24681e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
24691e130199SAlex Elder 						NULL, GFP_NOIO);
24701e130199SAlex Elder 
24711e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
24721e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
24731e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
24741e130199SAlex Elder 	} else {
24751e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
24761e130199SAlex Elder 	}
24771e130199SAlex Elder 
24781e130199SAlex Elder out:
24791e130199SAlex Elder 	kfree(reply_buf);
24801e130199SAlex Elder 
24811e130199SAlex Elder 	return ret;
24821e130199SAlex Elder }
24831e130199SAlex Elder 
2484b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2485b1b5402aSAlex Elder 		u64 *snap_features)
2486b1b5402aSAlex Elder {
2487b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2488b1b5402aSAlex Elder 	struct {
2489b1b5402aSAlex Elder 		__le64 features;
2490b1b5402aSAlex Elder 		__le64 incompat;
2491b1b5402aSAlex Elder 	} features_buf = { 0 };
2492d889140cSAlex Elder 	u64 incompat;
2493b1b5402aSAlex Elder 	int ret;
2494b1b5402aSAlex Elder 
2495b1b5402aSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2496b1b5402aSAlex Elder 				"rbd", "get_features",
2497b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2498b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
2499b1b5402aSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2500b1b5402aSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2501b1b5402aSAlex Elder 	if (ret < 0)
2502b1b5402aSAlex Elder 		return ret;
2503d889140cSAlex Elder 
2504d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2505d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2506b8f5c6edSAlex Elder 		return -ENXIO;
2507d889140cSAlex Elder 
2508b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2509b1b5402aSAlex Elder 
2510b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2511b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2512b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2513b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2514b1b5402aSAlex Elder 
2515b1b5402aSAlex Elder 	return 0;
2516b1b5402aSAlex Elder }
2517b1b5402aSAlex Elder 
2518b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2519b1b5402aSAlex Elder {
2520b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2521b1b5402aSAlex Elder 						&rbd_dev->header.features);
2522b1b5402aSAlex Elder }
2523b1b5402aSAlex Elder 
252486b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
252586b00e0dSAlex Elder {
252686b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
252786b00e0dSAlex Elder 	size_t size;
252886b00e0dSAlex Elder 	void *reply_buf = NULL;
252986b00e0dSAlex Elder 	__le64 snapid;
253086b00e0dSAlex Elder 	void *p;
253186b00e0dSAlex Elder 	void *end;
253286b00e0dSAlex Elder 	char *image_id;
253386b00e0dSAlex Elder 	u64 overlap;
253486b00e0dSAlex Elder 	int ret;
253586b00e0dSAlex Elder 
253686b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
253786b00e0dSAlex Elder 	if (!parent_spec)
253886b00e0dSAlex Elder 		return -ENOMEM;
253986b00e0dSAlex Elder 
254086b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
254186b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
254286b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
254386b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
254486b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
254586b00e0dSAlex Elder 	if (!reply_buf) {
254686b00e0dSAlex Elder 		ret = -ENOMEM;
254786b00e0dSAlex Elder 		goto out_err;
254886b00e0dSAlex Elder 	}
254986b00e0dSAlex Elder 
255086b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
255186b00e0dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
255286b00e0dSAlex Elder 				"rbd", "get_parent",
255386b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
255486b00e0dSAlex Elder 				(char *) reply_buf, size,
255586b00e0dSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
255686b00e0dSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
255786b00e0dSAlex Elder 	if (ret < 0)
255886b00e0dSAlex Elder 		goto out_err;
255986b00e0dSAlex Elder 
256086b00e0dSAlex Elder 	ret = -ERANGE;
256186b00e0dSAlex Elder 	p = reply_buf;
256286b00e0dSAlex Elder 	end = (char *) reply_buf + size;
256386b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
256486b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
256586b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
256686b00e0dSAlex Elder 
2567979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
256886b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
256986b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
257086b00e0dSAlex Elder 		goto out_err;
257186b00e0dSAlex Elder 	}
257286b00e0dSAlex Elder 	parent_spec->image_id = image_id;
257386b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
257486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
257586b00e0dSAlex Elder 
257686b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
257786b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
257886b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
257986b00e0dSAlex Elder out:
258086b00e0dSAlex Elder 	ret = 0;
258186b00e0dSAlex Elder out_err:
258286b00e0dSAlex Elder 	kfree(reply_buf);
258386b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
258486b00e0dSAlex Elder 
258586b00e0dSAlex Elder 	return ret;
258686b00e0dSAlex Elder }
258786b00e0dSAlex Elder 
25889e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
25899e15b77dSAlex Elder {
25909e15b77dSAlex Elder 	size_t image_id_size;
25919e15b77dSAlex Elder 	char *image_id;
25929e15b77dSAlex Elder 	void *p;
25939e15b77dSAlex Elder 	void *end;
25949e15b77dSAlex Elder 	size_t size;
25959e15b77dSAlex Elder 	void *reply_buf = NULL;
25969e15b77dSAlex Elder 	size_t len = 0;
25979e15b77dSAlex Elder 	char *image_name = NULL;
25989e15b77dSAlex Elder 	int ret;
25999e15b77dSAlex Elder 
26009e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
26019e15b77dSAlex Elder 
260269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
260369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
26049e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
26059e15b77dSAlex Elder 	if (!image_id)
26069e15b77dSAlex Elder 		return NULL;
26079e15b77dSAlex Elder 
26089e15b77dSAlex Elder 	p = image_id;
26099e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
261069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
26119e15b77dSAlex Elder 
26129e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
26139e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
26149e15b77dSAlex Elder 	if (!reply_buf)
26159e15b77dSAlex Elder 		goto out;
26169e15b77dSAlex Elder 
26179e15b77dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
26189e15b77dSAlex Elder 				"rbd", "dir_get_name",
26199e15b77dSAlex Elder 				image_id, image_id_size,
26209e15b77dSAlex Elder 				(char *) reply_buf, size,
26219e15b77dSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
26229e15b77dSAlex Elder 	if (ret < 0)
26239e15b77dSAlex Elder 		goto out;
26249e15b77dSAlex Elder 	p = reply_buf;
26259e15b77dSAlex Elder 	end = (char *) reply_buf + size;
26269e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
26279e15b77dSAlex Elder 	if (IS_ERR(image_name))
26289e15b77dSAlex Elder 		image_name = NULL;
26299e15b77dSAlex Elder 	else
26309e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
26319e15b77dSAlex Elder out:
26329e15b77dSAlex Elder 	kfree(reply_buf);
26339e15b77dSAlex Elder 	kfree(image_id);
26349e15b77dSAlex Elder 
26359e15b77dSAlex Elder 	return image_name;
26369e15b77dSAlex Elder }
26379e15b77dSAlex Elder 
26389e15b77dSAlex Elder /*
26399e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
26409e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
26419e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
26429e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
26439e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
26449e15b77dSAlex Elder  * until then.
26459e15b77dSAlex Elder  */
26469e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
26479e15b77dSAlex Elder {
26489e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
26499e15b77dSAlex Elder 	const char *name;
26509e15b77dSAlex Elder 	void *reply_buf = NULL;
26519e15b77dSAlex Elder 	int ret;
26529e15b77dSAlex Elder 
26539e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
26549e15b77dSAlex Elder 		return 0;	/* Already have the names */
26559e15b77dSAlex Elder 
26569e15b77dSAlex Elder 	/* Look up the pool name */
26579e15b77dSAlex Elder 
26589e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
26599e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2660935dc89fSAlex Elder 	if (!name) {
2661935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
2662935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
2663935dc89fSAlex Elder 		return -EIO;
2664935dc89fSAlex Elder 	}
26659e15b77dSAlex Elder 
26669e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
26679e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
26689e15b77dSAlex Elder 		return -ENOMEM;
26699e15b77dSAlex Elder 
26709e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
26719e15b77dSAlex Elder 
26729e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
267369e7a02fSAlex Elder 	if (name)
26749e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
267569e7a02fSAlex Elder 	else
267606ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
26779e15b77dSAlex Elder 
26789e15b77dSAlex Elder 	/* Look up the snapshot name. */
26799e15b77dSAlex Elder 
26809e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
26819e15b77dSAlex Elder 	if (!name) {
2682935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
2683935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
26849e15b77dSAlex Elder 		ret = -EIO;
26859e15b77dSAlex Elder 		goto out_err;
26869e15b77dSAlex Elder 	}
26879e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
26889e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
26899e15b77dSAlex Elder 		goto out_err;
26909e15b77dSAlex Elder 
26919e15b77dSAlex Elder 	return 0;
26929e15b77dSAlex Elder out_err:
26939e15b77dSAlex Elder 	kfree(reply_buf);
26949e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
26959e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
26969e15b77dSAlex Elder 
26979e15b77dSAlex Elder 	return ret;
26989e15b77dSAlex Elder }
26999e15b77dSAlex Elder 
27006e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
270135d489f9SAlex Elder {
270235d489f9SAlex Elder 	size_t size;
270335d489f9SAlex Elder 	int ret;
270435d489f9SAlex Elder 	void *reply_buf;
270535d489f9SAlex Elder 	void *p;
270635d489f9SAlex Elder 	void *end;
270735d489f9SAlex Elder 	u64 seq;
270835d489f9SAlex Elder 	u32 snap_count;
270935d489f9SAlex Elder 	struct ceph_snap_context *snapc;
271035d489f9SAlex Elder 	u32 i;
271135d489f9SAlex Elder 
271235d489f9SAlex Elder 	/*
271335d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
271435d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
271535d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
271635d489f9SAlex Elder 	 * prepared to receive.
271735d489f9SAlex Elder 	 */
271835d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
271935d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
272035d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
272135d489f9SAlex Elder 	if (!reply_buf)
272235d489f9SAlex Elder 		return -ENOMEM;
272335d489f9SAlex Elder 
272435d489f9SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
272535d489f9SAlex Elder 				"rbd", "get_snapcontext",
272635d489f9SAlex Elder 				NULL, 0,
272735d489f9SAlex Elder 				reply_buf, size,
27286e14b1a6SAlex Elder 				CEPH_OSD_FLAG_READ, ver);
272935d489f9SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
273035d489f9SAlex Elder 	if (ret < 0)
273135d489f9SAlex Elder 		goto out;
273235d489f9SAlex Elder 
273335d489f9SAlex Elder 	ret = -ERANGE;
273435d489f9SAlex Elder 	p = reply_buf;
273535d489f9SAlex Elder 	end = (char *) reply_buf + size;
273635d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
273735d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
273835d489f9SAlex Elder 
273935d489f9SAlex Elder 	/*
274035d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
274135d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
274235d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
274335d489f9SAlex Elder 	 * allocate is representable in a size_t.
274435d489f9SAlex Elder 	 */
274535d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
274635d489f9SAlex Elder 				 / sizeof (u64)) {
274735d489f9SAlex Elder 		ret = -EINVAL;
274835d489f9SAlex Elder 		goto out;
274935d489f9SAlex Elder 	}
275035d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
275135d489f9SAlex Elder 		goto out;
275235d489f9SAlex Elder 
275335d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
275435d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
275535d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
275635d489f9SAlex Elder 	if (!snapc) {
275735d489f9SAlex Elder 		ret = -ENOMEM;
275835d489f9SAlex Elder 		goto out;
275935d489f9SAlex Elder 	}
276035d489f9SAlex Elder 
276135d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
276235d489f9SAlex Elder 	snapc->seq = seq;
276335d489f9SAlex Elder 	snapc->num_snaps = snap_count;
276435d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
276535d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
276635d489f9SAlex Elder 
276735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
276835d489f9SAlex Elder 
276935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
277035d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
277135d489f9SAlex Elder 
277235d489f9SAlex Elder out:
277335d489f9SAlex Elder 	kfree(reply_buf);
277435d489f9SAlex Elder 
277535d489f9SAlex Elder 	return 0;
277635d489f9SAlex Elder }
277735d489f9SAlex Elder 
2778b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2779b8b1e2dbSAlex Elder {
2780b8b1e2dbSAlex Elder 	size_t size;
2781b8b1e2dbSAlex Elder 	void *reply_buf;
2782b8b1e2dbSAlex Elder 	__le64 snap_id;
2783b8b1e2dbSAlex Elder 	int ret;
2784b8b1e2dbSAlex Elder 	void *p;
2785b8b1e2dbSAlex Elder 	void *end;
2786b8b1e2dbSAlex Elder 	char *snap_name;
2787b8b1e2dbSAlex Elder 
2788b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2789b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
2790b8b1e2dbSAlex Elder 	if (!reply_buf)
2791b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
2792b8b1e2dbSAlex Elder 
2793b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2794b8b1e2dbSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2795b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
2796b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
2797b8b1e2dbSAlex Elder 				reply_buf, size,
2798b8b1e2dbSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2799b8b1e2dbSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2800b8b1e2dbSAlex Elder 	if (ret < 0)
2801b8b1e2dbSAlex Elder 		goto out;
2802b8b1e2dbSAlex Elder 
2803b8b1e2dbSAlex Elder 	p = reply_buf;
2804b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
2805e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2806b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
2807b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
2808b8b1e2dbSAlex Elder 		goto out;
2809b8b1e2dbSAlex Elder 	} else {
2810b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
2811b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
2812b8b1e2dbSAlex Elder 	}
2813b8b1e2dbSAlex Elder 	kfree(reply_buf);
2814b8b1e2dbSAlex Elder 
2815b8b1e2dbSAlex Elder 	return snap_name;
2816b8b1e2dbSAlex Elder out:
2817b8b1e2dbSAlex Elder 	kfree(reply_buf);
2818b8b1e2dbSAlex Elder 
2819b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
2820b8b1e2dbSAlex Elder }
2821b8b1e2dbSAlex Elder 
2822b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2823b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2824b8b1e2dbSAlex Elder {
2825b8b1e2dbSAlex Elder 	__le64 snap_id;
2826b8b1e2dbSAlex Elder 	u8 order;
2827b8b1e2dbSAlex Elder 	int ret;
2828b8b1e2dbSAlex Elder 
2829b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
2830b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2831b8b1e2dbSAlex Elder 	if (ret)
2832b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2833b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2834b8b1e2dbSAlex Elder 	if (ret)
2835b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2836b8b1e2dbSAlex Elder 
2837b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
2838b8b1e2dbSAlex Elder }
2839b8b1e2dbSAlex Elder 
2840b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2841b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2842b8b1e2dbSAlex Elder {
2843b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
2844b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
2845b8b1e2dbSAlex Elder 					snap_size, snap_features);
2846b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
2847b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
2848b8b1e2dbSAlex Elder 					snap_size, snap_features);
2849b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
2850b8b1e2dbSAlex Elder }
2851b8b1e2dbSAlex Elder 
2852117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2853117973fbSAlex Elder {
2854117973fbSAlex Elder 	int ret;
2855117973fbSAlex Elder 	__u8 obj_order;
2856117973fbSAlex Elder 
2857117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
2858117973fbSAlex Elder 
2859117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
2860117973fbSAlex Elder 
2861117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
2862117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
2863117973fbSAlex Elder 	if (ret)
2864117973fbSAlex Elder 		goto out;
2865117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
2866117973fbSAlex Elder 		ret = -EIO;
2867117973fbSAlex Elder 		goto out;
2868117973fbSAlex Elder 	}
2869117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
2870117973fbSAlex Elder 
2871117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2872117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
2873117973fbSAlex Elder 	if (ret)
2874117973fbSAlex Elder 		goto out;
2875117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2876117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
2877117973fbSAlex Elder 	if (ret)
2878117973fbSAlex Elder 		goto out;
2879117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
2880117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
2881117973fbSAlex Elder out:
2882117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
2883117973fbSAlex Elder 
2884117973fbSAlex Elder 	return ret;
2885117973fbSAlex Elder }
2886117973fbSAlex Elder 
28879d475de5SAlex Elder /*
288835938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
288935938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
289035938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
289135938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
289235938150SAlex Elder  * And verify there are no changes to snapshots we already know
289335938150SAlex Elder  * about.
289435938150SAlex Elder  *
289535938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
289635938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
289735938150SAlex Elder  * are also maintained in that order.)
2898dfc5606dSYehuda Sadeh  */
2899304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2900dfc5606dSYehuda Sadeh {
290135938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
290235938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
290335938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
290435938150SAlex Elder 	struct list_head *links = head->next;
290535938150SAlex Elder 	u32 index = 0;
2906dfc5606dSYehuda Sadeh 
29079fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
290835938150SAlex Elder 	while (index < snap_count || links != head) {
290935938150SAlex Elder 		u64 snap_id;
291035938150SAlex Elder 		struct rbd_snap *snap;
2911cd892126SAlex Elder 		char *snap_name;
2912cd892126SAlex Elder 		u64 snap_size = 0;
2913cd892126SAlex Elder 		u64 snap_features = 0;
2914dfc5606dSYehuda Sadeh 
291535938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
291635938150SAlex Elder 					     : CEPH_NOSNAP;
291735938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
291835938150SAlex Elder 				     : NULL;
2919aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2920dfc5606dSYehuda Sadeh 
292135938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
292235938150SAlex Elder 			struct list_head *next = links->next;
2923dfc5606dSYehuda Sadeh 
292435938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2925dfc5606dSYehuda Sadeh 
29260d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
2927d78b650aSAlex Elder 				atomic_set(&rbd_dev->exists, 0);
292841f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
29299fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
29300d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
29310d7dbfceSAlex Elder 							"mapped " : "",
29329fcbb800SAlex Elder 				(unsigned long long) snap->id);
2933dfc5606dSYehuda Sadeh 
293435938150SAlex Elder 			/* Done with this list entry; advance */
293535938150SAlex Elder 
293635938150SAlex Elder 			links = next;
293735938150SAlex Elder 			continue;
2938dfc5606dSYehuda Sadeh 		}
293935938150SAlex Elder 
2940b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
2941cd892126SAlex Elder 					&snap_size, &snap_features);
2942cd892126SAlex Elder 		if (IS_ERR(snap_name))
2943cd892126SAlex Elder 			return PTR_ERR(snap_name);
2944cd892126SAlex Elder 
29459fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
29469fcbb800SAlex Elder 			(unsigned long long) snap_id);
294735938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
294835938150SAlex Elder 			struct rbd_snap *new_snap;
294935938150SAlex Elder 
295035938150SAlex Elder 			/* We haven't seen this snapshot before */
295135938150SAlex Elder 
2952c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2953cd892126SAlex Elder 					snap_id, snap_size, snap_features);
29549fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
29559fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
29569fcbb800SAlex Elder 
29579fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
29589fcbb800SAlex Elder 
29599fcbb800SAlex Elder 				return err;
29609fcbb800SAlex Elder 			}
296135938150SAlex Elder 
296235938150SAlex Elder 			/* New goes before existing, or at end of list */
296335938150SAlex Elder 
29649fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
296535938150SAlex Elder 			if (snap)
296635938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
296735938150SAlex Elder 			else
2968523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
296935938150SAlex Elder 		} else {
297035938150SAlex Elder 			/* Already have this one */
297135938150SAlex Elder 
29729fcbb800SAlex Elder 			dout("  already present\n");
29739fcbb800SAlex Elder 
2974cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2975aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2976cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
297735938150SAlex Elder 
297835938150SAlex Elder 			/* Done with this list entry; advance */
297935938150SAlex Elder 
298035938150SAlex Elder 			links = links->next;
2981dfc5606dSYehuda Sadeh 		}
298235938150SAlex Elder 
298335938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
298435938150SAlex Elder 
298535938150SAlex Elder 		index++;
2986dfc5606dSYehuda Sadeh 	}
29879fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2988dfc5606dSYehuda Sadeh 
2989dfc5606dSYehuda Sadeh 	return 0;
2990dfc5606dSYehuda Sadeh }
2991dfc5606dSYehuda Sadeh 
2992304f6808SAlex Elder /*
2993304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2994304f6808SAlex Elder  * have not already been registered.
2995304f6808SAlex Elder  */
2996304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2997304f6808SAlex Elder {
2998304f6808SAlex Elder 	struct rbd_snap *snap;
2999304f6808SAlex Elder 	int ret = 0;
3000304f6808SAlex Elder 
3001304f6808SAlex Elder 	dout("%s called\n", __func__);
300286ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
300386ff77bbSAlex Elder 		return -EIO;
3004304f6808SAlex Elder 
3005304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3006304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3007304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3008304f6808SAlex Elder 			if (ret < 0)
3009304f6808SAlex Elder 				break;
3010304f6808SAlex Elder 		}
3011304f6808SAlex Elder 	}
3012304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3013304f6808SAlex Elder 
3014304f6808SAlex Elder 	return ret;
3015304f6808SAlex Elder }
3016304f6808SAlex Elder 
3017dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3018dfc5606dSYehuda Sadeh {
3019dfc5606dSYehuda Sadeh 	struct device *dev;
3020cd789ab9SAlex Elder 	int ret;
3021dfc5606dSYehuda Sadeh 
3022dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3023dfc5606dSYehuda Sadeh 
3024cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3025dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3026dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3027dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3028dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3029de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3030dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3031dfc5606dSYehuda Sadeh 
3032dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3033cd789ab9SAlex Elder 
3034dfc5606dSYehuda Sadeh 	return ret;
3035602adf40SYehuda Sadeh }
3036602adf40SYehuda Sadeh 
3037dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3038dfc5606dSYehuda Sadeh {
3039dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3040dfc5606dSYehuda Sadeh }
3041dfc5606dSYehuda Sadeh 
304259c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
304359c2be1eSYehuda Sadeh {
304459c2be1eSYehuda Sadeh 	int ret, rc;
304559c2be1eSYehuda Sadeh 
304659c2be1eSYehuda Sadeh 	do {
30470e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
304859c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
3049117973fbSAlex Elder 			rc = rbd_dev_refresh(rbd_dev, NULL);
305059c2be1eSYehuda Sadeh 			if (rc < 0)
305159c2be1eSYehuda Sadeh 				return rc;
305259c2be1eSYehuda Sadeh 		}
305359c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
305459c2be1eSYehuda Sadeh 
305559c2be1eSYehuda Sadeh 	return ret;
305659c2be1eSYehuda Sadeh }
305759c2be1eSYehuda Sadeh 
3058e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
30591ddbe94eSAlex Elder 
30601ddbe94eSAlex Elder /*
3061499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3062499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
30631ddbe94eSAlex Elder  */
3064e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3065b7f23c36SAlex Elder {
3066e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3067499afd5bSAlex Elder 
3068499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3069499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3070499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3071e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3072e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3073b7f23c36SAlex Elder }
3074b7f23c36SAlex Elder 
30751ddbe94eSAlex Elder /*
3076499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3077499afd5bSAlex Elder  * identifier is no longer in use.
30781ddbe94eSAlex Elder  */
3079e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
30801ddbe94eSAlex Elder {
3081d184f6bfSAlex Elder 	struct list_head *tmp;
3082de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3083d184f6bfSAlex Elder 	int max_id;
3084d184f6bfSAlex Elder 
3085aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3086499afd5bSAlex Elder 
3087e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3088e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3089499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3090499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3091d184f6bfSAlex Elder 
3092d184f6bfSAlex Elder 	/*
3093d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3094d184f6bfSAlex Elder 	 * is nothing special we need to do.
3095d184f6bfSAlex Elder 	 */
3096e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3097d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3098d184f6bfSAlex Elder 		return;
3099d184f6bfSAlex Elder 	}
3100d184f6bfSAlex Elder 
3101d184f6bfSAlex Elder 	/*
3102d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3103d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3104d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3105d184f6bfSAlex Elder 	 */
3106d184f6bfSAlex Elder 	max_id = 0;
3107d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3108d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3109d184f6bfSAlex Elder 
3110d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3111b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3112b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3113d184f6bfSAlex Elder 	}
3114499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
31151ddbe94eSAlex Elder 
31161ddbe94eSAlex Elder 	/*
3117e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3118d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3119d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3120d184f6bfSAlex Elder 	 * case.
31211ddbe94eSAlex Elder 	 */
3122e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3123e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3124b7f23c36SAlex Elder }
3125b7f23c36SAlex Elder 
3126a725f65eSAlex Elder /*
3127e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3128e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3129593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3130593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3131e28fff26SAlex Elder  */
3132e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3133e28fff26SAlex Elder {
3134e28fff26SAlex Elder         /*
3135e28fff26SAlex Elder         * These are the characters that produce nonzero for
3136e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3137e28fff26SAlex Elder         */
3138e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3139e28fff26SAlex Elder 
3140e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3141e28fff26SAlex Elder 
3142e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3143e28fff26SAlex Elder }
3144e28fff26SAlex Elder 
3145e28fff26SAlex Elder /*
3146e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3147e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3148593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3149593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3150e28fff26SAlex Elder  *
3151e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3152e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3153e28fff26SAlex Elder  * token_size if the token would not fit.
3154e28fff26SAlex Elder  *
3155593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3156e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3157e28fff26SAlex Elder  * too small to hold it.
3158e28fff26SAlex Elder  */
3159e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3160e28fff26SAlex Elder 				char *token,
3161e28fff26SAlex Elder 				size_t token_size)
3162e28fff26SAlex Elder {
3163e28fff26SAlex Elder         size_t len;
3164e28fff26SAlex Elder 
3165e28fff26SAlex Elder 	len = next_token(buf);
3166e28fff26SAlex Elder 	if (len < token_size) {
3167e28fff26SAlex Elder 		memcpy(token, *buf, len);
3168e28fff26SAlex Elder 		*(token + len) = '\0';
3169e28fff26SAlex Elder 	}
3170e28fff26SAlex Elder 	*buf += len;
3171e28fff26SAlex Elder 
3172e28fff26SAlex Elder         return len;
3173e28fff26SAlex Elder }
3174e28fff26SAlex Elder 
3175e28fff26SAlex Elder /*
3176ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3177ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3178ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3179ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3180ea3352f4SAlex Elder  *
3181ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3182ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3183ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3184ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3185ea3352f4SAlex Elder  *
3186ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3187ea3352f4SAlex Elder  * the end of the found token.
3188ea3352f4SAlex Elder  *
3189ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3190ea3352f4SAlex Elder  */
3191ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3192ea3352f4SAlex Elder {
3193ea3352f4SAlex Elder 	char *dup;
3194ea3352f4SAlex Elder 	size_t len;
3195ea3352f4SAlex Elder 
3196ea3352f4SAlex Elder 	len = next_token(buf);
31974caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3198ea3352f4SAlex Elder 	if (!dup)
3199ea3352f4SAlex Elder 		return NULL;
3200ea3352f4SAlex Elder 	*(dup + len) = '\0';
3201ea3352f4SAlex Elder 	*buf += len;
3202ea3352f4SAlex Elder 
3203ea3352f4SAlex Elder 	if (lenp)
3204ea3352f4SAlex Elder 		*lenp = len;
3205ea3352f4SAlex Elder 
3206ea3352f4SAlex Elder 	return dup;
3207ea3352f4SAlex Elder }
3208ea3352f4SAlex Elder 
3209ea3352f4SAlex Elder /*
3210859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3211859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3212859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3213859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3214d22f76e7SAlex Elder  *
3215859c31dfSAlex Elder  * The information extracted from these options is recorded in
3216859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3217859c31dfSAlex Elder  * structures:
3218859c31dfSAlex Elder  *  ceph_opts
3219859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3220859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3221859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3222859c31dfSAlex Elder  *  rbd_opts
3223859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3224859c31dfSAlex Elder  *	this function; caller must release with kfree().
3225859c31dfSAlex Elder  *  spec
3226859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3227859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3228859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3229859c31dfSAlex Elder  *
3230859c31dfSAlex Elder  * The options passed take this form:
3231859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3232859c31dfSAlex Elder  * where:
3233859c31dfSAlex Elder  *  <mon_addrs>
3234859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3235859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3236859c31dfSAlex Elder  *      by a port number (separated by a colon).
3237859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3238859c31dfSAlex Elder  *  <options>
3239859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3240859c31dfSAlex Elder  *  <pool_name>
3241859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3242859c31dfSAlex Elder  *  <image_name>
3243859c31dfSAlex Elder  *      The name of the image in that pool to map.
3244859c31dfSAlex Elder  *  <snap_id>
3245859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3246859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3247859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3248859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3249a725f65eSAlex Elder  */
3250859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3251dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3252859c31dfSAlex Elder 				struct rbd_options **opts,
3253859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3254a725f65eSAlex Elder {
3255e28fff26SAlex Elder 	size_t len;
3256859c31dfSAlex Elder 	char *options;
32570ddebc0cSAlex Elder 	const char *mon_addrs;
32580ddebc0cSAlex Elder 	size_t mon_addrs_size;
3259859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
32604e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3261859c31dfSAlex Elder 	struct ceph_options *copts;
3262dc79b113SAlex Elder 	int ret;
3263e28fff26SAlex Elder 
3264e28fff26SAlex Elder 	/* The first four tokens are required */
3265e28fff26SAlex Elder 
32667ef3214aSAlex Elder 	len = next_token(&buf);
32674fb5d671SAlex Elder 	if (!len) {
32684fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
32694fb5d671SAlex Elder 		return -EINVAL;
32704fb5d671SAlex Elder 	}
32710ddebc0cSAlex Elder 	mon_addrs = buf;
3272f28e565aSAlex Elder 	mon_addrs_size = len + 1;
32737ef3214aSAlex Elder 	buf += len;
3274a725f65eSAlex Elder 
3275dc79b113SAlex Elder 	ret = -EINVAL;
3276f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3277f28e565aSAlex Elder 	if (!options)
3278dc79b113SAlex Elder 		return -ENOMEM;
32794fb5d671SAlex Elder 	if (!*options) {
32804fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
32814fb5d671SAlex Elder 		goto out_err;
32824fb5d671SAlex Elder 	}
3283a725f65eSAlex Elder 
3284859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3285859c31dfSAlex Elder 	if (!spec)
3286f28e565aSAlex Elder 		goto out_mem;
3287859c31dfSAlex Elder 
3288859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3289859c31dfSAlex Elder 	if (!spec->pool_name)
3290859c31dfSAlex Elder 		goto out_mem;
32914fb5d671SAlex Elder 	if (!*spec->pool_name) {
32924fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
32934fb5d671SAlex Elder 		goto out_err;
32944fb5d671SAlex Elder 	}
3295e28fff26SAlex Elder 
329669e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3297859c31dfSAlex Elder 	if (!spec->image_name)
3298f28e565aSAlex Elder 		goto out_mem;
32994fb5d671SAlex Elder 	if (!*spec->image_name) {
33004fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
33014fb5d671SAlex Elder 		goto out_err;
33024fb5d671SAlex Elder 	}
3303e28fff26SAlex Elder 
3304f28e565aSAlex Elder 	/*
3305f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3306f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3307f28e565aSAlex Elder 	 */
33083feeb894SAlex Elder 	len = next_token(&buf);
3309820a5f3eSAlex Elder 	if (!len) {
33103feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
33113feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3312f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3313dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3314f28e565aSAlex Elder 		goto out_err;
3315849b4260SAlex Elder 	}
33164caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3317859c31dfSAlex Elder 	if (!spec->snap_name)
3318f28e565aSAlex Elder 		goto out_mem;
3319859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3320e5c35534SAlex Elder 
33210ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3322e28fff26SAlex Elder 
33234e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
33244e9afebaSAlex Elder 	if (!rbd_opts)
33254e9afebaSAlex Elder 		goto out_mem;
33264e9afebaSAlex Elder 
33274e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3328d22f76e7SAlex Elder 
3329859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
33300ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
33314e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3332859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3333859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3334dc79b113SAlex Elder 		goto out_err;
3335dc79b113SAlex Elder 	}
3336859c31dfSAlex Elder 	kfree(options);
3337859c31dfSAlex Elder 
3338859c31dfSAlex Elder 	*ceph_opts = copts;
33394e9afebaSAlex Elder 	*opts = rbd_opts;
3340859c31dfSAlex Elder 	*rbd_spec = spec;
33410ddebc0cSAlex Elder 
3342dc79b113SAlex Elder 	return 0;
3343f28e565aSAlex Elder out_mem:
3344dc79b113SAlex Elder 	ret = -ENOMEM;
3345d22f76e7SAlex Elder out_err:
3346859c31dfSAlex Elder 	kfree(rbd_opts);
3347859c31dfSAlex Elder 	rbd_spec_put(spec);
3348f28e565aSAlex Elder 	kfree(options);
3349d22f76e7SAlex Elder 
3350dc79b113SAlex Elder 	return ret;
3351a725f65eSAlex Elder }
3352a725f65eSAlex Elder 
3353589d30e0SAlex Elder /*
3354589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3355589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3356589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3357589d30e0SAlex Elder  *
3358589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3359589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3360589d30e0SAlex Elder  * with the supplied name.
3361589d30e0SAlex Elder  *
3362589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3363589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3364589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3365589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3366589d30e0SAlex Elder  */
3367589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3368589d30e0SAlex Elder {
3369589d30e0SAlex Elder 	int ret;
3370589d30e0SAlex Elder 	size_t size;
3371589d30e0SAlex Elder 	char *object_name;
3372589d30e0SAlex Elder 	void *response;
3373589d30e0SAlex Elder 	void *p;
3374589d30e0SAlex Elder 
3375589d30e0SAlex Elder 	/*
33762c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
33772c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
33782c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
33792c0d0a10SAlex Elder 	 */
33802c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
33812c0d0a10SAlex Elder 		return 0;
33822c0d0a10SAlex Elder 
33832c0d0a10SAlex Elder 	/*
3384589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3385589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3386589d30e0SAlex Elder 	 */
338769e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3388589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3389589d30e0SAlex Elder 	if (!object_name)
3390589d30e0SAlex Elder 		return -ENOMEM;
33910d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3392589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3393589d30e0SAlex Elder 
3394589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3395589d30e0SAlex Elder 
3396589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3397589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3398589d30e0SAlex Elder 	if (!response) {
3399589d30e0SAlex Elder 		ret = -ENOMEM;
3400589d30e0SAlex Elder 		goto out;
3401589d30e0SAlex Elder 	}
3402589d30e0SAlex Elder 
3403589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
3404589d30e0SAlex Elder 				"rbd", "get_id",
3405589d30e0SAlex Elder 				NULL, 0,
3406589d30e0SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX,
3407589d30e0SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
3408589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3409589d30e0SAlex Elder 	if (ret < 0)
3410589d30e0SAlex Elder 		goto out;
3411a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
3412589d30e0SAlex Elder 
3413589d30e0SAlex Elder 	p = response;
34140d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3415589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3416979ed480SAlex Elder 						NULL, GFP_NOIO);
34170d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
34180d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
34190d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3420589d30e0SAlex Elder 	} else {
34210d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3422589d30e0SAlex Elder 	}
3423589d30e0SAlex Elder out:
3424589d30e0SAlex Elder 	kfree(response);
3425589d30e0SAlex Elder 	kfree(object_name);
3426589d30e0SAlex Elder 
3427589d30e0SAlex Elder 	return ret;
3428589d30e0SAlex Elder }
3429589d30e0SAlex Elder 
3430a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3431a30b71b9SAlex Elder {
3432a30b71b9SAlex Elder 	int ret;
3433a30b71b9SAlex Elder 	size_t size;
3434a30b71b9SAlex Elder 
3435a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3436a30b71b9SAlex Elder 
34370d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
34380d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3439a30b71b9SAlex Elder 		return -ENOMEM;
3440a30b71b9SAlex Elder 
3441a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3442a30b71b9SAlex Elder 
344369e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3444a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3445a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3446a30b71b9SAlex Elder 		ret = -ENOMEM;
3447a30b71b9SAlex Elder 		goto out_err;
3448a30b71b9SAlex Elder 	}
34490d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34500d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3451a30b71b9SAlex Elder 
3452a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3453a30b71b9SAlex Elder 
3454a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3455a30b71b9SAlex Elder 	if (ret < 0)
3456a30b71b9SAlex Elder 		goto out_err;
345786b00e0dSAlex Elder 
345886b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
345986b00e0dSAlex Elder 
346086b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
346186b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
346286b00e0dSAlex Elder 
3463a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3464a30b71b9SAlex Elder 
3465a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3466a30b71b9SAlex Elder 		rbd_dev->header_name);
3467a30b71b9SAlex Elder 
3468a30b71b9SAlex Elder 	return 0;
3469a30b71b9SAlex Elder 
3470a30b71b9SAlex Elder out_err:
3471a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3472a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
34730d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
34740d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3475a30b71b9SAlex Elder 
3476a30b71b9SAlex Elder 	return ret;
3477a30b71b9SAlex Elder }
3478a30b71b9SAlex Elder 
3479a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3480a30b71b9SAlex Elder {
3481a30b71b9SAlex Elder 	size_t size;
34829d475de5SAlex Elder 	int ret;
34836e14b1a6SAlex Elder 	u64 ver = 0;
3484a30b71b9SAlex Elder 
3485a30b71b9SAlex Elder 	/*
3486a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3487a30b71b9SAlex Elder 	 * object name for this rbd image.
3488a30b71b9SAlex Elder 	 */
3489979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3490a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3491a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3492a30b71b9SAlex Elder 		return -ENOMEM;
3493a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34940d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
34959d475de5SAlex Elder 
34969d475de5SAlex Elder 	/* Get the size and object order for the image */
34979d475de5SAlex Elder 
34989d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
34999d475de5SAlex Elder 	if (ret < 0)
35009d475de5SAlex Elder 		goto out_err;
35011e130199SAlex Elder 
35021e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
35031e130199SAlex Elder 
35041e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
35051e130199SAlex Elder 	if (ret < 0)
35061e130199SAlex Elder 		goto out_err;
3507b1b5402aSAlex Elder 
3508d889140cSAlex Elder 	/* Get the and check features for the image */
3509b1b5402aSAlex Elder 
3510b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3511b1b5402aSAlex Elder 	if (ret < 0)
3512b1b5402aSAlex Elder 		goto out_err;
351335d489f9SAlex Elder 
351486b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
351586b00e0dSAlex Elder 
351686b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
351786b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
351886b00e0dSAlex Elder 		if (ret < 0)
351986b00e0dSAlex Elder 			goto out_err;
352086b00e0dSAlex Elder 	}
352186b00e0dSAlex Elder 
35226e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
352335d489f9SAlex Elder 
35246e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
35256e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
35266e14b1a6SAlex Elder 
35276e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
35286e14b1a6SAlex Elder 
35296e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
353035d489f9SAlex Elder 	if (ret)
353135d489f9SAlex Elder 		goto out_err;
35326e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
35336e14b1a6SAlex Elder 
3534a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3535a30b71b9SAlex Elder 
3536a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3537a30b71b9SAlex Elder 		rbd_dev->header_name);
3538a30b71b9SAlex Elder 
353935152979SAlex Elder 	return 0;
35409d475de5SAlex Elder out_err:
354186b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
354286b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
354386b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
35449d475de5SAlex Elder 	kfree(rbd_dev->header_name);
35459d475de5SAlex Elder 	rbd_dev->header_name = NULL;
35461e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
35471e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
35489d475de5SAlex Elder 
35499d475de5SAlex Elder 	return ret;
3550a30b71b9SAlex Elder }
3551a30b71b9SAlex Elder 
355283a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
355383a06263SAlex Elder {
355483a06263SAlex Elder 	int ret;
355583a06263SAlex Elder 
355683a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
355783a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
355883a06263SAlex Elder 	if (ret)
355983a06263SAlex Elder 		return ret;
356083a06263SAlex Elder 
35619e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
35629e15b77dSAlex Elder 	if (ret)
35639e15b77dSAlex Elder 		goto err_out_snaps;
35649e15b77dSAlex Elder 
356583a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
356683a06263SAlex Elder 	if (ret)
356783a06263SAlex Elder 		goto err_out_snaps;
356883a06263SAlex Elder 
356983a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
357083a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
357183a06263SAlex Elder 
357283a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
357383a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
357483a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
357583a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
357683a06263SAlex Elder 
357783a06263SAlex Elder 	/* Get our block major device number. */
357883a06263SAlex Elder 
357983a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
358083a06263SAlex Elder 	if (ret < 0)
358183a06263SAlex Elder 		goto err_out_id;
358283a06263SAlex Elder 	rbd_dev->major = ret;
358383a06263SAlex Elder 
358483a06263SAlex Elder 	/* Set up the blkdev mapping. */
358583a06263SAlex Elder 
358683a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
358783a06263SAlex Elder 	if (ret)
358883a06263SAlex Elder 		goto err_out_blkdev;
358983a06263SAlex Elder 
359083a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
359183a06263SAlex Elder 	if (ret)
359283a06263SAlex Elder 		goto err_out_disk;
359383a06263SAlex Elder 
359483a06263SAlex Elder 	/*
359583a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
359683a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
359783a06263SAlex Elder 	 */
359883a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
359983a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
360083a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
360183a06263SAlex Elder 	if (ret)
360283a06263SAlex Elder 		goto err_out_bus;
360383a06263SAlex Elder 
360483a06263SAlex Elder 	ret = rbd_init_watch_dev(rbd_dev);
360583a06263SAlex Elder 	if (ret)
360683a06263SAlex Elder 		goto err_out_bus;
360783a06263SAlex Elder 
360883a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
360983a06263SAlex Elder 
361083a06263SAlex Elder 	add_disk(rbd_dev->disk);
361183a06263SAlex Elder 
361283a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
361383a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
361483a06263SAlex Elder 
361583a06263SAlex Elder 	return ret;
361683a06263SAlex Elder err_out_bus:
361783a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
361883a06263SAlex Elder 
361983a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
362083a06263SAlex Elder 
362183a06263SAlex Elder 	return ret;
362283a06263SAlex Elder err_out_disk:
362383a06263SAlex Elder 	rbd_free_disk(rbd_dev);
362483a06263SAlex Elder err_out_blkdev:
362583a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
362683a06263SAlex Elder err_out_id:
362783a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
362883a06263SAlex Elder err_out_snaps:
362983a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
363083a06263SAlex Elder 
363183a06263SAlex Elder 	return ret;
363283a06263SAlex Elder }
363383a06263SAlex Elder 
3634a30b71b9SAlex Elder /*
3635a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3636a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3637a30b71b9SAlex Elder  * id.
3638a30b71b9SAlex Elder  */
3639a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3640a30b71b9SAlex Elder {
3641a30b71b9SAlex Elder 	int ret;
3642a30b71b9SAlex Elder 
3643a30b71b9SAlex Elder 	/*
3644a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3645a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3646a30b71b9SAlex Elder 	 * it's a format 1 image.
3647a30b71b9SAlex Elder 	 */
3648a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3649a30b71b9SAlex Elder 	if (ret)
3650a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3651a30b71b9SAlex Elder 	else
3652a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
365383a06263SAlex Elder 	if (ret) {
3654a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3655a30b71b9SAlex Elder 
3656a30b71b9SAlex Elder 		return ret;
3657a30b71b9SAlex Elder 	}
3658a30b71b9SAlex Elder 
365983a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
366083a06263SAlex Elder 	if (ret)
366183a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
366283a06263SAlex Elder 
366383a06263SAlex Elder 	return ret;
366483a06263SAlex Elder }
366583a06263SAlex Elder 
366659c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
366759c2be1eSYehuda Sadeh 		       const char *buf,
366859c2be1eSYehuda Sadeh 		       size_t count)
3669602adf40SYehuda Sadeh {
3670cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
3671dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
36724e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3673859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
36749d3997fdSAlex Elder 	struct rbd_client *rbdc;
367527cc2594SAlex Elder 	struct ceph_osd_client *osdc;
367627cc2594SAlex Elder 	int rc = -ENOMEM;
3677602adf40SYehuda Sadeh 
3678602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
3679602adf40SYehuda Sadeh 		return -ENODEV;
3680602adf40SYehuda Sadeh 
3681a725f65eSAlex Elder 	/* parse add command */
3682859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3683dc79b113SAlex Elder 	if (rc < 0)
3684bd4ba655SAlex Elder 		goto err_out_module;
3685a725f65eSAlex Elder 
36869d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
36879d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
36889d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
36890ddebc0cSAlex Elder 		goto err_out_args;
36909d3997fdSAlex Elder 	}
3691c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
3692602adf40SYehuda Sadeh 
3693602adf40SYehuda Sadeh 	/* pick the pool */
36949d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
3695859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3696602adf40SYehuda Sadeh 	if (rc < 0)
3697602adf40SYehuda Sadeh 		goto err_out_client;
3698859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
3699859c31dfSAlex Elder 
3700c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
3701bd4ba655SAlex Elder 	if (!rbd_dev)
3702bd4ba655SAlex Elder 		goto err_out_client;
3703c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
3704c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
3705602adf40SYehuda Sadeh 
3706bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
3707c53d5893SAlex Elder 	kfree(rbd_opts);
3708c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
3709bd4ba655SAlex Elder 
3710a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
3711a30b71b9SAlex Elder 	if (rc < 0)
3712c53d5893SAlex Elder 		goto err_out_rbd_dev;
371305fd6f6fSAlex Elder 
3714602adf40SYehuda Sadeh 	return count;
3715c53d5893SAlex Elder err_out_rbd_dev:
3716c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3717bd4ba655SAlex Elder err_out_client:
37189d3997fdSAlex Elder 	rbd_put_client(rbdc);
37190ddebc0cSAlex Elder err_out_args:
372078cea76eSAlex Elder 	if (ceph_opts)
372178cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
37224e9afebaSAlex Elder 	kfree(rbd_opts);
3723859c31dfSAlex Elder 	rbd_spec_put(spec);
3724bd4ba655SAlex Elder err_out_module:
3725bd4ba655SAlex Elder 	module_put(THIS_MODULE);
372627cc2594SAlex Elder 
3727602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
372827cc2594SAlex Elder 
372927cc2594SAlex Elder 	return (ssize_t) rc;
3730602adf40SYehuda Sadeh }
3731602adf40SYehuda Sadeh 
3732de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3733602adf40SYehuda Sadeh {
3734602adf40SYehuda Sadeh 	struct list_head *tmp;
3735602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
3736602adf40SYehuda Sadeh 
3737e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3738602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
3739602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3740de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
3741e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
3742602adf40SYehuda Sadeh 			return rbd_dev;
3743602adf40SYehuda Sadeh 		}
3744e124a82fSAlex Elder 	}
3745e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3746602adf40SYehuda Sadeh 	return NULL;
3747602adf40SYehuda Sadeh }
3748602adf40SYehuda Sadeh 
3749dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
3750602adf40SYehuda Sadeh {
3751593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3752602adf40SYehuda Sadeh 
37531dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
37541dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
37551dbb4399SAlex Elder 
37561dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
375759c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
37581dbb4399SAlex Elder 	}
375959c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
3760070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
376159c2be1eSYehuda Sadeh 
3762602adf40SYehuda Sadeh 
3763602adf40SYehuda Sadeh 	/* clean up and free blkdev */
3764602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
3765602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
376632eec68dSAlex Elder 
37672ac4e75dSAlex Elder 	/* release allocated disk header fields */
37682ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
37692ac4e75dSAlex Elder 
377032eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
3771e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
3772c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
3773c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3774602adf40SYehuda Sadeh 
3775602adf40SYehuda Sadeh 	/* release module ref */
3776602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
3777602adf40SYehuda Sadeh }
3778602adf40SYehuda Sadeh 
3779dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
3780602adf40SYehuda Sadeh 			  const char *buf,
3781602adf40SYehuda Sadeh 			  size_t count)
3782602adf40SYehuda Sadeh {
3783602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
3784602adf40SYehuda Sadeh 	int target_id, rc;
3785602adf40SYehuda Sadeh 	unsigned long ul;
3786602adf40SYehuda Sadeh 	int ret = count;
3787602adf40SYehuda Sadeh 
3788602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
3789602adf40SYehuda Sadeh 	if (rc)
3790602adf40SYehuda Sadeh 		return rc;
3791602adf40SYehuda Sadeh 
3792602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
3793602adf40SYehuda Sadeh 	target_id = (int) ul;
3794602adf40SYehuda Sadeh 	if (target_id != ul)
3795602adf40SYehuda Sadeh 		return -EINVAL;
3796602adf40SYehuda Sadeh 
3797602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3798602adf40SYehuda Sadeh 
3799602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
3800602adf40SYehuda Sadeh 	if (!rbd_dev) {
3801602adf40SYehuda Sadeh 		ret = -ENOENT;
3802602adf40SYehuda Sadeh 		goto done;
3803602adf40SYehuda Sadeh 	}
3804602adf40SYehuda Sadeh 
380542382b70SAlex Elder 	if (rbd_dev->open_count) {
380642382b70SAlex Elder 		ret = -EBUSY;
380742382b70SAlex Elder 		goto done;
380842382b70SAlex Elder 	}
380942382b70SAlex Elder 
381041f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
3811dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3812602adf40SYehuda Sadeh 
3813602adf40SYehuda Sadeh done:
3814602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3815aafb230eSAlex Elder 
3816602adf40SYehuda Sadeh 	return ret;
3817602adf40SYehuda Sadeh }
3818602adf40SYehuda Sadeh 
3819602adf40SYehuda Sadeh /*
3820602adf40SYehuda Sadeh  * create control files in sysfs
3821dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3822602adf40SYehuda Sadeh  */
3823602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3824602adf40SYehuda Sadeh {
3825dfc5606dSYehuda Sadeh 	int ret;
3826602adf40SYehuda Sadeh 
3827fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3828dfc5606dSYehuda Sadeh 	if (ret < 0)
3829dfc5606dSYehuda Sadeh 		return ret;
3830602adf40SYehuda Sadeh 
3831fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3832fed4c143SAlex Elder 	if (ret < 0)
3833fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3834602adf40SYehuda Sadeh 
3835602adf40SYehuda Sadeh 	return ret;
3836602adf40SYehuda Sadeh }
3837602adf40SYehuda Sadeh 
3838602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3839602adf40SYehuda Sadeh {
3840dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3841fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3842602adf40SYehuda Sadeh }
3843602adf40SYehuda Sadeh 
3844602adf40SYehuda Sadeh int __init rbd_init(void)
3845602adf40SYehuda Sadeh {
3846602adf40SYehuda Sadeh 	int rc;
3847602adf40SYehuda Sadeh 
3848602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3849602adf40SYehuda Sadeh 	if (rc)
3850602adf40SYehuda Sadeh 		return rc;
3851f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3852602adf40SYehuda Sadeh 	return 0;
3853602adf40SYehuda Sadeh }
3854602adf40SYehuda Sadeh 
3855602adf40SYehuda Sadeh void __exit rbd_exit(void)
3856602adf40SYehuda Sadeh {
3857602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3858602adf40SYehuda Sadeh }
3859602adf40SYehuda Sadeh 
3860602adf40SYehuda Sadeh module_init(rbd_init);
3861602adf40SYehuda Sadeh module_exit(rbd_exit);
3862602adf40SYehuda Sadeh 
3863602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3864602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3865602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3866602adf40SYehuda Sadeh 
3867602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3868602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3869602adf40SYehuda Sadeh 
3870602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3871