xref: /openbmc/linux/drivers/block/rbd.c (revision a7b4c65f)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55df111be6SAlex Elder /* It might be useful to have this defined elsewhere too */
56df111be6SAlex Elder 
57df111be6SAlex Elder #define	U64_MAX	((u64) (~0ULL))
58df111be6SAlex Elder 
59f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
60f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
61602adf40SYehuda Sadeh 
62602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
63602adf40SYehuda Sadeh 
64d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
65d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
66d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67d4b125e9SAlex Elder 
6835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
69602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
70602adf40SYehuda Sadeh 
71602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
72602adf40SYehuda Sadeh 
739e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
749e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
75589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
769e15b77dSAlex Elder 
771e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
78589d30e0SAlex Elder 
79d889140cSAlex Elder /* Feature bits */
80d889140cSAlex Elder 
81d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
82d889140cSAlex Elder 
83d889140cSAlex Elder /* Features supported by this (client software) implementation. */
84d889140cSAlex Elder 
85d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
86d889140cSAlex Elder 
8781a89793SAlex Elder /*
8881a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8981a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9081a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9181a89793SAlex Elder  * enough to hold all possible device names.
9281a89793SAlex Elder  */
93602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9481a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
95602adf40SYehuda Sadeh 
96cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
9759c2be1eSYehuda Sadeh 
98602adf40SYehuda Sadeh /*
99602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
100602adf40SYehuda Sadeh  */
101602adf40SYehuda Sadeh struct rbd_image_header {
102f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
103849b4260SAlex Elder 	char *object_prefix;
10434b13184SAlex Elder 	u64 features;
105602adf40SYehuda Sadeh 	__u8 obj_order;
106602adf40SYehuda Sadeh 	__u8 crypt_type;
107602adf40SYehuda Sadeh 	__u8 comp_type;
108602adf40SYehuda Sadeh 
109f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
110f84344f3SAlex Elder 	u64 image_size;
111f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
112602adf40SYehuda Sadeh 	char *snap_names;
113602adf40SYehuda Sadeh 	u64 *snap_sizes;
11459c2be1eSYehuda Sadeh 
11559c2be1eSYehuda Sadeh 	u64 obj_version;
11659c2be1eSYehuda Sadeh };
11759c2be1eSYehuda Sadeh 
1180d7dbfceSAlex Elder /*
1190d7dbfceSAlex Elder  * An rbd image specification.
1200d7dbfceSAlex Elder  *
1210d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
122c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
123c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
124c66c6e0cSAlex Elder  *
125c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
126c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
127c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
128c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
129c66c6e0cSAlex Elder  *
130c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
131c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
132c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
133c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
134c66c6e0cSAlex Elder  * is shared between the parent and child).
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
137c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
138c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
139c66c6e0cSAlex Elder  *
140c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
141c66c6e0cSAlex Elder  * could be a null pointer).
1420d7dbfceSAlex Elder  */
1430d7dbfceSAlex Elder struct rbd_spec {
1440d7dbfceSAlex Elder 	u64		pool_id;
1450d7dbfceSAlex Elder 	char		*pool_name;
1460d7dbfceSAlex Elder 
1470d7dbfceSAlex Elder 	char		*image_id;
1480d7dbfceSAlex Elder 	char		*image_name;
1490d7dbfceSAlex Elder 
1500d7dbfceSAlex Elder 	u64		snap_id;
1510d7dbfceSAlex Elder 	char		*snap_name;
1520d7dbfceSAlex Elder 
1530d7dbfceSAlex Elder 	struct kref	kref;
1540d7dbfceSAlex Elder };
1550d7dbfceSAlex Elder 
15659c2be1eSYehuda Sadeh struct rbd_options {
157cc0538b6SAlex Elder 	bool	read_only;
158602adf40SYehuda Sadeh };
159602adf40SYehuda Sadeh 
160602adf40SYehuda Sadeh /*
161f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
162602adf40SYehuda Sadeh  */
163602adf40SYehuda Sadeh struct rbd_client {
164602adf40SYehuda Sadeh 	struct ceph_client	*client;
165602adf40SYehuda Sadeh 	struct kref		kref;
166602adf40SYehuda Sadeh 	struct list_head	node;
167602adf40SYehuda Sadeh };
168602adf40SYehuda Sadeh 
169602adf40SYehuda Sadeh /*
170f0f8cef5SAlex Elder  * a request completion status
171602adf40SYehuda Sadeh  */
1721fec7093SYehuda Sadeh struct rbd_req_status {
1731fec7093SYehuda Sadeh 	int done;
1748986cb37SAlex Elder 	s32 rc;
1751fec7093SYehuda Sadeh 	u64 bytes;
1761fec7093SYehuda Sadeh };
1771fec7093SYehuda Sadeh 
1781fec7093SYehuda Sadeh /*
1791fec7093SYehuda Sadeh  * a collection of requests
1801fec7093SYehuda Sadeh  */
1811fec7093SYehuda Sadeh struct rbd_req_coll {
1821fec7093SYehuda Sadeh 	int			total;
1831fec7093SYehuda Sadeh 	int			num_done;
1841fec7093SYehuda Sadeh 	struct kref		kref;
1851fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
186602adf40SYehuda Sadeh };
187602adf40SYehuda Sadeh 
188f0f8cef5SAlex Elder /*
189f0f8cef5SAlex Elder  * a single io request
190f0f8cef5SAlex Elder  */
191f0f8cef5SAlex Elder struct rbd_request {
192f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
193f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
194f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
195f0f8cef5SAlex Elder 	u64			len;
196f0f8cef5SAlex Elder 	int			coll_index;
197f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
198f0f8cef5SAlex Elder };
199f0f8cef5SAlex Elder 
200dfc5606dSYehuda Sadeh struct rbd_snap {
201dfc5606dSYehuda Sadeh 	struct	device		dev;
202dfc5606dSYehuda Sadeh 	const char		*name;
2033591538fSJosh Durgin 	u64			size;
204dfc5606dSYehuda Sadeh 	struct list_head	node;
205dfc5606dSYehuda Sadeh 	u64			id;
20634b13184SAlex Elder 	u64			features;
207dfc5606dSYehuda Sadeh };
208dfc5606dSYehuda Sadeh 
209f84344f3SAlex Elder struct rbd_mapping {
21099c1f08fSAlex Elder 	u64                     size;
21134b13184SAlex Elder 	u64                     features;
212f84344f3SAlex Elder 	bool			read_only;
213f84344f3SAlex Elder };
214f84344f3SAlex Elder 
215602adf40SYehuda Sadeh /*
216602adf40SYehuda Sadeh  * a single device
217602adf40SYehuda Sadeh  */
218602adf40SYehuda Sadeh struct rbd_device {
219de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
220602adf40SYehuda Sadeh 
221602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
222602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
223602adf40SYehuda Sadeh 
224a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
225602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
226602adf40SYehuda Sadeh 
227602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
228602adf40SYehuda Sadeh 
229602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
230602adf40SYehuda Sadeh 
231602adf40SYehuda Sadeh 	struct rbd_image_header	header;
232d78b650aSAlex Elder 	atomic_t		exists;
2330d7dbfceSAlex Elder 	struct rbd_spec		*spec;
234602adf40SYehuda Sadeh 
2350d7dbfceSAlex Elder 	char			*header_name;
236971f839aSAlex Elder 
23759c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
23859c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
23959c2be1eSYehuda Sadeh 
24086b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
24186b00e0dSAlex Elder 	u64			parent_overlap;
24286b00e0dSAlex Elder 
243c666601aSJosh Durgin 	/* protects updating the header */
244c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
245f84344f3SAlex Elder 
246f84344f3SAlex Elder 	struct rbd_mapping	mapping;
247602adf40SYehuda Sadeh 
248602adf40SYehuda Sadeh 	struct list_head	node;
249dfc5606dSYehuda Sadeh 
250dfc5606dSYehuda Sadeh 	/* list of snapshots */
251dfc5606dSYehuda Sadeh 	struct list_head	snaps;
252dfc5606dSYehuda Sadeh 
253dfc5606dSYehuda Sadeh 	/* sysfs related */
254dfc5606dSYehuda Sadeh 	struct device		dev;
25542382b70SAlex Elder 	unsigned long		open_count;
256dfc5606dSYehuda Sadeh };
257dfc5606dSYehuda Sadeh 
258602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
259e124a82fSAlex Elder 
260602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
261e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
262e124a82fSAlex Elder 
263602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
264432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
265602adf40SYehuda Sadeh 
266304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
267304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
268304f6808SAlex Elder 
269dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
27041f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
271dfc5606dSYehuda Sadeh 
272f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
273f0f8cef5SAlex Elder 		       size_t count);
274f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
275f0f8cef5SAlex Elder 			  size_t count);
276f0f8cef5SAlex Elder 
277f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
278f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
279f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
280f0f8cef5SAlex Elder 	__ATTR_NULL
281f0f8cef5SAlex Elder };
282f0f8cef5SAlex Elder 
283f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
284f0f8cef5SAlex Elder 	.name		= "rbd",
285f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
286f0f8cef5SAlex Elder };
287f0f8cef5SAlex Elder 
288f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
289f0f8cef5SAlex Elder {
290f0f8cef5SAlex Elder }
291f0f8cef5SAlex Elder 
292f0f8cef5SAlex Elder static struct device rbd_root_dev = {
293f0f8cef5SAlex Elder 	.init_name =    "rbd",
294f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
295f0f8cef5SAlex Elder };
296f0f8cef5SAlex Elder 
29706ecc6cbSAlex Elder static __printf(2, 3)
29806ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
29906ecc6cbSAlex Elder {
30006ecc6cbSAlex Elder 	struct va_format vaf;
30106ecc6cbSAlex Elder 	va_list args;
30206ecc6cbSAlex Elder 
30306ecc6cbSAlex Elder 	va_start(args, fmt);
30406ecc6cbSAlex Elder 	vaf.fmt = fmt;
30506ecc6cbSAlex Elder 	vaf.va = &args;
30606ecc6cbSAlex Elder 
30706ecc6cbSAlex Elder 	if (!rbd_dev)
30806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
30906ecc6cbSAlex Elder 	else if (rbd_dev->disk)
31006ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
31106ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
31206ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
31306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
31406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
31506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
31606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
31706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
31806ecc6cbSAlex Elder 	else	/* punt */
31906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
32006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
32106ecc6cbSAlex Elder 	va_end(args);
32206ecc6cbSAlex Elder }
32306ecc6cbSAlex Elder 
324aafb230eSAlex Elder #ifdef RBD_DEBUG
325aafb230eSAlex Elder #define rbd_assert(expr)						\
326aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
327aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
328aafb230eSAlex Elder 						"at line %d:\n\n"	\
329aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
330aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
331aafb230eSAlex Elder 			BUG();						\
332aafb230eSAlex Elder 		}
333aafb230eSAlex Elder #else /* !RBD_DEBUG */
334aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
335aafb230eSAlex Elder #endif /* !RBD_DEBUG */
336dfc5606dSYehuda Sadeh 
337117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
338117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
33959c2be1eSYehuda Sadeh 
340602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
341602adf40SYehuda Sadeh {
342f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
343602adf40SYehuda Sadeh 
344f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
345602adf40SYehuda Sadeh 		return -EROFS;
346602adf40SYehuda Sadeh 
34742382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
348c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
349f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
35042382b70SAlex Elder 	rbd_dev->open_count++;
35142382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
352340c7a2bSAlex Elder 
353602adf40SYehuda Sadeh 	return 0;
354602adf40SYehuda Sadeh }
355602adf40SYehuda Sadeh 
356dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
357dfc5606dSYehuda Sadeh {
358dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
359dfc5606dSYehuda Sadeh 
36042382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
36142382b70SAlex Elder 	rbd_assert(rbd_dev->open_count > 0);
36242382b70SAlex Elder 	rbd_dev->open_count--;
363c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
36442382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
365dfc5606dSYehuda Sadeh 
366dfc5606dSYehuda Sadeh 	return 0;
367dfc5606dSYehuda Sadeh }
368dfc5606dSYehuda Sadeh 
369602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
370602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
371602adf40SYehuda Sadeh 	.open			= rbd_open,
372dfc5606dSYehuda Sadeh 	.release		= rbd_release,
373602adf40SYehuda Sadeh };
374602adf40SYehuda Sadeh 
375602adf40SYehuda Sadeh /*
376602adf40SYehuda Sadeh  * Initialize an rbd client instance.
37743ae4701SAlex Elder  * We own *ceph_opts.
378602adf40SYehuda Sadeh  */
379f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
380602adf40SYehuda Sadeh {
381602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
382602adf40SYehuda Sadeh 	int ret = -ENOMEM;
383602adf40SYehuda Sadeh 
384602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
385602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
386602adf40SYehuda Sadeh 	if (!rbdc)
387602adf40SYehuda Sadeh 		goto out_opt;
388602adf40SYehuda Sadeh 
389602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
390602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
391602adf40SYehuda Sadeh 
392bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
393bc534d86SAlex Elder 
39443ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
395602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
396bc534d86SAlex Elder 		goto out_mutex;
39743ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
398602adf40SYehuda Sadeh 
399602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
400602adf40SYehuda Sadeh 	if (ret < 0)
401602adf40SYehuda Sadeh 		goto out_err;
402602adf40SYehuda Sadeh 
403432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
404602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
405432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
406602adf40SYehuda Sadeh 
407bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
408bc534d86SAlex Elder 
409602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
410602adf40SYehuda Sadeh 	return rbdc;
411602adf40SYehuda Sadeh 
412602adf40SYehuda Sadeh out_err:
413602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
414bc534d86SAlex Elder out_mutex:
415bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
416602adf40SYehuda Sadeh 	kfree(rbdc);
417602adf40SYehuda Sadeh out_opt:
41843ae4701SAlex Elder 	if (ceph_opts)
41943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
42028f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
421602adf40SYehuda Sadeh }
422602adf40SYehuda Sadeh 
423602adf40SYehuda Sadeh /*
4241f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
4251f7ba331SAlex Elder  * found, bump its reference count.
426602adf40SYehuda Sadeh  */
4271f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
428602adf40SYehuda Sadeh {
429602adf40SYehuda Sadeh 	struct rbd_client *client_node;
4301f7ba331SAlex Elder 	bool found = false;
431602adf40SYehuda Sadeh 
43243ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
433602adf40SYehuda Sadeh 		return NULL;
434602adf40SYehuda Sadeh 
4351f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
4361f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
4371f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
4381f7ba331SAlex Elder 			kref_get(&client_node->kref);
4391f7ba331SAlex Elder 			found = true;
4401f7ba331SAlex Elder 			break;
4411f7ba331SAlex Elder 		}
4421f7ba331SAlex Elder 	}
4431f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
4441f7ba331SAlex Elder 
4451f7ba331SAlex Elder 	return found ? client_node : NULL;
446602adf40SYehuda Sadeh }
447602adf40SYehuda Sadeh 
448602adf40SYehuda Sadeh /*
44959c2be1eSYehuda Sadeh  * mount options
45059c2be1eSYehuda Sadeh  */
45159c2be1eSYehuda Sadeh enum {
45259c2be1eSYehuda Sadeh 	Opt_last_int,
45359c2be1eSYehuda Sadeh 	/* int args above */
45459c2be1eSYehuda Sadeh 	Opt_last_string,
45559c2be1eSYehuda Sadeh 	/* string args above */
456cc0538b6SAlex Elder 	Opt_read_only,
457cc0538b6SAlex Elder 	Opt_read_write,
458cc0538b6SAlex Elder 	/* Boolean args above */
459cc0538b6SAlex Elder 	Opt_last_bool,
46059c2be1eSYehuda Sadeh };
46159c2be1eSYehuda Sadeh 
46243ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
46359c2be1eSYehuda Sadeh 	/* int args above */
46459c2be1eSYehuda Sadeh 	/* string args above */
465be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
466cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
467cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
468cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
469cc0538b6SAlex Elder 	/* Boolean args above */
47059c2be1eSYehuda Sadeh 	{-1, NULL}
47159c2be1eSYehuda Sadeh };
47259c2be1eSYehuda Sadeh 
47359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
47459c2be1eSYehuda Sadeh {
47543ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
47659c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
47759c2be1eSYehuda Sadeh 	int token, intval, ret;
47859c2be1eSYehuda Sadeh 
47943ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
48059c2be1eSYehuda Sadeh 	if (token < 0)
48159c2be1eSYehuda Sadeh 		return -EINVAL;
48259c2be1eSYehuda Sadeh 
48359c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
48459c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
48559c2be1eSYehuda Sadeh 		if (ret < 0) {
48659c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
48759c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
48859c2be1eSYehuda Sadeh 			return ret;
48959c2be1eSYehuda Sadeh 		}
49059c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
49159c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
49259c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
49359c2be1eSYehuda Sadeh 		     argstr[0].from);
494cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
495cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
49659c2be1eSYehuda Sadeh 	} else {
49759c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
49859c2be1eSYehuda Sadeh 	}
49959c2be1eSYehuda Sadeh 
50059c2be1eSYehuda Sadeh 	switch (token) {
501cc0538b6SAlex Elder 	case Opt_read_only:
502cc0538b6SAlex Elder 		rbd_opts->read_only = true;
503cc0538b6SAlex Elder 		break;
504cc0538b6SAlex Elder 	case Opt_read_write:
505cc0538b6SAlex Elder 		rbd_opts->read_only = false;
506cc0538b6SAlex Elder 		break;
50759c2be1eSYehuda Sadeh 	default:
508aafb230eSAlex Elder 		rbd_assert(false);
509aafb230eSAlex Elder 		break;
51059c2be1eSYehuda Sadeh 	}
51159c2be1eSYehuda Sadeh 	return 0;
51259c2be1eSYehuda Sadeh }
51359c2be1eSYehuda Sadeh 
51459c2be1eSYehuda Sadeh /*
515602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
516602adf40SYehuda Sadeh  * not exist create it.
517602adf40SYehuda Sadeh  */
5189d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
519602adf40SYehuda Sadeh {
520f8c38929SAlex Elder 	struct rbd_client *rbdc;
52159c2be1eSYehuda Sadeh 
5221f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
5239d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
52443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
5259d3997fdSAlex Elder 	else
526f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
527d720bcb0SAlex Elder 
5289d3997fdSAlex Elder 	return rbdc;
529602adf40SYehuda Sadeh }
530602adf40SYehuda Sadeh 
531602adf40SYehuda Sadeh /*
532602adf40SYehuda Sadeh  * Destroy ceph client
533d23a4b3fSAlex Elder  *
534432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
535602adf40SYehuda Sadeh  */
536602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
537602adf40SYehuda Sadeh {
538602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
539602adf40SYehuda Sadeh 
540602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
541cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
542602adf40SYehuda Sadeh 	list_del(&rbdc->node);
543cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
544602adf40SYehuda Sadeh 
545602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
546602adf40SYehuda Sadeh 	kfree(rbdc);
547602adf40SYehuda Sadeh }
548602adf40SYehuda Sadeh 
549602adf40SYehuda Sadeh /*
550602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
551602adf40SYehuda Sadeh  * it.
552602adf40SYehuda Sadeh  */
5539d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
554602adf40SYehuda Sadeh {
555c53d5893SAlex Elder 	if (rbdc)
5569d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
557602adf40SYehuda Sadeh }
558602adf40SYehuda Sadeh 
5591fec7093SYehuda Sadeh /*
5601fec7093SYehuda Sadeh  * Destroy requests collection
5611fec7093SYehuda Sadeh  */
5621fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5631fec7093SYehuda Sadeh {
5641fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5651fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5661fec7093SYehuda Sadeh 
5671fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5681fec7093SYehuda Sadeh 	kfree(coll);
5691fec7093SYehuda Sadeh }
570602adf40SYehuda Sadeh 
571a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
572a30b71b9SAlex Elder {
573a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
574a30b71b9SAlex Elder }
575a30b71b9SAlex Elder 
5768e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5778e94af8eSAlex Elder {
578103a150fSAlex Elder 	size_t size;
579103a150fSAlex Elder 	u32 snap_count;
580103a150fSAlex Elder 
581103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
582103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
583103a150fSAlex Elder 		return false;
584103a150fSAlex Elder 
585db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
586db2388b6SAlex Elder 
587db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
588db2388b6SAlex Elder 		return false;
589db2388b6SAlex Elder 
590db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
591db2388b6SAlex Elder 
592db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
593db2388b6SAlex Elder 		return false;
594db2388b6SAlex Elder 
595103a150fSAlex Elder 	/*
596103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
597103a150fSAlex Elder 	 * that limits the number of snapshots.
598103a150fSAlex Elder 	 */
599103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
600103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
601103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
602103a150fSAlex Elder 		return false;
603103a150fSAlex Elder 
604103a150fSAlex Elder 	/*
605103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
606103a150fSAlex Elder 	 * header must also be representable in a size_t.
607103a150fSAlex Elder 	 */
608103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
609103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
610103a150fSAlex Elder 		return false;
611103a150fSAlex Elder 
612103a150fSAlex Elder 	return true;
6138e94af8eSAlex Elder }
6148e94af8eSAlex Elder 
615602adf40SYehuda Sadeh /*
616602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
617602adf40SYehuda Sadeh  * header.
618602adf40SYehuda Sadeh  */
619602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
6204156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
621602adf40SYehuda Sadeh {
622ccece235SAlex Elder 	u32 snap_count;
62358c17b0eSAlex Elder 	size_t len;
624d2bb24e5SAlex Elder 	size_t size;
625621901d6SAlex Elder 	u32 i;
626602adf40SYehuda Sadeh 
6276a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
6286a52325fSAlex Elder 
629103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
630103a150fSAlex Elder 
63158c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
63258c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6336a52325fSAlex Elder 	if (!header->object_prefix)
634602adf40SYehuda Sadeh 		return -ENOMEM;
63558c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
63658c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
63700f1f36fSAlex Elder 
638602adf40SYehuda Sadeh 	if (snap_count) {
639f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
640f785cc1dSAlex Elder 
641621901d6SAlex Elder 		/* Save a copy of the snapshot names */
642621901d6SAlex Elder 
643f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
644f785cc1dSAlex Elder 			return -EIO;
645f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
646602adf40SYehuda Sadeh 		if (!header->snap_names)
6476a52325fSAlex Elder 			goto out_err;
648f785cc1dSAlex Elder 		/*
649f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
650f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
651f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
652f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
653f785cc1dSAlex Elder 		 */
654f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
655f785cc1dSAlex Elder 			snap_names_len);
6566a52325fSAlex Elder 
657621901d6SAlex Elder 		/* Record each snapshot's size */
658621901d6SAlex Elder 
659d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
660d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
661602adf40SYehuda Sadeh 		if (!header->snap_sizes)
6626a52325fSAlex Elder 			goto out_err;
663621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
664621901d6SAlex Elder 			header->snap_sizes[i] =
665621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
666602adf40SYehuda Sadeh 	} else {
667ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
668602adf40SYehuda Sadeh 		header->snap_names = NULL;
669602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
670602adf40SYehuda Sadeh 	}
671849b4260SAlex Elder 
67234b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
673602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
674602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
675602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6766a52325fSAlex Elder 
677621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
678621901d6SAlex Elder 
679f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6806a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6816a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6826a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6836a52325fSAlex Elder 	if (!header->snapc)
6846a52325fSAlex Elder 		goto out_err;
685602adf40SYehuda Sadeh 
686602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
687505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
688602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
689621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
690602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
691602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
692602adf40SYehuda Sadeh 
693602adf40SYehuda Sadeh 	return 0;
694602adf40SYehuda Sadeh 
6956a52325fSAlex Elder out_err:
696849b4260SAlex Elder 	kfree(header->snap_sizes);
697ccece235SAlex Elder 	header->snap_sizes = NULL;
698602adf40SYehuda Sadeh 	kfree(header->snap_names);
699ccece235SAlex Elder 	header->snap_names = NULL;
7006a52325fSAlex Elder 	kfree(header->object_prefix);
7016a52325fSAlex Elder 	header->object_prefix = NULL;
702ccece235SAlex Elder 
70300f1f36fSAlex Elder 	return -ENOMEM;
704602adf40SYehuda Sadeh }
705602adf40SYehuda Sadeh 
7069e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7079e15b77dSAlex Elder {
7089e15b77dSAlex Elder 	struct rbd_snap *snap;
7099e15b77dSAlex Elder 
7109e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7119e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7129e15b77dSAlex Elder 
7139e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7149e15b77dSAlex Elder 		if (snap_id == snap->id)
7159e15b77dSAlex Elder 			return snap->name;
7169e15b77dSAlex Elder 
7179e15b77dSAlex Elder 	return NULL;
7189e15b77dSAlex Elder }
7199e15b77dSAlex Elder 
7208836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
721602adf40SYehuda Sadeh {
722602adf40SYehuda Sadeh 
723e86924a8SAlex Elder 	struct rbd_snap *snap;
72400f1f36fSAlex Elder 
725e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
726e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
7270d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
728e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
72934b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
73000f1f36fSAlex Elder 
731e86924a8SAlex Elder 			return 0;
732602adf40SYehuda Sadeh 		}
73300f1f36fSAlex Elder 	}
734e86924a8SAlex Elder 
73500f1f36fSAlex Elder 	return -ENOENT;
73600f1f36fSAlex Elder }
737602adf40SYehuda Sadeh 
738819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
739602adf40SYehuda Sadeh {
74078dc447dSAlex Elder 	int ret;
741602adf40SYehuda Sadeh 
7420d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
743cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
7440d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
74599c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
74634b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
747e86924a8SAlex Elder 		ret = 0;
748602adf40SYehuda Sadeh 	} else {
7490d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
750602adf40SYehuda Sadeh 		if (ret < 0)
751602adf40SYehuda Sadeh 			goto done;
752f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
753602adf40SYehuda Sadeh 	}
754d78b650aSAlex Elder 	atomic_set(&rbd_dev->exists, 1);
755602adf40SYehuda Sadeh done:
756602adf40SYehuda Sadeh 	return ret;
757602adf40SYehuda Sadeh }
758602adf40SYehuda Sadeh 
759602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
760602adf40SYehuda Sadeh {
761849b4260SAlex Elder 	kfree(header->object_prefix);
762d78fd7aeSAlex Elder 	header->object_prefix = NULL;
763602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
764d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
765849b4260SAlex Elder 	kfree(header->snap_names);
766d78fd7aeSAlex Elder 	header->snap_names = NULL;
767d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
768d78fd7aeSAlex Elder 	header->snapc = NULL;
769602adf40SYehuda Sadeh }
770602adf40SYehuda Sadeh 
77165ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
772602adf40SYehuda Sadeh {
77365ccfe21SAlex Elder 	char *name;
77465ccfe21SAlex Elder 	u64 segment;
77565ccfe21SAlex Elder 	int ret;
776602adf40SYehuda Sadeh 
7772fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
77865ccfe21SAlex Elder 	if (!name)
77965ccfe21SAlex Elder 		return NULL;
78065ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
7812fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
78265ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
7832fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
78465ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
78565ccfe21SAlex Elder 			segment, ret);
78665ccfe21SAlex Elder 		kfree(name);
78765ccfe21SAlex Elder 		name = NULL;
78865ccfe21SAlex Elder 	}
789602adf40SYehuda Sadeh 
79065ccfe21SAlex Elder 	return name;
79165ccfe21SAlex Elder }
792602adf40SYehuda Sadeh 
79365ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
79465ccfe21SAlex Elder {
79565ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
796602adf40SYehuda Sadeh 
79765ccfe21SAlex Elder 	return offset & (segment_size - 1);
79865ccfe21SAlex Elder }
79965ccfe21SAlex Elder 
80065ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
80165ccfe21SAlex Elder 				u64 offset, u64 length)
80265ccfe21SAlex Elder {
80365ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
80465ccfe21SAlex Elder 
80565ccfe21SAlex Elder 	offset &= segment_size - 1;
80665ccfe21SAlex Elder 
807aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
80865ccfe21SAlex Elder 	if (offset + length > segment_size)
80965ccfe21SAlex Elder 		length = segment_size - offset;
81065ccfe21SAlex Elder 
81165ccfe21SAlex Elder 	return length;
812602adf40SYehuda Sadeh }
813602adf40SYehuda Sadeh 
8141fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
8151fec7093SYehuda Sadeh 				u64 ofs, u64 len)
8161fec7093SYehuda Sadeh {
817df111be6SAlex Elder 	u64 start_seg;
818df111be6SAlex Elder 	u64 end_seg;
819df111be6SAlex Elder 
820df111be6SAlex Elder 	if (!len)
821df111be6SAlex Elder 		return 0;
822df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
823df111be6SAlex Elder 		return -ERANGE;
824df111be6SAlex Elder 
825df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
826df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
827df111be6SAlex Elder 
8281fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
8291fec7093SYehuda Sadeh }
8301fec7093SYehuda Sadeh 
831602adf40SYehuda Sadeh /*
832029bcbd8SJosh Durgin  * returns the size of an object in the image
833029bcbd8SJosh Durgin  */
834029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
835029bcbd8SJosh Durgin {
836029bcbd8SJosh Durgin 	return 1 << header->obj_order;
837029bcbd8SJosh Durgin }
838029bcbd8SJosh Durgin 
839029bcbd8SJosh Durgin /*
840602adf40SYehuda Sadeh  * bio helpers
841602adf40SYehuda Sadeh  */
842602adf40SYehuda Sadeh 
843602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
844602adf40SYehuda Sadeh {
845602adf40SYehuda Sadeh 	struct bio *tmp;
846602adf40SYehuda Sadeh 
847602adf40SYehuda Sadeh 	while (chain) {
848602adf40SYehuda Sadeh 		tmp = chain;
849602adf40SYehuda Sadeh 		chain = chain->bi_next;
850602adf40SYehuda Sadeh 		bio_put(tmp);
851602adf40SYehuda Sadeh 	}
852602adf40SYehuda Sadeh }
853602adf40SYehuda Sadeh 
854602adf40SYehuda Sadeh /*
855602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
856602adf40SYehuda Sadeh  */
857602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
858602adf40SYehuda Sadeh {
859602adf40SYehuda Sadeh 	struct bio_vec *bv;
860602adf40SYehuda Sadeh 	unsigned long flags;
861602adf40SYehuda Sadeh 	void *buf;
862602adf40SYehuda Sadeh 	int i;
863602adf40SYehuda Sadeh 	int pos = 0;
864602adf40SYehuda Sadeh 
865602adf40SYehuda Sadeh 	while (chain) {
866602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
867602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
868602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
869602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
870602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
871602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
87285b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
873602adf40SYehuda Sadeh 			}
874602adf40SYehuda Sadeh 			pos += bv->bv_len;
875602adf40SYehuda Sadeh 		}
876602adf40SYehuda Sadeh 
877602adf40SYehuda Sadeh 		chain = chain->bi_next;
878602adf40SYehuda Sadeh 	}
879602adf40SYehuda Sadeh }
880602adf40SYehuda Sadeh 
881602adf40SYehuda Sadeh /*
882f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
883f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
884602adf40SYehuda Sadeh  */
885f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
886f7760dadSAlex Elder 					unsigned int offset,
887f7760dadSAlex Elder 					unsigned int len,
888f7760dadSAlex Elder 					gfp_t gfpmask)
889602adf40SYehuda Sadeh {
890f7760dadSAlex Elder 	struct bio_vec *bv;
891f7760dadSAlex Elder 	unsigned int resid;
892f7760dadSAlex Elder 	unsigned short idx;
893f7760dadSAlex Elder 	unsigned int voff;
894f7760dadSAlex Elder 	unsigned short end_idx;
895f7760dadSAlex Elder 	unsigned short vcnt;
896f7760dadSAlex Elder 	struct bio *bio;
897602adf40SYehuda Sadeh 
898f7760dadSAlex Elder 	/* Handle the easy case for the caller */
899f7760dadSAlex Elder 
900f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
901f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
902f7760dadSAlex Elder 
903f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
904f7760dadSAlex Elder 		return NULL;
905f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
906f7760dadSAlex Elder 		return NULL;
907f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
908f7760dadSAlex Elder 		return NULL;
909f7760dadSAlex Elder 
910f7760dadSAlex Elder 	/* Find first affected segment... */
911f7760dadSAlex Elder 
912f7760dadSAlex Elder 	resid = offset;
913f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
914f7760dadSAlex Elder 		if (resid < bv->bv_len)
915f7760dadSAlex Elder 			break;
916f7760dadSAlex Elder 		resid -= bv->bv_len;
917602adf40SYehuda Sadeh 	}
918f7760dadSAlex Elder 	voff = resid;
919602adf40SYehuda Sadeh 
920f7760dadSAlex Elder 	/* ...and the last affected segment */
921542582fcSAlex Elder 
922f7760dadSAlex Elder 	resid += len;
923f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
924f7760dadSAlex Elder 		if (resid <= bv->bv_len)
925f7760dadSAlex Elder 			break;
926f7760dadSAlex Elder 		resid -= bv->bv_len;
927f7760dadSAlex Elder 	}
928f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
929602adf40SYehuda Sadeh 
930f7760dadSAlex Elder 	/* Build the clone */
931f7760dadSAlex Elder 
932f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
933f7760dadSAlex Elder 	if (!bio)
934f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
935f7760dadSAlex Elder 
936f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
937f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
938f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
939f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
940602adf40SYehuda Sadeh 
941602adf40SYehuda Sadeh 	/*
942f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
943f7760dadSAlex Elder 	 * and last (or only) entries.
944602adf40SYehuda Sadeh 	 */
945f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
946f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
947f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
948f7760dadSAlex Elder 	if (vcnt > 1) {
949f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
950f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
951602adf40SYehuda Sadeh 	} else {
952f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
953602adf40SYehuda Sadeh 	}
954602adf40SYehuda Sadeh 
955f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
956f7760dadSAlex Elder 	bio->bi_size = len;
957f7760dadSAlex Elder 	bio->bi_idx = 0;
958602adf40SYehuda Sadeh 
959f7760dadSAlex Elder 	return bio;
960602adf40SYehuda Sadeh }
961602adf40SYehuda Sadeh 
962f7760dadSAlex Elder /*
963f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
964f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
965f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
966f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
967f7760dadSAlex Elder  *
968f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
969f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
970f7760dadSAlex Elder  * the start of data to be cloned is located.
971f7760dadSAlex Elder  *
972f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
973f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
974f7760dadSAlex Elder  * contain the offset of that byte within that bio.
975f7760dadSAlex Elder  */
976f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
977f7760dadSAlex Elder 					unsigned int *offset,
978f7760dadSAlex Elder 					unsigned int len,
979f7760dadSAlex Elder 					gfp_t gfpmask)
980f7760dadSAlex Elder {
981f7760dadSAlex Elder 	struct bio *bi = *bio_src;
982f7760dadSAlex Elder 	unsigned int off = *offset;
983f7760dadSAlex Elder 	struct bio *chain = NULL;
984f7760dadSAlex Elder 	struct bio **end;
985602adf40SYehuda Sadeh 
986f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
987602adf40SYehuda Sadeh 
988f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
989f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
990602adf40SYehuda Sadeh 
991f7760dadSAlex Elder 	end = &chain;
992f7760dadSAlex Elder 	while (len) {
993f7760dadSAlex Elder 		unsigned int bi_size;
994f7760dadSAlex Elder 		struct bio *bio;
995f7760dadSAlex Elder 
996f5400b7aSAlex Elder 		if (!bi) {
997f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
998f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
999f5400b7aSAlex Elder 		}
1000f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1001f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1002f7760dadSAlex Elder 		if (!bio)
1003f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1004f7760dadSAlex Elder 
1005f7760dadSAlex Elder 		*end = bio;
1006f7760dadSAlex Elder 		end = &bio->bi_next;
1007f7760dadSAlex Elder 
1008f7760dadSAlex Elder 		off += bi_size;
1009f7760dadSAlex Elder 		if (off == bi->bi_size) {
1010f7760dadSAlex Elder 			bi = bi->bi_next;
1011f7760dadSAlex Elder 			off = 0;
1012f7760dadSAlex Elder 		}
1013f7760dadSAlex Elder 		len -= bi_size;
1014f7760dadSAlex Elder 	}
1015f7760dadSAlex Elder 	*bio_src = bi;
1016f7760dadSAlex Elder 	*offset = off;
1017f7760dadSAlex Elder 
1018f7760dadSAlex Elder 	return chain;
1019f7760dadSAlex Elder out_err:
1020f7760dadSAlex Elder 	bio_chain_put(chain);
1021f7760dadSAlex Elder 
1022602adf40SYehuda Sadeh 	return NULL;
1023602adf40SYehuda Sadeh }
1024602adf40SYehuda Sadeh 
1025602adf40SYehuda Sadeh /*
1026602adf40SYehuda Sadeh  * helpers for osd request op vectors.
1027602adf40SYehuda Sadeh  */
102857cfc106SAlex Elder static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
102957cfc106SAlex Elder 					int opcode, u32 payload_len)
1030602adf40SYehuda Sadeh {
103157cfc106SAlex Elder 	struct ceph_osd_req_op *ops;
103257cfc106SAlex Elder 
103357cfc106SAlex Elder 	ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
103457cfc106SAlex Elder 	if (!ops)
103557cfc106SAlex Elder 		return NULL;
103657cfc106SAlex Elder 
103757cfc106SAlex Elder 	ops[0].op = opcode;
103857cfc106SAlex Elder 
1039602adf40SYehuda Sadeh 	/*
1040602adf40SYehuda Sadeh 	 * op extent offset and length will be set later on
1041602adf40SYehuda Sadeh 	 * in calc_raw_layout()
1042602adf40SYehuda Sadeh 	 */
104357cfc106SAlex Elder 	ops[0].payload_len = payload_len;
104457cfc106SAlex Elder 
104557cfc106SAlex Elder 	return ops;
1046602adf40SYehuda Sadeh }
1047602adf40SYehuda Sadeh 
1048602adf40SYehuda Sadeh static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
1049602adf40SYehuda Sadeh {
1050602adf40SYehuda Sadeh 	kfree(ops);
1051602adf40SYehuda Sadeh }
1052602adf40SYehuda Sadeh 
10531fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
10541fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
10551fec7093SYehuda Sadeh 				   int index,
10568986cb37SAlex Elder 				   s32 ret, u64 len)
10571fec7093SYehuda Sadeh {
10581fec7093SYehuda Sadeh 	struct request_queue *q;
10591fec7093SYehuda Sadeh 	int min, max, i;
10601fec7093SYehuda Sadeh 
1061bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
10628986cb37SAlex Elder 	     coll, index, (int)ret, (unsigned long long)len);
10631fec7093SYehuda Sadeh 
10641fec7093SYehuda Sadeh 	if (!rq)
10651fec7093SYehuda Sadeh 		return;
10661fec7093SYehuda Sadeh 
10671fec7093SYehuda Sadeh 	if (!coll) {
10681fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
10691fec7093SYehuda Sadeh 		return;
10701fec7093SYehuda Sadeh 	}
10711fec7093SYehuda Sadeh 
10721fec7093SYehuda Sadeh 	q = rq->q;
10731fec7093SYehuda Sadeh 
10741fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
10751fec7093SYehuda Sadeh 	coll->status[index].done = 1;
10761fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
10771fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
10781fec7093SYehuda Sadeh 	max = min = coll->num_done;
10791fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
10801fec7093SYehuda Sadeh 		max++;
10811fec7093SYehuda Sadeh 
10821fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
10838986cb37SAlex Elder 		__blk_end_request(rq, (int)coll->status[i].rc,
10841fec7093SYehuda Sadeh 				  coll->status[i].bytes);
10851fec7093SYehuda Sadeh 		coll->num_done++;
10861fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
10871fec7093SYehuda Sadeh 	}
10881fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
10891fec7093SYehuda Sadeh }
10901fec7093SYehuda Sadeh 
1091725afc97SAlex Elder static void rbd_coll_end_req(struct rbd_request *rbd_req,
10928986cb37SAlex Elder 			     s32 ret, u64 len)
10931fec7093SYehuda Sadeh {
1094725afc97SAlex Elder 	rbd_coll_end_req_index(rbd_req->rq,
1095725afc97SAlex Elder 				rbd_req->coll, rbd_req->coll_index,
1096725afc97SAlex Elder 				ret, len);
10971fec7093SYehuda Sadeh }
10981fec7093SYehuda Sadeh 
1099602adf40SYehuda Sadeh /*
1100602adf40SYehuda Sadeh  * Send ceph osd request
1101602adf40SYehuda Sadeh  */
1102602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
11030ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
1104602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1105602adf40SYehuda Sadeh 			  u64 snapid,
1106aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
1107602adf40SYehuda Sadeh 			  struct bio *bio,
1108602adf40SYehuda Sadeh 			  struct page **pages,
1109602adf40SYehuda Sadeh 			  int num_pages,
1110602adf40SYehuda Sadeh 			  int flags,
1111602adf40SYehuda Sadeh 			  struct ceph_osd_req_op *ops,
11121fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
11131fec7093SYehuda Sadeh 			  int coll_index,
11145f29ddd4SAlex Elder 			  void (*rbd_cb)(struct ceph_osd_request *,
11155f29ddd4SAlex Elder 					 struct ceph_msg *),
111659c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
111759c2be1eSYehuda Sadeh 			  u64 *ver)
1118602adf40SYehuda Sadeh {
11195f29ddd4SAlex Elder 	struct ceph_osd_request *osd_req;
1120602adf40SYehuda Sadeh 	struct ceph_file_layout *layout;
1121602adf40SYehuda Sadeh 	int ret;
1122602adf40SYehuda Sadeh 	u64 bno;
1123602adf40SYehuda Sadeh 	struct timespec mtime = CURRENT_TIME;
1124725afc97SAlex Elder 	struct rbd_request *rbd_req;
1125602adf40SYehuda Sadeh 	struct ceph_osd_request_head *reqhead;
11261dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
1127602adf40SYehuda Sadeh 
1128725afc97SAlex Elder 	rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
1129cd323ac0SAlex Elder 	if (!rbd_req)
11301fec7093SYehuda Sadeh 		return -ENOMEM;
1131602adf40SYehuda Sadeh 
11321fec7093SYehuda Sadeh 	if (coll) {
1133725afc97SAlex Elder 		rbd_req->coll = coll;
1134725afc97SAlex Elder 		rbd_req->coll_index = coll_index;
11351fec7093SYehuda Sadeh 	}
11361fec7093SYehuda Sadeh 
1137f7760dadSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1138f7760dadSAlex Elder 		object_name, (unsigned long long) ofs,
1139f7760dadSAlex Elder 		(unsigned long long) len, coll, coll_index);
1140602adf40SYehuda Sadeh 
11410ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
11425f29ddd4SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
11431dbb4399SAlex Elder 					false, GFP_NOIO, pages, bio);
11445f29ddd4SAlex Elder 	if (!osd_req) {
11454ad12621SSage Weil 		ret = -ENOMEM;
1146602adf40SYehuda Sadeh 		goto done_pages;
1147602adf40SYehuda Sadeh 	}
1148602adf40SYehuda Sadeh 
11495f29ddd4SAlex Elder 	osd_req->r_callback = rbd_cb;
1150602adf40SYehuda Sadeh 
1151725afc97SAlex Elder 	rbd_req->rq = rq;
1152725afc97SAlex Elder 	rbd_req->bio = bio;
1153725afc97SAlex Elder 	rbd_req->pages = pages;
1154725afc97SAlex Elder 	rbd_req->len = len;
1155602adf40SYehuda Sadeh 
11565f29ddd4SAlex Elder 	osd_req->r_priv = rbd_req;
1157602adf40SYehuda Sadeh 
11585f29ddd4SAlex Elder 	reqhead = osd_req->r_request->front.iov_base;
1159602adf40SYehuda Sadeh 	reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1160602adf40SYehuda Sadeh 
11615f29ddd4SAlex Elder 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
11625f29ddd4SAlex Elder 	osd_req->r_oid_len = strlen(osd_req->r_oid);
1163602adf40SYehuda Sadeh 
11645f29ddd4SAlex Elder 	layout = &osd_req->r_file_layout;
1165602adf40SYehuda Sadeh 	memset(layout, 0, sizeof(*layout));
1166602adf40SYehuda Sadeh 	layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1167602adf40SYehuda Sadeh 	layout->fl_stripe_count = cpu_to_le32(1);
1168602adf40SYehuda Sadeh 	layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
11690d7dbfceSAlex Elder 	layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
11706cae3717SSage Weil 	ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
11715f29ddd4SAlex Elder 				   osd_req, ops);
11726cae3717SSage Weil 	rbd_assert(ret == 0);
1173602adf40SYehuda Sadeh 
11745f29ddd4SAlex Elder 	ceph_osdc_build_request(osd_req, ofs, &len,
1175602adf40SYehuda Sadeh 				ops,
1176602adf40SYehuda Sadeh 				snapc,
1177602adf40SYehuda Sadeh 				&mtime,
11785f29ddd4SAlex Elder 				osd_req->r_oid, osd_req->r_oid_len);
1179602adf40SYehuda Sadeh 
118059c2be1eSYehuda Sadeh 	if (linger_req) {
11815f29ddd4SAlex Elder 		ceph_osdc_set_request_linger(osdc, osd_req);
11825f29ddd4SAlex Elder 		*linger_req = osd_req;
118359c2be1eSYehuda Sadeh 	}
118459c2be1eSYehuda Sadeh 
11855f29ddd4SAlex Elder 	ret = ceph_osdc_start_request(osdc, osd_req, false);
1186602adf40SYehuda Sadeh 	if (ret < 0)
1187602adf40SYehuda Sadeh 		goto done_err;
1188602adf40SYehuda Sadeh 
1189602adf40SYehuda Sadeh 	if (!rbd_cb) {
11905f29ddd4SAlex Elder 		u64 version;
11915f29ddd4SAlex Elder 
11925f29ddd4SAlex Elder 		ret = ceph_osdc_wait_request(osdc, osd_req);
11935f29ddd4SAlex Elder 		version = le64_to_cpu(osd_req->r_reassert_version.version);
119459c2be1eSYehuda Sadeh 		if (ver)
11955f29ddd4SAlex Elder 			*ver = version;
11965f29ddd4SAlex Elder 		dout("reassert_ver=%llu\n", (unsigned long long) version);
11975f29ddd4SAlex Elder 		ceph_osdc_put_request(osd_req);
1198602adf40SYehuda Sadeh 	}
1199602adf40SYehuda Sadeh 	return ret;
1200602adf40SYehuda Sadeh 
1201602adf40SYehuda Sadeh done_err:
1202725afc97SAlex Elder 	bio_chain_put(rbd_req->bio);
12035f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
1204602adf40SYehuda Sadeh done_pages:
1205725afc97SAlex Elder 	kfree(rbd_req);
1206602adf40SYehuda Sadeh 	return ret;
1207602adf40SYehuda Sadeh }
1208602adf40SYehuda Sadeh 
1209602adf40SYehuda Sadeh /*
1210602adf40SYehuda Sadeh  * Ceph osd op callback
1211602adf40SYehuda Sadeh  */
12125f29ddd4SAlex Elder static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
1213602adf40SYehuda Sadeh {
12145f29ddd4SAlex Elder 	struct rbd_request *rbd_req = osd_req->r_priv;
1215602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1216602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
12178986cb37SAlex Elder 	s32 rc;
1218602adf40SYehuda Sadeh 	u64 bytes;
1219602adf40SYehuda Sadeh 	int read_op;
1220602adf40SYehuda Sadeh 
1221602adf40SYehuda Sadeh 	/* parse reply */
1222602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1223602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1224602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
12258986cb37SAlex Elder 	rc = (s32)le32_to_cpu(replyhead->result);
1226602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1227895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1228602adf40SYehuda Sadeh 
1229bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1230bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1231602adf40SYehuda Sadeh 
12328986cb37SAlex Elder 	if (rc == (s32)-ENOENT && read_op) {
1233725afc97SAlex Elder 		zero_bio_chain(rbd_req->bio, 0);
1234602adf40SYehuda Sadeh 		rc = 0;
1235725afc97SAlex Elder 	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
1236725afc97SAlex Elder 		zero_bio_chain(rbd_req->bio, bytes);
1237725afc97SAlex Elder 		bytes = rbd_req->len;
1238602adf40SYehuda Sadeh 	}
1239602adf40SYehuda Sadeh 
1240725afc97SAlex Elder 	rbd_coll_end_req(rbd_req, rc, bytes);
1241602adf40SYehuda Sadeh 
1242725afc97SAlex Elder 	if (rbd_req->bio)
1243725afc97SAlex Elder 		bio_chain_put(rbd_req->bio);
1244602adf40SYehuda Sadeh 
12455f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
1246725afc97SAlex Elder 	kfree(rbd_req);
1247602adf40SYehuda Sadeh }
1248602adf40SYehuda Sadeh 
12495f29ddd4SAlex Elder static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
12505f29ddd4SAlex Elder 				struct ceph_msg *msg)
125159c2be1eSYehuda Sadeh {
12525f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
125359c2be1eSYehuda Sadeh }
125459c2be1eSYehuda Sadeh 
1255602adf40SYehuda Sadeh /*
1256602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1257602adf40SYehuda Sadeh  */
12580ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1259602adf40SYehuda Sadeh 			   struct ceph_snap_context *snapc,
1260602adf40SYehuda Sadeh 			   u64 snapid,
1261602adf40SYehuda Sadeh 			   int flags,
1262913d2fdcSAlex Elder 			   struct ceph_osd_req_op *ops,
1263aded07eaSAlex Elder 			   const char *object_name,
1264f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1265f8d4de6eSAlex Elder 			   char *inbound,
126659c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
126759c2be1eSYehuda Sadeh 			   u64 *ver)
1268602adf40SYehuda Sadeh {
1269602adf40SYehuda Sadeh 	int ret;
1270602adf40SYehuda Sadeh 	struct page **pages;
1271602adf40SYehuda Sadeh 	int num_pages;
1272913d2fdcSAlex Elder 
1273aafb230eSAlex Elder 	rbd_assert(ops != NULL);
1274602adf40SYehuda Sadeh 
1275f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1276602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1277b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1278b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1279602adf40SYehuda Sadeh 
12800ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
1281f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1282602adf40SYehuda Sadeh 			  pages, num_pages,
1283602adf40SYehuda Sadeh 			  flags,
1284602adf40SYehuda Sadeh 			  ops,
12851fec7093SYehuda Sadeh 			  NULL, 0,
128659c2be1eSYehuda Sadeh 			  NULL,
128759c2be1eSYehuda Sadeh 			  linger_req, ver);
1288602adf40SYehuda Sadeh 	if (ret < 0)
1289913d2fdcSAlex Elder 		goto done;
1290602adf40SYehuda Sadeh 
1291f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1292f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1293602adf40SYehuda Sadeh 
1294602adf40SYehuda Sadeh done:
1295602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1296602adf40SYehuda Sadeh 	return ret;
1297602adf40SYehuda Sadeh }
1298602adf40SYehuda Sadeh 
1299602adf40SYehuda Sadeh /*
1300602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1301602adf40SYehuda Sadeh  */
1302602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1303602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1304602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1305602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
13061fec7093SYehuda Sadeh 		     struct bio *bio,
13071fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
13081fec7093SYehuda Sadeh 		     int coll_index)
1309602adf40SYehuda Sadeh {
1310602adf40SYehuda Sadeh 	char *seg_name;
1311602adf40SYehuda Sadeh 	u64 seg_ofs;
1312602adf40SYehuda Sadeh 	u64 seg_len;
1313602adf40SYehuda Sadeh 	int ret;
1314602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1315602adf40SYehuda Sadeh 	u32 payload_len;
1316ff2e4bb5SAlex Elder 	int opcode;
1317ff2e4bb5SAlex Elder 	int flags;
13184634246dSAlex Elder 	u64 snapid;
1319602adf40SYehuda Sadeh 
132065ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1321602adf40SYehuda Sadeh 	if (!seg_name)
1322602adf40SYehuda Sadeh 		return -ENOMEM;
132365ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
132465ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1325602adf40SYehuda Sadeh 
1326ff2e4bb5SAlex Elder 	if (rq_data_dir(rq) == WRITE) {
1327ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_WRITE;
1328ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
13294634246dSAlex Elder 		snapid = CEPH_NOSNAP;
1330ff2e4bb5SAlex Elder 		payload_len = seg_len;
1331ff2e4bb5SAlex Elder 	} else {
1332ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_READ;
1333ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_READ;
1334a7b4c65fSAlex Elder 		rbd_assert(!snapc);
13350d7dbfceSAlex Elder 		snapid = rbd_dev->spec->snap_id;
1336ff2e4bb5SAlex Elder 		payload_len = 0;
1337ff2e4bb5SAlex Elder 	}
1338602adf40SYehuda Sadeh 
133957cfc106SAlex Elder 	ret = -ENOMEM;
134057cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, opcode, payload_len);
134157cfc106SAlex Elder 	if (!ops)
1342602adf40SYehuda Sadeh 		goto done;
1343602adf40SYehuda Sadeh 
1344602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1345602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1346602adf40SYehuda Sadeh 	   truncated at this point */
1347aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1348602adf40SYehuda Sadeh 
1349602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1350602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1351602adf40SYehuda Sadeh 			     bio,
1352602adf40SYehuda Sadeh 			     NULL, 0,
1353602adf40SYehuda Sadeh 			     flags,
1354602adf40SYehuda Sadeh 			     ops,
13551fec7093SYehuda Sadeh 			     coll, coll_index,
135659c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
1357cd323ac0SAlex Elder 	if (ret < 0)
1358cd323ac0SAlex Elder 		rbd_coll_end_req_index(rq, coll, coll_index,
1359cd323ac0SAlex Elder 					(s32)ret, seg_len);
136011f77002SSage Weil 	rbd_destroy_ops(ops);
1361602adf40SYehuda Sadeh done:
1362602adf40SYehuda Sadeh 	kfree(seg_name);
1363602adf40SYehuda Sadeh 	return ret;
1364602adf40SYehuda Sadeh }
1365602adf40SYehuda Sadeh 
1366602adf40SYehuda Sadeh /*
1367602adf40SYehuda Sadeh  * Request sync osd read
1368602adf40SYehuda Sadeh  */
13690ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1370602adf40SYehuda Sadeh 			  u64 snapid,
1371aded07eaSAlex Elder 			  const char *object_name,
1372602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
137359c2be1eSYehuda Sadeh 			  char *buf,
137459c2be1eSYehuda Sadeh 			  u64 *ver)
1375602adf40SYehuda Sadeh {
1376913d2fdcSAlex Elder 	struct ceph_osd_req_op *ops;
1377913d2fdcSAlex Elder 	int ret;
1378913d2fdcSAlex Elder 
1379913d2fdcSAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1380913d2fdcSAlex Elder 	if (!ops)
1381913d2fdcSAlex Elder 		return -ENOMEM;
1382913d2fdcSAlex Elder 
1383913d2fdcSAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1384b06e6a6bSJosh Durgin 			       snapid,
1385602adf40SYehuda Sadeh 			       CEPH_OSD_FLAG_READ,
1386913d2fdcSAlex Elder 			       ops, object_name, ofs, len, buf, NULL, ver);
1387913d2fdcSAlex Elder 	rbd_destroy_ops(ops);
1388913d2fdcSAlex Elder 
1389913d2fdcSAlex Elder 	return ret;
1390602adf40SYehuda Sadeh }
1391602adf40SYehuda Sadeh 
1392602adf40SYehuda Sadeh /*
139359c2be1eSYehuda Sadeh  * Request sync osd watch
139459c2be1eSYehuda Sadeh  */
13950ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
139659c2be1eSYehuda Sadeh 				   u64 ver,
13977f0a24d8SAlex Elder 				   u64 notify_id)
139859c2be1eSYehuda Sadeh {
139959c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
140011f77002SSage Weil 	int ret;
140111f77002SSage Weil 
140257cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
140357cfc106SAlex Elder 	if (!ops)
140457cfc106SAlex Elder 		return -ENOMEM;
140559c2be1eSYehuda Sadeh 
1406a71b891bSJosh Durgin 	ops[0].watch.ver = cpu_to_le64(ver);
140759c2be1eSYehuda Sadeh 	ops[0].watch.cookie = notify_id;
140859c2be1eSYehuda Sadeh 	ops[0].watch.flag = 0;
140959c2be1eSYehuda Sadeh 
14100ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
14117f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1412ad4f232fSAlex Elder 			  NULL, 0,
141359c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
141459c2be1eSYehuda Sadeh 			  ops,
14151fec7093SYehuda Sadeh 			  NULL, 0,
141659c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
141759c2be1eSYehuda Sadeh 
141859c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
141959c2be1eSYehuda Sadeh 	return ret;
142059c2be1eSYehuda Sadeh }
142159c2be1eSYehuda Sadeh 
142259c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
142359c2be1eSYehuda Sadeh {
14240ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1425a71b891bSJosh Durgin 	u64 hver;
142613143d2dSSage Weil 	int rc;
142713143d2dSSage Weil 
14280ce1a794SAlex Elder 	if (!rbd_dev)
142959c2be1eSYehuda Sadeh 		return;
143059c2be1eSYehuda Sadeh 
1431bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1432bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1433bd919d45SAlex Elder 		(unsigned int) opcode);
1434117973fbSAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
143513143d2dSSage Weil 	if (rc)
143606ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
143706ecc6cbSAlex Elder 			   " update snaps: %d\n", rc);
143859c2be1eSYehuda Sadeh 
14397f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
144059c2be1eSYehuda Sadeh }
144159c2be1eSYehuda Sadeh 
144259c2be1eSYehuda Sadeh /*
144359c2be1eSYehuda Sadeh  * Request sync osd watch
144459c2be1eSYehuda Sadeh  */
14450e6f322dSAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
144659c2be1eSYehuda Sadeh {
144759c2be1eSYehuda Sadeh 	struct ceph_osd_req_op *ops;
14480ce1a794SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
144957cfc106SAlex Elder 	int ret;
145059c2be1eSYehuda Sadeh 
145157cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
145257cfc106SAlex Elder 	if (!ops)
145357cfc106SAlex Elder 		return -ENOMEM;
145459c2be1eSYehuda Sadeh 
145559c2be1eSYehuda Sadeh 	ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
14560ce1a794SAlex Elder 				     (void *)rbd_dev, &rbd_dev->watch_event);
145759c2be1eSYehuda Sadeh 	if (ret < 0)
145859c2be1eSYehuda Sadeh 		goto fail;
145959c2be1eSYehuda Sadeh 
14600e6f322dSAlex Elder 	ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
14610ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
146259c2be1eSYehuda Sadeh 	ops[0].watch.flag = 1;
146359c2be1eSYehuda Sadeh 
14640ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
146559c2be1eSYehuda Sadeh 			      CEPH_NOSNAP,
146659c2be1eSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
146759c2be1eSYehuda Sadeh 			      ops,
14680e6f322dSAlex Elder 			      rbd_dev->header_name,
14690e6f322dSAlex Elder 			      0, 0, NULL,
14700ce1a794SAlex Elder 			      &rbd_dev->watch_request, NULL);
147159c2be1eSYehuda Sadeh 
147259c2be1eSYehuda Sadeh 	if (ret < 0)
147359c2be1eSYehuda Sadeh 		goto fail_event;
147459c2be1eSYehuda Sadeh 
147559c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
147659c2be1eSYehuda Sadeh 	return 0;
147759c2be1eSYehuda Sadeh 
147859c2be1eSYehuda Sadeh fail_event:
14790ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
14800ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
148159c2be1eSYehuda Sadeh fail:
148259c2be1eSYehuda Sadeh 	rbd_destroy_ops(ops);
148359c2be1eSYehuda Sadeh 	return ret;
148459c2be1eSYehuda Sadeh }
148559c2be1eSYehuda Sadeh 
148679e3057cSYehuda Sadeh /*
148779e3057cSYehuda Sadeh  * Request sync osd unwatch
148879e3057cSYehuda Sadeh  */
1489070c633fSAlex Elder static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
149079e3057cSYehuda Sadeh {
149179e3057cSYehuda Sadeh 	struct ceph_osd_req_op *ops;
149257cfc106SAlex Elder 	int ret;
149379e3057cSYehuda Sadeh 
149457cfc106SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
149557cfc106SAlex Elder 	if (!ops)
149657cfc106SAlex Elder 		return -ENOMEM;
149779e3057cSYehuda Sadeh 
149879e3057cSYehuda Sadeh 	ops[0].watch.ver = 0;
14990ce1a794SAlex Elder 	ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
150079e3057cSYehuda Sadeh 	ops[0].watch.flag = 0;
150179e3057cSYehuda Sadeh 
15020ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
150379e3057cSYehuda Sadeh 			      CEPH_NOSNAP,
150479e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
150579e3057cSYehuda Sadeh 			      ops,
1506070c633fSAlex Elder 			      rbd_dev->header_name,
1507070c633fSAlex Elder 			      0, 0, NULL, NULL, NULL);
1508070c633fSAlex Elder 
150979e3057cSYehuda Sadeh 
151079e3057cSYehuda Sadeh 	rbd_destroy_ops(ops);
15110ce1a794SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
15120ce1a794SAlex Elder 	rbd_dev->watch_event = NULL;
151379e3057cSYehuda Sadeh 	return ret;
151479e3057cSYehuda Sadeh }
151579e3057cSYehuda Sadeh 
151659c2be1eSYehuda Sadeh /*
15173cb4a687SAlex Elder  * Synchronous osd object method call
1518602adf40SYehuda Sadeh  */
15190ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1520aded07eaSAlex Elder 			     const char *object_name,
1521aded07eaSAlex Elder 			     const char *class_name,
1522aded07eaSAlex Elder 			     const char *method_name,
15233cb4a687SAlex Elder 			     const char *outbound,
15243cb4a687SAlex Elder 			     size_t outbound_size,
1525f8d4de6eSAlex Elder 			     char *inbound,
1526f8d4de6eSAlex Elder 			     size_t inbound_size,
15273cb4a687SAlex Elder 			     int flags,
152859c2be1eSYehuda Sadeh 			     u64 *ver)
1529602adf40SYehuda Sadeh {
1530602adf40SYehuda Sadeh 	struct ceph_osd_req_op *ops;
1531aded07eaSAlex Elder 	int class_name_len = strlen(class_name);
1532aded07eaSAlex Elder 	int method_name_len = strlen(method_name);
15333cb4a687SAlex Elder 	int payload_size;
153457cfc106SAlex Elder 	int ret;
153557cfc106SAlex Elder 
15363cb4a687SAlex Elder 	/*
15373cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
15383cb4a687SAlex Elder 	 * will be sent along with the class and method names as
15393cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
15403cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
15413cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
15423cb4a687SAlex Elder 	 * operation.
15433cb4a687SAlex Elder 	 */
15443cb4a687SAlex Elder 	payload_size = class_name_len + method_name_len + outbound_size;
15453cb4a687SAlex Elder 	ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
154657cfc106SAlex Elder 	if (!ops)
154757cfc106SAlex Elder 		return -ENOMEM;
1548602adf40SYehuda Sadeh 
1549aded07eaSAlex Elder 	ops[0].cls.class_name = class_name;
1550aded07eaSAlex Elder 	ops[0].cls.class_len = (__u8) class_name_len;
1551aded07eaSAlex Elder 	ops[0].cls.method_name = method_name;
1552aded07eaSAlex Elder 	ops[0].cls.method_len = (__u8) method_name_len;
1553602adf40SYehuda Sadeh 	ops[0].cls.argc = 0;
15543cb4a687SAlex Elder 	ops[0].cls.indata = outbound;
15553cb4a687SAlex Elder 	ops[0].cls.indata_len = outbound_size;
1556602adf40SYehuda Sadeh 
15570ce1a794SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, NULL,
1558602adf40SYehuda Sadeh 			       CEPH_NOSNAP,
15593cb4a687SAlex Elder 			       flags, ops,
1560f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1561f8d4de6eSAlex Elder 			       NULL, ver);
1562602adf40SYehuda Sadeh 
1563602adf40SYehuda Sadeh 	rbd_destroy_ops(ops);
1564602adf40SYehuda Sadeh 
1565602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1566602adf40SYehuda Sadeh 	return ret;
1567602adf40SYehuda Sadeh }
1568602adf40SYehuda Sadeh 
15691fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
15701fec7093SYehuda Sadeh {
15711fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
15721fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
15731fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
15741fec7093SYehuda Sadeh 				GFP_ATOMIC);
15751fec7093SYehuda Sadeh 
15761fec7093SYehuda Sadeh 	if (!coll)
15771fec7093SYehuda Sadeh 		return NULL;
15781fec7093SYehuda Sadeh 	coll->total = num_reqs;
15791fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15801fec7093SYehuda Sadeh 	return coll;
15811fec7093SYehuda Sadeh }
15821fec7093SYehuda Sadeh 
15838295cda7SAlex Elder static int rbd_dev_do_request(struct request *rq,
15848295cda7SAlex Elder 				struct rbd_device *rbd_dev,
15858295cda7SAlex Elder 				struct ceph_snap_context *snapc,
15868295cda7SAlex Elder 				u64 ofs, unsigned int size,
15878295cda7SAlex Elder 				struct bio *bio_chain)
15888295cda7SAlex Elder {
15898295cda7SAlex Elder 	int num_segs;
15908295cda7SAlex Elder 	struct rbd_req_coll *coll;
15918295cda7SAlex Elder 	unsigned int bio_offset;
15928295cda7SAlex Elder 	int cur_seg = 0;
15938295cda7SAlex Elder 
15948295cda7SAlex Elder 	dout("%s 0x%x bytes at 0x%llx\n",
15958295cda7SAlex Elder 		rq_data_dir(rq) == WRITE ? "write" : "read",
15968295cda7SAlex Elder 		size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
15978295cda7SAlex Elder 
15988295cda7SAlex Elder 	num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
15998295cda7SAlex Elder 	if (num_segs <= 0)
16008295cda7SAlex Elder 		return num_segs;
16018295cda7SAlex Elder 
16028295cda7SAlex Elder 	coll = rbd_alloc_coll(num_segs);
16038295cda7SAlex Elder 	if (!coll)
16048295cda7SAlex Elder 		return -ENOMEM;
16058295cda7SAlex Elder 
16068295cda7SAlex Elder 	bio_offset = 0;
16078295cda7SAlex Elder 	do {
16088295cda7SAlex Elder 		u64 limit = rbd_segment_length(rbd_dev, ofs, size);
16098295cda7SAlex Elder 		unsigned int clone_size;
16108295cda7SAlex Elder 		struct bio *bio_clone;
16118295cda7SAlex Elder 
16128295cda7SAlex Elder 		BUG_ON(limit > (u64)UINT_MAX);
16138295cda7SAlex Elder 		clone_size = (unsigned int)limit;
16148295cda7SAlex Elder 		dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
16158295cda7SAlex Elder 
16168295cda7SAlex Elder 		kref_get(&coll->kref);
16178295cda7SAlex Elder 
16188295cda7SAlex Elder 		/* Pass a cloned bio chain via an osd request */
16198295cda7SAlex Elder 
16208295cda7SAlex Elder 		bio_clone = bio_chain_clone_range(&bio_chain,
16218295cda7SAlex Elder 					&bio_offset, clone_size,
16228295cda7SAlex Elder 					GFP_ATOMIC);
16238295cda7SAlex Elder 		if (bio_clone)
16248295cda7SAlex Elder 			(void)rbd_do_op(rq, rbd_dev, snapc,
16258295cda7SAlex Elder 					ofs, clone_size,
16268295cda7SAlex Elder 					bio_clone, coll, cur_seg);
16278295cda7SAlex Elder 		else
16288295cda7SAlex Elder 			rbd_coll_end_req_index(rq, coll, cur_seg,
16298295cda7SAlex Elder 						(s32)-ENOMEM,
16308295cda7SAlex Elder 						clone_size);
16318295cda7SAlex Elder 		size -= clone_size;
16328295cda7SAlex Elder 		ofs += clone_size;
16338295cda7SAlex Elder 
16348295cda7SAlex Elder 		cur_seg++;
16358295cda7SAlex Elder 	} while (size > 0);
16368295cda7SAlex Elder 	kref_put(&coll->kref, rbd_coll_release);
16378295cda7SAlex Elder 
16388295cda7SAlex Elder 	return 0;
16398295cda7SAlex Elder }
16408295cda7SAlex Elder 
1641602adf40SYehuda Sadeh /*
1642602adf40SYehuda Sadeh  * block device queue callback
1643602adf40SYehuda Sadeh  */
1644602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1645602adf40SYehuda Sadeh {
1646602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1647b395e8b5SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
1648602adf40SYehuda Sadeh 	struct request *rq;
1649602adf40SYehuda Sadeh 
165000f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1651b395e8b5SAlex Elder 		struct ceph_snap_context *snapc = NULL;
1652b395e8b5SAlex Elder 		unsigned int size = 0;
16538295cda7SAlex Elder 		int result;
1654602adf40SYehuda Sadeh 
1655602adf40SYehuda Sadeh 		dout("fetched request\n");
1656602adf40SYehuda Sadeh 
1657b395e8b5SAlex Elder 		/* Filter out block requests we don't understand */
1658b395e8b5SAlex Elder 
1659602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1660602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
166100f1f36fSAlex Elder 			continue;
1662602adf40SYehuda Sadeh 		}
1663602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1664602adf40SYehuda Sadeh 
1665a7b4c65fSAlex Elder 		/* Write requests need a reference to the snapshot context */
1666e88a36ecSJosh Durgin 
1667a7b4c65fSAlex Elder 		if (rq_data_dir(rq) == WRITE) {
1668b395e8b5SAlex Elder 			result = -EROFS;
1669a7b4c65fSAlex Elder 			if (read_only) /* Can't write to a read-only device */
1670b395e8b5SAlex Elder 				goto out_end_request;
1671b395e8b5SAlex Elder 
1672a7b4c65fSAlex Elder 			/*
1673a7b4c65fSAlex Elder 			 * Note that each osd request will take its
1674a7b4c65fSAlex Elder 			 * own reference to the snapshot context
1675a7b4c65fSAlex Elder 			 * supplied.  The reference we take here
1676a7b4c65fSAlex Elder 			 * just guarantees the one we provide stays
1677a7b4c65fSAlex Elder 			 * valid.
1678a7b4c65fSAlex Elder 			 */
1679b395e8b5SAlex Elder 			down_read(&rbd_dev->header_rwsem);
1680b395e8b5SAlex Elder 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1681d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1682a7b4c65fSAlex Elder 			rbd_assert(snapc != NULL);
1683a7b4c65fSAlex Elder 		} else if (!atomic_read(&rbd_dev->exists)) {
1684b395e8b5SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1685e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1686b395e8b5SAlex Elder 			result = -ENXIO;
1687b395e8b5SAlex Elder 			goto out_end_request;
1688e88a36ecSJosh Durgin 		}
1689d1d25646SJosh Durgin 
1690f7760dadSAlex Elder 		size = blk_rq_bytes(rq);
1691b395e8b5SAlex Elder 		result = rbd_dev_do_request(rq, rbd_dev, snapc,
1692b395e8b5SAlex Elder 				blk_rq_pos(rq) * SECTOR_SIZE,
1693b395e8b5SAlex Elder 				size, rq->bio);
1694b395e8b5SAlex Elder out_end_request:
1695a7b4c65fSAlex Elder 		if (snapc)
1696df111be6SAlex Elder 			ceph_put_snap_context(snapc);
16971fec7093SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
16988295cda7SAlex Elder 		if (!size || result < 0)
16998295cda7SAlex Elder 			__blk_end_request_all(rq, result);
1700602adf40SYehuda Sadeh 	}
1701602adf40SYehuda Sadeh }
1702602adf40SYehuda Sadeh 
1703602adf40SYehuda Sadeh /*
1704602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1705602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1706f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1707602adf40SYehuda Sadeh  */
1708602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1709602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1710602adf40SYehuda Sadeh {
1711602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1712e5cfeed2SAlex Elder 	sector_t sector_offset;
1713e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1714e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1715e5cfeed2SAlex Elder 	int ret;
1716602adf40SYehuda Sadeh 
1717e5cfeed2SAlex Elder 	/*
1718e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1719e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1720e5cfeed2SAlex Elder 	 * device.
1721e5cfeed2SAlex Elder 	 */
1722e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1723e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1724e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1725593a9e7bSAlex Elder 
1726e5cfeed2SAlex Elder 	/*
1727e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
1728e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
1729e5cfeed2SAlex Elder 	 */
1730e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1731e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
1732e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
1733e5cfeed2SAlex Elder 	else
1734e5cfeed2SAlex Elder 		ret = 0;
1735e5cfeed2SAlex Elder 
1736e5cfeed2SAlex Elder 	/*
1737e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
1738e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
1739e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
1740e5cfeed2SAlex Elder 	 * added to an empty bio."
1741e5cfeed2SAlex Elder 	 */
1742e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
1743e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
1744e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
1745e5cfeed2SAlex Elder 
1746e5cfeed2SAlex Elder 	return ret;
1747602adf40SYehuda Sadeh }
1748602adf40SYehuda Sadeh 
1749602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1750602adf40SYehuda Sadeh {
1751602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1752602adf40SYehuda Sadeh 
1753602adf40SYehuda Sadeh 	if (!disk)
1754602adf40SYehuda Sadeh 		return;
1755602adf40SYehuda Sadeh 
1756602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1757602adf40SYehuda Sadeh 		del_gendisk(disk);
1758602adf40SYehuda Sadeh 	if (disk->queue)
1759602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1760602adf40SYehuda Sadeh 	put_disk(disk);
1761602adf40SYehuda Sadeh }
1762602adf40SYehuda Sadeh 
1763602adf40SYehuda Sadeh /*
17644156d998SAlex Elder  * Read the complete header for the given rbd device.
17654156d998SAlex Elder  *
17664156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
17674156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
17684156d998SAlex Elder  * of a variable that will be filled in with the version of the
17694156d998SAlex Elder  * header object at the time it was read.
17704156d998SAlex Elder  *
17714156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
17724156d998SAlex Elder  */
17734156d998SAlex Elder static struct rbd_image_header_ondisk *
17744156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
17754156d998SAlex Elder {
17764156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
17774156d998SAlex Elder 	u32 snap_count = 0;
17784156d998SAlex Elder 	u64 names_size = 0;
17794156d998SAlex Elder 	u32 want_count;
17804156d998SAlex Elder 	int ret;
17814156d998SAlex Elder 
17824156d998SAlex Elder 	/*
17834156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
17844156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
17854156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
17864156d998SAlex Elder 	 * the number of snapshots could change by the time we read
17874156d998SAlex Elder 	 * it in, in which case we re-read it.
17884156d998SAlex Elder 	 */
17894156d998SAlex Elder 	do {
17904156d998SAlex Elder 		size_t size;
17914156d998SAlex Elder 
17924156d998SAlex Elder 		kfree(ondisk);
17934156d998SAlex Elder 
17944156d998SAlex Elder 		size = sizeof (*ondisk);
17954156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17964156d998SAlex Elder 		size += names_size;
17974156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17984156d998SAlex Elder 		if (!ondisk)
17994156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
18004156d998SAlex Elder 
18014156d998SAlex Elder 		ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
18024156d998SAlex Elder 				       rbd_dev->header_name,
18034156d998SAlex Elder 				       0, size,
18044156d998SAlex Elder 				       (char *) ondisk, version);
18054156d998SAlex Elder 
18064156d998SAlex Elder 		if (ret < 0)
18074156d998SAlex Elder 			goto out_err;
18084156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
18094156d998SAlex Elder 			ret = -ENXIO;
181006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
181106ecc6cbSAlex Elder 				size, ret);
18124156d998SAlex Elder 			goto out_err;
18134156d998SAlex Elder 		}
18144156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
18154156d998SAlex Elder 			ret = -ENXIO;
181606ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
18174156d998SAlex Elder 			goto out_err;
18184156d998SAlex Elder 		}
18194156d998SAlex Elder 
18204156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
18214156d998SAlex Elder 		want_count = snap_count;
18224156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
18234156d998SAlex Elder 	} while (snap_count != want_count);
18244156d998SAlex Elder 
18254156d998SAlex Elder 	return ondisk;
18264156d998SAlex Elder 
18274156d998SAlex Elder out_err:
18284156d998SAlex Elder 	kfree(ondisk);
18294156d998SAlex Elder 
18304156d998SAlex Elder 	return ERR_PTR(ret);
18314156d998SAlex Elder }
18324156d998SAlex Elder 
18334156d998SAlex Elder /*
1834602adf40SYehuda Sadeh  * reload the ondisk the header
1835602adf40SYehuda Sadeh  */
1836602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1837602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1838602adf40SYehuda Sadeh {
18394156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
18404156d998SAlex Elder 	u64 ver = 0;
18414156d998SAlex Elder 	int ret;
1842602adf40SYehuda Sadeh 
18434156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
18444156d998SAlex Elder 	if (IS_ERR(ondisk))
18454156d998SAlex Elder 		return PTR_ERR(ondisk);
18464156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
18474156d998SAlex Elder 	if (ret >= 0)
184859c2be1eSYehuda Sadeh 		header->obj_version = ver;
18494156d998SAlex Elder 	kfree(ondisk);
1850602adf40SYehuda Sadeh 
18514156d998SAlex Elder 	return ret;
1852602adf40SYehuda Sadeh }
1853602adf40SYehuda Sadeh 
185441f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1855dfc5606dSYehuda Sadeh {
1856dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1857a0593290SAlex Elder 	struct rbd_snap *next;
1858dfc5606dSYehuda Sadeh 
1859a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
186041f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
1861dfc5606dSYehuda Sadeh }
1862dfc5606dSYehuda Sadeh 
18639478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
18649478554aSAlex Elder {
18659478554aSAlex Elder 	sector_t size;
18669478554aSAlex Elder 
18670d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
18689478554aSAlex Elder 		return;
18699478554aSAlex Elder 
18709478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
18719478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
18729478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
18739478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
18749478554aSAlex Elder }
18759478554aSAlex Elder 
1876602adf40SYehuda Sadeh /*
1877602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1878602adf40SYehuda Sadeh  */
1879117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1880602adf40SYehuda Sadeh {
1881602adf40SYehuda Sadeh 	int ret;
1882602adf40SYehuda Sadeh 	struct rbd_image_header h;
1883602adf40SYehuda Sadeh 
1884602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1885602adf40SYehuda Sadeh 	if (ret < 0)
1886602adf40SYehuda Sadeh 		return ret;
1887602adf40SYehuda Sadeh 
1888a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1889a51aa0c0SJosh Durgin 
18909478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
18919478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
18929478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
18939db4b3e3SSage Weil 
1894849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1895602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1896849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1897d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1898d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1899602adf40SYehuda Sadeh 
1900b813623aSAlex Elder 	if (hver)
1901b813623aSAlex Elder 		*hver = h.obj_version;
1902a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
190393a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1904602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1905602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1906602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1907849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1908849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1909849b4260SAlex Elder 	kfree(h.object_prefix);
1910849b4260SAlex Elder 
1911304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1912304f6808SAlex Elder 	if (!ret)
1913304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1914dfc5606dSYehuda Sadeh 
1915c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1916602adf40SYehuda Sadeh 
1917dfc5606dSYehuda Sadeh 	return ret;
1918602adf40SYehuda Sadeh }
1919602adf40SYehuda Sadeh 
1920117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
19211fe5e993SAlex Elder {
19221fe5e993SAlex Elder 	int ret;
19231fe5e993SAlex Elder 
1924117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
19251fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1926117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
1927117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
1928117973fbSAlex Elder 	else
1929117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
19301fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
19311fe5e993SAlex Elder 
19321fe5e993SAlex Elder 	return ret;
19331fe5e993SAlex Elder }
19341fe5e993SAlex Elder 
1935602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1936602adf40SYehuda Sadeh {
1937602adf40SYehuda Sadeh 	struct gendisk *disk;
1938602adf40SYehuda Sadeh 	struct request_queue *q;
1939593a9e7bSAlex Elder 	u64 segment_size;
1940602adf40SYehuda Sadeh 
1941602adf40SYehuda Sadeh 	/* create gendisk info */
1942602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1943602adf40SYehuda Sadeh 	if (!disk)
19441fcdb8aaSAlex Elder 		return -ENOMEM;
1945602adf40SYehuda Sadeh 
1946f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1947de71a297SAlex Elder 		 rbd_dev->dev_id);
1948602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1949602adf40SYehuda Sadeh 	disk->first_minor = 0;
1950602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1951602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1952602adf40SYehuda Sadeh 
1953602adf40SYehuda Sadeh 	/* init rq */
1954602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1955602adf40SYehuda Sadeh 	if (!q)
1956602adf40SYehuda Sadeh 		goto out_disk;
1957029bcbd8SJosh Durgin 
1958593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1959593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1960593a9e7bSAlex Elder 
1961029bcbd8SJosh Durgin 	/* set io sizes to object size */
1962593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1963593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1964593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1965593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1966593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1967029bcbd8SJosh Durgin 
1968602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1969602adf40SYehuda Sadeh 	disk->queue = q;
1970602adf40SYehuda Sadeh 
1971602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1972602adf40SYehuda Sadeh 
1973602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1974602adf40SYehuda Sadeh 
197512f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
197612f02944SAlex Elder 
1977602adf40SYehuda Sadeh 	return 0;
1978602adf40SYehuda Sadeh out_disk:
1979602adf40SYehuda Sadeh 	put_disk(disk);
19801fcdb8aaSAlex Elder 
19811fcdb8aaSAlex Elder 	return -ENOMEM;
1982602adf40SYehuda Sadeh }
1983602adf40SYehuda Sadeh 
1984dfc5606dSYehuda Sadeh /*
1985dfc5606dSYehuda Sadeh   sysfs
1986dfc5606dSYehuda Sadeh */
1987602adf40SYehuda Sadeh 
1988593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1989593a9e7bSAlex Elder {
1990593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1991593a9e7bSAlex Elder }
1992593a9e7bSAlex Elder 
1993dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1994dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1995602adf40SYehuda Sadeh {
1996593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1997a51aa0c0SJosh Durgin 	sector_t size;
1998dfc5606dSYehuda Sadeh 
1999a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2000a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2001a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2002a51aa0c0SJosh Durgin 
2003a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2004602adf40SYehuda Sadeh }
2005602adf40SYehuda Sadeh 
200634b13184SAlex Elder /*
200734b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
200834b13184SAlex Elder  * necessarily the base image.
200934b13184SAlex Elder  */
201034b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
201134b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
201234b13184SAlex Elder {
201334b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
201434b13184SAlex Elder 
201534b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
201634b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
201734b13184SAlex Elder }
201834b13184SAlex Elder 
2019dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2020dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2021602adf40SYehuda Sadeh {
2022593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2023dfc5606dSYehuda Sadeh 
2024dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2025dfc5606dSYehuda Sadeh }
2026dfc5606dSYehuda Sadeh 
2027dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2028dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2029dfc5606dSYehuda Sadeh {
2030593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2031dfc5606dSYehuda Sadeh 
20321dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
20331dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2034dfc5606dSYehuda Sadeh }
2035dfc5606dSYehuda Sadeh 
2036dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2037dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2038dfc5606dSYehuda Sadeh {
2039593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2040dfc5606dSYehuda Sadeh 
20410d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2042dfc5606dSYehuda Sadeh }
2043dfc5606dSYehuda Sadeh 
20449bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
20459bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
20469bb2f334SAlex Elder {
20479bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
20489bb2f334SAlex Elder 
20490d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
20500d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
20519bb2f334SAlex Elder }
20529bb2f334SAlex Elder 
2053dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2054dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2055dfc5606dSYehuda Sadeh {
2056593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2057dfc5606dSYehuda Sadeh 
2058a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
20590d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2060a92ffdf8SAlex Elder 
2061a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2062dfc5606dSYehuda Sadeh }
2063dfc5606dSYehuda Sadeh 
2064589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2065589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2066589d30e0SAlex Elder {
2067589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2068589d30e0SAlex Elder 
20690d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2070589d30e0SAlex Elder }
2071589d30e0SAlex Elder 
207234b13184SAlex Elder /*
207334b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
207434b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
207534b13184SAlex Elder  */
2076dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2077dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2078dfc5606dSYehuda Sadeh 			     char *buf)
2079dfc5606dSYehuda Sadeh {
2080593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2081dfc5606dSYehuda Sadeh 
20820d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2083dfc5606dSYehuda Sadeh }
2084dfc5606dSYehuda Sadeh 
208586b00e0dSAlex Elder /*
208686b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
208786b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
208886b00e0dSAlex Elder  * "(no parent image)".
208986b00e0dSAlex Elder  */
209086b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
209186b00e0dSAlex Elder 			     struct device_attribute *attr,
209286b00e0dSAlex Elder 			     char *buf)
209386b00e0dSAlex Elder {
209486b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
209586b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
209686b00e0dSAlex Elder 	int count;
209786b00e0dSAlex Elder 	char *bufp = buf;
209886b00e0dSAlex Elder 
209986b00e0dSAlex Elder 	if (!spec)
210086b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
210186b00e0dSAlex Elder 
210286b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
210386b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
210486b00e0dSAlex Elder 	if (count < 0)
210586b00e0dSAlex Elder 		return count;
210686b00e0dSAlex Elder 	bufp += count;
210786b00e0dSAlex Elder 
210886b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
210986b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
211086b00e0dSAlex Elder 	if (count < 0)
211186b00e0dSAlex Elder 		return count;
211286b00e0dSAlex Elder 	bufp += count;
211386b00e0dSAlex Elder 
211486b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
211586b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
211686b00e0dSAlex Elder 	if (count < 0)
211786b00e0dSAlex Elder 		return count;
211886b00e0dSAlex Elder 	bufp += count;
211986b00e0dSAlex Elder 
212086b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
212186b00e0dSAlex Elder 	if (count < 0)
212286b00e0dSAlex Elder 		return count;
212386b00e0dSAlex Elder 	bufp += count;
212486b00e0dSAlex Elder 
212586b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
212686b00e0dSAlex Elder }
212786b00e0dSAlex Elder 
2128dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2129dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2130dfc5606dSYehuda Sadeh 				 const char *buf,
2131dfc5606dSYehuda Sadeh 				 size_t size)
2132dfc5606dSYehuda Sadeh {
2133593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2134b813623aSAlex Elder 	int ret;
2135602adf40SYehuda Sadeh 
2136117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2137b813623aSAlex Elder 
2138b813623aSAlex Elder 	return ret < 0 ? ret : size;
2139dfc5606dSYehuda Sadeh }
2140602adf40SYehuda Sadeh 
2141dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
214234b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2143dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2144dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2145dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
21469bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2147dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2148589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2149dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2150dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
215186b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2152dfc5606dSYehuda Sadeh 
2153dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2154dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
215534b13184SAlex Elder 	&dev_attr_features.attr,
2156dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2157dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2158dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
21599bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2160dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2161589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2162dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
216386b00e0dSAlex Elder 	&dev_attr_parent.attr,
2164dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2165dfc5606dSYehuda Sadeh 	NULL
2166dfc5606dSYehuda Sadeh };
2167dfc5606dSYehuda Sadeh 
2168dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2169dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2170dfc5606dSYehuda Sadeh };
2171dfc5606dSYehuda Sadeh 
2172dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2173dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2174dfc5606dSYehuda Sadeh 	NULL
2175dfc5606dSYehuda Sadeh };
2176dfc5606dSYehuda Sadeh 
2177dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2178dfc5606dSYehuda Sadeh {
2179dfc5606dSYehuda Sadeh }
2180dfc5606dSYehuda Sadeh 
2181dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2182dfc5606dSYehuda Sadeh 	.name		= "rbd",
2183dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2184dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2185dfc5606dSYehuda Sadeh };
2186dfc5606dSYehuda Sadeh 
2187dfc5606dSYehuda Sadeh 
2188dfc5606dSYehuda Sadeh /*
2189dfc5606dSYehuda Sadeh   sysfs - snapshots
2190dfc5606dSYehuda Sadeh */
2191dfc5606dSYehuda Sadeh 
2192dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2193dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2194dfc5606dSYehuda Sadeh 				  char *buf)
2195dfc5606dSYehuda Sadeh {
2196dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2197dfc5606dSYehuda Sadeh 
21983591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2199dfc5606dSYehuda Sadeh }
2200dfc5606dSYehuda Sadeh 
2201dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2202dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2203dfc5606dSYehuda Sadeh 				char *buf)
2204dfc5606dSYehuda Sadeh {
2205dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2206dfc5606dSYehuda Sadeh 
2207593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2208dfc5606dSYehuda Sadeh }
2209dfc5606dSYehuda Sadeh 
221034b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
221134b13184SAlex Elder 				struct device_attribute *attr,
221234b13184SAlex Elder 				char *buf)
221334b13184SAlex Elder {
221434b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
221534b13184SAlex Elder 
221634b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
221734b13184SAlex Elder 			(unsigned long long) snap->features);
221834b13184SAlex Elder }
221934b13184SAlex Elder 
2220dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2221dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
222234b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2223dfc5606dSYehuda Sadeh 
2224dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2225dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2226dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
222734b13184SAlex Elder 	&dev_attr_snap_features.attr,
2228dfc5606dSYehuda Sadeh 	NULL,
2229dfc5606dSYehuda Sadeh };
2230dfc5606dSYehuda Sadeh 
2231dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2232dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2233dfc5606dSYehuda Sadeh };
2234dfc5606dSYehuda Sadeh 
2235dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2236dfc5606dSYehuda Sadeh {
2237dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2238dfc5606dSYehuda Sadeh 	kfree(snap->name);
2239dfc5606dSYehuda Sadeh 	kfree(snap);
2240dfc5606dSYehuda Sadeh }
2241dfc5606dSYehuda Sadeh 
2242dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2243dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2244dfc5606dSYehuda Sadeh 	NULL
2245dfc5606dSYehuda Sadeh };
2246dfc5606dSYehuda Sadeh 
2247dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2248dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2249dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2250dfc5606dSYehuda Sadeh };
2251dfc5606dSYehuda Sadeh 
22528b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
22538b8fb99cSAlex Elder {
22548b8fb99cSAlex Elder 	kref_get(&spec->kref);
22558b8fb99cSAlex Elder 
22568b8fb99cSAlex Elder 	return spec;
22578b8fb99cSAlex Elder }
22588b8fb99cSAlex Elder 
22598b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
22608b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
22618b8fb99cSAlex Elder {
22628b8fb99cSAlex Elder 	if (spec)
22638b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
22648b8fb99cSAlex Elder }
22658b8fb99cSAlex Elder 
22668b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
22678b8fb99cSAlex Elder {
22688b8fb99cSAlex Elder 	struct rbd_spec *spec;
22698b8fb99cSAlex Elder 
22708b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
22718b8fb99cSAlex Elder 	if (!spec)
22728b8fb99cSAlex Elder 		return NULL;
22738b8fb99cSAlex Elder 	kref_init(&spec->kref);
22748b8fb99cSAlex Elder 
22758b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
22768b8fb99cSAlex Elder 
22778b8fb99cSAlex Elder 	return spec;
22788b8fb99cSAlex Elder }
22798b8fb99cSAlex Elder 
22808b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
22818b8fb99cSAlex Elder {
22828b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
22838b8fb99cSAlex Elder 
22848b8fb99cSAlex Elder 	kfree(spec->pool_name);
22858b8fb99cSAlex Elder 	kfree(spec->image_id);
22868b8fb99cSAlex Elder 	kfree(spec->image_name);
22878b8fb99cSAlex Elder 	kfree(spec->snap_name);
22888b8fb99cSAlex Elder 	kfree(spec);
22898b8fb99cSAlex Elder }
22908b8fb99cSAlex Elder 
2291c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2292c53d5893SAlex Elder 				struct rbd_spec *spec)
2293c53d5893SAlex Elder {
2294c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2295c53d5893SAlex Elder 
2296c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2297c53d5893SAlex Elder 	if (!rbd_dev)
2298c53d5893SAlex Elder 		return NULL;
2299c53d5893SAlex Elder 
2300c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
2301d78b650aSAlex Elder 	atomic_set(&rbd_dev->exists, 0);
2302c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2303c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2304c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2305c53d5893SAlex Elder 
2306c53d5893SAlex Elder 	rbd_dev->spec = spec;
2307c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2308c53d5893SAlex Elder 
2309c53d5893SAlex Elder 	return rbd_dev;
2310c53d5893SAlex Elder }
2311c53d5893SAlex Elder 
2312c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2313c53d5893SAlex Elder {
231486b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2315c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2316c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2317c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2318c53d5893SAlex Elder 	kfree(rbd_dev);
2319c53d5893SAlex Elder }
2320c53d5893SAlex Elder 
2321304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2322304f6808SAlex Elder {
2323304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2324304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2325304f6808SAlex Elder 
2326304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2327304f6808SAlex Elder 
2328304f6808SAlex Elder 	return ret;
2329304f6808SAlex Elder }
2330304f6808SAlex Elder 
233141f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2332dfc5606dSYehuda Sadeh {
2333dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2334304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2335dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2336dfc5606dSYehuda Sadeh }
2337dfc5606dSYehuda Sadeh 
233814e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2339dfc5606dSYehuda Sadeh 				  struct device *parent)
2340dfc5606dSYehuda Sadeh {
2341dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2342dfc5606dSYehuda Sadeh 	int ret;
2343dfc5606dSYehuda Sadeh 
2344dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2345dfc5606dSYehuda Sadeh 	dev->parent = parent;
2346dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2347d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2348304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2349304f6808SAlex Elder 
2350dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2351dfc5606dSYehuda Sadeh 
2352dfc5606dSYehuda Sadeh 	return ret;
2353dfc5606dSYehuda Sadeh }
2354dfc5606dSYehuda Sadeh 
23554e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2356c8d18425SAlex Elder 						const char *snap_name,
235734b13184SAlex Elder 						u64 snap_id, u64 snap_size,
235834b13184SAlex Elder 						u64 snap_features)
2359dfc5606dSYehuda Sadeh {
23604e891e0aSAlex Elder 	struct rbd_snap *snap;
2361dfc5606dSYehuda Sadeh 	int ret;
23624e891e0aSAlex Elder 
23634e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2364dfc5606dSYehuda Sadeh 	if (!snap)
23654e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
23664e891e0aSAlex Elder 
23674e891e0aSAlex Elder 	ret = -ENOMEM;
2368c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
23694e891e0aSAlex Elder 	if (!snap->name)
23704e891e0aSAlex Elder 		goto err;
23714e891e0aSAlex Elder 
2372c8d18425SAlex Elder 	snap->id = snap_id;
2373c8d18425SAlex Elder 	snap->size = snap_size;
237434b13184SAlex Elder 	snap->features = snap_features;
23754e891e0aSAlex Elder 
23764e891e0aSAlex Elder 	return snap;
23774e891e0aSAlex Elder 
2378dfc5606dSYehuda Sadeh err:
2379dfc5606dSYehuda Sadeh 	kfree(snap->name);
2380dfc5606dSYehuda Sadeh 	kfree(snap);
23814e891e0aSAlex Elder 
23824e891e0aSAlex Elder 	return ERR_PTR(ret);
2383dfc5606dSYehuda Sadeh }
2384dfc5606dSYehuda Sadeh 
2385cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2386cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2387cd892126SAlex Elder {
2388cd892126SAlex Elder 	char *snap_name;
2389cd892126SAlex Elder 
2390cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2391cd892126SAlex Elder 
2392cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2393cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2394cd892126SAlex Elder 
2395cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2396cd892126SAlex Elder 
2397cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2398cd892126SAlex Elder 	while (which--)
2399cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2400cd892126SAlex Elder 
2401cd892126SAlex Elder 	return snap_name;
2402cd892126SAlex Elder }
2403cd892126SAlex Elder 
2404dfc5606dSYehuda Sadeh /*
24059d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
24069d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
24079d475de5SAlex Elder  * image.
24089d475de5SAlex Elder  */
24099d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
24109d475de5SAlex Elder 				u8 *order, u64 *snap_size)
24119d475de5SAlex Elder {
24129d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
24139d475de5SAlex Elder 	int ret;
24149d475de5SAlex Elder 	struct {
24159d475de5SAlex Elder 		u8 order;
24169d475de5SAlex Elder 		__le64 size;
24179d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
24189d475de5SAlex Elder 
24199d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24209d475de5SAlex Elder 				"rbd", "get_size",
24219d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
24229d475de5SAlex Elder 				(char *) &size_buf, sizeof (size_buf),
24239d475de5SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
24249d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24259d475de5SAlex Elder 	if (ret < 0)
24269d475de5SAlex Elder 		return ret;
24279d475de5SAlex Elder 
24289d475de5SAlex Elder 	*order = size_buf.order;
24299d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
24309d475de5SAlex Elder 
24319d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
24329d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
24339d475de5SAlex Elder 		(unsigned long long) *snap_size);
24349d475de5SAlex Elder 
24359d475de5SAlex Elder 	return 0;
24369d475de5SAlex Elder }
24379d475de5SAlex Elder 
24389d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
24399d475de5SAlex Elder {
24409d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
24419d475de5SAlex Elder 					&rbd_dev->header.obj_order,
24429d475de5SAlex Elder 					&rbd_dev->header.image_size);
24439d475de5SAlex Elder }
24449d475de5SAlex Elder 
24451e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
24461e130199SAlex Elder {
24471e130199SAlex Elder 	void *reply_buf;
24481e130199SAlex Elder 	int ret;
24491e130199SAlex Elder 	void *p;
24501e130199SAlex Elder 
24511e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
24521e130199SAlex Elder 	if (!reply_buf)
24531e130199SAlex Elder 		return -ENOMEM;
24541e130199SAlex Elder 
24551e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24561e130199SAlex Elder 				"rbd", "get_object_prefix",
24571e130199SAlex Elder 				NULL, 0,
24581e130199SAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
24591e130199SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
24601e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24611e130199SAlex Elder 	if (ret < 0)
24621e130199SAlex Elder 		goto out;
2463a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
24641e130199SAlex Elder 
24651e130199SAlex Elder 	p = reply_buf;
24661e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
24671e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
24681e130199SAlex Elder 						NULL, GFP_NOIO);
24691e130199SAlex Elder 
24701e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
24711e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
24721e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
24731e130199SAlex Elder 	} else {
24741e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
24751e130199SAlex Elder 	}
24761e130199SAlex Elder 
24771e130199SAlex Elder out:
24781e130199SAlex Elder 	kfree(reply_buf);
24791e130199SAlex Elder 
24801e130199SAlex Elder 	return ret;
24811e130199SAlex Elder }
24821e130199SAlex Elder 
2483b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2484b1b5402aSAlex Elder 		u64 *snap_features)
2485b1b5402aSAlex Elder {
2486b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2487b1b5402aSAlex Elder 	struct {
2488b1b5402aSAlex Elder 		__le64 features;
2489b1b5402aSAlex Elder 		__le64 incompat;
2490b1b5402aSAlex Elder 	} features_buf = { 0 };
2491d889140cSAlex Elder 	u64 incompat;
2492b1b5402aSAlex Elder 	int ret;
2493b1b5402aSAlex Elder 
2494b1b5402aSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2495b1b5402aSAlex Elder 				"rbd", "get_features",
2496b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2497b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
2498b1b5402aSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2499b1b5402aSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2500b1b5402aSAlex Elder 	if (ret < 0)
2501b1b5402aSAlex Elder 		return ret;
2502d889140cSAlex Elder 
2503d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2504d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2505b8f5c6edSAlex Elder 		return -ENXIO;
2506d889140cSAlex Elder 
2507b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2508b1b5402aSAlex Elder 
2509b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2510b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2511b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2512b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2513b1b5402aSAlex Elder 
2514b1b5402aSAlex Elder 	return 0;
2515b1b5402aSAlex Elder }
2516b1b5402aSAlex Elder 
2517b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2518b1b5402aSAlex Elder {
2519b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2520b1b5402aSAlex Elder 						&rbd_dev->header.features);
2521b1b5402aSAlex Elder }
2522b1b5402aSAlex Elder 
252386b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
252486b00e0dSAlex Elder {
252586b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
252686b00e0dSAlex Elder 	size_t size;
252786b00e0dSAlex Elder 	void *reply_buf = NULL;
252886b00e0dSAlex Elder 	__le64 snapid;
252986b00e0dSAlex Elder 	void *p;
253086b00e0dSAlex Elder 	void *end;
253186b00e0dSAlex Elder 	char *image_id;
253286b00e0dSAlex Elder 	u64 overlap;
253386b00e0dSAlex Elder 	int ret;
253486b00e0dSAlex Elder 
253586b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
253686b00e0dSAlex Elder 	if (!parent_spec)
253786b00e0dSAlex Elder 		return -ENOMEM;
253886b00e0dSAlex Elder 
253986b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
254086b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
254186b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
254286b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
254386b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
254486b00e0dSAlex Elder 	if (!reply_buf) {
254586b00e0dSAlex Elder 		ret = -ENOMEM;
254686b00e0dSAlex Elder 		goto out_err;
254786b00e0dSAlex Elder 	}
254886b00e0dSAlex Elder 
254986b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
255086b00e0dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
255186b00e0dSAlex Elder 				"rbd", "get_parent",
255286b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
255386b00e0dSAlex Elder 				(char *) reply_buf, size,
255486b00e0dSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
255586b00e0dSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
255686b00e0dSAlex Elder 	if (ret < 0)
255786b00e0dSAlex Elder 		goto out_err;
255886b00e0dSAlex Elder 
255986b00e0dSAlex Elder 	ret = -ERANGE;
256086b00e0dSAlex Elder 	p = reply_buf;
256186b00e0dSAlex Elder 	end = (char *) reply_buf + size;
256286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
256386b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
256486b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
256586b00e0dSAlex Elder 
2566979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
256786b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
256886b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
256986b00e0dSAlex Elder 		goto out_err;
257086b00e0dSAlex Elder 	}
257186b00e0dSAlex Elder 	parent_spec->image_id = image_id;
257286b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
257386b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
257486b00e0dSAlex Elder 
257586b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
257686b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
257786b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
257886b00e0dSAlex Elder out:
257986b00e0dSAlex Elder 	ret = 0;
258086b00e0dSAlex Elder out_err:
258186b00e0dSAlex Elder 	kfree(reply_buf);
258286b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
258386b00e0dSAlex Elder 
258486b00e0dSAlex Elder 	return ret;
258586b00e0dSAlex Elder }
258686b00e0dSAlex Elder 
25879e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
25889e15b77dSAlex Elder {
25899e15b77dSAlex Elder 	size_t image_id_size;
25909e15b77dSAlex Elder 	char *image_id;
25919e15b77dSAlex Elder 	void *p;
25929e15b77dSAlex Elder 	void *end;
25939e15b77dSAlex Elder 	size_t size;
25949e15b77dSAlex Elder 	void *reply_buf = NULL;
25959e15b77dSAlex Elder 	size_t len = 0;
25969e15b77dSAlex Elder 	char *image_name = NULL;
25979e15b77dSAlex Elder 	int ret;
25989e15b77dSAlex Elder 
25999e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
26009e15b77dSAlex Elder 
260169e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
260269e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
26039e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
26049e15b77dSAlex Elder 	if (!image_id)
26059e15b77dSAlex Elder 		return NULL;
26069e15b77dSAlex Elder 
26079e15b77dSAlex Elder 	p = image_id;
26089e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
260969e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
26109e15b77dSAlex Elder 
26119e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
26129e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
26139e15b77dSAlex Elder 	if (!reply_buf)
26149e15b77dSAlex Elder 		goto out;
26159e15b77dSAlex Elder 
26169e15b77dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
26179e15b77dSAlex Elder 				"rbd", "dir_get_name",
26189e15b77dSAlex Elder 				image_id, image_id_size,
26199e15b77dSAlex Elder 				(char *) reply_buf, size,
26209e15b77dSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
26219e15b77dSAlex Elder 	if (ret < 0)
26229e15b77dSAlex Elder 		goto out;
26239e15b77dSAlex Elder 	p = reply_buf;
26249e15b77dSAlex Elder 	end = (char *) reply_buf + size;
26259e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
26269e15b77dSAlex Elder 	if (IS_ERR(image_name))
26279e15b77dSAlex Elder 		image_name = NULL;
26289e15b77dSAlex Elder 	else
26299e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
26309e15b77dSAlex Elder out:
26319e15b77dSAlex Elder 	kfree(reply_buf);
26329e15b77dSAlex Elder 	kfree(image_id);
26339e15b77dSAlex Elder 
26349e15b77dSAlex Elder 	return image_name;
26359e15b77dSAlex Elder }
26369e15b77dSAlex Elder 
26379e15b77dSAlex Elder /*
26389e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
26399e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
26409e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
26419e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
26429e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
26439e15b77dSAlex Elder  * until then.
26449e15b77dSAlex Elder  */
26459e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
26469e15b77dSAlex Elder {
26479e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
26489e15b77dSAlex Elder 	const char *name;
26499e15b77dSAlex Elder 	void *reply_buf = NULL;
26509e15b77dSAlex Elder 	int ret;
26519e15b77dSAlex Elder 
26529e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
26539e15b77dSAlex Elder 		return 0;	/* Already have the names */
26549e15b77dSAlex Elder 
26559e15b77dSAlex Elder 	/* Look up the pool name */
26569e15b77dSAlex Elder 
26579e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
26589e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2659935dc89fSAlex Elder 	if (!name) {
2660935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
2661935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
2662935dc89fSAlex Elder 		return -EIO;
2663935dc89fSAlex Elder 	}
26649e15b77dSAlex Elder 
26659e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
26669e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
26679e15b77dSAlex Elder 		return -ENOMEM;
26689e15b77dSAlex Elder 
26699e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
26709e15b77dSAlex Elder 
26719e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
267269e7a02fSAlex Elder 	if (name)
26739e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
267469e7a02fSAlex Elder 	else
267506ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
26769e15b77dSAlex Elder 
26779e15b77dSAlex Elder 	/* Look up the snapshot name. */
26789e15b77dSAlex Elder 
26799e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
26809e15b77dSAlex Elder 	if (!name) {
2681935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
2682935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
26839e15b77dSAlex Elder 		ret = -EIO;
26849e15b77dSAlex Elder 		goto out_err;
26859e15b77dSAlex Elder 	}
26869e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
26879e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
26889e15b77dSAlex Elder 		goto out_err;
26899e15b77dSAlex Elder 
26909e15b77dSAlex Elder 	return 0;
26919e15b77dSAlex Elder out_err:
26929e15b77dSAlex Elder 	kfree(reply_buf);
26939e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
26949e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
26959e15b77dSAlex Elder 
26969e15b77dSAlex Elder 	return ret;
26979e15b77dSAlex Elder }
26989e15b77dSAlex Elder 
26996e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
270035d489f9SAlex Elder {
270135d489f9SAlex Elder 	size_t size;
270235d489f9SAlex Elder 	int ret;
270335d489f9SAlex Elder 	void *reply_buf;
270435d489f9SAlex Elder 	void *p;
270535d489f9SAlex Elder 	void *end;
270635d489f9SAlex Elder 	u64 seq;
270735d489f9SAlex Elder 	u32 snap_count;
270835d489f9SAlex Elder 	struct ceph_snap_context *snapc;
270935d489f9SAlex Elder 	u32 i;
271035d489f9SAlex Elder 
271135d489f9SAlex Elder 	/*
271235d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
271335d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
271435d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
271535d489f9SAlex Elder 	 * prepared to receive.
271635d489f9SAlex Elder 	 */
271735d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
271835d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
271935d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
272035d489f9SAlex Elder 	if (!reply_buf)
272135d489f9SAlex Elder 		return -ENOMEM;
272235d489f9SAlex Elder 
272335d489f9SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
272435d489f9SAlex Elder 				"rbd", "get_snapcontext",
272535d489f9SAlex Elder 				NULL, 0,
272635d489f9SAlex Elder 				reply_buf, size,
27276e14b1a6SAlex Elder 				CEPH_OSD_FLAG_READ, ver);
272835d489f9SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
272935d489f9SAlex Elder 	if (ret < 0)
273035d489f9SAlex Elder 		goto out;
273135d489f9SAlex Elder 
273235d489f9SAlex Elder 	ret = -ERANGE;
273335d489f9SAlex Elder 	p = reply_buf;
273435d489f9SAlex Elder 	end = (char *) reply_buf + size;
273535d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
273635d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
273735d489f9SAlex Elder 
273835d489f9SAlex Elder 	/*
273935d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
274035d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
274135d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
274235d489f9SAlex Elder 	 * allocate is representable in a size_t.
274335d489f9SAlex Elder 	 */
274435d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
274535d489f9SAlex Elder 				 / sizeof (u64)) {
274635d489f9SAlex Elder 		ret = -EINVAL;
274735d489f9SAlex Elder 		goto out;
274835d489f9SAlex Elder 	}
274935d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
275035d489f9SAlex Elder 		goto out;
275135d489f9SAlex Elder 
275235d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
275335d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
275435d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
275535d489f9SAlex Elder 	if (!snapc) {
275635d489f9SAlex Elder 		ret = -ENOMEM;
275735d489f9SAlex Elder 		goto out;
275835d489f9SAlex Elder 	}
275935d489f9SAlex Elder 
276035d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
276135d489f9SAlex Elder 	snapc->seq = seq;
276235d489f9SAlex Elder 	snapc->num_snaps = snap_count;
276335d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
276435d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
276535d489f9SAlex Elder 
276635d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
276735d489f9SAlex Elder 
276835d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
276935d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
277035d489f9SAlex Elder 
277135d489f9SAlex Elder out:
277235d489f9SAlex Elder 	kfree(reply_buf);
277335d489f9SAlex Elder 
277435d489f9SAlex Elder 	return 0;
277535d489f9SAlex Elder }
277635d489f9SAlex Elder 
2777b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2778b8b1e2dbSAlex Elder {
2779b8b1e2dbSAlex Elder 	size_t size;
2780b8b1e2dbSAlex Elder 	void *reply_buf;
2781b8b1e2dbSAlex Elder 	__le64 snap_id;
2782b8b1e2dbSAlex Elder 	int ret;
2783b8b1e2dbSAlex Elder 	void *p;
2784b8b1e2dbSAlex Elder 	void *end;
2785b8b1e2dbSAlex Elder 	char *snap_name;
2786b8b1e2dbSAlex Elder 
2787b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2788b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
2789b8b1e2dbSAlex Elder 	if (!reply_buf)
2790b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
2791b8b1e2dbSAlex Elder 
2792b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2793b8b1e2dbSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2794b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
2795b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
2796b8b1e2dbSAlex Elder 				reply_buf, size,
2797b8b1e2dbSAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
2798b8b1e2dbSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2799b8b1e2dbSAlex Elder 	if (ret < 0)
2800b8b1e2dbSAlex Elder 		goto out;
2801b8b1e2dbSAlex Elder 
2802b8b1e2dbSAlex Elder 	p = reply_buf;
2803b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
2804e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2805b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
2806b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
2807b8b1e2dbSAlex Elder 		goto out;
2808b8b1e2dbSAlex Elder 	} else {
2809b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
2810b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
2811b8b1e2dbSAlex Elder 	}
2812b8b1e2dbSAlex Elder 	kfree(reply_buf);
2813b8b1e2dbSAlex Elder 
2814b8b1e2dbSAlex Elder 	return snap_name;
2815b8b1e2dbSAlex Elder out:
2816b8b1e2dbSAlex Elder 	kfree(reply_buf);
2817b8b1e2dbSAlex Elder 
2818b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
2819b8b1e2dbSAlex Elder }
2820b8b1e2dbSAlex Elder 
2821b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2822b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2823b8b1e2dbSAlex Elder {
2824b8b1e2dbSAlex Elder 	__le64 snap_id;
2825b8b1e2dbSAlex Elder 	u8 order;
2826b8b1e2dbSAlex Elder 	int ret;
2827b8b1e2dbSAlex Elder 
2828b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
2829b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2830b8b1e2dbSAlex Elder 	if (ret)
2831b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2832b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2833b8b1e2dbSAlex Elder 	if (ret)
2834b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2835b8b1e2dbSAlex Elder 
2836b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
2837b8b1e2dbSAlex Elder }
2838b8b1e2dbSAlex Elder 
2839b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2840b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2841b8b1e2dbSAlex Elder {
2842b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
2843b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
2844b8b1e2dbSAlex Elder 					snap_size, snap_features);
2845b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
2846b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
2847b8b1e2dbSAlex Elder 					snap_size, snap_features);
2848b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
2849b8b1e2dbSAlex Elder }
2850b8b1e2dbSAlex Elder 
2851117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2852117973fbSAlex Elder {
2853117973fbSAlex Elder 	int ret;
2854117973fbSAlex Elder 	__u8 obj_order;
2855117973fbSAlex Elder 
2856117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
2857117973fbSAlex Elder 
2858117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
2859117973fbSAlex Elder 
2860117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
2861117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
2862117973fbSAlex Elder 	if (ret)
2863117973fbSAlex Elder 		goto out;
2864117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
2865117973fbSAlex Elder 		ret = -EIO;
2866117973fbSAlex Elder 		goto out;
2867117973fbSAlex Elder 	}
2868117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
2869117973fbSAlex Elder 
2870117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2871117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
2872117973fbSAlex Elder 	if (ret)
2873117973fbSAlex Elder 		goto out;
2874117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2875117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
2876117973fbSAlex Elder 	if (ret)
2877117973fbSAlex Elder 		goto out;
2878117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
2879117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
2880117973fbSAlex Elder out:
2881117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
2882117973fbSAlex Elder 
2883117973fbSAlex Elder 	return ret;
2884117973fbSAlex Elder }
2885117973fbSAlex Elder 
28869d475de5SAlex Elder /*
288735938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
288835938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
288935938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
289035938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
289135938150SAlex Elder  * And verify there are no changes to snapshots we already know
289235938150SAlex Elder  * about.
289335938150SAlex Elder  *
289435938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
289535938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
289635938150SAlex Elder  * are also maintained in that order.)
2897dfc5606dSYehuda Sadeh  */
2898304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2899dfc5606dSYehuda Sadeh {
290035938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
290135938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
290235938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
290335938150SAlex Elder 	struct list_head *links = head->next;
290435938150SAlex Elder 	u32 index = 0;
2905dfc5606dSYehuda Sadeh 
29069fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
290735938150SAlex Elder 	while (index < snap_count || links != head) {
290835938150SAlex Elder 		u64 snap_id;
290935938150SAlex Elder 		struct rbd_snap *snap;
2910cd892126SAlex Elder 		char *snap_name;
2911cd892126SAlex Elder 		u64 snap_size = 0;
2912cd892126SAlex Elder 		u64 snap_features = 0;
2913dfc5606dSYehuda Sadeh 
291435938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
291535938150SAlex Elder 					     : CEPH_NOSNAP;
291635938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
291735938150SAlex Elder 				     : NULL;
2918aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2919dfc5606dSYehuda Sadeh 
292035938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
292135938150SAlex Elder 			struct list_head *next = links->next;
2922dfc5606dSYehuda Sadeh 
292335938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2924dfc5606dSYehuda Sadeh 
29250d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
2926d78b650aSAlex Elder 				atomic_set(&rbd_dev->exists, 0);
292741f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
29289fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
29290d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
29300d7dbfceSAlex Elder 							"mapped " : "",
29319fcbb800SAlex Elder 				(unsigned long long) snap->id);
2932dfc5606dSYehuda Sadeh 
293335938150SAlex Elder 			/* Done with this list entry; advance */
293435938150SAlex Elder 
293535938150SAlex Elder 			links = next;
293635938150SAlex Elder 			continue;
2937dfc5606dSYehuda Sadeh 		}
293835938150SAlex Elder 
2939b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
2940cd892126SAlex Elder 					&snap_size, &snap_features);
2941cd892126SAlex Elder 		if (IS_ERR(snap_name))
2942cd892126SAlex Elder 			return PTR_ERR(snap_name);
2943cd892126SAlex Elder 
29449fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
29459fcbb800SAlex Elder 			(unsigned long long) snap_id);
294635938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
294735938150SAlex Elder 			struct rbd_snap *new_snap;
294835938150SAlex Elder 
294935938150SAlex Elder 			/* We haven't seen this snapshot before */
295035938150SAlex Elder 
2951c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2952cd892126SAlex Elder 					snap_id, snap_size, snap_features);
29539fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
29549fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
29559fcbb800SAlex Elder 
29569fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
29579fcbb800SAlex Elder 
29589fcbb800SAlex Elder 				return err;
29599fcbb800SAlex Elder 			}
296035938150SAlex Elder 
296135938150SAlex Elder 			/* New goes before existing, or at end of list */
296235938150SAlex Elder 
29639fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
296435938150SAlex Elder 			if (snap)
296535938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
296635938150SAlex Elder 			else
2967523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
296835938150SAlex Elder 		} else {
296935938150SAlex Elder 			/* Already have this one */
297035938150SAlex Elder 
29719fcbb800SAlex Elder 			dout("  already present\n");
29729fcbb800SAlex Elder 
2973cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2974aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2975cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
297635938150SAlex Elder 
297735938150SAlex Elder 			/* Done with this list entry; advance */
297835938150SAlex Elder 
297935938150SAlex Elder 			links = links->next;
2980dfc5606dSYehuda Sadeh 		}
298135938150SAlex Elder 
298235938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
298335938150SAlex Elder 
298435938150SAlex Elder 		index++;
2985dfc5606dSYehuda Sadeh 	}
29869fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2987dfc5606dSYehuda Sadeh 
2988dfc5606dSYehuda Sadeh 	return 0;
2989dfc5606dSYehuda Sadeh }
2990dfc5606dSYehuda Sadeh 
2991304f6808SAlex Elder /*
2992304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2993304f6808SAlex Elder  * have not already been registered.
2994304f6808SAlex Elder  */
2995304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2996304f6808SAlex Elder {
2997304f6808SAlex Elder 	struct rbd_snap *snap;
2998304f6808SAlex Elder 	int ret = 0;
2999304f6808SAlex Elder 
3000304f6808SAlex Elder 	dout("%s called\n", __func__);
300186ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
300286ff77bbSAlex Elder 		return -EIO;
3003304f6808SAlex Elder 
3004304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3005304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3006304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3007304f6808SAlex Elder 			if (ret < 0)
3008304f6808SAlex Elder 				break;
3009304f6808SAlex Elder 		}
3010304f6808SAlex Elder 	}
3011304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3012304f6808SAlex Elder 
3013304f6808SAlex Elder 	return ret;
3014304f6808SAlex Elder }
3015304f6808SAlex Elder 
3016dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3017dfc5606dSYehuda Sadeh {
3018dfc5606dSYehuda Sadeh 	struct device *dev;
3019cd789ab9SAlex Elder 	int ret;
3020dfc5606dSYehuda Sadeh 
3021dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3022dfc5606dSYehuda Sadeh 
3023cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3024dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3025dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3026dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3027dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3028de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3029dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3030dfc5606dSYehuda Sadeh 
3031dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3032cd789ab9SAlex Elder 
3033dfc5606dSYehuda Sadeh 	return ret;
3034602adf40SYehuda Sadeh }
3035602adf40SYehuda Sadeh 
3036dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3037dfc5606dSYehuda Sadeh {
3038dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3039dfc5606dSYehuda Sadeh }
3040dfc5606dSYehuda Sadeh 
304159c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
304259c2be1eSYehuda Sadeh {
304359c2be1eSYehuda Sadeh 	int ret, rc;
304459c2be1eSYehuda Sadeh 
304559c2be1eSYehuda Sadeh 	do {
30460e6f322dSAlex Elder 		ret = rbd_req_sync_watch(rbd_dev);
304759c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
3048117973fbSAlex Elder 			rc = rbd_dev_refresh(rbd_dev, NULL);
304959c2be1eSYehuda Sadeh 			if (rc < 0)
305059c2be1eSYehuda Sadeh 				return rc;
305159c2be1eSYehuda Sadeh 		}
305259c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
305359c2be1eSYehuda Sadeh 
305459c2be1eSYehuda Sadeh 	return ret;
305559c2be1eSYehuda Sadeh }
305659c2be1eSYehuda Sadeh 
3057e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
30581ddbe94eSAlex Elder 
30591ddbe94eSAlex Elder /*
3060499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3061499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
30621ddbe94eSAlex Elder  */
3063e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3064b7f23c36SAlex Elder {
3065e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3066499afd5bSAlex Elder 
3067499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3068499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3069499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3070e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3071e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3072b7f23c36SAlex Elder }
3073b7f23c36SAlex Elder 
30741ddbe94eSAlex Elder /*
3075499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3076499afd5bSAlex Elder  * identifier is no longer in use.
30771ddbe94eSAlex Elder  */
3078e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
30791ddbe94eSAlex Elder {
3080d184f6bfSAlex Elder 	struct list_head *tmp;
3081de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3082d184f6bfSAlex Elder 	int max_id;
3083d184f6bfSAlex Elder 
3084aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3085499afd5bSAlex Elder 
3086e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3087e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3088499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3089499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3090d184f6bfSAlex Elder 
3091d184f6bfSAlex Elder 	/*
3092d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3093d184f6bfSAlex Elder 	 * is nothing special we need to do.
3094d184f6bfSAlex Elder 	 */
3095e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3096d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3097d184f6bfSAlex Elder 		return;
3098d184f6bfSAlex Elder 	}
3099d184f6bfSAlex Elder 
3100d184f6bfSAlex Elder 	/*
3101d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3102d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3103d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3104d184f6bfSAlex Elder 	 */
3105d184f6bfSAlex Elder 	max_id = 0;
3106d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3107d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3108d184f6bfSAlex Elder 
3109d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3110b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3111b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3112d184f6bfSAlex Elder 	}
3113499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
31141ddbe94eSAlex Elder 
31151ddbe94eSAlex Elder 	/*
3116e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3117d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3118d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3119d184f6bfSAlex Elder 	 * case.
31201ddbe94eSAlex Elder 	 */
3121e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3122e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3123b7f23c36SAlex Elder }
3124b7f23c36SAlex Elder 
3125a725f65eSAlex Elder /*
3126e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3127e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3128593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3129593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3130e28fff26SAlex Elder  */
3131e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3132e28fff26SAlex Elder {
3133e28fff26SAlex Elder         /*
3134e28fff26SAlex Elder         * These are the characters that produce nonzero for
3135e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3136e28fff26SAlex Elder         */
3137e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3138e28fff26SAlex Elder 
3139e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3140e28fff26SAlex Elder 
3141e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3142e28fff26SAlex Elder }
3143e28fff26SAlex Elder 
3144e28fff26SAlex Elder /*
3145e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3146e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3147593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3148593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3149e28fff26SAlex Elder  *
3150e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3151e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3152e28fff26SAlex Elder  * token_size if the token would not fit.
3153e28fff26SAlex Elder  *
3154593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3155e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3156e28fff26SAlex Elder  * too small to hold it.
3157e28fff26SAlex Elder  */
3158e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3159e28fff26SAlex Elder 				char *token,
3160e28fff26SAlex Elder 				size_t token_size)
3161e28fff26SAlex Elder {
3162e28fff26SAlex Elder         size_t len;
3163e28fff26SAlex Elder 
3164e28fff26SAlex Elder 	len = next_token(buf);
3165e28fff26SAlex Elder 	if (len < token_size) {
3166e28fff26SAlex Elder 		memcpy(token, *buf, len);
3167e28fff26SAlex Elder 		*(token + len) = '\0';
3168e28fff26SAlex Elder 	}
3169e28fff26SAlex Elder 	*buf += len;
3170e28fff26SAlex Elder 
3171e28fff26SAlex Elder         return len;
3172e28fff26SAlex Elder }
3173e28fff26SAlex Elder 
3174e28fff26SAlex Elder /*
3175ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3176ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3177ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3178ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3179ea3352f4SAlex Elder  *
3180ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3181ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3182ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3183ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3184ea3352f4SAlex Elder  *
3185ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3186ea3352f4SAlex Elder  * the end of the found token.
3187ea3352f4SAlex Elder  *
3188ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3189ea3352f4SAlex Elder  */
3190ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3191ea3352f4SAlex Elder {
3192ea3352f4SAlex Elder 	char *dup;
3193ea3352f4SAlex Elder 	size_t len;
3194ea3352f4SAlex Elder 
3195ea3352f4SAlex Elder 	len = next_token(buf);
31964caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3197ea3352f4SAlex Elder 	if (!dup)
3198ea3352f4SAlex Elder 		return NULL;
3199ea3352f4SAlex Elder 	*(dup + len) = '\0';
3200ea3352f4SAlex Elder 	*buf += len;
3201ea3352f4SAlex Elder 
3202ea3352f4SAlex Elder 	if (lenp)
3203ea3352f4SAlex Elder 		*lenp = len;
3204ea3352f4SAlex Elder 
3205ea3352f4SAlex Elder 	return dup;
3206ea3352f4SAlex Elder }
3207ea3352f4SAlex Elder 
3208ea3352f4SAlex Elder /*
3209859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3210859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3211859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3212859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3213d22f76e7SAlex Elder  *
3214859c31dfSAlex Elder  * The information extracted from these options is recorded in
3215859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3216859c31dfSAlex Elder  * structures:
3217859c31dfSAlex Elder  *  ceph_opts
3218859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3219859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3220859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3221859c31dfSAlex Elder  *  rbd_opts
3222859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3223859c31dfSAlex Elder  *	this function; caller must release with kfree().
3224859c31dfSAlex Elder  *  spec
3225859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3226859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3227859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3228859c31dfSAlex Elder  *
3229859c31dfSAlex Elder  * The options passed take this form:
3230859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3231859c31dfSAlex Elder  * where:
3232859c31dfSAlex Elder  *  <mon_addrs>
3233859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3234859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3235859c31dfSAlex Elder  *      by a port number (separated by a colon).
3236859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3237859c31dfSAlex Elder  *  <options>
3238859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3239859c31dfSAlex Elder  *  <pool_name>
3240859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3241859c31dfSAlex Elder  *  <image_name>
3242859c31dfSAlex Elder  *      The name of the image in that pool to map.
3243859c31dfSAlex Elder  *  <snap_id>
3244859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3245859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3246859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3247859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3248a725f65eSAlex Elder  */
3249859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3250dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3251859c31dfSAlex Elder 				struct rbd_options **opts,
3252859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3253a725f65eSAlex Elder {
3254e28fff26SAlex Elder 	size_t len;
3255859c31dfSAlex Elder 	char *options;
32560ddebc0cSAlex Elder 	const char *mon_addrs;
32570ddebc0cSAlex Elder 	size_t mon_addrs_size;
3258859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
32594e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3260859c31dfSAlex Elder 	struct ceph_options *copts;
3261dc79b113SAlex Elder 	int ret;
3262e28fff26SAlex Elder 
3263e28fff26SAlex Elder 	/* The first four tokens are required */
3264e28fff26SAlex Elder 
32657ef3214aSAlex Elder 	len = next_token(&buf);
32664fb5d671SAlex Elder 	if (!len) {
32674fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
32684fb5d671SAlex Elder 		return -EINVAL;
32694fb5d671SAlex Elder 	}
32700ddebc0cSAlex Elder 	mon_addrs = buf;
3271f28e565aSAlex Elder 	mon_addrs_size = len + 1;
32727ef3214aSAlex Elder 	buf += len;
3273a725f65eSAlex Elder 
3274dc79b113SAlex Elder 	ret = -EINVAL;
3275f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3276f28e565aSAlex Elder 	if (!options)
3277dc79b113SAlex Elder 		return -ENOMEM;
32784fb5d671SAlex Elder 	if (!*options) {
32794fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
32804fb5d671SAlex Elder 		goto out_err;
32814fb5d671SAlex Elder 	}
3282a725f65eSAlex Elder 
3283859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3284859c31dfSAlex Elder 	if (!spec)
3285f28e565aSAlex Elder 		goto out_mem;
3286859c31dfSAlex Elder 
3287859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3288859c31dfSAlex Elder 	if (!spec->pool_name)
3289859c31dfSAlex Elder 		goto out_mem;
32904fb5d671SAlex Elder 	if (!*spec->pool_name) {
32914fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
32924fb5d671SAlex Elder 		goto out_err;
32934fb5d671SAlex Elder 	}
3294e28fff26SAlex Elder 
329569e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3296859c31dfSAlex Elder 	if (!spec->image_name)
3297f28e565aSAlex Elder 		goto out_mem;
32984fb5d671SAlex Elder 	if (!*spec->image_name) {
32994fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
33004fb5d671SAlex Elder 		goto out_err;
33014fb5d671SAlex Elder 	}
3302e28fff26SAlex Elder 
3303f28e565aSAlex Elder 	/*
3304f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3305f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3306f28e565aSAlex Elder 	 */
33073feeb894SAlex Elder 	len = next_token(&buf);
3308820a5f3eSAlex Elder 	if (!len) {
33093feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
33103feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3311f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3312dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3313f28e565aSAlex Elder 		goto out_err;
3314849b4260SAlex Elder 	}
33154caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3316859c31dfSAlex Elder 	if (!spec->snap_name)
3317f28e565aSAlex Elder 		goto out_mem;
3318859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3319e5c35534SAlex Elder 
33200ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3321e28fff26SAlex Elder 
33224e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
33234e9afebaSAlex Elder 	if (!rbd_opts)
33244e9afebaSAlex Elder 		goto out_mem;
33254e9afebaSAlex Elder 
33264e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3327d22f76e7SAlex Elder 
3328859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
33290ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
33304e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3331859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3332859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3333dc79b113SAlex Elder 		goto out_err;
3334dc79b113SAlex Elder 	}
3335859c31dfSAlex Elder 	kfree(options);
3336859c31dfSAlex Elder 
3337859c31dfSAlex Elder 	*ceph_opts = copts;
33384e9afebaSAlex Elder 	*opts = rbd_opts;
3339859c31dfSAlex Elder 	*rbd_spec = spec;
33400ddebc0cSAlex Elder 
3341dc79b113SAlex Elder 	return 0;
3342f28e565aSAlex Elder out_mem:
3343dc79b113SAlex Elder 	ret = -ENOMEM;
3344d22f76e7SAlex Elder out_err:
3345859c31dfSAlex Elder 	kfree(rbd_opts);
3346859c31dfSAlex Elder 	rbd_spec_put(spec);
3347f28e565aSAlex Elder 	kfree(options);
3348d22f76e7SAlex Elder 
3349dc79b113SAlex Elder 	return ret;
3350a725f65eSAlex Elder }
3351a725f65eSAlex Elder 
3352589d30e0SAlex Elder /*
3353589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3354589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3355589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3356589d30e0SAlex Elder  *
3357589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3358589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3359589d30e0SAlex Elder  * with the supplied name.
3360589d30e0SAlex Elder  *
3361589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3362589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3363589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3364589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3365589d30e0SAlex Elder  */
3366589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3367589d30e0SAlex Elder {
3368589d30e0SAlex Elder 	int ret;
3369589d30e0SAlex Elder 	size_t size;
3370589d30e0SAlex Elder 	char *object_name;
3371589d30e0SAlex Elder 	void *response;
3372589d30e0SAlex Elder 	void *p;
3373589d30e0SAlex Elder 
3374589d30e0SAlex Elder 	/*
33752c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
33762c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
33772c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
33782c0d0a10SAlex Elder 	 */
33792c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
33802c0d0a10SAlex Elder 		return 0;
33812c0d0a10SAlex Elder 
33822c0d0a10SAlex Elder 	/*
3383589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3384589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3385589d30e0SAlex Elder 	 */
338669e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3387589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3388589d30e0SAlex Elder 	if (!object_name)
3389589d30e0SAlex Elder 		return -ENOMEM;
33900d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3391589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3392589d30e0SAlex Elder 
3393589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3394589d30e0SAlex Elder 
3395589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3396589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3397589d30e0SAlex Elder 	if (!response) {
3398589d30e0SAlex Elder 		ret = -ENOMEM;
3399589d30e0SAlex Elder 		goto out;
3400589d30e0SAlex Elder 	}
3401589d30e0SAlex Elder 
3402589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
3403589d30e0SAlex Elder 				"rbd", "get_id",
3404589d30e0SAlex Elder 				NULL, 0,
3405589d30e0SAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX,
3406589d30e0SAlex Elder 				CEPH_OSD_FLAG_READ, NULL);
3407589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3408589d30e0SAlex Elder 	if (ret < 0)
3409589d30e0SAlex Elder 		goto out;
3410a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
3411589d30e0SAlex Elder 
3412589d30e0SAlex Elder 	p = response;
34130d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3414589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3415979ed480SAlex Elder 						NULL, GFP_NOIO);
34160d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
34170d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
34180d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3419589d30e0SAlex Elder 	} else {
34200d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3421589d30e0SAlex Elder 	}
3422589d30e0SAlex Elder out:
3423589d30e0SAlex Elder 	kfree(response);
3424589d30e0SAlex Elder 	kfree(object_name);
3425589d30e0SAlex Elder 
3426589d30e0SAlex Elder 	return ret;
3427589d30e0SAlex Elder }
3428589d30e0SAlex Elder 
3429a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3430a30b71b9SAlex Elder {
3431a30b71b9SAlex Elder 	int ret;
3432a30b71b9SAlex Elder 	size_t size;
3433a30b71b9SAlex Elder 
3434a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3435a30b71b9SAlex Elder 
34360d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
34370d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3438a30b71b9SAlex Elder 		return -ENOMEM;
3439a30b71b9SAlex Elder 
3440a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3441a30b71b9SAlex Elder 
344269e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3443a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3444a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3445a30b71b9SAlex Elder 		ret = -ENOMEM;
3446a30b71b9SAlex Elder 		goto out_err;
3447a30b71b9SAlex Elder 	}
34480d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34490d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3450a30b71b9SAlex Elder 
3451a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3452a30b71b9SAlex Elder 
3453a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3454a30b71b9SAlex Elder 	if (ret < 0)
3455a30b71b9SAlex Elder 		goto out_err;
345686b00e0dSAlex Elder 
345786b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
345886b00e0dSAlex Elder 
345986b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
346086b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
346186b00e0dSAlex Elder 
3462a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3463a30b71b9SAlex Elder 
3464a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3465a30b71b9SAlex Elder 		rbd_dev->header_name);
3466a30b71b9SAlex Elder 
3467a30b71b9SAlex Elder 	return 0;
3468a30b71b9SAlex Elder 
3469a30b71b9SAlex Elder out_err:
3470a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3471a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
34720d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
34730d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3474a30b71b9SAlex Elder 
3475a30b71b9SAlex Elder 	return ret;
3476a30b71b9SAlex Elder }
3477a30b71b9SAlex Elder 
3478a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3479a30b71b9SAlex Elder {
3480a30b71b9SAlex Elder 	size_t size;
34819d475de5SAlex Elder 	int ret;
34826e14b1a6SAlex Elder 	u64 ver = 0;
3483a30b71b9SAlex Elder 
3484a30b71b9SAlex Elder 	/*
3485a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3486a30b71b9SAlex Elder 	 * object name for this rbd image.
3487a30b71b9SAlex Elder 	 */
3488979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3489a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3490a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3491a30b71b9SAlex Elder 		return -ENOMEM;
3492a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34930d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
34949d475de5SAlex Elder 
34959d475de5SAlex Elder 	/* Get the size and object order for the image */
34969d475de5SAlex Elder 
34979d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
34989d475de5SAlex Elder 	if (ret < 0)
34999d475de5SAlex Elder 		goto out_err;
35001e130199SAlex Elder 
35011e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
35021e130199SAlex Elder 
35031e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
35041e130199SAlex Elder 	if (ret < 0)
35051e130199SAlex Elder 		goto out_err;
3506b1b5402aSAlex Elder 
3507d889140cSAlex Elder 	/* Get the and check features for the image */
3508b1b5402aSAlex Elder 
3509b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3510b1b5402aSAlex Elder 	if (ret < 0)
3511b1b5402aSAlex Elder 		goto out_err;
351235d489f9SAlex Elder 
351386b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
351486b00e0dSAlex Elder 
351586b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
351686b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
351786b00e0dSAlex Elder 		if (ret < 0)
351886b00e0dSAlex Elder 			goto out_err;
351986b00e0dSAlex Elder 	}
352086b00e0dSAlex Elder 
35216e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
352235d489f9SAlex Elder 
35236e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
35246e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
35256e14b1a6SAlex Elder 
35266e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
35276e14b1a6SAlex Elder 
35286e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
352935d489f9SAlex Elder 	if (ret)
353035d489f9SAlex Elder 		goto out_err;
35316e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
35326e14b1a6SAlex Elder 
3533a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3534a30b71b9SAlex Elder 
3535a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3536a30b71b9SAlex Elder 		rbd_dev->header_name);
3537a30b71b9SAlex Elder 
353835152979SAlex Elder 	return 0;
35399d475de5SAlex Elder out_err:
354086b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
354186b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
354286b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
35439d475de5SAlex Elder 	kfree(rbd_dev->header_name);
35449d475de5SAlex Elder 	rbd_dev->header_name = NULL;
35451e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
35461e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
35479d475de5SAlex Elder 
35489d475de5SAlex Elder 	return ret;
3549a30b71b9SAlex Elder }
3550a30b71b9SAlex Elder 
355183a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
355283a06263SAlex Elder {
355383a06263SAlex Elder 	int ret;
355483a06263SAlex Elder 
355583a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
355683a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
355783a06263SAlex Elder 	if (ret)
355883a06263SAlex Elder 		return ret;
355983a06263SAlex Elder 
35609e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
35619e15b77dSAlex Elder 	if (ret)
35629e15b77dSAlex Elder 		goto err_out_snaps;
35639e15b77dSAlex Elder 
356483a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
356583a06263SAlex Elder 	if (ret)
356683a06263SAlex Elder 		goto err_out_snaps;
356783a06263SAlex Elder 
356883a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
356983a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
357083a06263SAlex Elder 
357183a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
357283a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
357383a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
357483a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
357583a06263SAlex Elder 
357683a06263SAlex Elder 	/* Get our block major device number. */
357783a06263SAlex Elder 
357883a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
357983a06263SAlex Elder 	if (ret < 0)
358083a06263SAlex Elder 		goto err_out_id;
358183a06263SAlex Elder 	rbd_dev->major = ret;
358283a06263SAlex Elder 
358383a06263SAlex Elder 	/* Set up the blkdev mapping. */
358483a06263SAlex Elder 
358583a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
358683a06263SAlex Elder 	if (ret)
358783a06263SAlex Elder 		goto err_out_blkdev;
358883a06263SAlex Elder 
358983a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
359083a06263SAlex Elder 	if (ret)
359183a06263SAlex Elder 		goto err_out_disk;
359283a06263SAlex Elder 
359383a06263SAlex Elder 	/*
359483a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
359583a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
359683a06263SAlex Elder 	 */
359783a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
359883a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
359983a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
360083a06263SAlex Elder 	if (ret)
360183a06263SAlex Elder 		goto err_out_bus;
360283a06263SAlex Elder 
360383a06263SAlex Elder 	ret = rbd_init_watch_dev(rbd_dev);
360483a06263SAlex Elder 	if (ret)
360583a06263SAlex Elder 		goto err_out_bus;
360683a06263SAlex Elder 
360783a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
360883a06263SAlex Elder 
360983a06263SAlex Elder 	add_disk(rbd_dev->disk);
361083a06263SAlex Elder 
361183a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
361283a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
361383a06263SAlex Elder 
361483a06263SAlex Elder 	return ret;
361583a06263SAlex Elder err_out_bus:
361683a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
361783a06263SAlex Elder 
361883a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
361983a06263SAlex Elder 
362083a06263SAlex Elder 	return ret;
362183a06263SAlex Elder err_out_disk:
362283a06263SAlex Elder 	rbd_free_disk(rbd_dev);
362383a06263SAlex Elder err_out_blkdev:
362483a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
362583a06263SAlex Elder err_out_id:
362683a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
362783a06263SAlex Elder err_out_snaps:
362883a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
362983a06263SAlex Elder 
363083a06263SAlex Elder 	return ret;
363183a06263SAlex Elder }
363283a06263SAlex Elder 
3633a30b71b9SAlex Elder /*
3634a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3635a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3636a30b71b9SAlex Elder  * id.
3637a30b71b9SAlex Elder  */
3638a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3639a30b71b9SAlex Elder {
3640a30b71b9SAlex Elder 	int ret;
3641a30b71b9SAlex Elder 
3642a30b71b9SAlex Elder 	/*
3643a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3644a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3645a30b71b9SAlex Elder 	 * it's a format 1 image.
3646a30b71b9SAlex Elder 	 */
3647a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3648a30b71b9SAlex Elder 	if (ret)
3649a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3650a30b71b9SAlex Elder 	else
3651a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
365283a06263SAlex Elder 	if (ret) {
3653a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3654a30b71b9SAlex Elder 
3655a30b71b9SAlex Elder 		return ret;
3656a30b71b9SAlex Elder 	}
3657a30b71b9SAlex Elder 
365883a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
365983a06263SAlex Elder 	if (ret)
366083a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
366183a06263SAlex Elder 
366283a06263SAlex Elder 	return ret;
366383a06263SAlex Elder }
366483a06263SAlex Elder 
366559c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
366659c2be1eSYehuda Sadeh 		       const char *buf,
366759c2be1eSYehuda Sadeh 		       size_t count)
3668602adf40SYehuda Sadeh {
3669cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
3670dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
36714e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3672859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
36739d3997fdSAlex Elder 	struct rbd_client *rbdc;
367427cc2594SAlex Elder 	struct ceph_osd_client *osdc;
367527cc2594SAlex Elder 	int rc = -ENOMEM;
3676602adf40SYehuda Sadeh 
3677602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
3678602adf40SYehuda Sadeh 		return -ENODEV;
3679602adf40SYehuda Sadeh 
3680a725f65eSAlex Elder 	/* parse add command */
3681859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3682dc79b113SAlex Elder 	if (rc < 0)
3683bd4ba655SAlex Elder 		goto err_out_module;
3684a725f65eSAlex Elder 
36859d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
36869d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
36879d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
36880ddebc0cSAlex Elder 		goto err_out_args;
36899d3997fdSAlex Elder 	}
3690c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
3691602adf40SYehuda Sadeh 
3692602adf40SYehuda Sadeh 	/* pick the pool */
36939d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
3694859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3695602adf40SYehuda Sadeh 	if (rc < 0)
3696602adf40SYehuda Sadeh 		goto err_out_client;
3697859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
3698859c31dfSAlex Elder 
3699c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
3700bd4ba655SAlex Elder 	if (!rbd_dev)
3701bd4ba655SAlex Elder 		goto err_out_client;
3702c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
3703c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
3704602adf40SYehuda Sadeh 
3705bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
3706c53d5893SAlex Elder 	kfree(rbd_opts);
3707c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
3708bd4ba655SAlex Elder 
3709a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
3710a30b71b9SAlex Elder 	if (rc < 0)
3711c53d5893SAlex Elder 		goto err_out_rbd_dev;
371205fd6f6fSAlex Elder 
3713602adf40SYehuda Sadeh 	return count;
3714c53d5893SAlex Elder err_out_rbd_dev:
3715c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3716bd4ba655SAlex Elder err_out_client:
37179d3997fdSAlex Elder 	rbd_put_client(rbdc);
37180ddebc0cSAlex Elder err_out_args:
371978cea76eSAlex Elder 	if (ceph_opts)
372078cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
37214e9afebaSAlex Elder 	kfree(rbd_opts);
3722859c31dfSAlex Elder 	rbd_spec_put(spec);
3723bd4ba655SAlex Elder err_out_module:
3724bd4ba655SAlex Elder 	module_put(THIS_MODULE);
372527cc2594SAlex Elder 
3726602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
372727cc2594SAlex Elder 
372827cc2594SAlex Elder 	return (ssize_t) rc;
3729602adf40SYehuda Sadeh }
3730602adf40SYehuda Sadeh 
3731de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3732602adf40SYehuda Sadeh {
3733602adf40SYehuda Sadeh 	struct list_head *tmp;
3734602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
3735602adf40SYehuda Sadeh 
3736e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3737602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
3738602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3739de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
3740e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
3741602adf40SYehuda Sadeh 			return rbd_dev;
3742602adf40SYehuda Sadeh 		}
3743e124a82fSAlex Elder 	}
3744e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3745602adf40SYehuda Sadeh 	return NULL;
3746602adf40SYehuda Sadeh }
3747602adf40SYehuda Sadeh 
3748dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
3749602adf40SYehuda Sadeh {
3750593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3751602adf40SYehuda Sadeh 
37521dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
37531dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
37541dbb4399SAlex Elder 
37551dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
375659c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
37571dbb4399SAlex Elder 	}
375859c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
3759070c633fSAlex Elder 		rbd_req_sync_unwatch(rbd_dev);
376059c2be1eSYehuda Sadeh 
3761602adf40SYehuda Sadeh 
3762602adf40SYehuda Sadeh 	/* clean up and free blkdev */
3763602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
3764602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
376532eec68dSAlex Elder 
37662ac4e75dSAlex Elder 	/* release allocated disk header fields */
37672ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
37682ac4e75dSAlex Elder 
376932eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
3770e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
3771c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
3772c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3773602adf40SYehuda Sadeh 
3774602adf40SYehuda Sadeh 	/* release module ref */
3775602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
3776602adf40SYehuda Sadeh }
3777602adf40SYehuda Sadeh 
3778dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
3779602adf40SYehuda Sadeh 			  const char *buf,
3780602adf40SYehuda Sadeh 			  size_t count)
3781602adf40SYehuda Sadeh {
3782602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
3783602adf40SYehuda Sadeh 	int target_id, rc;
3784602adf40SYehuda Sadeh 	unsigned long ul;
3785602adf40SYehuda Sadeh 	int ret = count;
3786602adf40SYehuda Sadeh 
3787602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
3788602adf40SYehuda Sadeh 	if (rc)
3789602adf40SYehuda Sadeh 		return rc;
3790602adf40SYehuda Sadeh 
3791602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
3792602adf40SYehuda Sadeh 	target_id = (int) ul;
3793602adf40SYehuda Sadeh 	if (target_id != ul)
3794602adf40SYehuda Sadeh 		return -EINVAL;
3795602adf40SYehuda Sadeh 
3796602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3797602adf40SYehuda Sadeh 
3798602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
3799602adf40SYehuda Sadeh 	if (!rbd_dev) {
3800602adf40SYehuda Sadeh 		ret = -ENOENT;
3801602adf40SYehuda Sadeh 		goto done;
3802602adf40SYehuda Sadeh 	}
3803602adf40SYehuda Sadeh 
380442382b70SAlex Elder 	if (rbd_dev->open_count) {
380542382b70SAlex Elder 		ret = -EBUSY;
380642382b70SAlex Elder 		goto done;
380742382b70SAlex Elder 	}
380842382b70SAlex Elder 
380941f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
3810dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3811602adf40SYehuda Sadeh 
3812602adf40SYehuda Sadeh done:
3813602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3814aafb230eSAlex Elder 
3815602adf40SYehuda Sadeh 	return ret;
3816602adf40SYehuda Sadeh }
3817602adf40SYehuda Sadeh 
3818602adf40SYehuda Sadeh /*
3819602adf40SYehuda Sadeh  * create control files in sysfs
3820dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3821602adf40SYehuda Sadeh  */
3822602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3823602adf40SYehuda Sadeh {
3824dfc5606dSYehuda Sadeh 	int ret;
3825602adf40SYehuda Sadeh 
3826fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3827dfc5606dSYehuda Sadeh 	if (ret < 0)
3828dfc5606dSYehuda Sadeh 		return ret;
3829602adf40SYehuda Sadeh 
3830fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3831fed4c143SAlex Elder 	if (ret < 0)
3832fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3833602adf40SYehuda Sadeh 
3834602adf40SYehuda Sadeh 	return ret;
3835602adf40SYehuda Sadeh }
3836602adf40SYehuda Sadeh 
3837602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3838602adf40SYehuda Sadeh {
3839dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3840fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3841602adf40SYehuda Sadeh }
3842602adf40SYehuda Sadeh 
3843602adf40SYehuda Sadeh int __init rbd_init(void)
3844602adf40SYehuda Sadeh {
3845602adf40SYehuda Sadeh 	int rc;
3846602adf40SYehuda Sadeh 
3847602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3848602adf40SYehuda Sadeh 	if (rc)
3849602adf40SYehuda Sadeh 		return rc;
3850f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3851602adf40SYehuda Sadeh 	return 0;
3852602adf40SYehuda Sadeh }
3853602adf40SYehuda Sadeh 
3854602adf40SYehuda Sadeh void __exit rbd_exit(void)
3855602adf40SYehuda Sadeh {
3856602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3857602adf40SYehuda Sadeh }
3858602adf40SYehuda Sadeh 
3859602adf40SYehuda Sadeh module_init(rbd_init);
3860602adf40SYehuda Sadeh module_exit(rbd_exit);
3861602adf40SYehuda Sadeh 
3862602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3863602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3864602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3865602adf40SYehuda Sadeh 
3866602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3867602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3868602adf40SYehuda Sadeh 
3869602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3870