xref: /openbmc/linux/drivers/block/rbd.c (revision 5efea49a)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
552647ba38SAlex Elder /* It might be useful to have these defined elsewhere */
56df111be6SAlex Elder 
572647ba38SAlex Elder #define	U8_MAX	((u8)	(~0U))
582647ba38SAlex Elder #define	U16_MAX	((u16)	(~0U))
590ec8ce87SAlex Elder #define	U32_MAX	((u32)	(~0U))
60df111be6SAlex Elder #define	U64_MAX	((u64)	(~0ULL))
61df111be6SAlex Elder 
62f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
63f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
64602adf40SYehuda Sadeh 
65602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
66602adf40SYehuda Sadeh 
67d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
68d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
69d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70d4b125e9SAlex Elder 
7135d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
72602adf40SYehuda Sadeh #define RBD_MAX_OPT_LEN		1024
73602adf40SYehuda Sadeh 
74602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
75602adf40SYehuda Sadeh 
769e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
779e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
78589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
799e15b77dSAlex Elder 
801e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
81589d30e0SAlex Elder 
82d889140cSAlex Elder /* Feature bits */
83d889140cSAlex Elder 
84d889140cSAlex Elder #define RBD_FEATURE_LAYERING      1
85d889140cSAlex Elder 
86d889140cSAlex Elder /* Features supported by this (client software) implementation. */
87d889140cSAlex Elder 
88d889140cSAlex Elder #define RBD_FEATURES_ALL          (0)
89d889140cSAlex Elder 
9081a89793SAlex Elder /*
9181a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
9281a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
9381a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
9481a89793SAlex Elder  * enough to hold all possible device names.
9581a89793SAlex Elder  */
96602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9781a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
98602adf40SYehuda Sadeh 
99cc0538b6SAlex Elder #define RBD_READ_ONLY_DEFAULT		false
10059c2be1eSYehuda Sadeh 
101602adf40SYehuda Sadeh /*
102602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
103602adf40SYehuda Sadeh  */
104602adf40SYehuda Sadeh struct rbd_image_header {
105f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
106849b4260SAlex Elder 	char *object_prefix;
10734b13184SAlex Elder 	u64 features;
108602adf40SYehuda Sadeh 	__u8 obj_order;
109602adf40SYehuda Sadeh 	__u8 crypt_type;
110602adf40SYehuda Sadeh 	__u8 comp_type;
111602adf40SYehuda Sadeh 
112f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
113f84344f3SAlex Elder 	u64 image_size;
114f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
115602adf40SYehuda Sadeh 	char *snap_names;
116602adf40SYehuda Sadeh 	u64 *snap_sizes;
11759c2be1eSYehuda Sadeh 
11859c2be1eSYehuda Sadeh 	u64 obj_version;
11959c2be1eSYehuda Sadeh };
12059c2be1eSYehuda Sadeh 
1210d7dbfceSAlex Elder /*
1220d7dbfceSAlex Elder  * An rbd image specification.
1230d7dbfceSAlex Elder  *
1240d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
125c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
126c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
127c66c6e0cSAlex Elder  *
128c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
129c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
130c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
131c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
132c66c6e0cSAlex Elder  *
133c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
134c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
135c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
136c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
137c66c6e0cSAlex Elder  * is shared between the parent and child).
138c66c6e0cSAlex Elder  *
139c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
140c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
141c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
142c66c6e0cSAlex Elder  *
143c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
144c66c6e0cSAlex Elder  * could be a null pointer).
1450d7dbfceSAlex Elder  */
1460d7dbfceSAlex Elder struct rbd_spec {
1470d7dbfceSAlex Elder 	u64		pool_id;
1480d7dbfceSAlex Elder 	char		*pool_name;
1490d7dbfceSAlex Elder 
1500d7dbfceSAlex Elder 	char		*image_id;
1510d7dbfceSAlex Elder 	char		*image_name;
1520d7dbfceSAlex Elder 
1530d7dbfceSAlex Elder 	u64		snap_id;
1540d7dbfceSAlex Elder 	char		*snap_name;
1550d7dbfceSAlex Elder 
1560d7dbfceSAlex Elder 	struct kref	kref;
1570d7dbfceSAlex Elder };
1580d7dbfceSAlex Elder 
15959c2be1eSYehuda Sadeh struct rbd_options {
160cc0538b6SAlex Elder 	bool	read_only;
161602adf40SYehuda Sadeh };
162602adf40SYehuda Sadeh 
163602adf40SYehuda Sadeh /*
164f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
165602adf40SYehuda Sadeh  */
166602adf40SYehuda Sadeh struct rbd_client {
167602adf40SYehuda Sadeh 	struct ceph_client	*client;
168602adf40SYehuda Sadeh 	struct kref		kref;
169602adf40SYehuda Sadeh 	struct list_head	node;
170602adf40SYehuda Sadeh };
171602adf40SYehuda Sadeh 
172602adf40SYehuda Sadeh /*
173f0f8cef5SAlex Elder  * a request completion status
174602adf40SYehuda Sadeh  */
1751fec7093SYehuda Sadeh struct rbd_req_status {
1761fec7093SYehuda Sadeh 	int done;
1778986cb37SAlex Elder 	s32 rc;
1781fec7093SYehuda Sadeh 	u64 bytes;
1791fec7093SYehuda Sadeh };
1801fec7093SYehuda Sadeh 
1811fec7093SYehuda Sadeh /*
1821fec7093SYehuda Sadeh  * a collection of requests
1831fec7093SYehuda Sadeh  */
1841fec7093SYehuda Sadeh struct rbd_req_coll {
1851fec7093SYehuda Sadeh 	int			total;
1861fec7093SYehuda Sadeh 	int			num_done;
1871fec7093SYehuda Sadeh 	struct kref		kref;
1881fec7093SYehuda Sadeh 	struct rbd_req_status	status[0];
189602adf40SYehuda Sadeh };
190602adf40SYehuda Sadeh 
191f0f8cef5SAlex Elder /*
192f0f8cef5SAlex Elder  * a single io request
193f0f8cef5SAlex Elder  */
194f0f8cef5SAlex Elder struct rbd_request {
195f0f8cef5SAlex Elder 	struct request		*rq;		/* blk layer request */
196f0f8cef5SAlex Elder 	struct bio		*bio;		/* cloned bio */
197f0f8cef5SAlex Elder 	struct page		**pages;	/* list of used pages */
198f0f8cef5SAlex Elder 	u64			len;
199f0f8cef5SAlex Elder 	int			coll_index;
200f0f8cef5SAlex Elder 	struct rbd_req_coll	*coll;
201f0f8cef5SAlex Elder };
202f0f8cef5SAlex Elder 
203dfc5606dSYehuda Sadeh struct rbd_snap {
204dfc5606dSYehuda Sadeh 	struct	device		dev;
205dfc5606dSYehuda Sadeh 	const char		*name;
2063591538fSJosh Durgin 	u64			size;
207dfc5606dSYehuda Sadeh 	struct list_head	node;
208dfc5606dSYehuda Sadeh 	u64			id;
20934b13184SAlex Elder 	u64			features;
210dfc5606dSYehuda Sadeh };
211dfc5606dSYehuda Sadeh 
212f84344f3SAlex Elder struct rbd_mapping {
21399c1f08fSAlex Elder 	u64                     size;
21434b13184SAlex Elder 	u64                     features;
215f84344f3SAlex Elder 	bool			read_only;
216f84344f3SAlex Elder };
217f84344f3SAlex Elder 
218602adf40SYehuda Sadeh /*
219602adf40SYehuda Sadeh  * a single device
220602adf40SYehuda Sadeh  */
221602adf40SYehuda Sadeh struct rbd_device {
222de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
223602adf40SYehuda Sadeh 
224602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
225602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
226602adf40SYehuda Sadeh 
227a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
228602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
229602adf40SYehuda Sadeh 
230602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
231602adf40SYehuda Sadeh 
232602adf40SYehuda Sadeh 	spinlock_t		lock;		/* queue lock */
233602adf40SYehuda Sadeh 
234602adf40SYehuda Sadeh 	struct rbd_image_header	header;
235d78b650aSAlex Elder 	atomic_t		exists;
2360d7dbfceSAlex Elder 	struct rbd_spec		*spec;
237602adf40SYehuda Sadeh 
2380d7dbfceSAlex Elder 	char			*header_name;
239971f839aSAlex Elder 
2400903e875SAlex Elder 	struct ceph_file_layout	layout;
2410903e875SAlex Elder 
24259c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
24359c2be1eSYehuda Sadeh 	struct ceph_osd_request *watch_request;
24459c2be1eSYehuda Sadeh 
24586b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
24686b00e0dSAlex Elder 	u64			parent_overlap;
24786b00e0dSAlex Elder 
248c666601aSJosh Durgin 	/* protects updating the header */
249c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
250f84344f3SAlex Elder 
251f84344f3SAlex Elder 	struct rbd_mapping	mapping;
252602adf40SYehuda Sadeh 
253602adf40SYehuda Sadeh 	struct list_head	node;
254dfc5606dSYehuda Sadeh 
255dfc5606dSYehuda Sadeh 	/* list of snapshots */
256dfc5606dSYehuda Sadeh 	struct list_head	snaps;
257dfc5606dSYehuda Sadeh 
258dfc5606dSYehuda Sadeh 	/* sysfs related */
259dfc5606dSYehuda Sadeh 	struct device		dev;
26042382b70SAlex Elder 	unsigned long		open_count;
261dfc5606dSYehuda Sadeh };
262dfc5606dSYehuda Sadeh 
263602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
264e124a82fSAlex Elder 
265602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
266e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
267e124a82fSAlex Elder 
268602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
269432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
270602adf40SYehuda Sadeh 
271304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
272304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
273304f6808SAlex Elder 
274dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
27541f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
276dfc5606dSYehuda Sadeh 
277f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
278f0f8cef5SAlex Elder 		       size_t count);
279f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
280f0f8cef5SAlex Elder 			  size_t count);
281f0f8cef5SAlex Elder 
282f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
283f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
284f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
285f0f8cef5SAlex Elder 	__ATTR_NULL
286f0f8cef5SAlex Elder };
287f0f8cef5SAlex Elder 
288f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
289f0f8cef5SAlex Elder 	.name		= "rbd",
290f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
291f0f8cef5SAlex Elder };
292f0f8cef5SAlex Elder 
293f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
294f0f8cef5SAlex Elder {
295f0f8cef5SAlex Elder }
296f0f8cef5SAlex Elder 
297f0f8cef5SAlex Elder static struct device rbd_root_dev = {
298f0f8cef5SAlex Elder 	.init_name =    "rbd",
299f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
300f0f8cef5SAlex Elder };
301f0f8cef5SAlex Elder 
30206ecc6cbSAlex Elder static __printf(2, 3)
30306ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
30406ecc6cbSAlex Elder {
30506ecc6cbSAlex Elder 	struct va_format vaf;
30606ecc6cbSAlex Elder 	va_list args;
30706ecc6cbSAlex Elder 
30806ecc6cbSAlex Elder 	va_start(args, fmt);
30906ecc6cbSAlex Elder 	vaf.fmt = fmt;
31006ecc6cbSAlex Elder 	vaf.va = &args;
31106ecc6cbSAlex Elder 
31206ecc6cbSAlex Elder 	if (!rbd_dev)
31306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
31406ecc6cbSAlex Elder 	else if (rbd_dev->disk)
31506ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
31606ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
31706ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
31806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
31906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
32006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
32106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
32206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
32306ecc6cbSAlex Elder 	else	/* punt */
32406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
32506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
32606ecc6cbSAlex Elder 	va_end(args);
32706ecc6cbSAlex Elder }
32806ecc6cbSAlex Elder 
329aafb230eSAlex Elder #ifdef RBD_DEBUG
330aafb230eSAlex Elder #define rbd_assert(expr)						\
331aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
332aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
333aafb230eSAlex Elder 						"at line %d:\n\n"	\
334aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
335aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
336aafb230eSAlex Elder 			BUG();						\
337aafb230eSAlex Elder 		}
338aafb230eSAlex Elder #else /* !RBD_DEBUG */
339aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
340aafb230eSAlex Elder #endif /* !RBD_DEBUG */
341dfc5606dSYehuda Sadeh 
342117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
343117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
34459c2be1eSYehuda Sadeh 
345602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
346602adf40SYehuda Sadeh {
347f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
348602adf40SYehuda Sadeh 
349f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
350602adf40SYehuda Sadeh 		return -EROFS;
351602adf40SYehuda Sadeh 
35242382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
353c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
354f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
35542382b70SAlex Elder 	rbd_dev->open_count++;
35642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
357340c7a2bSAlex Elder 
358602adf40SYehuda Sadeh 	return 0;
359602adf40SYehuda Sadeh }
360602adf40SYehuda Sadeh 
361dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
362dfc5606dSYehuda Sadeh {
363dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
364dfc5606dSYehuda Sadeh 
36542382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
36642382b70SAlex Elder 	rbd_assert(rbd_dev->open_count > 0);
36742382b70SAlex Elder 	rbd_dev->open_count--;
368c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
36942382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
370dfc5606dSYehuda Sadeh 
371dfc5606dSYehuda Sadeh 	return 0;
372dfc5606dSYehuda Sadeh }
373dfc5606dSYehuda Sadeh 
374602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
375602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
376602adf40SYehuda Sadeh 	.open			= rbd_open,
377dfc5606dSYehuda Sadeh 	.release		= rbd_release,
378602adf40SYehuda Sadeh };
379602adf40SYehuda Sadeh 
380602adf40SYehuda Sadeh /*
381602adf40SYehuda Sadeh  * Initialize an rbd client instance.
38243ae4701SAlex Elder  * We own *ceph_opts.
383602adf40SYehuda Sadeh  */
384f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
385602adf40SYehuda Sadeh {
386602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
387602adf40SYehuda Sadeh 	int ret = -ENOMEM;
388602adf40SYehuda Sadeh 
389602adf40SYehuda Sadeh 	dout("rbd_client_create\n");
390602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
391602adf40SYehuda Sadeh 	if (!rbdc)
392602adf40SYehuda Sadeh 		goto out_opt;
393602adf40SYehuda Sadeh 
394602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
395602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
396602adf40SYehuda Sadeh 
397bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398bc534d86SAlex Elder 
39943ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
400602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
401bc534d86SAlex Elder 		goto out_mutex;
40243ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
403602adf40SYehuda Sadeh 
404602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
405602adf40SYehuda Sadeh 	if (ret < 0)
406602adf40SYehuda Sadeh 		goto out_err;
407602adf40SYehuda Sadeh 
408432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
409602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
410432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
411602adf40SYehuda Sadeh 
412bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
413bc534d86SAlex Elder 
414602adf40SYehuda Sadeh 	dout("rbd_client_create created %p\n", rbdc);
415602adf40SYehuda Sadeh 	return rbdc;
416602adf40SYehuda Sadeh 
417602adf40SYehuda Sadeh out_err:
418602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
419bc534d86SAlex Elder out_mutex:
420bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
421602adf40SYehuda Sadeh 	kfree(rbdc);
422602adf40SYehuda Sadeh out_opt:
42343ae4701SAlex Elder 	if (ceph_opts)
42443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
42528f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
426602adf40SYehuda Sadeh }
427602adf40SYehuda Sadeh 
428602adf40SYehuda Sadeh /*
4291f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
4301f7ba331SAlex Elder  * found, bump its reference count.
431602adf40SYehuda Sadeh  */
4321f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
433602adf40SYehuda Sadeh {
434602adf40SYehuda Sadeh 	struct rbd_client *client_node;
4351f7ba331SAlex Elder 	bool found = false;
436602adf40SYehuda Sadeh 
43743ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
438602adf40SYehuda Sadeh 		return NULL;
439602adf40SYehuda Sadeh 
4401f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
4411f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
4421f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
4431f7ba331SAlex Elder 			kref_get(&client_node->kref);
4441f7ba331SAlex Elder 			found = true;
4451f7ba331SAlex Elder 			break;
4461f7ba331SAlex Elder 		}
4471f7ba331SAlex Elder 	}
4481f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
4491f7ba331SAlex Elder 
4501f7ba331SAlex Elder 	return found ? client_node : NULL;
451602adf40SYehuda Sadeh }
452602adf40SYehuda Sadeh 
453602adf40SYehuda Sadeh /*
45459c2be1eSYehuda Sadeh  * mount options
45559c2be1eSYehuda Sadeh  */
45659c2be1eSYehuda Sadeh enum {
45759c2be1eSYehuda Sadeh 	Opt_last_int,
45859c2be1eSYehuda Sadeh 	/* int args above */
45959c2be1eSYehuda Sadeh 	Opt_last_string,
46059c2be1eSYehuda Sadeh 	/* string args above */
461cc0538b6SAlex Elder 	Opt_read_only,
462cc0538b6SAlex Elder 	Opt_read_write,
463cc0538b6SAlex Elder 	/* Boolean args above */
464cc0538b6SAlex Elder 	Opt_last_bool,
46559c2be1eSYehuda Sadeh };
46659c2be1eSYehuda Sadeh 
46743ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
46859c2be1eSYehuda Sadeh 	/* int args above */
46959c2be1eSYehuda Sadeh 	/* string args above */
470be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
471cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
472cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
473cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
474cc0538b6SAlex Elder 	/* Boolean args above */
47559c2be1eSYehuda Sadeh 	{-1, NULL}
47659c2be1eSYehuda Sadeh };
47759c2be1eSYehuda Sadeh 
47859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
47959c2be1eSYehuda Sadeh {
48043ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
48159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
48259c2be1eSYehuda Sadeh 	int token, intval, ret;
48359c2be1eSYehuda Sadeh 
48443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
48559c2be1eSYehuda Sadeh 	if (token < 0)
48659c2be1eSYehuda Sadeh 		return -EINVAL;
48759c2be1eSYehuda Sadeh 
48859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
48959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
49059c2be1eSYehuda Sadeh 		if (ret < 0) {
49159c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
49259c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
49359c2be1eSYehuda Sadeh 			return ret;
49459c2be1eSYehuda Sadeh 		}
49559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
49659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
49759c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
49859c2be1eSYehuda Sadeh 		     argstr[0].from);
499cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
500cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
50159c2be1eSYehuda Sadeh 	} else {
50259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
50359c2be1eSYehuda Sadeh 	}
50459c2be1eSYehuda Sadeh 
50559c2be1eSYehuda Sadeh 	switch (token) {
506cc0538b6SAlex Elder 	case Opt_read_only:
507cc0538b6SAlex Elder 		rbd_opts->read_only = true;
508cc0538b6SAlex Elder 		break;
509cc0538b6SAlex Elder 	case Opt_read_write:
510cc0538b6SAlex Elder 		rbd_opts->read_only = false;
511cc0538b6SAlex Elder 		break;
51259c2be1eSYehuda Sadeh 	default:
513aafb230eSAlex Elder 		rbd_assert(false);
514aafb230eSAlex Elder 		break;
51559c2be1eSYehuda Sadeh 	}
51659c2be1eSYehuda Sadeh 	return 0;
51759c2be1eSYehuda Sadeh }
51859c2be1eSYehuda Sadeh 
51959c2be1eSYehuda Sadeh /*
520602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
521602adf40SYehuda Sadeh  * not exist create it.
522602adf40SYehuda Sadeh  */
5239d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
524602adf40SYehuda Sadeh {
525f8c38929SAlex Elder 	struct rbd_client *rbdc;
52659c2be1eSYehuda Sadeh 
5271f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
5289d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
52943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
5309d3997fdSAlex Elder 	else
531f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
532d720bcb0SAlex Elder 
5339d3997fdSAlex Elder 	return rbdc;
534602adf40SYehuda Sadeh }
535602adf40SYehuda Sadeh 
536602adf40SYehuda Sadeh /*
537602adf40SYehuda Sadeh  * Destroy ceph client
538d23a4b3fSAlex Elder  *
539432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
540602adf40SYehuda Sadeh  */
541602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
542602adf40SYehuda Sadeh {
543602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
544602adf40SYehuda Sadeh 
545602adf40SYehuda Sadeh 	dout("rbd_release_client %p\n", rbdc);
546cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
547602adf40SYehuda Sadeh 	list_del(&rbdc->node);
548cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
549602adf40SYehuda Sadeh 
550602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
551602adf40SYehuda Sadeh 	kfree(rbdc);
552602adf40SYehuda Sadeh }
553602adf40SYehuda Sadeh 
554602adf40SYehuda Sadeh /*
555602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
556602adf40SYehuda Sadeh  * it.
557602adf40SYehuda Sadeh  */
5589d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
559602adf40SYehuda Sadeh {
560c53d5893SAlex Elder 	if (rbdc)
5619d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
562602adf40SYehuda Sadeh }
563602adf40SYehuda Sadeh 
5641fec7093SYehuda Sadeh /*
5651fec7093SYehuda Sadeh  * Destroy requests collection
5661fec7093SYehuda Sadeh  */
5671fec7093SYehuda Sadeh static void rbd_coll_release(struct kref *kref)
5681fec7093SYehuda Sadeh {
5691fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
5701fec7093SYehuda Sadeh 		container_of(kref, struct rbd_req_coll, kref);
5711fec7093SYehuda Sadeh 
5721fec7093SYehuda Sadeh 	dout("rbd_coll_release %p\n", coll);
5731fec7093SYehuda Sadeh 	kfree(coll);
5741fec7093SYehuda Sadeh }
575602adf40SYehuda Sadeh 
576a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
577a30b71b9SAlex Elder {
578a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
579a30b71b9SAlex Elder }
580a30b71b9SAlex Elder 
5818e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
5828e94af8eSAlex Elder {
583103a150fSAlex Elder 	size_t size;
584103a150fSAlex Elder 	u32 snap_count;
585103a150fSAlex Elder 
586103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
587103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
588103a150fSAlex Elder 		return false;
589103a150fSAlex Elder 
590db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
591db2388b6SAlex Elder 
592db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
593db2388b6SAlex Elder 		return false;
594db2388b6SAlex Elder 
595db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
596db2388b6SAlex Elder 
597db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
598db2388b6SAlex Elder 		return false;
599db2388b6SAlex Elder 
600103a150fSAlex Elder 	/*
601103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
602103a150fSAlex Elder 	 * that limits the number of snapshots.
603103a150fSAlex Elder 	 */
604103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
605103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
606103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
607103a150fSAlex Elder 		return false;
608103a150fSAlex Elder 
609103a150fSAlex Elder 	/*
610103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
611103a150fSAlex Elder 	 * header must also be representable in a size_t.
612103a150fSAlex Elder 	 */
613103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
614103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
615103a150fSAlex Elder 		return false;
616103a150fSAlex Elder 
617103a150fSAlex Elder 	return true;
6188e94af8eSAlex Elder }
6198e94af8eSAlex Elder 
620602adf40SYehuda Sadeh /*
621602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
622602adf40SYehuda Sadeh  * header.
623602adf40SYehuda Sadeh  */
624602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
6254156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
626602adf40SYehuda Sadeh {
627ccece235SAlex Elder 	u32 snap_count;
62858c17b0eSAlex Elder 	size_t len;
629d2bb24e5SAlex Elder 	size_t size;
630621901d6SAlex Elder 	u32 i;
631602adf40SYehuda Sadeh 
6326a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
6336a52325fSAlex Elder 
634103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
635103a150fSAlex Elder 
63658c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
63758c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6386a52325fSAlex Elder 	if (!header->object_prefix)
639602adf40SYehuda Sadeh 		return -ENOMEM;
64058c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
64158c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
64200f1f36fSAlex Elder 
643602adf40SYehuda Sadeh 	if (snap_count) {
644f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
645f785cc1dSAlex Elder 
646621901d6SAlex Elder 		/* Save a copy of the snapshot names */
647621901d6SAlex Elder 
648f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
649f785cc1dSAlex Elder 			return -EIO;
650f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
651602adf40SYehuda Sadeh 		if (!header->snap_names)
6526a52325fSAlex Elder 			goto out_err;
653f785cc1dSAlex Elder 		/*
654f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
655f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
656f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
657f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
658f785cc1dSAlex Elder 		 */
659f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
660f785cc1dSAlex Elder 			snap_names_len);
6616a52325fSAlex Elder 
662621901d6SAlex Elder 		/* Record each snapshot's size */
663621901d6SAlex Elder 
664d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
665d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
666602adf40SYehuda Sadeh 		if (!header->snap_sizes)
6676a52325fSAlex Elder 			goto out_err;
668621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
669621901d6SAlex Elder 			header->snap_sizes[i] =
670621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
671602adf40SYehuda Sadeh 	} else {
672ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
673602adf40SYehuda Sadeh 		header->snap_names = NULL;
674602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
675602adf40SYehuda Sadeh 	}
676849b4260SAlex Elder 
67734b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
678602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
679602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
680602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
6816a52325fSAlex Elder 
682621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
683621901d6SAlex Elder 
684f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
6856a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
6866a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
6876a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
6886a52325fSAlex Elder 	if (!header->snapc)
6896a52325fSAlex Elder 		goto out_err;
690602adf40SYehuda Sadeh 
691602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
692505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
693602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
694621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
695602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
696602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
697602adf40SYehuda Sadeh 
698602adf40SYehuda Sadeh 	return 0;
699602adf40SYehuda Sadeh 
7006a52325fSAlex Elder out_err:
701849b4260SAlex Elder 	kfree(header->snap_sizes);
702ccece235SAlex Elder 	header->snap_sizes = NULL;
703602adf40SYehuda Sadeh 	kfree(header->snap_names);
704ccece235SAlex Elder 	header->snap_names = NULL;
7056a52325fSAlex Elder 	kfree(header->object_prefix);
7066a52325fSAlex Elder 	header->object_prefix = NULL;
707ccece235SAlex Elder 
70800f1f36fSAlex Elder 	return -ENOMEM;
709602adf40SYehuda Sadeh }
710602adf40SYehuda Sadeh 
7119e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
7129e15b77dSAlex Elder {
7139e15b77dSAlex Elder 	struct rbd_snap *snap;
7149e15b77dSAlex Elder 
7159e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
7169e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
7179e15b77dSAlex Elder 
7189e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
7199e15b77dSAlex Elder 		if (snap_id == snap->id)
7209e15b77dSAlex Elder 			return snap->name;
7219e15b77dSAlex Elder 
7229e15b77dSAlex Elder 	return NULL;
7239e15b77dSAlex Elder }
7249e15b77dSAlex Elder 
7258836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
726602adf40SYehuda Sadeh {
727602adf40SYehuda Sadeh 
728e86924a8SAlex Elder 	struct rbd_snap *snap;
72900f1f36fSAlex Elder 
730e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
731e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
7320d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
733e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
73434b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
73500f1f36fSAlex Elder 
736e86924a8SAlex Elder 			return 0;
737602adf40SYehuda Sadeh 		}
73800f1f36fSAlex Elder 	}
739e86924a8SAlex Elder 
74000f1f36fSAlex Elder 	return -ENOENT;
74100f1f36fSAlex Elder }
742602adf40SYehuda Sadeh 
743819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
744602adf40SYehuda Sadeh {
74578dc447dSAlex Elder 	int ret;
746602adf40SYehuda Sadeh 
7470d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
748cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
7490d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
75099c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
75134b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
752e86924a8SAlex Elder 		ret = 0;
753602adf40SYehuda Sadeh 	} else {
7540d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
755602adf40SYehuda Sadeh 		if (ret < 0)
756602adf40SYehuda Sadeh 			goto done;
757f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
758602adf40SYehuda Sadeh 	}
759d78b650aSAlex Elder 	atomic_set(&rbd_dev->exists, 1);
760602adf40SYehuda Sadeh done:
761602adf40SYehuda Sadeh 	return ret;
762602adf40SYehuda Sadeh }
763602adf40SYehuda Sadeh 
764602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
765602adf40SYehuda Sadeh {
766849b4260SAlex Elder 	kfree(header->object_prefix);
767d78fd7aeSAlex Elder 	header->object_prefix = NULL;
768602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
769d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
770849b4260SAlex Elder 	kfree(header->snap_names);
771d78fd7aeSAlex Elder 	header->snap_names = NULL;
772d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
773d78fd7aeSAlex Elder 	header->snapc = NULL;
774602adf40SYehuda Sadeh }
775602adf40SYehuda Sadeh 
77665ccfe21SAlex Elder static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
777602adf40SYehuda Sadeh {
77865ccfe21SAlex Elder 	char *name;
77965ccfe21SAlex Elder 	u64 segment;
78065ccfe21SAlex Elder 	int ret;
781602adf40SYehuda Sadeh 
7822fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
78365ccfe21SAlex Elder 	if (!name)
78465ccfe21SAlex Elder 		return NULL;
78565ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
7862fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
78765ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
7882fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
78965ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
79065ccfe21SAlex Elder 			segment, ret);
79165ccfe21SAlex Elder 		kfree(name);
79265ccfe21SAlex Elder 		name = NULL;
79365ccfe21SAlex Elder 	}
794602adf40SYehuda Sadeh 
79565ccfe21SAlex Elder 	return name;
79665ccfe21SAlex Elder }
797602adf40SYehuda Sadeh 
79865ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
79965ccfe21SAlex Elder {
80065ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
801602adf40SYehuda Sadeh 
80265ccfe21SAlex Elder 	return offset & (segment_size - 1);
80365ccfe21SAlex Elder }
80465ccfe21SAlex Elder 
80565ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
80665ccfe21SAlex Elder 				u64 offset, u64 length)
80765ccfe21SAlex Elder {
80865ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
80965ccfe21SAlex Elder 
81065ccfe21SAlex Elder 	offset &= segment_size - 1;
81165ccfe21SAlex Elder 
812aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
81365ccfe21SAlex Elder 	if (offset + length > segment_size)
81465ccfe21SAlex Elder 		length = segment_size - offset;
81565ccfe21SAlex Elder 
81665ccfe21SAlex Elder 	return length;
817602adf40SYehuda Sadeh }
818602adf40SYehuda Sadeh 
8191fec7093SYehuda Sadeh static int rbd_get_num_segments(struct rbd_image_header *header,
8201fec7093SYehuda Sadeh 				u64 ofs, u64 len)
8211fec7093SYehuda Sadeh {
822df111be6SAlex Elder 	u64 start_seg;
823df111be6SAlex Elder 	u64 end_seg;
824df111be6SAlex Elder 
825df111be6SAlex Elder 	if (!len)
826df111be6SAlex Elder 		return 0;
827df111be6SAlex Elder 	if (len - 1 > U64_MAX - ofs)
828df111be6SAlex Elder 		return -ERANGE;
829df111be6SAlex Elder 
830df111be6SAlex Elder 	start_seg = ofs >> header->obj_order;
831df111be6SAlex Elder 	end_seg = (ofs + len - 1) >> header->obj_order;
832df111be6SAlex Elder 
8331fec7093SYehuda Sadeh 	return end_seg - start_seg + 1;
8341fec7093SYehuda Sadeh }
8351fec7093SYehuda Sadeh 
836602adf40SYehuda Sadeh /*
837029bcbd8SJosh Durgin  * returns the size of an object in the image
838029bcbd8SJosh Durgin  */
839029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
840029bcbd8SJosh Durgin {
841029bcbd8SJosh Durgin 	return 1 << header->obj_order;
842029bcbd8SJosh Durgin }
843029bcbd8SJosh Durgin 
844029bcbd8SJosh Durgin /*
845602adf40SYehuda Sadeh  * bio helpers
846602adf40SYehuda Sadeh  */
847602adf40SYehuda Sadeh 
848602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
849602adf40SYehuda Sadeh {
850602adf40SYehuda Sadeh 	struct bio *tmp;
851602adf40SYehuda Sadeh 
852602adf40SYehuda Sadeh 	while (chain) {
853602adf40SYehuda Sadeh 		tmp = chain;
854602adf40SYehuda Sadeh 		chain = chain->bi_next;
855602adf40SYehuda Sadeh 		bio_put(tmp);
856602adf40SYehuda Sadeh 	}
857602adf40SYehuda Sadeh }
858602adf40SYehuda Sadeh 
859602adf40SYehuda Sadeh /*
860602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
861602adf40SYehuda Sadeh  */
862602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
863602adf40SYehuda Sadeh {
864602adf40SYehuda Sadeh 	struct bio_vec *bv;
865602adf40SYehuda Sadeh 	unsigned long flags;
866602adf40SYehuda Sadeh 	void *buf;
867602adf40SYehuda Sadeh 	int i;
868602adf40SYehuda Sadeh 	int pos = 0;
869602adf40SYehuda Sadeh 
870602adf40SYehuda Sadeh 	while (chain) {
871602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
872602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
873602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
874602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
875602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
876602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
87785b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
878602adf40SYehuda Sadeh 			}
879602adf40SYehuda Sadeh 			pos += bv->bv_len;
880602adf40SYehuda Sadeh 		}
881602adf40SYehuda Sadeh 
882602adf40SYehuda Sadeh 		chain = chain->bi_next;
883602adf40SYehuda Sadeh 	}
884602adf40SYehuda Sadeh }
885602adf40SYehuda Sadeh 
886602adf40SYehuda Sadeh /*
887f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
888f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
889602adf40SYehuda Sadeh  */
890f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
891f7760dadSAlex Elder 					unsigned int offset,
892f7760dadSAlex Elder 					unsigned int len,
893f7760dadSAlex Elder 					gfp_t gfpmask)
894602adf40SYehuda Sadeh {
895f7760dadSAlex Elder 	struct bio_vec *bv;
896f7760dadSAlex Elder 	unsigned int resid;
897f7760dadSAlex Elder 	unsigned short idx;
898f7760dadSAlex Elder 	unsigned int voff;
899f7760dadSAlex Elder 	unsigned short end_idx;
900f7760dadSAlex Elder 	unsigned short vcnt;
901f7760dadSAlex Elder 	struct bio *bio;
902602adf40SYehuda Sadeh 
903f7760dadSAlex Elder 	/* Handle the easy case for the caller */
904f7760dadSAlex Elder 
905f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
906f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
907f7760dadSAlex Elder 
908f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
909f7760dadSAlex Elder 		return NULL;
910f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
911f7760dadSAlex Elder 		return NULL;
912f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
913f7760dadSAlex Elder 		return NULL;
914f7760dadSAlex Elder 
915f7760dadSAlex Elder 	/* Find first affected segment... */
916f7760dadSAlex Elder 
917f7760dadSAlex Elder 	resid = offset;
918f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
919f7760dadSAlex Elder 		if (resid < bv->bv_len)
920f7760dadSAlex Elder 			break;
921f7760dadSAlex Elder 		resid -= bv->bv_len;
922602adf40SYehuda Sadeh 	}
923f7760dadSAlex Elder 	voff = resid;
924602adf40SYehuda Sadeh 
925f7760dadSAlex Elder 	/* ...and the last affected segment */
926542582fcSAlex Elder 
927f7760dadSAlex Elder 	resid += len;
928f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
929f7760dadSAlex Elder 		if (resid <= bv->bv_len)
930f7760dadSAlex Elder 			break;
931f7760dadSAlex Elder 		resid -= bv->bv_len;
932f7760dadSAlex Elder 	}
933f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
934602adf40SYehuda Sadeh 
935f7760dadSAlex Elder 	/* Build the clone */
936f7760dadSAlex Elder 
937f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
938f7760dadSAlex Elder 	if (!bio)
939f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
940f7760dadSAlex Elder 
941f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
942f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
943f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
944f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
945602adf40SYehuda Sadeh 
946602adf40SYehuda Sadeh 	/*
947f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
948f7760dadSAlex Elder 	 * and last (or only) entries.
949602adf40SYehuda Sadeh 	 */
950f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
951f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
952f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
953f7760dadSAlex Elder 	if (vcnt > 1) {
954f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
955f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
956602adf40SYehuda Sadeh 	} else {
957f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
958602adf40SYehuda Sadeh 	}
959602adf40SYehuda Sadeh 
960f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
961f7760dadSAlex Elder 	bio->bi_size = len;
962f7760dadSAlex Elder 	bio->bi_idx = 0;
963602adf40SYehuda Sadeh 
964f7760dadSAlex Elder 	return bio;
965602adf40SYehuda Sadeh }
966602adf40SYehuda Sadeh 
967f7760dadSAlex Elder /*
968f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
969f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
970f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
971f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
972f7760dadSAlex Elder  *
973f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
974f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
975f7760dadSAlex Elder  * the start of data to be cloned is located.
976f7760dadSAlex Elder  *
977f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
978f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
979f7760dadSAlex Elder  * contain the offset of that byte within that bio.
980f7760dadSAlex Elder  */
981f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
982f7760dadSAlex Elder 					unsigned int *offset,
983f7760dadSAlex Elder 					unsigned int len,
984f7760dadSAlex Elder 					gfp_t gfpmask)
985f7760dadSAlex Elder {
986f7760dadSAlex Elder 	struct bio *bi = *bio_src;
987f7760dadSAlex Elder 	unsigned int off = *offset;
988f7760dadSAlex Elder 	struct bio *chain = NULL;
989f7760dadSAlex Elder 	struct bio **end;
990602adf40SYehuda Sadeh 
991f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
992602adf40SYehuda Sadeh 
993f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
994f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
995602adf40SYehuda Sadeh 
996f7760dadSAlex Elder 	end = &chain;
997f7760dadSAlex Elder 	while (len) {
998f7760dadSAlex Elder 		unsigned int bi_size;
999f7760dadSAlex Elder 		struct bio *bio;
1000f7760dadSAlex Elder 
1001f5400b7aSAlex Elder 		if (!bi) {
1002f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1003f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1004f5400b7aSAlex Elder 		}
1005f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1006f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1007f7760dadSAlex Elder 		if (!bio)
1008f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1009f7760dadSAlex Elder 
1010f7760dadSAlex Elder 		*end = bio;
1011f7760dadSAlex Elder 		end = &bio->bi_next;
1012f7760dadSAlex Elder 
1013f7760dadSAlex Elder 		off += bi_size;
1014f7760dadSAlex Elder 		if (off == bi->bi_size) {
1015f7760dadSAlex Elder 			bi = bi->bi_next;
1016f7760dadSAlex Elder 			off = 0;
1017f7760dadSAlex Elder 		}
1018f7760dadSAlex Elder 		len -= bi_size;
1019f7760dadSAlex Elder 	}
1020f7760dadSAlex Elder 	*bio_src = bi;
1021f7760dadSAlex Elder 	*offset = off;
1022f7760dadSAlex Elder 
1023f7760dadSAlex Elder 	return chain;
1024f7760dadSAlex Elder out_err:
1025f7760dadSAlex Elder 	bio_chain_put(chain);
1026f7760dadSAlex Elder 
1027602adf40SYehuda Sadeh 	return NULL;
1028602adf40SYehuda Sadeh }
1029602adf40SYehuda Sadeh 
10308d23bf29SAlex Elder struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
10318d23bf29SAlex Elder {
10328d23bf29SAlex Elder 	struct ceph_osd_req_op *op;
10338d23bf29SAlex Elder 	va_list args;
10342647ba38SAlex Elder 	size_t size;
10358d23bf29SAlex Elder 
10368d23bf29SAlex Elder 	op = kzalloc(sizeof (*op), GFP_NOIO);
10378d23bf29SAlex Elder 	if (!op)
10388d23bf29SAlex Elder 		return NULL;
10398d23bf29SAlex Elder 	op->op = opcode;
10408d23bf29SAlex Elder 	va_start(args, opcode);
10418d23bf29SAlex Elder 	switch (opcode) {
10428d23bf29SAlex Elder 	case CEPH_OSD_OP_READ:
10438d23bf29SAlex Elder 	case CEPH_OSD_OP_WRITE:
10448d23bf29SAlex Elder 		/* rbd_osd_req_op_create(READ, offset, length) */
10458d23bf29SAlex Elder 		/* rbd_osd_req_op_create(WRITE, offset, length) */
10468d23bf29SAlex Elder 		op->extent.offset = va_arg(args, u64);
10478d23bf29SAlex Elder 		op->extent.length = va_arg(args, u64);
10488d23bf29SAlex Elder 		if (opcode == CEPH_OSD_OP_WRITE)
10498d23bf29SAlex Elder 			op->payload_len = op->extent.length;
10508d23bf29SAlex Elder 		break;
10512647ba38SAlex Elder 	case CEPH_OSD_OP_CALL:
10522647ba38SAlex Elder 		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
10532647ba38SAlex Elder 		op->cls.class_name = va_arg(args, char *);
10542647ba38SAlex Elder 		size = strlen(op->cls.class_name);
10552647ba38SAlex Elder 		rbd_assert(size <= (size_t) U8_MAX);
10562647ba38SAlex Elder 		op->cls.class_len = size;
10572647ba38SAlex Elder 		op->payload_len = size;
10582647ba38SAlex Elder 
10592647ba38SAlex Elder 		op->cls.method_name = va_arg(args, char *);
10602647ba38SAlex Elder 		size = strlen(op->cls.method_name);
10612647ba38SAlex Elder 		rbd_assert(size <= (size_t) U8_MAX);
10622647ba38SAlex Elder 		op->cls.method_len = size;
10632647ba38SAlex Elder 		op->payload_len += size;
10642647ba38SAlex Elder 
10652647ba38SAlex Elder 		op->cls.argc = 0;
10662647ba38SAlex Elder 		op->cls.indata = va_arg(args, void *);
10672647ba38SAlex Elder 		size = va_arg(args, size_t);
10682647ba38SAlex Elder 		rbd_assert(size <= (size_t) U32_MAX);
10692647ba38SAlex Elder 		op->cls.indata_len = (u32) size;
10702647ba38SAlex Elder 		op->payload_len += size;
10712647ba38SAlex Elder 		break;
10725efea49aSAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
10735efea49aSAlex Elder 	case CEPH_OSD_OP_WATCH:
10745efea49aSAlex Elder 		/* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
10755efea49aSAlex Elder 		/* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
10765efea49aSAlex Elder 		op->watch.cookie = va_arg(args, u64);
10775efea49aSAlex Elder 		op->watch.ver = va_arg(args, u64);
10785efea49aSAlex Elder 		op->watch.ver = cpu_to_le64(op->watch.ver);
10795efea49aSAlex Elder 		if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
10805efea49aSAlex Elder 			op->watch.flag = (u8) 1;
10815efea49aSAlex Elder 		break;
10828d23bf29SAlex Elder 	default:
10838d23bf29SAlex Elder 		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
10848d23bf29SAlex Elder 		kfree(op);
10858d23bf29SAlex Elder 		op = NULL;
10868d23bf29SAlex Elder 		break;
10878d23bf29SAlex Elder 	}
10888d23bf29SAlex Elder 	va_end(args);
10898d23bf29SAlex Elder 
10908d23bf29SAlex Elder 	return op;
10918d23bf29SAlex Elder }
10928d23bf29SAlex Elder 
10938d23bf29SAlex Elder static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
10948d23bf29SAlex Elder {
10958d23bf29SAlex Elder 	kfree(op);
10968d23bf29SAlex Elder }
10978d23bf29SAlex Elder 
10981fec7093SYehuda Sadeh static void rbd_coll_end_req_index(struct request *rq,
10991fec7093SYehuda Sadeh 				   struct rbd_req_coll *coll,
11001fec7093SYehuda Sadeh 				   int index,
11018986cb37SAlex Elder 				   s32 ret, u64 len)
11021fec7093SYehuda Sadeh {
11031fec7093SYehuda Sadeh 	struct request_queue *q;
11041fec7093SYehuda Sadeh 	int min, max, i;
11051fec7093SYehuda Sadeh 
1106bd919d45SAlex Elder 	dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
11078986cb37SAlex Elder 	     coll, index, (int)ret, (unsigned long long)len);
11081fec7093SYehuda Sadeh 
11091fec7093SYehuda Sadeh 	if (!rq)
11101fec7093SYehuda Sadeh 		return;
11111fec7093SYehuda Sadeh 
11121fec7093SYehuda Sadeh 	if (!coll) {
11131fec7093SYehuda Sadeh 		blk_end_request(rq, ret, len);
11141fec7093SYehuda Sadeh 		return;
11151fec7093SYehuda Sadeh 	}
11161fec7093SYehuda Sadeh 
11171fec7093SYehuda Sadeh 	q = rq->q;
11181fec7093SYehuda Sadeh 
11191fec7093SYehuda Sadeh 	spin_lock_irq(q->queue_lock);
11201fec7093SYehuda Sadeh 	coll->status[index].done = 1;
11211fec7093SYehuda Sadeh 	coll->status[index].rc = ret;
11221fec7093SYehuda Sadeh 	coll->status[index].bytes = len;
11231fec7093SYehuda Sadeh 	max = min = coll->num_done;
11241fec7093SYehuda Sadeh 	while (max < coll->total && coll->status[max].done)
11251fec7093SYehuda Sadeh 		max++;
11261fec7093SYehuda Sadeh 
11271fec7093SYehuda Sadeh 	for (i = min; i<max; i++) {
11288986cb37SAlex Elder 		__blk_end_request(rq, (int)coll->status[i].rc,
11291fec7093SYehuda Sadeh 				  coll->status[i].bytes);
11301fec7093SYehuda Sadeh 		coll->num_done++;
11311fec7093SYehuda Sadeh 		kref_put(&coll->kref, rbd_coll_release);
11321fec7093SYehuda Sadeh 	}
11331fec7093SYehuda Sadeh 	spin_unlock_irq(q->queue_lock);
11341fec7093SYehuda Sadeh }
11351fec7093SYehuda Sadeh 
1136725afc97SAlex Elder static void rbd_coll_end_req(struct rbd_request *rbd_req,
11378986cb37SAlex Elder 			     s32 ret, u64 len)
11381fec7093SYehuda Sadeh {
1139725afc97SAlex Elder 	rbd_coll_end_req_index(rbd_req->rq,
1140725afc97SAlex Elder 				rbd_req->coll, rbd_req->coll_index,
1141725afc97SAlex Elder 				ret, len);
11421fec7093SYehuda Sadeh }
11431fec7093SYehuda Sadeh 
1144602adf40SYehuda Sadeh /*
1145602adf40SYehuda Sadeh  * Send ceph osd request
1146602adf40SYehuda Sadeh  */
1147602adf40SYehuda Sadeh static int rbd_do_request(struct request *rq,
11480ce1a794SAlex Elder 			  struct rbd_device *rbd_dev,
1149602adf40SYehuda Sadeh 			  struct ceph_snap_context *snapc,
1150602adf40SYehuda Sadeh 			  u64 snapid,
1151aded07eaSAlex Elder 			  const char *object_name, u64 ofs, u64 len,
1152602adf40SYehuda Sadeh 			  struct bio *bio,
1153602adf40SYehuda Sadeh 			  struct page **pages,
1154602adf40SYehuda Sadeh 			  int num_pages,
1155602adf40SYehuda Sadeh 			  int flags,
115630573d68SAlex Elder 			  struct ceph_osd_req_op *op,
11571fec7093SYehuda Sadeh 			  struct rbd_req_coll *coll,
11581fec7093SYehuda Sadeh 			  int coll_index,
11595f29ddd4SAlex Elder 			  void (*rbd_cb)(struct ceph_osd_request *,
11605f29ddd4SAlex Elder 					 struct ceph_msg *),
116159c2be1eSYehuda Sadeh 			  struct ceph_osd_request **linger_req,
116259c2be1eSYehuda Sadeh 			  u64 *ver)
1163602adf40SYehuda Sadeh {
11641dbb4399SAlex Elder 	struct ceph_osd_client *osdc;
11652e53c6c3SAlex Elder 	struct ceph_osd_request *osd_req;
11662e53c6c3SAlex Elder 	struct rbd_request *rbd_req = NULL;
11672e53c6c3SAlex Elder 	struct timespec mtime = CURRENT_TIME;
11682e53c6c3SAlex Elder 	int ret;
11691fec7093SYehuda Sadeh 
1170f7760dadSAlex Elder 	dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1171f7760dadSAlex Elder 		object_name, (unsigned long long) ofs,
1172f7760dadSAlex Elder 		(unsigned long long) len, coll, coll_index);
1173602adf40SYehuda Sadeh 
11740ce1a794SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
117530573d68SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
11762e53c6c3SAlex Elder 	if (!osd_req)
11772e53c6c3SAlex Elder 		return -ENOMEM;
1178602adf40SYehuda Sadeh 
1179d178a9e7SAlex Elder 	osd_req->r_flags = flags;
118054a54007SAlex Elder 	osd_req->r_pages = pages;
118154a54007SAlex Elder 	if (bio) {
118254a54007SAlex Elder 		osd_req->r_bio = bio;
118354a54007SAlex Elder 		bio_get(osd_req->r_bio);
118454a54007SAlex Elder 	}
11852e53c6c3SAlex Elder 
118618216657SAlex Elder 	if (coll) {
11872e53c6c3SAlex Elder 		ret = -ENOMEM;
11882e53c6c3SAlex Elder 		rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
11892e53c6c3SAlex Elder 		if (!rbd_req)
11902e53c6c3SAlex Elder 			goto done_osd_req;
1191602adf40SYehuda Sadeh 
1192725afc97SAlex Elder 		rbd_req->rq = rq;
1193725afc97SAlex Elder 		rbd_req->bio = bio;
1194725afc97SAlex Elder 		rbd_req->pages = pages;
1195725afc97SAlex Elder 		rbd_req->len = len;
11962e53c6c3SAlex Elder 		rbd_req->coll = coll;
119718216657SAlex Elder 		rbd_req->coll_index = coll_index;
11982e53c6c3SAlex Elder 	}
1199602adf40SYehuda Sadeh 
12002e53c6c3SAlex Elder 	osd_req->r_callback = rbd_cb;
12015f29ddd4SAlex Elder 	osd_req->r_priv = rbd_req;
1202602adf40SYehuda Sadeh 
12035f29ddd4SAlex Elder 	strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
12045f29ddd4SAlex Elder 	osd_req->r_oid_len = strlen(osd_req->r_oid);
1205602adf40SYehuda Sadeh 
12060903e875SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1207e01e7927SAlex Elder 	osd_req->r_num_pages = calc_pages_for(ofs, len);
1208e01e7927SAlex Elder 	osd_req->r_page_alignment = ofs & ~PAGE_MASK;
1209602adf40SYehuda Sadeh 
121030573d68SAlex Elder 	ceph_osdc_build_request(osd_req, ofs, len, 1, op,
1211ae7ca4a3SAlex Elder 				snapc, snapid, &mtime);
1212602adf40SYehuda Sadeh 
121359c2be1eSYehuda Sadeh 	if (linger_req) {
12145f29ddd4SAlex Elder 		ceph_osdc_set_request_linger(osdc, osd_req);
12155f29ddd4SAlex Elder 		*linger_req = osd_req;
121659c2be1eSYehuda Sadeh 	}
121759c2be1eSYehuda Sadeh 
12185f29ddd4SAlex Elder 	ret = ceph_osdc_start_request(osdc, osd_req, false);
1219602adf40SYehuda Sadeh 	if (ret < 0)
1220602adf40SYehuda Sadeh 		goto done_err;
1221602adf40SYehuda Sadeh 
1222602adf40SYehuda Sadeh 	if (!rbd_cb) {
12235f29ddd4SAlex Elder 		u64 version;
12245f29ddd4SAlex Elder 
12255f29ddd4SAlex Elder 		ret = ceph_osdc_wait_request(osdc, osd_req);
12265f29ddd4SAlex Elder 		version = le64_to_cpu(osd_req->r_reassert_version.version);
122759c2be1eSYehuda Sadeh 		if (ver)
12285f29ddd4SAlex Elder 			*ver = version;
12295f29ddd4SAlex Elder 		dout("reassert_ver=%llu\n", (unsigned long long) version);
12305f29ddd4SAlex Elder 		ceph_osdc_put_request(osd_req);
1231602adf40SYehuda Sadeh 	}
1232602adf40SYehuda Sadeh 	return ret;
1233602adf40SYehuda Sadeh 
1234602adf40SYehuda Sadeh done_err:
12352e53c6c3SAlex Elder 	if (bio)
12362e53c6c3SAlex Elder 		bio_chain_put(osd_req->r_bio);
1237725afc97SAlex Elder 	kfree(rbd_req);
12382e53c6c3SAlex Elder done_osd_req:
12392e53c6c3SAlex Elder 	ceph_osdc_put_request(osd_req);
12402e53c6c3SAlex Elder 
1241602adf40SYehuda Sadeh 	return ret;
1242602adf40SYehuda Sadeh }
1243602adf40SYehuda Sadeh 
1244602adf40SYehuda Sadeh /*
1245602adf40SYehuda Sadeh  * Ceph osd op callback
1246602adf40SYehuda Sadeh  */
12475f29ddd4SAlex Elder static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
1248602adf40SYehuda Sadeh {
12495f29ddd4SAlex Elder 	struct rbd_request *rbd_req = osd_req->r_priv;
1250602adf40SYehuda Sadeh 	struct ceph_osd_reply_head *replyhead;
1251602adf40SYehuda Sadeh 	struct ceph_osd_op *op;
12528986cb37SAlex Elder 	s32 rc;
1253602adf40SYehuda Sadeh 	u64 bytes;
1254602adf40SYehuda Sadeh 	int read_op;
1255602adf40SYehuda Sadeh 
1256602adf40SYehuda Sadeh 	/* parse reply */
1257602adf40SYehuda Sadeh 	replyhead = msg->front.iov_base;
1258602adf40SYehuda Sadeh 	WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1259602adf40SYehuda Sadeh 	op = (void *)(replyhead + 1);
12608986cb37SAlex Elder 	rc = (s32)le32_to_cpu(replyhead->result);
1261602adf40SYehuda Sadeh 	bytes = le64_to_cpu(op->extent.length);
1262895cfcc8SDan Carpenter 	read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
1263602adf40SYehuda Sadeh 
1264bd919d45SAlex Elder 	dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1265bd919d45SAlex Elder 		(unsigned long long) bytes, read_op, (int) rc);
1266602adf40SYehuda Sadeh 
12678986cb37SAlex Elder 	if (rc == (s32)-ENOENT && read_op) {
1268725afc97SAlex Elder 		zero_bio_chain(rbd_req->bio, 0);
1269602adf40SYehuda Sadeh 		rc = 0;
1270725afc97SAlex Elder 	} else if (rc == 0 && read_op && bytes < rbd_req->len) {
1271725afc97SAlex Elder 		zero_bio_chain(rbd_req->bio, bytes);
1272725afc97SAlex Elder 		bytes = rbd_req->len;
1273602adf40SYehuda Sadeh 	}
1274602adf40SYehuda Sadeh 
1275725afc97SAlex Elder 	rbd_coll_end_req(rbd_req, rc, bytes);
1276602adf40SYehuda Sadeh 
1277725afc97SAlex Elder 	if (rbd_req->bio)
1278725afc97SAlex Elder 		bio_chain_put(rbd_req->bio);
1279602adf40SYehuda Sadeh 
12805f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
1281725afc97SAlex Elder 	kfree(rbd_req);
1282602adf40SYehuda Sadeh }
1283602adf40SYehuda Sadeh 
12845f29ddd4SAlex Elder static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
12855f29ddd4SAlex Elder 				struct ceph_msg *msg)
128659c2be1eSYehuda Sadeh {
12875f29ddd4SAlex Elder 	ceph_osdc_put_request(osd_req);
128859c2be1eSYehuda Sadeh }
128959c2be1eSYehuda Sadeh 
1290602adf40SYehuda Sadeh /*
1291602adf40SYehuda Sadeh  * Do a synchronous ceph osd operation
1292602adf40SYehuda Sadeh  */
12930ce1a794SAlex Elder static int rbd_req_sync_op(struct rbd_device *rbd_dev,
1294602adf40SYehuda Sadeh 			   int flags,
129530573d68SAlex Elder 			   struct ceph_osd_req_op *op,
1296aded07eaSAlex Elder 			   const char *object_name,
1297f8d4de6eSAlex Elder 			   u64 ofs, u64 inbound_size,
1298f8d4de6eSAlex Elder 			   char *inbound,
129959c2be1eSYehuda Sadeh 			   struct ceph_osd_request **linger_req,
130059c2be1eSYehuda Sadeh 			   u64 *ver)
1301602adf40SYehuda Sadeh {
1302602adf40SYehuda Sadeh 	int ret;
1303602adf40SYehuda Sadeh 	struct page **pages;
1304602adf40SYehuda Sadeh 	int num_pages;
1305913d2fdcSAlex Elder 
130630573d68SAlex Elder 	rbd_assert(op != NULL);
1307602adf40SYehuda Sadeh 
1308f8d4de6eSAlex Elder 	num_pages = calc_pages_for(ofs, inbound_size);
1309602adf40SYehuda Sadeh 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1310b8d0638aSDan Carpenter 	if (IS_ERR(pages))
1311b8d0638aSDan Carpenter 		return PTR_ERR(pages);
1312602adf40SYehuda Sadeh 
131325704ac9SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
1314f8d4de6eSAlex Elder 			  object_name, ofs, inbound_size, NULL,
1315602adf40SYehuda Sadeh 			  pages, num_pages,
1316602adf40SYehuda Sadeh 			  flags,
131730573d68SAlex Elder 			  op,
13181fec7093SYehuda Sadeh 			  NULL, 0,
131959c2be1eSYehuda Sadeh 			  NULL,
132059c2be1eSYehuda Sadeh 			  linger_req, ver);
1321602adf40SYehuda Sadeh 	if (ret < 0)
1322913d2fdcSAlex Elder 		goto done;
1323602adf40SYehuda Sadeh 
1324f8d4de6eSAlex Elder 	if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1325f8d4de6eSAlex Elder 		ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
1326602adf40SYehuda Sadeh 
1327602adf40SYehuda Sadeh done:
1328602adf40SYehuda Sadeh 	ceph_release_page_vector(pages, num_pages);
1329602adf40SYehuda Sadeh 	return ret;
1330602adf40SYehuda Sadeh }
1331602adf40SYehuda Sadeh 
1332602adf40SYehuda Sadeh /*
1333602adf40SYehuda Sadeh  * Do an asynchronous ceph osd operation
1334602adf40SYehuda Sadeh  */
1335602adf40SYehuda Sadeh static int rbd_do_op(struct request *rq,
1336602adf40SYehuda Sadeh 		     struct rbd_device *rbd_dev,
1337602adf40SYehuda Sadeh 		     struct ceph_snap_context *snapc,
1338602adf40SYehuda Sadeh 		     u64 ofs, u64 len,
13391fec7093SYehuda Sadeh 		     struct bio *bio,
13401fec7093SYehuda Sadeh 		     struct rbd_req_coll *coll,
13411fec7093SYehuda Sadeh 		     int coll_index)
1342602adf40SYehuda Sadeh {
1343602adf40SYehuda Sadeh 	char *seg_name;
1344602adf40SYehuda Sadeh 	u64 seg_ofs;
1345602adf40SYehuda Sadeh 	u64 seg_len;
1346602adf40SYehuda Sadeh 	int ret;
1347139b4318SAlex Elder 	struct ceph_osd_req_op *op;
1348ff2e4bb5SAlex Elder 	int opcode;
1349ff2e4bb5SAlex Elder 	int flags;
13504634246dSAlex Elder 	u64 snapid;
1351602adf40SYehuda Sadeh 
135265ccfe21SAlex Elder 	seg_name = rbd_segment_name(rbd_dev, ofs);
1353602adf40SYehuda Sadeh 	if (!seg_name)
1354602adf40SYehuda Sadeh 		return -ENOMEM;
135565ccfe21SAlex Elder 	seg_len = rbd_segment_length(rbd_dev, ofs, len);
135665ccfe21SAlex Elder 	seg_ofs = rbd_segment_offset(rbd_dev, ofs);
1357602adf40SYehuda Sadeh 
1358ff2e4bb5SAlex Elder 	if (rq_data_dir(rq) == WRITE) {
1359ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_WRITE;
1360ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
13614634246dSAlex Elder 		snapid = CEPH_NOSNAP;
1362ff2e4bb5SAlex Elder 	} else {
1363ff2e4bb5SAlex Elder 		opcode = CEPH_OSD_OP_READ;
1364ff2e4bb5SAlex Elder 		flags = CEPH_OSD_FLAG_READ;
1365a7b4c65fSAlex Elder 		rbd_assert(!snapc);
13660d7dbfceSAlex Elder 		snapid = rbd_dev->spec->snap_id;
1367ff2e4bb5SAlex Elder 	}
1368602adf40SYehuda Sadeh 
136957cfc106SAlex Elder 	ret = -ENOMEM;
13708d23bf29SAlex Elder 	op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
1371139b4318SAlex Elder 	if (!op)
1372602adf40SYehuda Sadeh 		goto done;
1373602adf40SYehuda Sadeh 
1374602adf40SYehuda Sadeh 	/* we've taken care of segment sizes earlier when we
1375602adf40SYehuda Sadeh 	   cloned the bios. We should never have a segment
1376602adf40SYehuda Sadeh 	   truncated at this point */
1377aafb230eSAlex Elder 	rbd_assert(seg_len == len);
1378602adf40SYehuda Sadeh 
1379602adf40SYehuda Sadeh 	ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1380602adf40SYehuda Sadeh 			     seg_name, seg_ofs, seg_len,
1381602adf40SYehuda Sadeh 			     bio,
1382602adf40SYehuda Sadeh 			     NULL, 0,
1383602adf40SYehuda Sadeh 			     flags,
138430573d68SAlex Elder 			     op,
13851fec7093SYehuda Sadeh 			     coll, coll_index,
138659c2be1eSYehuda Sadeh 			     rbd_req_cb, 0, NULL);
1387cd323ac0SAlex Elder 	if (ret < 0)
1388cd323ac0SAlex Elder 		rbd_coll_end_req_index(rq, coll, coll_index,
1389cd323ac0SAlex Elder 					(s32)ret, seg_len);
13908d23bf29SAlex Elder 	rbd_osd_req_op_destroy(op);
1391602adf40SYehuda Sadeh done:
1392602adf40SYehuda Sadeh 	kfree(seg_name);
1393602adf40SYehuda Sadeh 	return ret;
1394602adf40SYehuda Sadeh }
1395602adf40SYehuda Sadeh 
1396602adf40SYehuda Sadeh /*
1397602adf40SYehuda Sadeh  * Request sync osd read
1398602adf40SYehuda Sadeh  */
13990ce1a794SAlex Elder static int rbd_req_sync_read(struct rbd_device *rbd_dev,
1400aded07eaSAlex Elder 			  const char *object_name,
1401602adf40SYehuda Sadeh 			  u64 ofs, u64 len,
140259c2be1eSYehuda Sadeh 			  char *buf,
140359c2be1eSYehuda Sadeh 			  u64 *ver)
1404602adf40SYehuda Sadeh {
1405139b4318SAlex Elder 	struct ceph_osd_req_op *op;
1406913d2fdcSAlex Elder 	int ret;
1407913d2fdcSAlex Elder 
14088d23bf29SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
1409139b4318SAlex Elder 	if (!op)
1410913d2fdcSAlex Elder 		return -ENOMEM;
1411913d2fdcSAlex Elder 
141225704ac9SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
141330573d68SAlex Elder 			       op, object_name, ofs, len, buf, NULL, ver);
14148d23bf29SAlex Elder 	rbd_osd_req_op_destroy(op);
1415913d2fdcSAlex Elder 
1416913d2fdcSAlex Elder 	return ret;
1417602adf40SYehuda Sadeh }
1418602adf40SYehuda Sadeh 
1419602adf40SYehuda Sadeh /*
142059c2be1eSYehuda Sadeh  * Request sync osd watch
142159c2be1eSYehuda Sadeh  */
14220ce1a794SAlex Elder static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
142359c2be1eSYehuda Sadeh 				   u64 ver,
14247f0a24d8SAlex Elder 				   u64 notify_id)
142559c2be1eSYehuda Sadeh {
1426139b4318SAlex Elder 	struct ceph_osd_req_op *op;
142711f77002SSage Weil 	int ret;
142811f77002SSage Weil 
14295efea49aSAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1430139b4318SAlex Elder 	if (!op)
143157cfc106SAlex Elder 		return -ENOMEM;
143259c2be1eSYehuda Sadeh 
14330ce1a794SAlex Elder 	ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
14347f0a24d8SAlex Elder 			  rbd_dev->header_name, 0, 0, NULL,
1435ad4f232fSAlex Elder 			  NULL, 0,
143659c2be1eSYehuda Sadeh 			  CEPH_OSD_FLAG_READ,
143730573d68SAlex Elder 			  op,
14381fec7093SYehuda Sadeh 			  NULL, 0,
143959c2be1eSYehuda Sadeh 			  rbd_simple_req_cb, 0, NULL);
144059c2be1eSYehuda Sadeh 
14415efea49aSAlex Elder 	rbd_osd_req_op_destroy(op);
14425efea49aSAlex Elder 
144359c2be1eSYehuda Sadeh 	return ret;
144459c2be1eSYehuda Sadeh }
144559c2be1eSYehuda Sadeh 
144659c2be1eSYehuda Sadeh static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
144759c2be1eSYehuda Sadeh {
14480ce1a794SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
1449a71b891bSJosh Durgin 	u64 hver;
145013143d2dSSage Weil 	int rc;
145113143d2dSSage Weil 
14520ce1a794SAlex Elder 	if (!rbd_dev)
145359c2be1eSYehuda Sadeh 		return;
145459c2be1eSYehuda Sadeh 
1455bd919d45SAlex Elder 	dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1456bd919d45SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
1457bd919d45SAlex Elder 		(unsigned int) opcode);
1458117973fbSAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
145913143d2dSSage Weil 	if (rc)
146006ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
146106ecc6cbSAlex Elder 			   " update snaps: %d\n", rc);
146259c2be1eSYehuda Sadeh 
14637f0a24d8SAlex Elder 	rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
146459c2be1eSYehuda Sadeh }
146559c2be1eSYehuda Sadeh 
146659c2be1eSYehuda Sadeh /*
1467907703d0SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
1468907703d0SAlex Elder  * whether a watch request is being initiated or torn down.
146959c2be1eSYehuda Sadeh  */
1470907703d0SAlex Elder static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
147159c2be1eSYehuda Sadeh {
1472907703d0SAlex Elder 	struct ceph_osd_request **linger_req = NULL;
14735efea49aSAlex Elder 	struct ceph_osd_req_op *op;
14745efea49aSAlex Elder 	int ret = 0;
147559c2be1eSYehuda Sadeh 
1476907703d0SAlex Elder 	if (start) {
1477907703d0SAlex Elder 		struct ceph_osd_client *osdc;
1478907703d0SAlex Elder 
1479907703d0SAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
1480907703d0SAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1481907703d0SAlex Elder 						&rbd_dev->watch_event);
148259c2be1eSYehuda Sadeh 		if (ret < 0)
14835efea49aSAlex Elder 			return ret;
1484907703d0SAlex Elder 		linger_req = &rbd_dev->watch_request;
14855efea49aSAlex Elder 	} else {
14865efea49aSAlex Elder 		rbd_assert(rbd_dev->watch_request != NULL);
148759c2be1eSYehuda Sadeh 	}
148859c2be1eSYehuda Sadeh 
14895efea49aSAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
14905efea49aSAlex Elder 				rbd_dev->watch_event->cookie,
14915efea49aSAlex Elder 				rbd_dev->header.obj_version, start);
14925efea49aSAlex Elder 	if (op)
149325704ac9SAlex Elder 		ret = rbd_req_sync_op(rbd_dev,
149479e3057cSYehuda Sadeh 			      CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1495907703d0SAlex Elder 			      op, rbd_dev->header_name,
1496907703d0SAlex Elder 			      0, 0, NULL, linger_req, NULL);
1497070c633fSAlex Elder 
14985efea49aSAlex Elder 	/* Cancel the event if we're tearing down, or on error */
14995efea49aSAlex Elder 
15005efea49aSAlex Elder 	if (!start || !op || ret < 0) {
15010ce1a794SAlex Elder 		ceph_osdc_cancel_event(rbd_dev->watch_event);
15020ce1a794SAlex Elder 		rbd_dev->watch_event = NULL;
1503907703d0SAlex Elder 	}
15045efea49aSAlex Elder 	rbd_osd_req_op_destroy(op);
1505907703d0SAlex Elder 
150679e3057cSYehuda Sadeh 	return ret;
150779e3057cSYehuda Sadeh }
150879e3057cSYehuda Sadeh 
150959c2be1eSYehuda Sadeh /*
15103cb4a687SAlex Elder  * Synchronous osd object method call
1511602adf40SYehuda Sadeh  */
15120ce1a794SAlex Elder static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
1513aded07eaSAlex Elder 			     const char *object_name,
1514aded07eaSAlex Elder 			     const char *class_name,
1515aded07eaSAlex Elder 			     const char *method_name,
15163cb4a687SAlex Elder 			     const char *outbound,
15173cb4a687SAlex Elder 			     size_t outbound_size,
1518f8d4de6eSAlex Elder 			     char *inbound,
1519f8d4de6eSAlex Elder 			     size_t inbound_size,
152059c2be1eSYehuda Sadeh 			     u64 *ver)
1521602adf40SYehuda Sadeh {
1522139b4318SAlex Elder 	struct ceph_osd_req_op *op;
152357cfc106SAlex Elder 	int ret;
152457cfc106SAlex Elder 
15253cb4a687SAlex Elder 	/*
15263cb4a687SAlex Elder 	 * Any input parameters required by the method we're calling
15273cb4a687SAlex Elder 	 * will be sent along with the class and method names as
15283cb4a687SAlex Elder 	 * part of the message payload.  That data and its size are
15293cb4a687SAlex Elder 	 * supplied via the indata and indata_len fields (named from
15303cb4a687SAlex Elder 	 * the perspective of the server side) in the OSD request
15313cb4a687SAlex Elder 	 * operation.
15323cb4a687SAlex Elder 	 */
15332647ba38SAlex Elder 	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
15342647ba38SAlex Elder 					method_name, outbound, outbound_size);
1535139b4318SAlex Elder 	if (!op)
153657cfc106SAlex Elder 		return -ENOMEM;
1537602adf40SYehuda Sadeh 
153830573d68SAlex Elder 	ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
1539f8d4de6eSAlex Elder 			       object_name, 0, inbound_size, inbound,
1540f8d4de6eSAlex Elder 			       NULL, ver);
1541602adf40SYehuda Sadeh 
15422647ba38SAlex Elder 	rbd_osd_req_op_destroy(op);
1543602adf40SYehuda Sadeh 
1544602adf40SYehuda Sadeh 	dout("cls_exec returned %d\n", ret);
1545602adf40SYehuda Sadeh 	return ret;
1546602adf40SYehuda Sadeh }
1547602adf40SYehuda Sadeh 
15481fec7093SYehuda Sadeh static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
15491fec7093SYehuda Sadeh {
15501fec7093SYehuda Sadeh 	struct rbd_req_coll *coll =
15511fec7093SYehuda Sadeh 			kzalloc(sizeof(struct rbd_req_coll) +
15521fec7093SYehuda Sadeh 			        sizeof(struct rbd_req_status) * num_reqs,
15531fec7093SYehuda Sadeh 				GFP_ATOMIC);
15541fec7093SYehuda Sadeh 
15551fec7093SYehuda Sadeh 	if (!coll)
15561fec7093SYehuda Sadeh 		return NULL;
15571fec7093SYehuda Sadeh 	coll->total = num_reqs;
15581fec7093SYehuda Sadeh 	kref_init(&coll->kref);
15591fec7093SYehuda Sadeh 	return coll;
15601fec7093SYehuda Sadeh }
15611fec7093SYehuda Sadeh 
15628295cda7SAlex Elder static int rbd_dev_do_request(struct request *rq,
15638295cda7SAlex Elder 				struct rbd_device *rbd_dev,
15648295cda7SAlex Elder 				struct ceph_snap_context *snapc,
15658295cda7SAlex Elder 				u64 ofs, unsigned int size,
15668295cda7SAlex Elder 				struct bio *bio_chain)
15678295cda7SAlex Elder {
15688295cda7SAlex Elder 	int num_segs;
15698295cda7SAlex Elder 	struct rbd_req_coll *coll;
15708295cda7SAlex Elder 	unsigned int bio_offset;
15718295cda7SAlex Elder 	int cur_seg = 0;
15728295cda7SAlex Elder 
15738295cda7SAlex Elder 	dout("%s 0x%x bytes at 0x%llx\n",
15748295cda7SAlex Elder 		rq_data_dir(rq) == WRITE ? "write" : "read",
15758295cda7SAlex Elder 		size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
15768295cda7SAlex Elder 
15778295cda7SAlex Elder 	num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
15788295cda7SAlex Elder 	if (num_segs <= 0)
15798295cda7SAlex Elder 		return num_segs;
15808295cda7SAlex Elder 
15818295cda7SAlex Elder 	coll = rbd_alloc_coll(num_segs);
15828295cda7SAlex Elder 	if (!coll)
15838295cda7SAlex Elder 		return -ENOMEM;
15848295cda7SAlex Elder 
15858295cda7SAlex Elder 	bio_offset = 0;
15868295cda7SAlex Elder 	do {
15878295cda7SAlex Elder 		u64 limit = rbd_segment_length(rbd_dev, ofs, size);
15888295cda7SAlex Elder 		unsigned int clone_size;
15898295cda7SAlex Elder 		struct bio *bio_clone;
15908295cda7SAlex Elder 
15918295cda7SAlex Elder 		BUG_ON(limit > (u64)UINT_MAX);
15928295cda7SAlex Elder 		clone_size = (unsigned int)limit;
15938295cda7SAlex Elder 		dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
15948295cda7SAlex Elder 
15958295cda7SAlex Elder 		kref_get(&coll->kref);
15968295cda7SAlex Elder 
15978295cda7SAlex Elder 		/* Pass a cloned bio chain via an osd request */
15988295cda7SAlex Elder 
15998295cda7SAlex Elder 		bio_clone = bio_chain_clone_range(&bio_chain,
16008295cda7SAlex Elder 					&bio_offset, clone_size,
16018295cda7SAlex Elder 					GFP_ATOMIC);
16028295cda7SAlex Elder 		if (bio_clone)
16038295cda7SAlex Elder 			(void)rbd_do_op(rq, rbd_dev, snapc,
16048295cda7SAlex Elder 					ofs, clone_size,
16058295cda7SAlex Elder 					bio_clone, coll, cur_seg);
16068295cda7SAlex Elder 		else
16078295cda7SAlex Elder 			rbd_coll_end_req_index(rq, coll, cur_seg,
16088295cda7SAlex Elder 						(s32)-ENOMEM,
16098295cda7SAlex Elder 						clone_size);
16108295cda7SAlex Elder 		size -= clone_size;
16118295cda7SAlex Elder 		ofs += clone_size;
16128295cda7SAlex Elder 
16138295cda7SAlex Elder 		cur_seg++;
16148295cda7SAlex Elder 	} while (size > 0);
16158295cda7SAlex Elder 	kref_put(&coll->kref, rbd_coll_release);
16168295cda7SAlex Elder 
16178295cda7SAlex Elder 	return 0;
16188295cda7SAlex Elder }
16198295cda7SAlex Elder 
1620602adf40SYehuda Sadeh /*
1621602adf40SYehuda Sadeh  * block device queue callback
1622602adf40SYehuda Sadeh  */
1623602adf40SYehuda Sadeh static void rbd_rq_fn(struct request_queue *q)
1624602adf40SYehuda Sadeh {
1625602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1626b395e8b5SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
1627602adf40SYehuda Sadeh 	struct request *rq;
1628602adf40SYehuda Sadeh 
162900f1f36fSAlex Elder 	while ((rq = blk_fetch_request(q))) {
1630b395e8b5SAlex Elder 		struct ceph_snap_context *snapc = NULL;
1631b395e8b5SAlex Elder 		unsigned int size = 0;
16328295cda7SAlex Elder 		int result;
1633602adf40SYehuda Sadeh 
1634602adf40SYehuda Sadeh 		dout("fetched request\n");
1635602adf40SYehuda Sadeh 
1636b395e8b5SAlex Elder 		/* Filter out block requests we don't understand */
1637b395e8b5SAlex Elder 
1638602adf40SYehuda Sadeh 		if ((rq->cmd_type != REQ_TYPE_FS)) {
1639602adf40SYehuda Sadeh 			__blk_end_request_all(rq, 0);
164000f1f36fSAlex Elder 			continue;
1641602adf40SYehuda Sadeh 		}
1642602adf40SYehuda Sadeh 		spin_unlock_irq(q->queue_lock);
1643602adf40SYehuda Sadeh 
1644a7b4c65fSAlex Elder 		/* Write requests need a reference to the snapshot context */
1645e88a36ecSJosh Durgin 
1646a7b4c65fSAlex Elder 		if (rq_data_dir(rq) == WRITE) {
1647b395e8b5SAlex Elder 			result = -EROFS;
1648a7b4c65fSAlex Elder 			if (read_only) /* Can't write to a read-only device */
1649b395e8b5SAlex Elder 				goto out_end_request;
1650b395e8b5SAlex Elder 
1651a7b4c65fSAlex Elder 			/*
1652a7b4c65fSAlex Elder 			 * Note that each osd request will take its
1653a7b4c65fSAlex Elder 			 * own reference to the snapshot context
1654a7b4c65fSAlex Elder 			 * supplied.  The reference we take here
1655a7b4c65fSAlex Elder 			 * just guarantees the one we provide stays
1656a7b4c65fSAlex Elder 			 * valid.
1657a7b4c65fSAlex Elder 			 */
1658b395e8b5SAlex Elder 			down_read(&rbd_dev->header_rwsem);
1659b395e8b5SAlex Elder 			snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1660d1d25646SJosh Durgin 			up_read(&rbd_dev->header_rwsem);
1661a7b4c65fSAlex Elder 			rbd_assert(snapc != NULL);
1662a7b4c65fSAlex Elder 		} else if (!atomic_read(&rbd_dev->exists)) {
1663b395e8b5SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1664e88a36ecSJosh Durgin 			dout("request for non-existent snapshot");
1665b395e8b5SAlex Elder 			result = -ENXIO;
1666b395e8b5SAlex Elder 			goto out_end_request;
1667e88a36ecSJosh Durgin 		}
1668d1d25646SJosh Durgin 
1669f7760dadSAlex Elder 		size = blk_rq_bytes(rq);
1670b395e8b5SAlex Elder 		result = rbd_dev_do_request(rq, rbd_dev, snapc,
1671b395e8b5SAlex Elder 				blk_rq_pos(rq) * SECTOR_SIZE,
1672b395e8b5SAlex Elder 				size, rq->bio);
1673b395e8b5SAlex Elder out_end_request:
1674a7b4c65fSAlex Elder 		if (snapc)
1675df111be6SAlex Elder 			ceph_put_snap_context(snapc);
16761fec7093SYehuda Sadeh 		spin_lock_irq(q->queue_lock);
16778295cda7SAlex Elder 		if (!size || result < 0)
16788295cda7SAlex Elder 			__blk_end_request_all(rq, result);
1679602adf40SYehuda Sadeh 	}
1680602adf40SYehuda Sadeh }
1681602adf40SYehuda Sadeh 
1682602adf40SYehuda Sadeh /*
1683602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
1684602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
1685f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
1686602adf40SYehuda Sadeh  */
1687602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1688602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
1689602adf40SYehuda Sadeh {
1690602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
1691e5cfeed2SAlex Elder 	sector_t sector_offset;
1692e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
1693e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
1694e5cfeed2SAlex Elder 	int ret;
1695602adf40SYehuda Sadeh 
1696e5cfeed2SAlex Elder 	/*
1697e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
1698e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
1699e5cfeed2SAlex Elder 	 * device.
1700e5cfeed2SAlex Elder 	 */
1701e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1702e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1703e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1704593a9e7bSAlex Elder 
1705e5cfeed2SAlex Elder 	/*
1706e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
1707e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
1708e5cfeed2SAlex Elder 	 */
1709e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1710e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
1711e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
1712e5cfeed2SAlex Elder 	else
1713e5cfeed2SAlex Elder 		ret = 0;
1714e5cfeed2SAlex Elder 
1715e5cfeed2SAlex Elder 	/*
1716e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
1717e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
1718e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
1719e5cfeed2SAlex Elder 	 * added to an empty bio."
1720e5cfeed2SAlex Elder 	 */
1721e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
1722e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
1723e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
1724e5cfeed2SAlex Elder 
1725e5cfeed2SAlex Elder 	return ret;
1726602adf40SYehuda Sadeh }
1727602adf40SYehuda Sadeh 
1728602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
1729602adf40SYehuda Sadeh {
1730602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
1731602adf40SYehuda Sadeh 
1732602adf40SYehuda Sadeh 	if (!disk)
1733602adf40SYehuda Sadeh 		return;
1734602adf40SYehuda Sadeh 
1735602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
1736602adf40SYehuda Sadeh 		del_gendisk(disk);
1737602adf40SYehuda Sadeh 	if (disk->queue)
1738602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
1739602adf40SYehuda Sadeh 	put_disk(disk);
1740602adf40SYehuda Sadeh }
1741602adf40SYehuda Sadeh 
1742602adf40SYehuda Sadeh /*
17434156d998SAlex Elder  * Read the complete header for the given rbd device.
17444156d998SAlex Elder  *
17454156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
17464156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
17474156d998SAlex Elder  * of a variable that will be filled in with the version of the
17484156d998SAlex Elder  * header object at the time it was read.
17494156d998SAlex Elder  *
17504156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
17514156d998SAlex Elder  */
17524156d998SAlex Elder static struct rbd_image_header_ondisk *
17534156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
17544156d998SAlex Elder {
17554156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
17564156d998SAlex Elder 	u32 snap_count = 0;
17574156d998SAlex Elder 	u64 names_size = 0;
17584156d998SAlex Elder 	u32 want_count;
17594156d998SAlex Elder 	int ret;
17604156d998SAlex Elder 
17614156d998SAlex Elder 	/*
17624156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
17634156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
17644156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
17654156d998SAlex Elder 	 * the number of snapshots could change by the time we read
17664156d998SAlex Elder 	 * it in, in which case we re-read it.
17674156d998SAlex Elder 	 */
17684156d998SAlex Elder 	do {
17694156d998SAlex Elder 		size_t size;
17704156d998SAlex Elder 
17714156d998SAlex Elder 		kfree(ondisk);
17724156d998SAlex Elder 
17734156d998SAlex Elder 		size = sizeof (*ondisk);
17744156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
17754156d998SAlex Elder 		size += names_size;
17764156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
17774156d998SAlex Elder 		if (!ondisk)
17784156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
17794156d998SAlex Elder 
17804775618dSAlex Elder 		ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
17814156d998SAlex Elder 				       0, size,
17824156d998SAlex Elder 				       (char *) ondisk, version);
17834156d998SAlex Elder 
17844156d998SAlex Elder 		if (ret < 0)
17854156d998SAlex Elder 			goto out_err;
17864156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
17874156d998SAlex Elder 			ret = -ENXIO;
178806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
178906ecc6cbSAlex Elder 				size, ret);
17904156d998SAlex Elder 			goto out_err;
17914156d998SAlex Elder 		}
17924156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
17934156d998SAlex Elder 			ret = -ENXIO;
179406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
17954156d998SAlex Elder 			goto out_err;
17964156d998SAlex Elder 		}
17974156d998SAlex Elder 
17984156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
17994156d998SAlex Elder 		want_count = snap_count;
18004156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
18014156d998SAlex Elder 	} while (snap_count != want_count);
18024156d998SAlex Elder 
18034156d998SAlex Elder 	return ondisk;
18044156d998SAlex Elder 
18054156d998SAlex Elder out_err:
18064156d998SAlex Elder 	kfree(ondisk);
18074156d998SAlex Elder 
18084156d998SAlex Elder 	return ERR_PTR(ret);
18094156d998SAlex Elder }
18104156d998SAlex Elder 
18114156d998SAlex Elder /*
1812602adf40SYehuda Sadeh  * reload the ondisk the header
1813602adf40SYehuda Sadeh  */
1814602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
1815602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
1816602adf40SYehuda Sadeh {
18174156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
18184156d998SAlex Elder 	u64 ver = 0;
18194156d998SAlex Elder 	int ret;
1820602adf40SYehuda Sadeh 
18214156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
18224156d998SAlex Elder 	if (IS_ERR(ondisk))
18234156d998SAlex Elder 		return PTR_ERR(ondisk);
18244156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
18254156d998SAlex Elder 	if (ret >= 0)
182659c2be1eSYehuda Sadeh 		header->obj_version = ver;
18274156d998SAlex Elder 	kfree(ondisk);
1828602adf40SYehuda Sadeh 
18294156d998SAlex Elder 	return ret;
1830602adf40SYehuda Sadeh }
1831602adf40SYehuda Sadeh 
183241f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1833dfc5606dSYehuda Sadeh {
1834dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
1835a0593290SAlex Elder 	struct rbd_snap *next;
1836dfc5606dSYehuda Sadeh 
1837a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
183841f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
1839dfc5606dSYehuda Sadeh }
1840dfc5606dSYehuda Sadeh 
18419478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
18429478554aSAlex Elder {
18439478554aSAlex Elder 	sector_t size;
18449478554aSAlex Elder 
18450d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
18469478554aSAlex Elder 		return;
18479478554aSAlex Elder 
18489478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
18499478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
18509478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
18519478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
18529478554aSAlex Elder }
18539478554aSAlex Elder 
1854602adf40SYehuda Sadeh /*
1855602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
1856602adf40SYehuda Sadeh  */
1857117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
1858602adf40SYehuda Sadeh {
1859602adf40SYehuda Sadeh 	int ret;
1860602adf40SYehuda Sadeh 	struct rbd_image_header h;
1861602adf40SYehuda Sadeh 
1862602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
1863602adf40SYehuda Sadeh 	if (ret < 0)
1864602adf40SYehuda Sadeh 		return ret;
1865602adf40SYehuda Sadeh 
1866a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
1867a51aa0c0SJosh Durgin 
18689478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
18699478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
18709478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
18719db4b3e3SSage Weil 
1872849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
1873602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
1874849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
1875d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
1876d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
1877602adf40SYehuda Sadeh 
1878b813623aSAlex Elder 	if (hver)
1879b813623aSAlex Elder 		*hver = h.obj_version;
1880a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
188193a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
1882602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
1883602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
1884602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
1885849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
1886849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1887849b4260SAlex Elder 	kfree(h.object_prefix);
1888849b4260SAlex Elder 
1889304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
1890304f6808SAlex Elder 	if (!ret)
1891304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
1892dfc5606dSYehuda Sadeh 
1893c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
1894602adf40SYehuda Sadeh 
1895dfc5606dSYehuda Sadeh 	return ret;
1896602adf40SYehuda Sadeh }
1897602adf40SYehuda Sadeh 
1898117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
18991fe5e993SAlex Elder {
19001fe5e993SAlex Elder 	int ret;
19011fe5e993SAlex Elder 
1902117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
19031fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1904117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
1905117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
1906117973fbSAlex Elder 	else
1907117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
19081fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
19091fe5e993SAlex Elder 
19101fe5e993SAlex Elder 	return ret;
19111fe5e993SAlex Elder }
19121fe5e993SAlex Elder 
1913602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
1914602adf40SYehuda Sadeh {
1915602adf40SYehuda Sadeh 	struct gendisk *disk;
1916602adf40SYehuda Sadeh 	struct request_queue *q;
1917593a9e7bSAlex Elder 	u64 segment_size;
1918602adf40SYehuda Sadeh 
1919602adf40SYehuda Sadeh 	/* create gendisk info */
1920602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1921602adf40SYehuda Sadeh 	if (!disk)
19221fcdb8aaSAlex Elder 		return -ENOMEM;
1923602adf40SYehuda Sadeh 
1924f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
1925de71a297SAlex Elder 		 rbd_dev->dev_id);
1926602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
1927602adf40SYehuda Sadeh 	disk->first_minor = 0;
1928602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
1929602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
1930602adf40SYehuda Sadeh 
1931602adf40SYehuda Sadeh 	/* init rq */
1932602adf40SYehuda Sadeh 	q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1933602adf40SYehuda Sadeh 	if (!q)
1934602adf40SYehuda Sadeh 		goto out_disk;
1935029bcbd8SJosh Durgin 
1936593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
1937593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
1938593a9e7bSAlex Elder 
1939029bcbd8SJosh Durgin 	/* set io sizes to object size */
1940593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
1941593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1942593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
1943593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
1944593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
1945029bcbd8SJosh Durgin 
1946602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
1947602adf40SYehuda Sadeh 	disk->queue = q;
1948602adf40SYehuda Sadeh 
1949602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
1950602adf40SYehuda Sadeh 
1951602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
1952602adf40SYehuda Sadeh 
195312f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
195412f02944SAlex Elder 
1955602adf40SYehuda Sadeh 	return 0;
1956602adf40SYehuda Sadeh out_disk:
1957602adf40SYehuda Sadeh 	put_disk(disk);
19581fcdb8aaSAlex Elder 
19591fcdb8aaSAlex Elder 	return -ENOMEM;
1960602adf40SYehuda Sadeh }
1961602adf40SYehuda Sadeh 
1962dfc5606dSYehuda Sadeh /*
1963dfc5606dSYehuda Sadeh   sysfs
1964dfc5606dSYehuda Sadeh */
1965602adf40SYehuda Sadeh 
1966593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1967593a9e7bSAlex Elder {
1968593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
1969593a9e7bSAlex Elder }
1970593a9e7bSAlex Elder 
1971dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
1972dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
1973602adf40SYehuda Sadeh {
1974593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1975a51aa0c0SJosh Durgin 	sector_t size;
1976dfc5606dSYehuda Sadeh 
1977a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
1978a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
1979a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
1980a51aa0c0SJosh Durgin 
1981a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
1982602adf40SYehuda Sadeh }
1983602adf40SYehuda Sadeh 
198434b13184SAlex Elder /*
198534b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
198634b13184SAlex Elder  * necessarily the base image.
198734b13184SAlex Elder  */
198834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
198934b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
199034b13184SAlex Elder {
199134b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
199234b13184SAlex Elder 
199334b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
199434b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
199534b13184SAlex Elder }
199634b13184SAlex Elder 
1997dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
1998dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
1999602adf40SYehuda Sadeh {
2000593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2001dfc5606dSYehuda Sadeh 
2002dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2003dfc5606dSYehuda Sadeh }
2004dfc5606dSYehuda Sadeh 
2005dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2006dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2007dfc5606dSYehuda Sadeh {
2008593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2009dfc5606dSYehuda Sadeh 
20101dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
20111dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2012dfc5606dSYehuda Sadeh }
2013dfc5606dSYehuda Sadeh 
2014dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2015dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2016dfc5606dSYehuda Sadeh {
2017593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2018dfc5606dSYehuda Sadeh 
20190d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2020dfc5606dSYehuda Sadeh }
2021dfc5606dSYehuda Sadeh 
20229bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
20239bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
20249bb2f334SAlex Elder {
20259bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
20269bb2f334SAlex Elder 
20270d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
20280d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
20299bb2f334SAlex Elder }
20309bb2f334SAlex Elder 
2031dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2032dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2033dfc5606dSYehuda Sadeh {
2034593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2035dfc5606dSYehuda Sadeh 
2036a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
20370d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2038a92ffdf8SAlex Elder 
2039a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2040dfc5606dSYehuda Sadeh }
2041dfc5606dSYehuda Sadeh 
2042589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2043589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2044589d30e0SAlex Elder {
2045589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2046589d30e0SAlex Elder 
20470d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2048589d30e0SAlex Elder }
2049589d30e0SAlex Elder 
205034b13184SAlex Elder /*
205134b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
205234b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
205334b13184SAlex Elder  */
2054dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2055dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2056dfc5606dSYehuda Sadeh 			     char *buf)
2057dfc5606dSYehuda Sadeh {
2058593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2059dfc5606dSYehuda Sadeh 
20600d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2061dfc5606dSYehuda Sadeh }
2062dfc5606dSYehuda Sadeh 
206386b00e0dSAlex Elder /*
206486b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
206586b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
206686b00e0dSAlex Elder  * "(no parent image)".
206786b00e0dSAlex Elder  */
206886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
206986b00e0dSAlex Elder 			     struct device_attribute *attr,
207086b00e0dSAlex Elder 			     char *buf)
207186b00e0dSAlex Elder {
207286b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
207386b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
207486b00e0dSAlex Elder 	int count;
207586b00e0dSAlex Elder 	char *bufp = buf;
207686b00e0dSAlex Elder 
207786b00e0dSAlex Elder 	if (!spec)
207886b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
207986b00e0dSAlex Elder 
208086b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
208186b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
208286b00e0dSAlex Elder 	if (count < 0)
208386b00e0dSAlex Elder 		return count;
208486b00e0dSAlex Elder 	bufp += count;
208586b00e0dSAlex Elder 
208686b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
208786b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
208886b00e0dSAlex Elder 	if (count < 0)
208986b00e0dSAlex Elder 		return count;
209086b00e0dSAlex Elder 	bufp += count;
209186b00e0dSAlex Elder 
209286b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
209386b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
209486b00e0dSAlex Elder 	if (count < 0)
209586b00e0dSAlex Elder 		return count;
209686b00e0dSAlex Elder 	bufp += count;
209786b00e0dSAlex Elder 
209886b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
209986b00e0dSAlex Elder 	if (count < 0)
210086b00e0dSAlex Elder 		return count;
210186b00e0dSAlex Elder 	bufp += count;
210286b00e0dSAlex Elder 
210386b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
210486b00e0dSAlex Elder }
210586b00e0dSAlex Elder 
2106dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2107dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2108dfc5606dSYehuda Sadeh 				 const char *buf,
2109dfc5606dSYehuda Sadeh 				 size_t size)
2110dfc5606dSYehuda Sadeh {
2111593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2112b813623aSAlex Elder 	int ret;
2113602adf40SYehuda Sadeh 
2114117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2115b813623aSAlex Elder 
2116b813623aSAlex Elder 	return ret < 0 ? ret : size;
2117dfc5606dSYehuda Sadeh }
2118602adf40SYehuda Sadeh 
2119dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
212034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2121dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2122dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2123dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
21249bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2125dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2126589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2127dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2128dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
212986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2130dfc5606dSYehuda Sadeh 
2131dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2132dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
213334b13184SAlex Elder 	&dev_attr_features.attr,
2134dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2135dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2136dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
21379bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2138dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2139589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2140dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
214186b00e0dSAlex Elder 	&dev_attr_parent.attr,
2142dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2143dfc5606dSYehuda Sadeh 	NULL
2144dfc5606dSYehuda Sadeh };
2145dfc5606dSYehuda Sadeh 
2146dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2147dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2148dfc5606dSYehuda Sadeh };
2149dfc5606dSYehuda Sadeh 
2150dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2151dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2152dfc5606dSYehuda Sadeh 	NULL
2153dfc5606dSYehuda Sadeh };
2154dfc5606dSYehuda Sadeh 
2155dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2156dfc5606dSYehuda Sadeh {
2157dfc5606dSYehuda Sadeh }
2158dfc5606dSYehuda Sadeh 
2159dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2160dfc5606dSYehuda Sadeh 	.name		= "rbd",
2161dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2162dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2163dfc5606dSYehuda Sadeh };
2164dfc5606dSYehuda Sadeh 
2165dfc5606dSYehuda Sadeh 
2166dfc5606dSYehuda Sadeh /*
2167dfc5606dSYehuda Sadeh   sysfs - snapshots
2168dfc5606dSYehuda Sadeh */
2169dfc5606dSYehuda Sadeh 
2170dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2171dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2172dfc5606dSYehuda Sadeh 				  char *buf)
2173dfc5606dSYehuda Sadeh {
2174dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2175dfc5606dSYehuda Sadeh 
21763591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2177dfc5606dSYehuda Sadeh }
2178dfc5606dSYehuda Sadeh 
2179dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2180dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2181dfc5606dSYehuda Sadeh 				char *buf)
2182dfc5606dSYehuda Sadeh {
2183dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2184dfc5606dSYehuda Sadeh 
2185593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2186dfc5606dSYehuda Sadeh }
2187dfc5606dSYehuda Sadeh 
218834b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
218934b13184SAlex Elder 				struct device_attribute *attr,
219034b13184SAlex Elder 				char *buf)
219134b13184SAlex Elder {
219234b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
219334b13184SAlex Elder 
219434b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
219534b13184SAlex Elder 			(unsigned long long) snap->features);
219634b13184SAlex Elder }
219734b13184SAlex Elder 
2198dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2199dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
220034b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2201dfc5606dSYehuda Sadeh 
2202dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2203dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2204dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
220534b13184SAlex Elder 	&dev_attr_snap_features.attr,
2206dfc5606dSYehuda Sadeh 	NULL,
2207dfc5606dSYehuda Sadeh };
2208dfc5606dSYehuda Sadeh 
2209dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2210dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
2211dfc5606dSYehuda Sadeh };
2212dfc5606dSYehuda Sadeh 
2213dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
2214dfc5606dSYehuda Sadeh {
2215dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2216dfc5606dSYehuda Sadeh 	kfree(snap->name);
2217dfc5606dSYehuda Sadeh 	kfree(snap);
2218dfc5606dSYehuda Sadeh }
2219dfc5606dSYehuda Sadeh 
2220dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
2221dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
2222dfc5606dSYehuda Sadeh 	NULL
2223dfc5606dSYehuda Sadeh };
2224dfc5606dSYehuda Sadeh 
2225dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
2226dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
2227dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
2228dfc5606dSYehuda Sadeh };
2229dfc5606dSYehuda Sadeh 
22308b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
22318b8fb99cSAlex Elder {
22328b8fb99cSAlex Elder 	kref_get(&spec->kref);
22338b8fb99cSAlex Elder 
22348b8fb99cSAlex Elder 	return spec;
22358b8fb99cSAlex Elder }
22368b8fb99cSAlex Elder 
22378b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
22388b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
22398b8fb99cSAlex Elder {
22408b8fb99cSAlex Elder 	if (spec)
22418b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
22428b8fb99cSAlex Elder }
22438b8fb99cSAlex Elder 
22448b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
22458b8fb99cSAlex Elder {
22468b8fb99cSAlex Elder 	struct rbd_spec *spec;
22478b8fb99cSAlex Elder 
22488b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
22498b8fb99cSAlex Elder 	if (!spec)
22508b8fb99cSAlex Elder 		return NULL;
22518b8fb99cSAlex Elder 	kref_init(&spec->kref);
22528b8fb99cSAlex Elder 
22538b8fb99cSAlex Elder 	rbd_spec_put(rbd_spec_get(spec));	/* TEMPORARY */
22548b8fb99cSAlex Elder 
22558b8fb99cSAlex Elder 	return spec;
22568b8fb99cSAlex Elder }
22578b8fb99cSAlex Elder 
22588b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
22598b8fb99cSAlex Elder {
22608b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
22618b8fb99cSAlex Elder 
22628b8fb99cSAlex Elder 	kfree(spec->pool_name);
22638b8fb99cSAlex Elder 	kfree(spec->image_id);
22648b8fb99cSAlex Elder 	kfree(spec->image_name);
22658b8fb99cSAlex Elder 	kfree(spec->snap_name);
22668b8fb99cSAlex Elder 	kfree(spec);
22678b8fb99cSAlex Elder }
22688b8fb99cSAlex Elder 
2269c53d5893SAlex Elder struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2270c53d5893SAlex Elder 				struct rbd_spec *spec)
2271c53d5893SAlex Elder {
2272c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
2273c53d5893SAlex Elder 
2274c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2275c53d5893SAlex Elder 	if (!rbd_dev)
2276c53d5893SAlex Elder 		return NULL;
2277c53d5893SAlex Elder 
2278c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
2279d78b650aSAlex Elder 	atomic_set(&rbd_dev->exists, 0);
2280c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
2281c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
2282c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
2283c53d5893SAlex Elder 
2284c53d5893SAlex Elder 	rbd_dev->spec = spec;
2285c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
2286c53d5893SAlex Elder 
22870903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
22880903e875SAlex Elder 
22890903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
22900903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
22910903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
22920903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
22930903e875SAlex Elder 
2294c53d5893SAlex Elder 	return rbd_dev;
2295c53d5893SAlex Elder }
2296c53d5893SAlex Elder 
2297c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2298c53d5893SAlex Elder {
229986b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
2300c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
2301c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
2302c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
2303c53d5893SAlex Elder 	kfree(rbd_dev);
2304c53d5893SAlex Elder }
2305c53d5893SAlex Elder 
2306304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
2307304f6808SAlex Elder {
2308304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
2309304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
2310304f6808SAlex Elder 
2311304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
2312304f6808SAlex Elder 
2313304f6808SAlex Elder 	return ret;
2314304f6808SAlex Elder }
2315304f6808SAlex Elder 
231641f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
2317dfc5606dSYehuda Sadeh {
2318dfc5606dSYehuda Sadeh 	list_del(&snap->node);
2319304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
2320dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
2321dfc5606dSYehuda Sadeh }
2322dfc5606dSYehuda Sadeh 
232314e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
2324dfc5606dSYehuda Sadeh 				  struct device *parent)
2325dfc5606dSYehuda Sadeh {
2326dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
2327dfc5606dSYehuda Sadeh 	int ret;
2328dfc5606dSYehuda Sadeh 
2329dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
2330dfc5606dSYehuda Sadeh 	dev->parent = parent;
2331dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
2332d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
2333304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2334304f6808SAlex Elder 
2335dfc5606dSYehuda Sadeh 	ret = device_register(dev);
2336dfc5606dSYehuda Sadeh 
2337dfc5606dSYehuda Sadeh 	return ret;
2338dfc5606dSYehuda Sadeh }
2339dfc5606dSYehuda Sadeh 
23404e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2341c8d18425SAlex Elder 						const char *snap_name,
234234b13184SAlex Elder 						u64 snap_id, u64 snap_size,
234334b13184SAlex Elder 						u64 snap_features)
2344dfc5606dSYehuda Sadeh {
23454e891e0aSAlex Elder 	struct rbd_snap *snap;
2346dfc5606dSYehuda Sadeh 	int ret;
23474e891e0aSAlex Elder 
23484e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
2349dfc5606dSYehuda Sadeh 	if (!snap)
23504e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
23514e891e0aSAlex Elder 
23524e891e0aSAlex Elder 	ret = -ENOMEM;
2353c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
23544e891e0aSAlex Elder 	if (!snap->name)
23554e891e0aSAlex Elder 		goto err;
23564e891e0aSAlex Elder 
2357c8d18425SAlex Elder 	snap->id = snap_id;
2358c8d18425SAlex Elder 	snap->size = snap_size;
235934b13184SAlex Elder 	snap->features = snap_features;
23604e891e0aSAlex Elder 
23614e891e0aSAlex Elder 	return snap;
23624e891e0aSAlex Elder 
2363dfc5606dSYehuda Sadeh err:
2364dfc5606dSYehuda Sadeh 	kfree(snap->name);
2365dfc5606dSYehuda Sadeh 	kfree(snap);
23664e891e0aSAlex Elder 
23674e891e0aSAlex Elder 	return ERR_PTR(ret);
2368dfc5606dSYehuda Sadeh }
2369dfc5606dSYehuda Sadeh 
2370cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2371cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
2372cd892126SAlex Elder {
2373cd892126SAlex Elder 	char *snap_name;
2374cd892126SAlex Elder 
2375cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2376cd892126SAlex Elder 
2377cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
2378cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
2379cd892126SAlex Elder 
2380cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
2381cd892126SAlex Elder 
2382cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
2383cd892126SAlex Elder 	while (which--)
2384cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
2385cd892126SAlex Elder 
2386cd892126SAlex Elder 	return snap_name;
2387cd892126SAlex Elder }
2388cd892126SAlex Elder 
2389dfc5606dSYehuda Sadeh /*
23909d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
23919d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
23929d475de5SAlex Elder  * image.
23939d475de5SAlex Elder  */
23949d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
23959d475de5SAlex Elder 				u8 *order, u64 *snap_size)
23969d475de5SAlex Elder {
23979d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
23989d475de5SAlex Elder 	int ret;
23999d475de5SAlex Elder 	struct {
24009d475de5SAlex Elder 		u8 order;
24019d475de5SAlex Elder 		__le64 size;
24029d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
24039d475de5SAlex Elder 
24049d475de5SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24059d475de5SAlex Elder 				"rbd", "get_size",
24069d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
240707b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
24089d475de5SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24099d475de5SAlex Elder 	if (ret < 0)
24109d475de5SAlex Elder 		return ret;
24119d475de5SAlex Elder 
24129d475de5SAlex Elder 	*order = size_buf.order;
24139d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
24149d475de5SAlex Elder 
24159d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
24169d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
24179d475de5SAlex Elder 		(unsigned long long) *snap_size);
24189d475de5SAlex Elder 
24199d475de5SAlex Elder 	return 0;
24209d475de5SAlex Elder }
24219d475de5SAlex Elder 
24229d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
24239d475de5SAlex Elder {
24249d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
24259d475de5SAlex Elder 					&rbd_dev->header.obj_order,
24269d475de5SAlex Elder 					&rbd_dev->header.image_size);
24279d475de5SAlex Elder }
24289d475de5SAlex Elder 
24291e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
24301e130199SAlex Elder {
24311e130199SAlex Elder 	void *reply_buf;
24321e130199SAlex Elder 	int ret;
24331e130199SAlex Elder 	void *p;
24341e130199SAlex Elder 
24351e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
24361e130199SAlex Elder 	if (!reply_buf)
24371e130199SAlex Elder 		return -ENOMEM;
24381e130199SAlex Elder 
24391e130199SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
24401e130199SAlex Elder 				"rbd", "get_object_prefix",
24411e130199SAlex Elder 				NULL, 0,
244207b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
24431e130199SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
24441e130199SAlex Elder 	if (ret < 0)
24451e130199SAlex Elder 		goto out;
2446a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
24471e130199SAlex Elder 
24481e130199SAlex Elder 	p = reply_buf;
24491e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
24501e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
24511e130199SAlex Elder 						NULL, GFP_NOIO);
24521e130199SAlex Elder 
24531e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
24541e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
24551e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
24561e130199SAlex Elder 	} else {
24571e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
24581e130199SAlex Elder 	}
24591e130199SAlex Elder 
24601e130199SAlex Elder out:
24611e130199SAlex Elder 	kfree(reply_buf);
24621e130199SAlex Elder 
24631e130199SAlex Elder 	return ret;
24641e130199SAlex Elder }
24651e130199SAlex Elder 
2466b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2467b1b5402aSAlex Elder 		u64 *snap_features)
2468b1b5402aSAlex Elder {
2469b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
2470b1b5402aSAlex Elder 	struct {
2471b1b5402aSAlex Elder 		__le64 features;
2472b1b5402aSAlex Elder 		__le64 incompat;
2473b1b5402aSAlex Elder 	} features_buf = { 0 };
2474d889140cSAlex Elder 	u64 incompat;
2475b1b5402aSAlex Elder 	int ret;
2476b1b5402aSAlex Elder 
2477b1b5402aSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2478b1b5402aSAlex Elder 				"rbd", "get_features",
2479b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
2480b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
248107b2391fSAlex Elder 				NULL);
2482b1b5402aSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2483b1b5402aSAlex Elder 	if (ret < 0)
2484b1b5402aSAlex Elder 		return ret;
2485d889140cSAlex Elder 
2486d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
2487d889140cSAlex Elder 	if (incompat & ~RBD_FEATURES_ALL)
2488b8f5c6edSAlex Elder 		return -ENXIO;
2489d889140cSAlex Elder 
2490b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
2491b1b5402aSAlex Elder 
2492b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2493b1b5402aSAlex Elder 		(unsigned long long) snap_id,
2494b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
2495b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
2496b1b5402aSAlex Elder 
2497b1b5402aSAlex Elder 	return 0;
2498b1b5402aSAlex Elder }
2499b1b5402aSAlex Elder 
2500b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2501b1b5402aSAlex Elder {
2502b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2503b1b5402aSAlex Elder 						&rbd_dev->header.features);
2504b1b5402aSAlex Elder }
2505b1b5402aSAlex Elder 
250686b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
250786b00e0dSAlex Elder {
250886b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
250986b00e0dSAlex Elder 	size_t size;
251086b00e0dSAlex Elder 	void *reply_buf = NULL;
251186b00e0dSAlex Elder 	__le64 snapid;
251286b00e0dSAlex Elder 	void *p;
251386b00e0dSAlex Elder 	void *end;
251486b00e0dSAlex Elder 	char *image_id;
251586b00e0dSAlex Elder 	u64 overlap;
251686b00e0dSAlex Elder 	int ret;
251786b00e0dSAlex Elder 
251886b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
251986b00e0dSAlex Elder 	if (!parent_spec)
252086b00e0dSAlex Elder 		return -ENOMEM;
252186b00e0dSAlex Elder 
252286b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
252386b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
252486b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
252586b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
252686b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
252786b00e0dSAlex Elder 	if (!reply_buf) {
252886b00e0dSAlex Elder 		ret = -ENOMEM;
252986b00e0dSAlex Elder 		goto out_err;
253086b00e0dSAlex Elder 	}
253186b00e0dSAlex Elder 
253286b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
253386b00e0dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
253486b00e0dSAlex Elder 				"rbd", "get_parent",
253586b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
253607b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
253786b00e0dSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
253886b00e0dSAlex Elder 	if (ret < 0)
253986b00e0dSAlex Elder 		goto out_err;
254086b00e0dSAlex Elder 
254186b00e0dSAlex Elder 	ret = -ERANGE;
254286b00e0dSAlex Elder 	p = reply_buf;
254386b00e0dSAlex Elder 	end = (char *) reply_buf + size;
254486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
254586b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
254686b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
254786b00e0dSAlex Elder 
25480903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
25490903e875SAlex Elder 
25500903e875SAlex Elder 	ret = -EIO;
25510903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
25520903e875SAlex Elder 		goto out;
25530903e875SAlex Elder 
2554979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
255586b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
255686b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
255786b00e0dSAlex Elder 		goto out_err;
255886b00e0dSAlex Elder 	}
255986b00e0dSAlex Elder 	parent_spec->image_id = image_id;
256086b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
256186b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
256286b00e0dSAlex Elder 
256386b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
256486b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
256586b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
256686b00e0dSAlex Elder out:
256786b00e0dSAlex Elder 	ret = 0;
256886b00e0dSAlex Elder out_err:
256986b00e0dSAlex Elder 	kfree(reply_buf);
257086b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
257186b00e0dSAlex Elder 
257286b00e0dSAlex Elder 	return ret;
257386b00e0dSAlex Elder }
257486b00e0dSAlex Elder 
25759e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
25769e15b77dSAlex Elder {
25779e15b77dSAlex Elder 	size_t image_id_size;
25789e15b77dSAlex Elder 	char *image_id;
25799e15b77dSAlex Elder 	void *p;
25809e15b77dSAlex Elder 	void *end;
25819e15b77dSAlex Elder 	size_t size;
25829e15b77dSAlex Elder 	void *reply_buf = NULL;
25839e15b77dSAlex Elder 	size_t len = 0;
25849e15b77dSAlex Elder 	char *image_name = NULL;
25859e15b77dSAlex Elder 	int ret;
25869e15b77dSAlex Elder 
25879e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
25889e15b77dSAlex Elder 
258969e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
259069e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
25919e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
25929e15b77dSAlex Elder 	if (!image_id)
25939e15b77dSAlex Elder 		return NULL;
25949e15b77dSAlex Elder 
25959e15b77dSAlex Elder 	p = image_id;
25969e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
259769e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
25989e15b77dSAlex Elder 
25999e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
26009e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
26019e15b77dSAlex Elder 	if (!reply_buf)
26029e15b77dSAlex Elder 		goto out;
26039e15b77dSAlex Elder 
26049e15b77dSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
26059e15b77dSAlex Elder 				"rbd", "dir_get_name",
26069e15b77dSAlex Elder 				image_id, image_id_size,
260707b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
26089e15b77dSAlex Elder 	if (ret < 0)
26099e15b77dSAlex Elder 		goto out;
26109e15b77dSAlex Elder 	p = reply_buf;
26119e15b77dSAlex Elder 	end = (char *) reply_buf + size;
26129e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
26139e15b77dSAlex Elder 	if (IS_ERR(image_name))
26149e15b77dSAlex Elder 		image_name = NULL;
26159e15b77dSAlex Elder 	else
26169e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
26179e15b77dSAlex Elder out:
26189e15b77dSAlex Elder 	kfree(reply_buf);
26199e15b77dSAlex Elder 	kfree(image_id);
26209e15b77dSAlex Elder 
26219e15b77dSAlex Elder 	return image_name;
26229e15b77dSAlex Elder }
26239e15b77dSAlex Elder 
26249e15b77dSAlex Elder /*
26259e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
26269e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
26279e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
26289e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
26299e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
26309e15b77dSAlex Elder  * until then.
26319e15b77dSAlex Elder  */
26329e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
26339e15b77dSAlex Elder {
26349e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
26359e15b77dSAlex Elder 	const char *name;
26369e15b77dSAlex Elder 	void *reply_buf = NULL;
26379e15b77dSAlex Elder 	int ret;
26389e15b77dSAlex Elder 
26399e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
26409e15b77dSAlex Elder 		return 0;	/* Already have the names */
26419e15b77dSAlex Elder 
26429e15b77dSAlex Elder 	/* Look up the pool name */
26439e15b77dSAlex Elder 
26449e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
26459e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
2646935dc89fSAlex Elder 	if (!name) {
2647935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
2648935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
2649935dc89fSAlex Elder 		return -EIO;
2650935dc89fSAlex Elder 	}
26519e15b77dSAlex Elder 
26529e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
26539e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
26549e15b77dSAlex Elder 		return -ENOMEM;
26559e15b77dSAlex Elder 
26569e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
26579e15b77dSAlex Elder 
26589e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
265969e7a02fSAlex Elder 	if (name)
26609e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
266169e7a02fSAlex Elder 	else
266206ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
26639e15b77dSAlex Elder 
26649e15b77dSAlex Elder 	/* Look up the snapshot name. */
26659e15b77dSAlex Elder 
26669e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
26679e15b77dSAlex Elder 	if (!name) {
2668935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
2669935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
26709e15b77dSAlex Elder 		ret = -EIO;
26719e15b77dSAlex Elder 		goto out_err;
26729e15b77dSAlex Elder 	}
26739e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
26749e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
26759e15b77dSAlex Elder 		goto out_err;
26769e15b77dSAlex Elder 
26779e15b77dSAlex Elder 	return 0;
26789e15b77dSAlex Elder out_err:
26799e15b77dSAlex Elder 	kfree(reply_buf);
26809e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
26819e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
26829e15b77dSAlex Elder 
26839e15b77dSAlex Elder 	return ret;
26849e15b77dSAlex Elder }
26859e15b77dSAlex Elder 
26866e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
268735d489f9SAlex Elder {
268835d489f9SAlex Elder 	size_t size;
268935d489f9SAlex Elder 	int ret;
269035d489f9SAlex Elder 	void *reply_buf;
269135d489f9SAlex Elder 	void *p;
269235d489f9SAlex Elder 	void *end;
269335d489f9SAlex Elder 	u64 seq;
269435d489f9SAlex Elder 	u32 snap_count;
269535d489f9SAlex Elder 	struct ceph_snap_context *snapc;
269635d489f9SAlex Elder 	u32 i;
269735d489f9SAlex Elder 
269835d489f9SAlex Elder 	/*
269935d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
270035d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
270135d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
270235d489f9SAlex Elder 	 * prepared to receive.
270335d489f9SAlex Elder 	 */
270435d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
270535d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
270635d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
270735d489f9SAlex Elder 	if (!reply_buf)
270835d489f9SAlex Elder 		return -ENOMEM;
270935d489f9SAlex Elder 
271035d489f9SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
271135d489f9SAlex Elder 				"rbd", "get_snapcontext",
271235d489f9SAlex Elder 				NULL, 0,
271307b2391fSAlex Elder 				reply_buf, size, ver);
271435d489f9SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
271535d489f9SAlex Elder 	if (ret < 0)
271635d489f9SAlex Elder 		goto out;
271735d489f9SAlex Elder 
271835d489f9SAlex Elder 	ret = -ERANGE;
271935d489f9SAlex Elder 	p = reply_buf;
272035d489f9SAlex Elder 	end = (char *) reply_buf + size;
272135d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
272235d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
272335d489f9SAlex Elder 
272435d489f9SAlex Elder 	/*
272535d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
272635d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
272735d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
272835d489f9SAlex Elder 	 * allocate is representable in a size_t.
272935d489f9SAlex Elder 	 */
273035d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
273135d489f9SAlex Elder 				 / sizeof (u64)) {
273235d489f9SAlex Elder 		ret = -EINVAL;
273335d489f9SAlex Elder 		goto out;
273435d489f9SAlex Elder 	}
273535d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
273635d489f9SAlex Elder 		goto out;
273735d489f9SAlex Elder 
273835d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
273935d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
274035d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
274135d489f9SAlex Elder 	if (!snapc) {
274235d489f9SAlex Elder 		ret = -ENOMEM;
274335d489f9SAlex Elder 		goto out;
274435d489f9SAlex Elder 	}
274535d489f9SAlex Elder 
274635d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
274735d489f9SAlex Elder 	snapc->seq = seq;
274835d489f9SAlex Elder 	snapc->num_snaps = snap_count;
274935d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
275035d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
275135d489f9SAlex Elder 
275235d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
275335d489f9SAlex Elder 
275435d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
275535d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
275635d489f9SAlex Elder 
275735d489f9SAlex Elder out:
275835d489f9SAlex Elder 	kfree(reply_buf);
275935d489f9SAlex Elder 
276035d489f9SAlex Elder 	return 0;
276135d489f9SAlex Elder }
276235d489f9SAlex Elder 
2763b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2764b8b1e2dbSAlex Elder {
2765b8b1e2dbSAlex Elder 	size_t size;
2766b8b1e2dbSAlex Elder 	void *reply_buf;
2767b8b1e2dbSAlex Elder 	__le64 snap_id;
2768b8b1e2dbSAlex Elder 	int ret;
2769b8b1e2dbSAlex Elder 	void *p;
2770b8b1e2dbSAlex Elder 	void *end;
2771b8b1e2dbSAlex Elder 	char *snap_name;
2772b8b1e2dbSAlex Elder 
2773b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2774b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
2775b8b1e2dbSAlex Elder 	if (!reply_buf)
2776b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
2777b8b1e2dbSAlex Elder 
2778b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2779b8b1e2dbSAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2780b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
2781b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
278207b2391fSAlex Elder 				reply_buf, size, NULL);
2783b8b1e2dbSAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2784b8b1e2dbSAlex Elder 	if (ret < 0)
2785b8b1e2dbSAlex Elder 		goto out;
2786b8b1e2dbSAlex Elder 
2787b8b1e2dbSAlex Elder 	p = reply_buf;
2788b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
2789e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
2790b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
2791b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
2792b8b1e2dbSAlex Elder 		goto out;
2793b8b1e2dbSAlex Elder 	} else {
2794b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
2795b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
2796b8b1e2dbSAlex Elder 	}
2797b8b1e2dbSAlex Elder 	kfree(reply_buf);
2798b8b1e2dbSAlex Elder 
2799b8b1e2dbSAlex Elder 	return snap_name;
2800b8b1e2dbSAlex Elder out:
2801b8b1e2dbSAlex Elder 	kfree(reply_buf);
2802b8b1e2dbSAlex Elder 
2803b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
2804b8b1e2dbSAlex Elder }
2805b8b1e2dbSAlex Elder 
2806b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2807b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2808b8b1e2dbSAlex Elder {
2809b8b1e2dbSAlex Elder 	__le64 snap_id;
2810b8b1e2dbSAlex Elder 	u8 order;
2811b8b1e2dbSAlex Elder 	int ret;
2812b8b1e2dbSAlex Elder 
2813b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
2814b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2815b8b1e2dbSAlex Elder 	if (ret)
2816b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2817b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2818b8b1e2dbSAlex Elder 	if (ret)
2819b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
2820b8b1e2dbSAlex Elder 
2821b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
2822b8b1e2dbSAlex Elder }
2823b8b1e2dbSAlex Elder 
2824b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2825b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
2826b8b1e2dbSAlex Elder {
2827b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
2828b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
2829b8b1e2dbSAlex Elder 					snap_size, snap_features);
2830b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
2831b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
2832b8b1e2dbSAlex Elder 					snap_size, snap_features);
2833b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
2834b8b1e2dbSAlex Elder }
2835b8b1e2dbSAlex Elder 
2836117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2837117973fbSAlex Elder {
2838117973fbSAlex Elder 	int ret;
2839117973fbSAlex Elder 	__u8 obj_order;
2840117973fbSAlex Elder 
2841117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
2842117973fbSAlex Elder 
2843117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
2844117973fbSAlex Elder 
2845117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
2846117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
2847117973fbSAlex Elder 	if (ret)
2848117973fbSAlex Elder 		goto out;
2849117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
2850117973fbSAlex Elder 		ret = -EIO;
2851117973fbSAlex Elder 		goto out;
2852117973fbSAlex Elder 	}
2853117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
2854117973fbSAlex Elder 
2855117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2856117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
2857117973fbSAlex Elder 	if (ret)
2858117973fbSAlex Elder 		goto out;
2859117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2860117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
2861117973fbSAlex Elder 	if (ret)
2862117973fbSAlex Elder 		goto out;
2863117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
2864117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
2865117973fbSAlex Elder out:
2866117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
2867117973fbSAlex Elder 
2868117973fbSAlex Elder 	return ret;
2869117973fbSAlex Elder }
2870117973fbSAlex Elder 
28719d475de5SAlex Elder /*
287235938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
287335938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
287435938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
287535938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
287635938150SAlex Elder  * And verify there are no changes to snapshots we already know
287735938150SAlex Elder  * about.
287835938150SAlex Elder  *
287935938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
288035938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
288135938150SAlex Elder  * are also maintained in that order.)
2882dfc5606dSYehuda Sadeh  */
2883304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
2884dfc5606dSYehuda Sadeh {
288535938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
288635938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
288735938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
288835938150SAlex Elder 	struct list_head *links = head->next;
288935938150SAlex Elder 	u32 index = 0;
2890dfc5606dSYehuda Sadeh 
28919fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
289235938150SAlex Elder 	while (index < snap_count || links != head) {
289335938150SAlex Elder 		u64 snap_id;
289435938150SAlex Elder 		struct rbd_snap *snap;
2895cd892126SAlex Elder 		char *snap_name;
2896cd892126SAlex Elder 		u64 snap_size = 0;
2897cd892126SAlex Elder 		u64 snap_features = 0;
2898dfc5606dSYehuda Sadeh 
289935938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
290035938150SAlex Elder 					     : CEPH_NOSNAP;
290135938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
290235938150SAlex Elder 				     : NULL;
2903aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
2904dfc5606dSYehuda Sadeh 
290535938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
290635938150SAlex Elder 			struct list_head *next = links->next;
2907dfc5606dSYehuda Sadeh 
290835938150SAlex Elder 			/* Existing snapshot not in the new snap context */
2909dfc5606dSYehuda Sadeh 
29100d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
2911d78b650aSAlex Elder 				atomic_set(&rbd_dev->exists, 0);
291241f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
29139fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
29140d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
29150d7dbfceSAlex Elder 							"mapped " : "",
29169fcbb800SAlex Elder 				(unsigned long long) snap->id);
2917dfc5606dSYehuda Sadeh 
291835938150SAlex Elder 			/* Done with this list entry; advance */
291935938150SAlex Elder 
292035938150SAlex Elder 			links = next;
292135938150SAlex Elder 			continue;
2922dfc5606dSYehuda Sadeh 		}
292335938150SAlex Elder 
2924b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
2925cd892126SAlex Elder 					&snap_size, &snap_features);
2926cd892126SAlex Elder 		if (IS_ERR(snap_name))
2927cd892126SAlex Elder 			return PTR_ERR(snap_name);
2928cd892126SAlex Elder 
29299fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
29309fcbb800SAlex Elder 			(unsigned long long) snap_id);
293135938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
293235938150SAlex Elder 			struct rbd_snap *new_snap;
293335938150SAlex Elder 
293435938150SAlex Elder 			/* We haven't seen this snapshot before */
293535938150SAlex Elder 
2936c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
2937cd892126SAlex Elder 					snap_id, snap_size, snap_features);
29389fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
29399fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
29409fcbb800SAlex Elder 
29419fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
29429fcbb800SAlex Elder 
29439fcbb800SAlex Elder 				return err;
29449fcbb800SAlex Elder 			}
294535938150SAlex Elder 
294635938150SAlex Elder 			/* New goes before existing, or at end of list */
294735938150SAlex Elder 
29489fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
294935938150SAlex Elder 			if (snap)
295035938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
295135938150SAlex Elder 			else
2952523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
295335938150SAlex Elder 		} else {
295435938150SAlex Elder 			/* Already have this one */
295535938150SAlex Elder 
29569fcbb800SAlex Elder 			dout("  already present\n");
29579fcbb800SAlex Elder 
2958cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
2959aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
2960cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
296135938150SAlex Elder 
296235938150SAlex Elder 			/* Done with this list entry; advance */
296335938150SAlex Elder 
296435938150SAlex Elder 			links = links->next;
2965dfc5606dSYehuda Sadeh 		}
296635938150SAlex Elder 
296735938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
296835938150SAlex Elder 
296935938150SAlex Elder 		index++;
2970dfc5606dSYehuda Sadeh 	}
29719fcbb800SAlex Elder 	dout("%s: done\n", __func__);
2972dfc5606dSYehuda Sadeh 
2973dfc5606dSYehuda Sadeh 	return 0;
2974dfc5606dSYehuda Sadeh }
2975dfc5606dSYehuda Sadeh 
2976304f6808SAlex Elder /*
2977304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
2978304f6808SAlex Elder  * have not already been registered.
2979304f6808SAlex Elder  */
2980304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2981304f6808SAlex Elder {
2982304f6808SAlex Elder 	struct rbd_snap *snap;
2983304f6808SAlex Elder 	int ret = 0;
2984304f6808SAlex Elder 
2985304f6808SAlex Elder 	dout("%s called\n", __func__);
298686ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
298786ff77bbSAlex Elder 		return -EIO;
2988304f6808SAlex Elder 
2989304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
2990304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
2991304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2992304f6808SAlex Elder 			if (ret < 0)
2993304f6808SAlex Elder 				break;
2994304f6808SAlex Elder 		}
2995304f6808SAlex Elder 	}
2996304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
2997304f6808SAlex Elder 
2998304f6808SAlex Elder 	return ret;
2999304f6808SAlex Elder }
3000304f6808SAlex Elder 
3001dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3002dfc5606dSYehuda Sadeh {
3003dfc5606dSYehuda Sadeh 	struct device *dev;
3004cd789ab9SAlex Elder 	int ret;
3005dfc5606dSYehuda Sadeh 
3006dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3007dfc5606dSYehuda Sadeh 
3008cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3009dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3010dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3011dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3012dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3013de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3014dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3015dfc5606dSYehuda Sadeh 
3016dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3017cd789ab9SAlex Elder 
3018dfc5606dSYehuda Sadeh 	return ret;
3019602adf40SYehuda Sadeh }
3020602adf40SYehuda Sadeh 
3021dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3022dfc5606dSYehuda Sadeh {
3023dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3024dfc5606dSYehuda Sadeh }
3025dfc5606dSYehuda Sadeh 
302659c2be1eSYehuda Sadeh static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
302759c2be1eSYehuda Sadeh {
302859c2be1eSYehuda Sadeh 	int ret, rc;
302959c2be1eSYehuda Sadeh 
303059c2be1eSYehuda Sadeh 	do {
3031907703d0SAlex Elder 		ret = rbd_req_sync_watch(rbd_dev, 1);
303259c2be1eSYehuda Sadeh 		if (ret == -ERANGE) {
3033117973fbSAlex Elder 			rc = rbd_dev_refresh(rbd_dev, NULL);
303459c2be1eSYehuda Sadeh 			if (rc < 0)
303559c2be1eSYehuda Sadeh 				return rc;
303659c2be1eSYehuda Sadeh 		}
303759c2be1eSYehuda Sadeh 	} while (ret == -ERANGE);
303859c2be1eSYehuda Sadeh 
303959c2be1eSYehuda Sadeh 	return ret;
304059c2be1eSYehuda Sadeh }
304159c2be1eSYehuda Sadeh 
3042e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
30431ddbe94eSAlex Elder 
30441ddbe94eSAlex Elder /*
3045499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3046499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
30471ddbe94eSAlex Elder  */
3048e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3049b7f23c36SAlex Elder {
3050e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3051499afd5bSAlex Elder 
3052499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3053499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3054499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3055e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3056e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3057b7f23c36SAlex Elder }
3058b7f23c36SAlex Elder 
30591ddbe94eSAlex Elder /*
3060499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3061499afd5bSAlex Elder  * identifier is no longer in use.
30621ddbe94eSAlex Elder  */
3063e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
30641ddbe94eSAlex Elder {
3065d184f6bfSAlex Elder 	struct list_head *tmp;
3066de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3067d184f6bfSAlex Elder 	int max_id;
3068d184f6bfSAlex Elder 
3069aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3070499afd5bSAlex Elder 
3071e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3072e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3073499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3074499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3075d184f6bfSAlex Elder 
3076d184f6bfSAlex Elder 	/*
3077d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3078d184f6bfSAlex Elder 	 * is nothing special we need to do.
3079d184f6bfSAlex Elder 	 */
3080e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3081d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3082d184f6bfSAlex Elder 		return;
3083d184f6bfSAlex Elder 	}
3084d184f6bfSAlex Elder 
3085d184f6bfSAlex Elder 	/*
3086d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3087d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3088d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3089d184f6bfSAlex Elder 	 */
3090d184f6bfSAlex Elder 	max_id = 0;
3091d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3092d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3093d184f6bfSAlex Elder 
3094d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3095b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3096b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3097d184f6bfSAlex Elder 	}
3098499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
30991ddbe94eSAlex Elder 
31001ddbe94eSAlex Elder 	/*
3101e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3102d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3103d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3104d184f6bfSAlex Elder 	 * case.
31051ddbe94eSAlex Elder 	 */
3106e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3107e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3108b7f23c36SAlex Elder }
3109b7f23c36SAlex Elder 
3110a725f65eSAlex Elder /*
3111e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3112e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3113593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3114593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3115e28fff26SAlex Elder  */
3116e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3117e28fff26SAlex Elder {
3118e28fff26SAlex Elder         /*
3119e28fff26SAlex Elder         * These are the characters that produce nonzero for
3120e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3121e28fff26SAlex Elder         */
3122e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3123e28fff26SAlex Elder 
3124e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3125e28fff26SAlex Elder 
3126e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3127e28fff26SAlex Elder }
3128e28fff26SAlex Elder 
3129e28fff26SAlex Elder /*
3130e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3131e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3132593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3133593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3134e28fff26SAlex Elder  *
3135e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3136e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3137e28fff26SAlex Elder  * token_size if the token would not fit.
3138e28fff26SAlex Elder  *
3139593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3140e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3141e28fff26SAlex Elder  * too small to hold it.
3142e28fff26SAlex Elder  */
3143e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3144e28fff26SAlex Elder 				char *token,
3145e28fff26SAlex Elder 				size_t token_size)
3146e28fff26SAlex Elder {
3147e28fff26SAlex Elder         size_t len;
3148e28fff26SAlex Elder 
3149e28fff26SAlex Elder 	len = next_token(buf);
3150e28fff26SAlex Elder 	if (len < token_size) {
3151e28fff26SAlex Elder 		memcpy(token, *buf, len);
3152e28fff26SAlex Elder 		*(token + len) = '\0';
3153e28fff26SAlex Elder 	}
3154e28fff26SAlex Elder 	*buf += len;
3155e28fff26SAlex Elder 
3156e28fff26SAlex Elder         return len;
3157e28fff26SAlex Elder }
3158e28fff26SAlex Elder 
3159e28fff26SAlex Elder /*
3160ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3161ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3162ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3163ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3164ea3352f4SAlex Elder  *
3165ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3166ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3167ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3168ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3169ea3352f4SAlex Elder  *
3170ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3171ea3352f4SAlex Elder  * the end of the found token.
3172ea3352f4SAlex Elder  *
3173ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3174ea3352f4SAlex Elder  */
3175ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3176ea3352f4SAlex Elder {
3177ea3352f4SAlex Elder 	char *dup;
3178ea3352f4SAlex Elder 	size_t len;
3179ea3352f4SAlex Elder 
3180ea3352f4SAlex Elder 	len = next_token(buf);
31814caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3182ea3352f4SAlex Elder 	if (!dup)
3183ea3352f4SAlex Elder 		return NULL;
3184ea3352f4SAlex Elder 	*(dup + len) = '\0';
3185ea3352f4SAlex Elder 	*buf += len;
3186ea3352f4SAlex Elder 
3187ea3352f4SAlex Elder 	if (lenp)
3188ea3352f4SAlex Elder 		*lenp = len;
3189ea3352f4SAlex Elder 
3190ea3352f4SAlex Elder 	return dup;
3191ea3352f4SAlex Elder }
3192ea3352f4SAlex Elder 
3193ea3352f4SAlex Elder /*
3194859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3195859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3196859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3197859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3198d22f76e7SAlex Elder  *
3199859c31dfSAlex Elder  * The information extracted from these options is recorded in
3200859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3201859c31dfSAlex Elder  * structures:
3202859c31dfSAlex Elder  *  ceph_opts
3203859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3204859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3205859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3206859c31dfSAlex Elder  *  rbd_opts
3207859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3208859c31dfSAlex Elder  *	this function; caller must release with kfree().
3209859c31dfSAlex Elder  *  spec
3210859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3211859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3212859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3213859c31dfSAlex Elder  *
3214859c31dfSAlex Elder  * The options passed take this form:
3215859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3216859c31dfSAlex Elder  * where:
3217859c31dfSAlex Elder  *  <mon_addrs>
3218859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3219859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3220859c31dfSAlex Elder  *      by a port number (separated by a colon).
3221859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3222859c31dfSAlex Elder  *  <options>
3223859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
3224859c31dfSAlex Elder  *  <pool_name>
3225859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
3226859c31dfSAlex Elder  *  <image_name>
3227859c31dfSAlex Elder  *      The name of the image in that pool to map.
3228859c31dfSAlex Elder  *  <snap_id>
3229859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
3230859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
3231859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
3232859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
3233a725f65eSAlex Elder  */
3234859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
3235dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
3236859c31dfSAlex Elder 				struct rbd_options **opts,
3237859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
3238a725f65eSAlex Elder {
3239e28fff26SAlex Elder 	size_t len;
3240859c31dfSAlex Elder 	char *options;
32410ddebc0cSAlex Elder 	const char *mon_addrs;
32420ddebc0cSAlex Elder 	size_t mon_addrs_size;
3243859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
32444e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3245859c31dfSAlex Elder 	struct ceph_options *copts;
3246dc79b113SAlex Elder 	int ret;
3247e28fff26SAlex Elder 
3248e28fff26SAlex Elder 	/* The first four tokens are required */
3249e28fff26SAlex Elder 
32507ef3214aSAlex Elder 	len = next_token(&buf);
32514fb5d671SAlex Elder 	if (!len) {
32524fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
32534fb5d671SAlex Elder 		return -EINVAL;
32544fb5d671SAlex Elder 	}
32550ddebc0cSAlex Elder 	mon_addrs = buf;
3256f28e565aSAlex Elder 	mon_addrs_size = len + 1;
32577ef3214aSAlex Elder 	buf += len;
3258a725f65eSAlex Elder 
3259dc79b113SAlex Elder 	ret = -EINVAL;
3260f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
3261f28e565aSAlex Elder 	if (!options)
3262dc79b113SAlex Elder 		return -ENOMEM;
32634fb5d671SAlex Elder 	if (!*options) {
32644fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
32654fb5d671SAlex Elder 		goto out_err;
32664fb5d671SAlex Elder 	}
3267a725f65eSAlex Elder 
3268859c31dfSAlex Elder 	spec = rbd_spec_alloc();
3269859c31dfSAlex Elder 	if (!spec)
3270f28e565aSAlex Elder 		goto out_mem;
3271859c31dfSAlex Elder 
3272859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
3273859c31dfSAlex Elder 	if (!spec->pool_name)
3274859c31dfSAlex Elder 		goto out_mem;
32754fb5d671SAlex Elder 	if (!*spec->pool_name) {
32764fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
32774fb5d671SAlex Elder 		goto out_err;
32784fb5d671SAlex Elder 	}
3279e28fff26SAlex Elder 
328069e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
3281859c31dfSAlex Elder 	if (!spec->image_name)
3282f28e565aSAlex Elder 		goto out_mem;
32834fb5d671SAlex Elder 	if (!*spec->image_name) {
32844fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
32854fb5d671SAlex Elder 		goto out_err;
32864fb5d671SAlex Elder 	}
3287e28fff26SAlex Elder 
3288f28e565aSAlex Elder 	/*
3289f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
3290f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
3291f28e565aSAlex Elder 	 */
32923feeb894SAlex Elder 	len = next_token(&buf);
3293820a5f3eSAlex Elder 	if (!len) {
32943feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
32953feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
3296f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
3297dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
3298f28e565aSAlex Elder 		goto out_err;
3299849b4260SAlex Elder 	}
33004caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
3301859c31dfSAlex Elder 	if (!spec->snap_name)
3302f28e565aSAlex Elder 		goto out_mem;
3303859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
3304e5c35534SAlex Elder 
33050ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
3306e28fff26SAlex Elder 
33074e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
33084e9afebaSAlex Elder 	if (!rbd_opts)
33094e9afebaSAlex Elder 		goto out_mem;
33104e9afebaSAlex Elder 
33114e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
3312d22f76e7SAlex Elder 
3313859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
33140ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
33154e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
3316859c31dfSAlex Elder 	if (IS_ERR(copts)) {
3317859c31dfSAlex Elder 		ret = PTR_ERR(copts);
3318dc79b113SAlex Elder 		goto out_err;
3319dc79b113SAlex Elder 	}
3320859c31dfSAlex Elder 	kfree(options);
3321859c31dfSAlex Elder 
3322859c31dfSAlex Elder 	*ceph_opts = copts;
33234e9afebaSAlex Elder 	*opts = rbd_opts;
3324859c31dfSAlex Elder 	*rbd_spec = spec;
33250ddebc0cSAlex Elder 
3326dc79b113SAlex Elder 	return 0;
3327f28e565aSAlex Elder out_mem:
3328dc79b113SAlex Elder 	ret = -ENOMEM;
3329d22f76e7SAlex Elder out_err:
3330859c31dfSAlex Elder 	kfree(rbd_opts);
3331859c31dfSAlex Elder 	rbd_spec_put(spec);
3332f28e565aSAlex Elder 	kfree(options);
3333d22f76e7SAlex Elder 
3334dc79b113SAlex Elder 	return ret;
3335a725f65eSAlex Elder }
3336a725f65eSAlex Elder 
3337589d30e0SAlex Elder /*
3338589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
3339589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
3340589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
3341589d30e0SAlex Elder  *
3342589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
3343589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
3344589d30e0SAlex Elder  * with the supplied name.
3345589d30e0SAlex Elder  *
3346589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
3347589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
3348589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
3349589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
3350589d30e0SAlex Elder  */
3351589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3352589d30e0SAlex Elder {
3353589d30e0SAlex Elder 	int ret;
3354589d30e0SAlex Elder 	size_t size;
3355589d30e0SAlex Elder 	char *object_name;
3356589d30e0SAlex Elder 	void *response;
3357589d30e0SAlex Elder 	void *p;
3358589d30e0SAlex Elder 
3359589d30e0SAlex Elder 	/*
33602c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
33612c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
33622c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
33632c0d0a10SAlex Elder 	 */
33642c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
33652c0d0a10SAlex Elder 		return 0;
33662c0d0a10SAlex Elder 
33672c0d0a10SAlex Elder 	/*
3368589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
3369589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
3370589d30e0SAlex Elder 	 */
337169e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
3372589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
3373589d30e0SAlex Elder 	if (!object_name)
3374589d30e0SAlex Elder 		return -ENOMEM;
33750d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
3376589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
3377589d30e0SAlex Elder 
3378589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
3379589d30e0SAlex Elder 
3380589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3381589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
3382589d30e0SAlex Elder 	if (!response) {
3383589d30e0SAlex Elder 		ret = -ENOMEM;
3384589d30e0SAlex Elder 		goto out;
3385589d30e0SAlex Elder 	}
3386589d30e0SAlex Elder 
3387589d30e0SAlex Elder 	ret = rbd_req_sync_exec(rbd_dev, object_name,
3388589d30e0SAlex Elder 				"rbd", "get_id",
3389589d30e0SAlex Elder 				NULL, 0,
339007b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
3391589d30e0SAlex Elder 	dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3392589d30e0SAlex Elder 	if (ret < 0)
3393589d30e0SAlex Elder 		goto out;
3394a0ea3a40SAlex Elder 	ret = 0;    /* rbd_req_sync_exec() can return positive */
3395589d30e0SAlex Elder 
3396589d30e0SAlex Elder 	p = response;
33970d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
3398589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
3399979ed480SAlex Elder 						NULL, GFP_NOIO);
34000d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
34010d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
34020d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
3403589d30e0SAlex Elder 	} else {
34040d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
3405589d30e0SAlex Elder 	}
3406589d30e0SAlex Elder out:
3407589d30e0SAlex Elder 	kfree(response);
3408589d30e0SAlex Elder 	kfree(object_name);
3409589d30e0SAlex Elder 
3410589d30e0SAlex Elder 	return ret;
3411589d30e0SAlex Elder }
3412589d30e0SAlex Elder 
3413a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3414a30b71b9SAlex Elder {
3415a30b71b9SAlex Elder 	int ret;
3416a30b71b9SAlex Elder 	size_t size;
3417a30b71b9SAlex Elder 
3418a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
3419a30b71b9SAlex Elder 
34200d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
34210d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
3422a30b71b9SAlex Elder 		return -ENOMEM;
3423a30b71b9SAlex Elder 
3424a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
3425a30b71b9SAlex Elder 
342669e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
3427a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3428a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
3429a30b71b9SAlex Elder 		ret = -ENOMEM;
3430a30b71b9SAlex Elder 		goto out_err;
3431a30b71b9SAlex Elder 	}
34320d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34330d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
3434a30b71b9SAlex Elder 
3435a30b71b9SAlex Elder 	/* Populate rbd image metadata */
3436a30b71b9SAlex Elder 
3437a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3438a30b71b9SAlex Elder 	if (ret < 0)
3439a30b71b9SAlex Elder 		goto out_err;
344086b00e0dSAlex Elder 
344186b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
344286b00e0dSAlex Elder 
344386b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
344486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
344586b00e0dSAlex Elder 
3446a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
3447a30b71b9SAlex Elder 
3448a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
3449a30b71b9SAlex Elder 		rbd_dev->header_name);
3450a30b71b9SAlex Elder 
3451a30b71b9SAlex Elder 	return 0;
3452a30b71b9SAlex Elder 
3453a30b71b9SAlex Elder out_err:
3454a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
3455a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
34560d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
34570d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
3458a30b71b9SAlex Elder 
3459a30b71b9SAlex Elder 	return ret;
3460a30b71b9SAlex Elder }
3461a30b71b9SAlex Elder 
3462a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3463a30b71b9SAlex Elder {
3464a30b71b9SAlex Elder 	size_t size;
34659d475de5SAlex Elder 	int ret;
34666e14b1a6SAlex Elder 	u64 ver = 0;
3467a30b71b9SAlex Elder 
3468a30b71b9SAlex Elder 	/*
3469a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
3470a30b71b9SAlex Elder 	 * object name for this rbd image.
3471a30b71b9SAlex Elder 	 */
3472979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
3473a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3474a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
3475a30b71b9SAlex Elder 		return -ENOMEM;
3476a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
34770d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
34789d475de5SAlex Elder 
34799d475de5SAlex Elder 	/* Get the size and object order for the image */
34809d475de5SAlex Elder 
34819d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
34829d475de5SAlex Elder 	if (ret < 0)
34839d475de5SAlex Elder 		goto out_err;
34841e130199SAlex Elder 
34851e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
34861e130199SAlex Elder 
34871e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
34881e130199SAlex Elder 	if (ret < 0)
34891e130199SAlex Elder 		goto out_err;
3490b1b5402aSAlex Elder 
3491d889140cSAlex Elder 	/* Get the and check features for the image */
3492b1b5402aSAlex Elder 
3493b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
3494b1b5402aSAlex Elder 	if (ret < 0)
3495b1b5402aSAlex Elder 		goto out_err;
349635d489f9SAlex Elder 
349786b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
349886b00e0dSAlex Elder 
349986b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
350086b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
350186b00e0dSAlex Elder 		if (ret < 0)
350286b00e0dSAlex Elder 			goto out_err;
350386b00e0dSAlex Elder 	}
350486b00e0dSAlex Elder 
35056e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
350635d489f9SAlex Elder 
35076e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
35086e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
35096e14b1a6SAlex Elder 
35106e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
35116e14b1a6SAlex Elder 
35126e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
351335d489f9SAlex Elder 	if (ret)
351435d489f9SAlex Elder 		goto out_err;
35156e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
35166e14b1a6SAlex Elder 
3517a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
3518a30b71b9SAlex Elder 
3519a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
3520a30b71b9SAlex Elder 		rbd_dev->header_name);
3521a30b71b9SAlex Elder 
352235152979SAlex Elder 	return 0;
35239d475de5SAlex Elder out_err:
352486b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
352586b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
352686b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
35279d475de5SAlex Elder 	kfree(rbd_dev->header_name);
35289d475de5SAlex Elder 	rbd_dev->header_name = NULL;
35291e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
35301e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
35319d475de5SAlex Elder 
35329d475de5SAlex Elder 	return ret;
3533a30b71b9SAlex Elder }
3534a30b71b9SAlex Elder 
353583a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
353683a06263SAlex Elder {
353783a06263SAlex Elder 	int ret;
353883a06263SAlex Elder 
353983a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
354083a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
354183a06263SAlex Elder 	if (ret)
354283a06263SAlex Elder 		return ret;
354383a06263SAlex Elder 
35449e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
35459e15b77dSAlex Elder 	if (ret)
35469e15b77dSAlex Elder 		goto err_out_snaps;
35479e15b77dSAlex Elder 
354883a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
354983a06263SAlex Elder 	if (ret)
355083a06263SAlex Elder 		goto err_out_snaps;
355183a06263SAlex Elder 
355283a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
355383a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
355483a06263SAlex Elder 
355583a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
355683a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
355783a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
355883a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
355983a06263SAlex Elder 
356083a06263SAlex Elder 	/* Get our block major device number. */
356183a06263SAlex Elder 
356283a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
356383a06263SAlex Elder 	if (ret < 0)
356483a06263SAlex Elder 		goto err_out_id;
356583a06263SAlex Elder 	rbd_dev->major = ret;
356683a06263SAlex Elder 
356783a06263SAlex Elder 	/* Set up the blkdev mapping. */
356883a06263SAlex Elder 
356983a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
357083a06263SAlex Elder 	if (ret)
357183a06263SAlex Elder 		goto err_out_blkdev;
357283a06263SAlex Elder 
357383a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
357483a06263SAlex Elder 	if (ret)
357583a06263SAlex Elder 		goto err_out_disk;
357683a06263SAlex Elder 
357783a06263SAlex Elder 	/*
357883a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
357983a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
358083a06263SAlex Elder 	 */
358183a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
358283a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
358383a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
358483a06263SAlex Elder 	if (ret)
358583a06263SAlex Elder 		goto err_out_bus;
358683a06263SAlex Elder 
358783a06263SAlex Elder 	ret = rbd_init_watch_dev(rbd_dev);
358883a06263SAlex Elder 	if (ret)
358983a06263SAlex Elder 		goto err_out_bus;
359083a06263SAlex Elder 
359183a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
359283a06263SAlex Elder 
359383a06263SAlex Elder 	add_disk(rbd_dev->disk);
359483a06263SAlex Elder 
359583a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
359683a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
359783a06263SAlex Elder 
359883a06263SAlex Elder 	return ret;
359983a06263SAlex Elder err_out_bus:
360083a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
360183a06263SAlex Elder 
360283a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
360383a06263SAlex Elder 
360483a06263SAlex Elder 	return ret;
360583a06263SAlex Elder err_out_disk:
360683a06263SAlex Elder 	rbd_free_disk(rbd_dev);
360783a06263SAlex Elder err_out_blkdev:
360883a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
360983a06263SAlex Elder err_out_id:
361083a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
361183a06263SAlex Elder err_out_snaps:
361283a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
361383a06263SAlex Elder 
361483a06263SAlex Elder 	return ret;
361583a06263SAlex Elder }
361683a06263SAlex Elder 
3617a30b71b9SAlex Elder /*
3618a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
3619a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
3620a30b71b9SAlex Elder  * id.
3621a30b71b9SAlex Elder  */
3622a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
3623a30b71b9SAlex Elder {
3624a30b71b9SAlex Elder 	int ret;
3625a30b71b9SAlex Elder 
3626a30b71b9SAlex Elder 	/*
3627a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
3628a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
3629a30b71b9SAlex Elder 	 * it's a format 1 image.
3630a30b71b9SAlex Elder 	 */
3631a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
3632a30b71b9SAlex Elder 	if (ret)
3633a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
3634a30b71b9SAlex Elder 	else
3635a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
363683a06263SAlex Elder 	if (ret) {
3637a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
3638a30b71b9SAlex Elder 
3639a30b71b9SAlex Elder 		return ret;
3640a30b71b9SAlex Elder 	}
3641a30b71b9SAlex Elder 
364283a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
364383a06263SAlex Elder 	if (ret)
364483a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
364583a06263SAlex Elder 
364683a06263SAlex Elder 	return ret;
364783a06263SAlex Elder }
364883a06263SAlex Elder 
364959c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
365059c2be1eSYehuda Sadeh 		       const char *buf,
365159c2be1eSYehuda Sadeh 		       size_t count)
3652602adf40SYehuda Sadeh {
3653cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
3654dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
36554e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
3656859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
36579d3997fdSAlex Elder 	struct rbd_client *rbdc;
365827cc2594SAlex Elder 	struct ceph_osd_client *osdc;
365927cc2594SAlex Elder 	int rc = -ENOMEM;
3660602adf40SYehuda Sadeh 
3661602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
3662602adf40SYehuda Sadeh 		return -ENODEV;
3663602adf40SYehuda Sadeh 
3664a725f65eSAlex Elder 	/* parse add command */
3665859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
3666dc79b113SAlex Elder 	if (rc < 0)
3667bd4ba655SAlex Elder 		goto err_out_module;
3668a725f65eSAlex Elder 
36699d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
36709d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
36719d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
36720ddebc0cSAlex Elder 		goto err_out_args;
36739d3997fdSAlex Elder 	}
3674c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
3675602adf40SYehuda Sadeh 
3676602adf40SYehuda Sadeh 	/* pick the pool */
36779d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
3678859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
3679602adf40SYehuda Sadeh 	if (rc < 0)
3680602adf40SYehuda Sadeh 		goto err_out_client;
3681859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
3682859c31dfSAlex Elder 
36830903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36840903e875SAlex Elder 
36850903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
36860903e875SAlex Elder 		rc = -EIO;
36870903e875SAlex Elder 		goto err_out_client;
36880903e875SAlex Elder 	}
36890903e875SAlex Elder 
3690c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
3691bd4ba655SAlex Elder 	if (!rbd_dev)
3692bd4ba655SAlex Elder 		goto err_out_client;
3693c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
3694c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
3695602adf40SYehuda Sadeh 
3696bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
3697c53d5893SAlex Elder 	kfree(rbd_opts);
3698c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
3699bd4ba655SAlex Elder 
3700a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
3701a30b71b9SAlex Elder 	if (rc < 0)
3702c53d5893SAlex Elder 		goto err_out_rbd_dev;
370305fd6f6fSAlex Elder 
3704602adf40SYehuda Sadeh 	return count;
3705c53d5893SAlex Elder err_out_rbd_dev:
3706c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3707bd4ba655SAlex Elder err_out_client:
37089d3997fdSAlex Elder 	rbd_put_client(rbdc);
37090ddebc0cSAlex Elder err_out_args:
371078cea76eSAlex Elder 	if (ceph_opts)
371178cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
37124e9afebaSAlex Elder 	kfree(rbd_opts);
3713859c31dfSAlex Elder 	rbd_spec_put(spec);
3714bd4ba655SAlex Elder err_out_module:
3715bd4ba655SAlex Elder 	module_put(THIS_MODULE);
371627cc2594SAlex Elder 
3717602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
371827cc2594SAlex Elder 
371927cc2594SAlex Elder 	return (ssize_t) rc;
3720602adf40SYehuda Sadeh }
3721602adf40SYehuda Sadeh 
3722de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
3723602adf40SYehuda Sadeh {
3724602adf40SYehuda Sadeh 	struct list_head *tmp;
3725602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
3726602adf40SYehuda Sadeh 
3727e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3728602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
3729602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3730de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
3731e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
3732602adf40SYehuda Sadeh 			return rbd_dev;
3733602adf40SYehuda Sadeh 		}
3734e124a82fSAlex Elder 	}
3735e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3736602adf40SYehuda Sadeh 	return NULL;
3737602adf40SYehuda Sadeh }
3738602adf40SYehuda Sadeh 
3739dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
3740602adf40SYehuda Sadeh {
3741593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3742602adf40SYehuda Sadeh 
37431dbb4399SAlex Elder 	if (rbd_dev->watch_request) {
37441dbb4399SAlex Elder 		struct ceph_client *client = rbd_dev->rbd_client->client;
37451dbb4399SAlex Elder 
37461dbb4399SAlex Elder 		ceph_osdc_unregister_linger_request(&client->osdc,
374759c2be1eSYehuda Sadeh 						    rbd_dev->watch_request);
37481dbb4399SAlex Elder 	}
374959c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
3750907703d0SAlex Elder 		rbd_req_sync_watch(rbd_dev, 0);
3751602adf40SYehuda Sadeh 
3752602adf40SYehuda Sadeh 	/* clean up and free blkdev */
3753602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
3754602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
375532eec68dSAlex Elder 
37562ac4e75dSAlex Elder 	/* release allocated disk header fields */
37572ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
37582ac4e75dSAlex Elder 
375932eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
3760e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
3761c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
3762c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
3763602adf40SYehuda Sadeh 
3764602adf40SYehuda Sadeh 	/* release module ref */
3765602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
3766602adf40SYehuda Sadeh }
3767602adf40SYehuda Sadeh 
3768dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
3769602adf40SYehuda Sadeh 			  const char *buf,
3770602adf40SYehuda Sadeh 			  size_t count)
3771602adf40SYehuda Sadeh {
3772602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
3773602adf40SYehuda Sadeh 	int target_id, rc;
3774602adf40SYehuda Sadeh 	unsigned long ul;
3775602adf40SYehuda Sadeh 	int ret = count;
3776602adf40SYehuda Sadeh 
3777602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
3778602adf40SYehuda Sadeh 	if (rc)
3779602adf40SYehuda Sadeh 		return rc;
3780602adf40SYehuda Sadeh 
3781602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
3782602adf40SYehuda Sadeh 	target_id = (int) ul;
3783602adf40SYehuda Sadeh 	if (target_id != ul)
3784602adf40SYehuda Sadeh 		return -EINVAL;
3785602adf40SYehuda Sadeh 
3786602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3787602adf40SYehuda Sadeh 
3788602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
3789602adf40SYehuda Sadeh 	if (!rbd_dev) {
3790602adf40SYehuda Sadeh 		ret = -ENOENT;
3791602adf40SYehuda Sadeh 		goto done;
3792602adf40SYehuda Sadeh 	}
3793602adf40SYehuda Sadeh 
379442382b70SAlex Elder 	if (rbd_dev->open_count) {
379542382b70SAlex Elder 		ret = -EBUSY;
379642382b70SAlex Elder 		goto done;
379742382b70SAlex Elder 	}
379842382b70SAlex Elder 
379941f38c2bSAlex Elder 	rbd_remove_all_snaps(rbd_dev);
3800dfc5606dSYehuda Sadeh 	rbd_bus_del_dev(rbd_dev);
3801602adf40SYehuda Sadeh 
3802602adf40SYehuda Sadeh done:
3803602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3804aafb230eSAlex Elder 
3805602adf40SYehuda Sadeh 	return ret;
3806602adf40SYehuda Sadeh }
3807602adf40SYehuda Sadeh 
3808602adf40SYehuda Sadeh /*
3809602adf40SYehuda Sadeh  * create control files in sysfs
3810dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
3811602adf40SYehuda Sadeh  */
3812602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
3813602adf40SYehuda Sadeh {
3814dfc5606dSYehuda Sadeh 	int ret;
3815602adf40SYehuda Sadeh 
3816fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
3817dfc5606dSYehuda Sadeh 	if (ret < 0)
3818dfc5606dSYehuda Sadeh 		return ret;
3819602adf40SYehuda Sadeh 
3820fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
3821fed4c143SAlex Elder 	if (ret < 0)
3822fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
3823602adf40SYehuda Sadeh 
3824602adf40SYehuda Sadeh 	return ret;
3825602adf40SYehuda Sadeh }
3826602adf40SYehuda Sadeh 
3827602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
3828602adf40SYehuda Sadeh {
3829dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
3830fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
3831602adf40SYehuda Sadeh }
3832602adf40SYehuda Sadeh 
3833602adf40SYehuda Sadeh int __init rbd_init(void)
3834602adf40SYehuda Sadeh {
3835602adf40SYehuda Sadeh 	int rc;
3836602adf40SYehuda Sadeh 
3837602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
3838602adf40SYehuda Sadeh 	if (rc)
3839602adf40SYehuda Sadeh 		return rc;
3840f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
3841602adf40SYehuda Sadeh 	return 0;
3842602adf40SYehuda Sadeh }
3843602adf40SYehuda Sadeh 
3844602adf40SYehuda Sadeh void __exit rbd_exit(void)
3845602adf40SYehuda Sadeh {
3846602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
3847602adf40SYehuda Sadeh }
3848602adf40SYehuda Sadeh 
3849602adf40SYehuda Sadeh module_init(rbd_init);
3850602adf40SYehuda Sadeh module_exit(rbd_exit);
3851602adf40SYehuda Sadeh 
3852602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3853602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3854602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
3855602adf40SYehuda Sadeh 
3856602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
3857602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3858602adf40SYehuda Sadeh 
3859602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
3860