xref: /openbmc/linux/drivers/block/rbd.c (revision b454e36d)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
835cbf6f12SAlex Elder #define RBD_FEATURES_SUPPORTED	(0)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
11159c2be1eSYehuda Sadeh 	u64 obj_version;
11259c2be1eSYehuda Sadeh };
11359c2be1eSYehuda Sadeh 
1140d7dbfceSAlex Elder /*
1150d7dbfceSAlex Elder  * An rbd image specification.
1160d7dbfceSAlex Elder  *
1170d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
118c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
119c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
120c66c6e0cSAlex Elder  *
121c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
122c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
123c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
124c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
127c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
128c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
129c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
130c66c6e0cSAlex Elder  * is shared between the parent and child).
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
133c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
134c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
137c66c6e0cSAlex Elder  * could be a null pointer).
1380d7dbfceSAlex Elder  */
1390d7dbfceSAlex Elder struct rbd_spec {
1400d7dbfceSAlex Elder 	u64		pool_id;
1410d7dbfceSAlex Elder 	char		*pool_name;
1420d7dbfceSAlex Elder 
1430d7dbfceSAlex Elder 	char		*image_id;
1440d7dbfceSAlex Elder 	char		*image_name;
1450d7dbfceSAlex Elder 
1460d7dbfceSAlex Elder 	u64		snap_id;
1470d7dbfceSAlex Elder 	char		*snap_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	struct kref	kref;
1500d7dbfceSAlex Elder };
1510d7dbfceSAlex Elder 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
154602adf40SYehuda Sadeh  */
155602adf40SYehuda Sadeh struct rbd_client {
156602adf40SYehuda Sadeh 	struct ceph_client	*client;
157602adf40SYehuda Sadeh 	struct kref		kref;
158602adf40SYehuda Sadeh 	struct list_head	node;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161bf0d5f50SAlex Elder struct rbd_img_request;
162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163bf0d5f50SAlex Elder 
164bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
165bf0d5f50SAlex Elder 
166bf0d5f50SAlex Elder struct rbd_obj_request;
167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168bf0d5f50SAlex Elder 
1699969ebc5SAlex Elder enum obj_request_type {
1709969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1719969ebc5SAlex Elder };
172bf0d5f50SAlex Elder 
173926f9b3fSAlex Elder enum obj_req_flags {
174926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1756365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1765679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1775679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
178926f9b3fSAlex Elder };
179926f9b3fSAlex Elder 
180bf0d5f50SAlex Elder struct rbd_obj_request {
181bf0d5f50SAlex Elder 	const char		*object_name;
182bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
183bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
184926f9b3fSAlex Elder 	unsigned long		flags;
185bf0d5f50SAlex Elder 
186c5b5ef6cSAlex Elder 	/*
187c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
188c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
189c5b5ef6cSAlex Elder 	 *
190c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
191c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
192c5b5ef6cSAlex Elder 	 *
193c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
194c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
195c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
196c5b5ef6cSAlex Elder 	 *
197c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
198c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
199c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
200c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
201c5b5ef6cSAlex Elder 	 */
202c5b5ef6cSAlex Elder 	union {
203c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
204c5b5ef6cSAlex Elder 		struct {
205bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
206c5b5ef6cSAlex Elder 			u64			img_offset;
207c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
208c5b5ef6cSAlex Elder 			struct list_head	links;
209c5b5ef6cSAlex Elder 		};
210c5b5ef6cSAlex Elder 	};
211bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
212bf0d5f50SAlex Elder 
213bf0d5f50SAlex Elder 	enum obj_request_type	type;
214788e2df3SAlex Elder 	union {
215bf0d5f50SAlex Elder 		struct bio	*bio_list;
216788e2df3SAlex Elder 		struct {
217788e2df3SAlex Elder 			struct page	**pages;
218788e2df3SAlex Elder 			u32		page_count;
219788e2df3SAlex Elder 		};
220788e2df3SAlex Elder 	};
221bf0d5f50SAlex Elder 
222bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
223bf0d5f50SAlex Elder 
224bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
225bf0d5f50SAlex Elder 	u64			version;
2261b83bef2SSage Weil 	int			result;
227bf0d5f50SAlex Elder 
228bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
229788e2df3SAlex Elder 	struct completion	completion;
230bf0d5f50SAlex Elder 
231bf0d5f50SAlex Elder 	struct kref		kref;
232bf0d5f50SAlex Elder };
233bf0d5f50SAlex Elder 
2340c425248SAlex Elder enum img_req_flags {
2359849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2369849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
237d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2380c425248SAlex Elder };
2390c425248SAlex Elder 
240bf0d5f50SAlex Elder struct rbd_img_request {
241bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
242bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
243bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2440c425248SAlex Elder 	unsigned long		flags;
245bf0d5f50SAlex Elder 	union {
246bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2479849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2489849e986SAlex Elder 	};
2499849e986SAlex Elder 	union {
2509849e986SAlex Elder 		struct request		*rq;		/* block request */
2519849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
252bf0d5f50SAlex Elder 	};
253bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
254bf0d5f50SAlex Elder 	u32			next_completion;
255bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
25655f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
257a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
258bf0d5f50SAlex Elder 
259bf0d5f50SAlex Elder 	u32			obj_request_count;
260bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
261bf0d5f50SAlex Elder 
262bf0d5f50SAlex Elder 	struct kref		kref;
263bf0d5f50SAlex Elder };
264bf0d5f50SAlex Elder 
265bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
266ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
267bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
268ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
269bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
270ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
271bf0d5f50SAlex Elder 
272dfc5606dSYehuda Sadeh struct rbd_snap {
273dfc5606dSYehuda Sadeh 	struct	device		dev;
274dfc5606dSYehuda Sadeh 	const char		*name;
2753591538fSJosh Durgin 	u64			size;
276dfc5606dSYehuda Sadeh 	struct list_head	node;
277dfc5606dSYehuda Sadeh 	u64			id;
27834b13184SAlex Elder 	u64			features;
279dfc5606dSYehuda Sadeh };
280dfc5606dSYehuda Sadeh 
281f84344f3SAlex Elder struct rbd_mapping {
28299c1f08fSAlex Elder 	u64                     size;
28334b13184SAlex Elder 	u64                     features;
284f84344f3SAlex Elder 	bool			read_only;
285f84344f3SAlex Elder };
286f84344f3SAlex Elder 
287602adf40SYehuda Sadeh /*
288602adf40SYehuda Sadeh  * a single device
289602adf40SYehuda Sadeh  */
290602adf40SYehuda Sadeh struct rbd_device {
291de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
292602adf40SYehuda Sadeh 
293602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
294602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
295602adf40SYehuda Sadeh 
296a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
297602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
298602adf40SYehuda Sadeh 
299602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
300602adf40SYehuda Sadeh 
301b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
302602adf40SYehuda Sadeh 
303602adf40SYehuda Sadeh 	struct rbd_image_header	header;
304b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3050d7dbfceSAlex Elder 	struct rbd_spec		*spec;
306602adf40SYehuda Sadeh 
3070d7dbfceSAlex Elder 	char			*header_name;
308971f839aSAlex Elder 
3090903e875SAlex Elder 	struct ceph_file_layout	layout;
3100903e875SAlex Elder 
31159c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
312975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31359c2be1eSYehuda Sadeh 
31486b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31586b00e0dSAlex Elder 	u64			parent_overlap;
3162f82ee54SAlex Elder 	struct rbd_device	*parent;
31786b00e0dSAlex Elder 
318c666601aSJosh Durgin 	/* protects updating the header */
319c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
320f84344f3SAlex Elder 
321f84344f3SAlex Elder 	struct rbd_mapping	mapping;
322602adf40SYehuda Sadeh 
323602adf40SYehuda Sadeh 	struct list_head	node;
324dfc5606dSYehuda Sadeh 
325dfc5606dSYehuda Sadeh 	/* list of snapshots */
326dfc5606dSYehuda Sadeh 	struct list_head	snaps;
327dfc5606dSYehuda Sadeh 
328dfc5606dSYehuda Sadeh 	/* sysfs related */
329dfc5606dSYehuda Sadeh 	struct device		dev;
330b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
331dfc5606dSYehuda Sadeh };
332dfc5606dSYehuda Sadeh 
333b82d167bSAlex Elder /*
334b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
335b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
336b82d167bSAlex Elder  *
337b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
338b82d167bSAlex Elder  * "open_count" field) requires atomic access.
339b82d167bSAlex Elder  */
3406d292906SAlex Elder enum rbd_dev_flags {
3416d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
342b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3436d292906SAlex Elder };
3446d292906SAlex Elder 
345602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
346e124a82fSAlex Elder 
347602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
348e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
349e124a82fSAlex Elder 
350602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
351432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
352602adf40SYehuda Sadeh 
353304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
354304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
355304f6808SAlex Elder 
356dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
35741f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap);
358dfc5606dSYehuda Sadeh 
359f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
360f0f8cef5SAlex Elder 		       size_t count);
361f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
362f0f8cef5SAlex Elder 			  size_t count);
3632f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev);
364f0f8cef5SAlex Elder 
365f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
366f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
367f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
368f0f8cef5SAlex Elder 	__ATTR_NULL
369f0f8cef5SAlex Elder };
370f0f8cef5SAlex Elder 
371f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
372f0f8cef5SAlex Elder 	.name		= "rbd",
373f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
374f0f8cef5SAlex Elder };
375f0f8cef5SAlex Elder 
376f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
377f0f8cef5SAlex Elder {
378f0f8cef5SAlex Elder }
379f0f8cef5SAlex Elder 
380f0f8cef5SAlex Elder static struct device rbd_root_dev = {
381f0f8cef5SAlex Elder 	.init_name =    "rbd",
382f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
383f0f8cef5SAlex Elder };
384f0f8cef5SAlex Elder 
38506ecc6cbSAlex Elder static __printf(2, 3)
38606ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
38706ecc6cbSAlex Elder {
38806ecc6cbSAlex Elder 	struct va_format vaf;
38906ecc6cbSAlex Elder 	va_list args;
39006ecc6cbSAlex Elder 
39106ecc6cbSAlex Elder 	va_start(args, fmt);
39206ecc6cbSAlex Elder 	vaf.fmt = fmt;
39306ecc6cbSAlex Elder 	vaf.va = &args;
39406ecc6cbSAlex Elder 
39506ecc6cbSAlex Elder 	if (!rbd_dev)
39606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
39706ecc6cbSAlex Elder 	else if (rbd_dev->disk)
39806ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
39906ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
40006ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
40106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40206ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40306ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40406ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
40506ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
40606ecc6cbSAlex Elder 	else	/* punt */
40706ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
40806ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
40906ecc6cbSAlex Elder 	va_end(args);
41006ecc6cbSAlex Elder }
41106ecc6cbSAlex Elder 
412aafb230eSAlex Elder #ifdef RBD_DEBUG
413aafb230eSAlex Elder #define rbd_assert(expr)						\
414aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
415aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
416aafb230eSAlex Elder 						"at line %d:\n\n"	\
417aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
418aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
419aafb230eSAlex Elder 			BUG();						\
420aafb230eSAlex Elder 		}
421aafb230eSAlex Elder #else /* !RBD_DEBUG */
422aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
423aafb230eSAlex Elder #endif /* !RBD_DEBUG */
424dfc5606dSYehuda Sadeh 
4258b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
426b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
4278b3e1a56SAlex Elder 
428117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
429117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
43059c2be1eSYehuda Sadeh 
431602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
432602adf40SYehuda Sadeh {
433f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
434b82d167bSAlex Elder 	bool removing = false;
435602adf40SYehuda Sadeh 
436f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
437602adf40SYehuda Sadeh 		return -EROFS;
438602adf40SYehuda Sadeh 
439a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
440b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
441b82d167bSAlex Elder 		removing = true;
442b82d167bSAlex Elder 	else
443b82d167bSAlex Elder 		rbd_dev->open_count++;
444a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
445b82d167bSAlex Elder 	if (removing)
446b82d167bSAlex Elder 		return -ENOENT;
447b82d167bSAlex Elder 
44842382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
449c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
450f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45142382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
452340c7a2bSAlex Elder 
453602adf40SYehuda Sadeh 	return 0;
454602adf40SYehuda Sadeh }
455602adf40SYehuda Sadeh 
456dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
457dfc5606dSYehuda Sadeh {
458dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
459b82d167bSAlex Elder 	unsigned long open_count_before;
460b82d167bSAlex Elder 
461a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
462b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
463a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
464b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
465dfc5606dSYehuda Sadeh 
46642382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
467c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
46842382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
469dfc5606dSYehuda Sadeh 
470dfc5606dSYehuda Sadeh 	return 0;
471dfc5606dSYehuda Sadeh }
472dfc5606dSYehuda Sadeh 
473602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
474602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
475602adf40SYehuda Sadeh 	.open			= rbd_open,
476dfc5606dSYehuda Sadeh 	.release		= rbd_release,
477602adf40SYehuda Sadeh };
478602adf40SYehuda Sadeh 
479602adf40SYehuda Sadeh /*
480602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48143ae4701SAlex Elder  * We own *ceph_opts.
482602adf40SYehuda Sadeh  */
483f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
484602adf40SYehuda Sadeh {
485602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
486602adf40SYehuda Sadeh 	int ret = -ENOMEM;
487602adf40SYehuda Sadeh 
48837206ee5SAlex Elder 	dout("%s:\n", __func__);
489602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
490602adf40SYehuda Sadeh 	if (!rbdc)
491602adf40SYehuda Sadeh 		goto out_opt;
492602adf40SYehuda Sadeh 
493602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
494602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
495602adf40SYehuda Sadeh 
496bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
497bc534d86SAlex Elder 
49843ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
499602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
500bc534d86SAlex Elder 		goto out_mutex;
50143ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
502602adf40SYehuda Sadeh 
503602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
504602adf40SYehuda Sadeh 	if (ret < 0)
505602adf40SYehuda Sadeh 		goto out_err;
506602adf40SYehuda Sadeh 
507432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
508602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
509432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
510602adf40SYehuda Sadeh 
511bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51237206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
513bc534d86SAlex Elder 
514602adf40SYehuda Sadeh 	return rbdc;
515602adf40SYehuda Sadeh 
516602adf40SYehuda Sadeh out_err:
517602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
518bc534d86SAlex Elder out_mutex:
519bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
520602adf40SYehuda Sadeh 	kfree(rbdc);
521602adf40SYehuda Sadeh out_opt:
52243ae4701SAlex Elder 	if (ceph_opts)
52343ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
52437206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
52537206ee5SAlex Elder 
52628f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
527602adf40SYehuda Sadeh }
528602adf40SYehuda Sadeh 
5292f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5302f82ee54SAlex Elder {
5312f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5322f82ee54SAlex Elder 
5332f82ee54SAlex Elder 	return rbdc;
5342f82ee54SAlex Elder }
5352f82ee54SAlex Elder 
536602adf40SYehuda Sadeh /*
5371f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5381f7ba331SAlex Elder  * found, bump its reference count.
539602adf40SYehuda Sadeh  */
5401f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
541602adf40SYehuda Sadeh {
542602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5431f7ba331SAlex Elder 	bool found = false;
544602adf40SYehuda Sadeh 
54543ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
546602adf40SYehuda Sadeh 		return NULL;
547602adf40SYehuda Sadeh 
5481f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5491f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5501f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5512f82ee54SAlex Elder 			__rbd_get_client(client_node);
5522f82ee54SAlex Elder 
5531f7ba331SAlex Elder 			found = true;
5541f7ba331SAlex Elder 			break;
5551f7ba331SAlex Elder 		}
5561f7ba331SAlex Elder 	}
5571f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5581f7ba331SAlex Elder 
5591f7ba331SAlex Elder 	return found ? client_node : NULL;
560602adf40SYehuda Sadeh }
561602adf40SYehuda Sadeh 
562602adf40SYehuda Sadeh /*
56359c2be1eSYehuda Sadeh  * mount options
56459c2be1eSYehuda Sadeh  */
56559c2be1eSYehuda Sadeh enum {
56659c2be1eSYehuda Sadeh 	Opt_last_int,
56759c2be1eSYehuda Sadeh 	/* int args above */
56859c2be1eSYehuda Sadeh 	Opt_last_string,
56959c2be1eSYehuda Sadeh 	/* string args above */
570cc0538b6SAlex Elder 	Opt_read_only,
571cc0538b6SAlex Elder 	Opt_read_write,
572cc0538b6SAlex Elder 	/* Boolean args above */
573cc0538b6SAlex Elder 	Opt_last_bool,
57459c2be1eSYehuda Sadeh };
57559c2be1eSYehuda Sadeh 
57643ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
57759c2be1eSYehuda Sadeh 	/* int args above */
57859c2be1eSYehuda Sadeh 	/* string args above */
579be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
580cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
581cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
582cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
583cc0538b6SAlex Elder 	/* Boolean args above */
58459c2be1eSYehuda Sadeh 	{-1, NULL}
58559c2be1eSYehuda Sadeh };
58659c2be1eSYehuda Sadeh 
58798571b5aSAlex Elder struct rbd_options {
58898571b5aSAlex Elder 	bool	read_only;
58998571b5aSAlex Elder };
59098571b5aSAlex Elder 
59198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59298571b5aSAlex Elder 
59359c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
59459c2be1eSYehuda Sadeh {
59543ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
59659c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
59759c2be1eSYehuda Sadeh 	int token, intval, ret;
59859c2be1eSYehuda Sadeh 
59943ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60059c2be1eSYehuda Sadeh 	if (token < 0)
60159c2be1eSYehuda Sadeh 		return -EINVAL;
60259c2be1eSYehuda Sadeh 
60359c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
60459c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
60559c2be1eSYehuda Sadeh 		if (ret < 0) {
60659c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
60759c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
60859c2be1eSYehuda Sadeh 			return ret;
60959c2be1eSYehuda Sadeh 		}
61059c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61159c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61259c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
61359c2be1eSYehuda Sadeh 		     argstr[0].from);
614cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
615cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
61659c2be1eSYehuda Sadeh 	} else {
61759c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
61859c2be1eSYehuda Sadeh 	}
61959c2be1eSYehuda Sadeh 
62059c2be1eSYehuda Sadeh 	switch (token) {
621cc0538b6SAlex Elder 	case Opt_read_only:
622cc0538b6SAlex Elder 		rbd_opts->read_only = true;
623cc0538b6SAlex Elder 		break;
624cc0538b6SAlex Elder 	case Opt_read_write:
625cc0538b6SAlex Elder 		rbd_opts->read_only = false;
626cc0538b6SAlex Elder 		break;
62759c2be1eSYehuda Sadeh 	default:
628aafb230eSAlex Elder 		rbd_assert(false);
629aafb230eSAlex Elder 		break;
63059c2be1eSYehuda Sadeh 	}
63159c2be1eSYehuda Sadeh 	return 0;
63259c2be1eSYehuda Sadeh }
63359c2be1eSYehuda Sadeh 
63459c2be1eSYehuda Sadeh /*
635602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
636602adf40SYehuda Sadeh  * not exist create it.
637602adf40SYehuda Sadeh  */
6389d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
639602adf40SYehuda Sadeh {
640f8c38929SAlex Elder 	struct rbd_client *rbdc;
64159c2be1eSYehuda Sadeh 
6421f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6439d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
64443ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6459d3997fdSAlex Elder 	else
646f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
647d720bcb0SAlex Elder 
6489d3997fdSAlex Elder 	return rbdc;
649602adf40SYehuda Sadeh }
650602adf40SYehuda Sadeh 
651602adf40SYehuda Sadeh /*
652602adf40SYehuda Sadeh  * Destroy ceph client
653d23a4b3fSAlex Elder  *
654432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
655602adf40SYehuda Sadeh  */
656602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
657602adf40SYehuda Sadeh {
658602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
659602adf40SYehuda Sadeh 
66037206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
661cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
662602adf40SYehuda Sadeh 	list_del(&rbdc->node);
663cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
664602adf40SYehuda Sadeh 
665602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
666602adf40SYehuda Sadeh 	kfree(rbdc);
667602adf40SYehuda Sadeh }
668602adf40SYehuda Sadeh 
669602adf40SYehuda Sadeh /*
670602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
671602adf40SYehuda Sadeh  * it.
672602adf40SYehuda Sadeh  */
6739d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
674602adf40SYehuda Sadeh {
675c53d5893SAlex Elder 	if (rbdc)
6769d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
677602adf40SYehuda Sadeh }
678602adf40SYehuda Sadeh 
679a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
680a30b71b9SAlex Elder {
681a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
682a30b71b9SAlex Elder }
683a30b71b9SAlex Elder 
6848e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6858e94af8eSAlex Elder {
686103a150fSAlex Elder 	size_t size;
687103a150fSAlex Elder 	u32 snap_count;
688103a150fSAlex Elder 
689103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
690103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
691103a150fSAlex Elder 		return false;
692103a150fSAlex Elder 
693db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
694db2388b6SAlex Elder 
695db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
696db2388b6SAlex Elder 		return false;
697db2388b6SAlex Elder 
698db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
699db2388b6SAlex Elder 
700db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
701db2388b6SAlex Elder 		return false;
702db2388b6SAlex Elder 
703103a150fSAlex Elder 	/*
704103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
705103a150fSAlex Elder 	 * that limits the number of snapshots.
706103a150fSAlex Elder 	 */
707103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
708103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
709103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
710103a150fSAlex Elder 		return false;
711103a150fSAlex Elder 
712103a150fSAlex Elder 	/*
713103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
714103a150fSAlex Elder 	 * header must also be representable in a size_t.
715103a150fSAlex Elder 	 */
716103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
717103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
718103a150fSAlex Elder 		return false;
719103a150fSAlex Elder 
720103a150fSAlex Elder 	return true;
7218e94af8eSAlex Elder }
7228e94af8eSAlex Elder 
723602adf40SYehuda Sadeh /*
724602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
725602adf40SYehuda Sadeh  * header.
726602adf40SYehuda Sadeh  */
727602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7284156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
729602adf40SYehuda Sadeh {
730ccece235SAlex Elder 	u32 snap_count;
73158c17b0eSAlex Elder 	size_t len;
732d2bb24e5SAlex Elder 	size_t size;
733621901d6SAlex Elder 	u32 i;
734602adf40SYehuda Sadeh 
7356a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7366a52325fSAlex Elder 
737103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
738103a150fSAlex Elder 
73958c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
74058c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7416a52325fSAlex Elder 	if (!header->object_prefix)
742602adf40SYehuda Sadeh 		return -ENOMEM;
74358c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
74458c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
74500f1f36fSAlex Elder 
746602adf40SYehuda Sadeh 	if (snap_count) {
747f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
748f785cc1dSAlex Elder 
749621901d6SAlex Elder 		/* Save a copy of the snapshot names */
750621901d6SAlex Elder 
751f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
752f785cc1dSAlex Elder 			return -EIO;
753f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
754602adf40SYehuda Sadeh 		if (!header->snap_names)
7556a52325fSAlex Elder 			goto out_err;
756f785cc1dSAlex Elder 		/*
757f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
758f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
759f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
760f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
761f785cc1dSAlex Elder 		 */
762f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
763f785cc1dSAlex Elder 			snap_names_len);
7646a52325fSAlex Elder 
765621901d6SAlex Elder 		/* Record each snapshot's size */
766621901d6SAlex Elder 
767d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
768d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
769602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7706a52325fSAlex Elder 			goto out_err;
771621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
772621901d6SAlex Elder 			header->snap_sizes[i] =
773621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
774602adf40SYehuda Sadeh 	} else {
775ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
776602adf40SYehuda Sadeh 		header->snap_names = NULL;
777602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
778602adf40SYehuda Sadeh 	}
779849b4260SAlex Elder 
78034b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
781602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
782602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
783602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7846a52325fSAlex Elder 
785621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
786621901d6SAlex Elder 
787f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7886a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7896a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7906a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7916a52325fSAlex Elder 	if (!header->snapc)
7926a52325fSAlex Elder 		goto out_err;
793602adf40SYehuda Sadeh 
794602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
795505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
796602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
797621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
798602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
799602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
800602adf40SYehuda Sadeh 
801602adf40SYehuda Sadeh 	return 0;
802602adf40SYehuda Sadeh 
8036a52325fSAlex Elder out_err:
804849b4260SAlex Elder 	kfree(header->snap_sizes);
805ccece235SAlex Elder 	header->snap_sizes = NULL;
806602adf40SYehuda Sadeh 	kfree(header->snap_names);
807ccece235SAlex Elder 	header->snap_names = NULL;
8086a52325fSAlex Elder 	kfree(header->object_prefix);
8096a52325fSAlex Elder 	header->object_prefix = NULL;
810ccece235SAlex Elder 
81100f1f36fSAlex Elder 	return -ENOMEM;
812602adf40SYehuda Sadeh }
813602adf40SYehuda Sadeh 
8149e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
8159e15b77dSAlex Elder {
8169e15b77dSAlex Elder 	struct rbd_snap *snap;
8179e15b77dSAlex Elder 
8189e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
8199e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
8209e15b77dSAlex Elder 
8219e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
8229e15b77dSAlex Elder 		if (snap_id == snap->id)
8239e15b77dSAlex Elder 			return snap->name;
8249e15b77dSAlex Elder 
8259e15b77dSAlex Elder 	return NULL;
8269e15b77dSAlex Elder }
8279e15b77dSAlex Elder 
8288836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
829602adf40SYehuda Sadeh {
830602adf40SYehuda Sadeh 
831e86924a8SAlex Elder 	struct rbd_snap *snap;
83200f1f36fSAlex Elder 
833e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
834e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
8350d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
836e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
83734b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
83800f1f36fSAlex Elder 
839e86924a8SAlex Elder 			return 0;
840602adf40SYehuda Sadeh 		}
84100f1f36fSAlex Elder 	}
842e86924a8SAlex Elder 
84300f1f36fSAlex Elder 	return -ENOENT;
84400f1f36fSAlex Elder }
845602adf40SYehuda Sadeh 
846819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
847602adf40SYehuda Sadeh {
84878dc447dSAlex Elder 	int ret;
849602adf40SYehuda Sadeh 
8500d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
851cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8520d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
85399c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
85434b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
855e86924a8SAlex Elder 		ret = 0;
856602adf40SYehuda Sadeh 	} else {
8570d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
858602adf40SYehuda Sadeh 		if (ret < 0)
859602adf40SYehuda Sadeh 			goto done;
860f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
861602adf40SYehuda Sadeh 	}
8626d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8636d292906SAlex Elder 
864602adf40SYehuda Sadeh done:
865602adf40SYehuda Sadeh 	return ret;
866602adf40SYehuda Sadeh }
867602adf40SYehuda Sadeh 
868602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
869602adf40SYehuda Sadeh {
870849b4260SAlex Elder 	kfree(header->object_prefix);
871d78fd7aeSAlex Elder 	header->object_prefix = NULL;
872602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
873d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
874849b4260SAlex Elder 	kfree(header->snap_names);
875d78fd7aeSAlex Elder 	header->snap_names = NULL;
876d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
877d78fd7aeSAlex Elder 	header->snapc = NULL;
878602adf40SYehuda Sadeh }
879602adf40SYehuda Sadeh 
88098571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
881602adf40SYehuda Sadeh {
88265ccfe21SAlex Elder 	char *name;
88365ccfe21SAlex Elder 	u64 segment;
88465ccfe21SAlex Elder 	int ret;
885602adf40SYehuda Sadeh 
8862fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
88765ccfe21SAlex Elder 	if (!name)
88865ccfe21SAlex Elder 		return NULL;
88965ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8902fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
89165ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8922fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
89365ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
89465ccfe21SAlex Elder 			segment, ret);
89565ccfe21SAlex Elder 		kfree(name);
89665ccfe21SAlex Elder 		name = NULL;
89765ccfe21SAlex Elder 	}
898602adf40SYehuda Sadeh 
89965ccfe21SAlex Elder 	return name;
90065ccfe21SAlex Elder }
901602adf40SYehuda Sadeh 
90265ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
90365ccfe21SAlex Elder {
90465ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
905602adf40SYehuda Sadeh 
90665ccfe21SAlex Elder 	return offset & (segment_size - 1);
90765ccfe21SAlex Elder }
90865ccfe21SAlex Elder 
90965ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
91065ccfe21SAlex Elder 				u64 offset, u64 length)
91165ccfe21SAlex Elder {
91265ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
91365ccfe21SAlex Elder 
91465ccfe21SAlex Elder 	offset &= segment_size - 1;
91565ccfe21SAlex Elder 
916aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
91765ccfe21SAlex Elder 	if (offset + length > segment_size)
91865ccfe21SAlex Elder 		length = segment_size - offset;
91965ccfe21SAlex Elder 
92065ccfe21SAlex Elder 	return length;
921602adf40SYehuda Sadeh }
922602adf40SYehuda Sadeh 
923602adf40SYehuda Sadeh /*
924029bcbd8SJosh Durgin  * returns the size of an object in the image
925029bcbd8SJosh Durgin  */
926029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
927029bcbd8SJosh Durgin {
928029bcbd8SJosh Durgin 	return 1 << header->obj_order;
929029bcbd8SJosh Durgin }
930029bcbd8SJosh Durgin 
931029bcbd8SJosh Durgin /*
932602adf40SYehuda Sadeh  * bio helpers
933602adf40SYehuda Sadeh  */
934602adf40SYehuda Sadeh 
935602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
936602adf40SYehuda Sadeh {
937602adf40SYehuda Sadeh 	struct bio *tmp;
938602adf40SYehuda Sadeh 
939602adf40SYehuda Sadeh 	while (chain) {
940602adf40SYehuda Sadeh 		tmp = chain;
941602adf40SYehuda Sadeh 		chain = chain->bi_next;
942602adf40SYehuda Sadeh 		bio_put(tmp);
943602adf40SYehuda Sadeh 	}
944602adf40SYehuda Sadeh }
945602adf40SYehuda Sadeh 
946602adf40SYehuda Sadeh /*
947602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
948602adf40SYehuda Sadeh  */
949602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
950602adf40SYehuda Sadeh {
951602adf40SYehuda Sadeh 	struct bio_vec *bv;
952602adf40SYehuda Sadeh 	unsigned long flags;
953602adf40SYehuda Sadeh 	void *buf;
954602adf40SYehuda Sadeh 	int i;
955602adf40SYehuda Sadeh 	int pos = 0;
956602adf40SYehuda Sadeh 
957602adf40SYehuda Sadeh 	while (chain) {
958602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
959602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
960602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
961602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
962602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
963602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
96485b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
965602adf40SYehuda Sadeh 			}
966602adf40SYehuda Sadeh 			pos += bv->bv_len;
967602adf40SYehuda Sadeh 		}
968602adf40SYehuda Sadeh 
969602adf40SYehuda Sadeh 		chain = chain->bi_next;
970602adf40SYehuda Sadeh 	}
971602adf40SYehuda Sadeh }
972602adf40SYehuda Sadeh 
973602adf40SYehuda Sadeh /*
974f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
975f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
976602adf40SYehuda Sadeh  */
977f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
978f7760dadSAlex Elder 					unsigned int offset,
979f7760dadSAlex Elder 					unsigned int len,
980f7760dadSAlex Elder 					gfp_t gfpmask)
981602adf40SYehuda Sadeh {
982f7760dadSAlex Elder 	struct bio_vec *bv;
983f7760dadSAlex Elder 	unsigned int resid;
984f7760dadSAlex Elder 	unsigned short idx;
985f7760dadSAlex Elder 	unsigned int voff;
986f7760dadSAlex Elder 	unsigned short end_idx;
987f7760dadSAlex Elder 	unsigned short vcnt;
988f7760dadSAlex Elder 	struct bio *bio;
989602adf40SYehuda Sadeh 
990f7760dadSAlex Elder 	/* Handle the easy case for the caller */
991f7760dadSAlex Elder 
992f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
993f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
994f7760dadSAlex Elder 
995f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
996f7760dadSAlex Elder 		return NULL;
997f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
998f7760dadSAlex Elder 		return NULL;
999f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1000f7760dadSAlex Elder 		return NULL;
1001f7760dadSAlex Elder 
1002f7760dadSAlex Elder 	/* Find first affected segment... */
1003f7760dadSAlex Elder 
1004f7760dadSAlex Elder 	resid = offset;
1005f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1006f7760dadSAlex Elder 		if (resid < bv->bv_len)
1007f7760dadSAlex Elder 			break;
1008f7760dadSAlex Elder 		resid -= bv->bv_len;
1009602adf40SYehuda Sadeh 	}
1010f7760dadSAlex Elder 	voff = resid;
1011602adf40SYehuda Sadeh 
1012f7760dadSAlex Elder 	/* ...and the last affected segment */
1013542582fcSAlex Elder 
1014f7760dadSAlex Elder 	resid += len;
1015f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1016f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1017f7760dadSAlex Elder 			break;
1018f7760dadSAlex Elder 		resid -= bv->bv_len;
1019f7760dadSAlex Elder 	}
1020f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1021602adf40SYehuda Sadeh 
1022f7760dadSAlex Elder 	/* Build the clone */
1023f7760dadSAlex Elder 
1024f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1025f7760dadSAlex Elder 	if (!bio)
1026f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1027f7760dadSAlex Elder 
1028f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1029f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1030f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1031f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1032602adf40SYehuda Sadeh 
1033602adf40SYehuda Sadeh 	/*
1034f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1035f7760dadSAlex Elder 	 * and last (or only) entries.
1036602adf40SYehuda Sadeh 	 */
1037f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1038f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1039f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1040f7760dadSAlex Elder 	if (vcnt > 1) {
1041f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1042f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1043602adf40SYehuda Sadeh 	} else {
1044f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1045602adf40SYehuda Sadeh 	}
1046602adf40SYehuda Sadeh 
1047f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1048f7760dadSAlex Elder 	bio->bi_size = len;
1049f7760dadSAlex Elder 	bio->bi_idx = 0;
1050602adf40SYehuda Sadeh 
1051f7760dadSAlex Elder 	return bio;
1052602adf40SYehuda Sadeh }
1053602adf40SYehuda Sadeh 
1054f7760dadSAlex Elder /*
1055f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1056f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1057f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1058f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1059f7760dadSAlex Elder  *
1060f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1061f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1062f7760dadSAlex Elder  * the start of data to be cloned is located.
1063f7760dadSAlex Elder  *
1064f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1065f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1066f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1067f7760dadSAlex Elder  */
1068f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1069f7760dadSAlex Elder 					unsigned int *offset,
1070f7760dadSAlex Elder 					unsigned int len,
1071f7760dadSAlex Elder 					gfp_t gfpmask)
1072f7760dadSAlex Elder {
1073f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1074f7760dadSAlex Elder 	unsigned int off = *offset;
1075f7760dadSAlex Elder 	struct bio *chain = NULL;
1076f7760dadSAlex Elder 	struct bio **end;
1077602adf40SYehuda Sadeh 
1078f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1079602adf40SYehuda Sadeh 
1080f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1081f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1082602adf40SYehuda Sadeh 
1083f7760dadSAlex Elder 	end = &chain;
1084f7760dadSAlex Elder 	while (len) {
1085f7760dadSAlex Elder 		unsigned int bi_size;
1086f7760dadSAlex Elder 		struct bio *bio;
1087f7760dadSAlex Elder 
1088f5400b7aSAlex Elder 		if (!bi) {
1089f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1090f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1091f5400b7aSAlex Elder 		}
1092f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1093f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1094f7760dadSAlex Elder 		if (!bio)
1095f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1096f7760dadSAlex Elder 
1097f7760dadSAlex Elder 		*end = bio;
1098f7760dadSAlex Elder 		end = &bio->bi_next;
1099f7760dadSAlex Elder 
1100f7760dadSAlex Elder 		off += bi_size;
1101f7760dadSAlex Elder 		if (off == bi->bi_size) {
1102f7760dadSAlex Elder 			bi = bi->bi_next;
1103f7760dadSAlex Elder 			off = 0;
1104f7760dadSAlex Elder 		}
1105f7760dadSAlex Elder 		len -= bi_size;
1106f7760dadSAlex Elder 	}
1107f7760dadSAlex Elder 	*bio_src = bi;
1108f7760dadSAlex Elder 	*offset = off;
1109f7760dadSAlex Elder 
1110f7760dadSAlex Elder 	return chain;
1111f7760dadSAlex Elder out_err:
1112f7760dadSAlex Elder 	bio_chain_put(chain);
1113f7760dadSAlex Elder 
1114602adf40SYehuda Sadeh 	return NULL;
1115602adf40SYehuda Sadeh }
1116602adf40SYehuda Sadeh 
1117926f9b3fSAlex Elder /*
1118926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1119926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1120926f9b3fSAlex Elder  * again.
1121926f9b3fSAlex Elder  */
11226365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
11236365d33aSAlex Elder {
11246365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
11256365d33aSAlex Elder 		struct rbd_device *rbd_dev;
11266365d33aSAlex Elder 
112757acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
11286365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
11296365d33aSAlex Elder 			obj_request);
11306365d33aSAlex Elder 	}
11316365d33aSAlex Elder }
11326365d33aSAlex Elder 
11336365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
11346365d33aSAlex Elder {
11356365d33aSAlex Elder 	smp_mb();
11366365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
11376365d33aSAlex Elder }
11386365d33aSAlex Elder 
113957acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
114057acbaa7SAlex Elder {
114157acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
114257acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
114357acbaa7SAlex Elder 
114457acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
114557acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
114657acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
114757acbaa7SAlex Elder 			obj_request);
114857acbaa7SAlex Elder 	}
114957acbaa7SAlex Elder }
115057acbaa7SAlex Elder 
115157acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
115257acbaa7SAlex Elder {
115357acbaa7SAlex Elder 	smp_mb();
115457acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
115557acbaa7SAlex Elder }
115657acbaa7SAlex Elder 
11575679c59fSAlex Elder /*
11585679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
11595679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
11605679c59fSAlex Elder  *
11615679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
11625679c59fSAlex Elder  * away again.  It's possible that the response from two existence
11635679c59fSAlex Elder  * checks are separated by the creation of the target object, and
11645679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
11655679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
11665679c59fSAlex Elder  */
11675679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
11685679c59fSAlex Elder 				bool exists)
11695679c59fSAlex Elder {
11705679c59fSAlex Elder 	if (exists)
11715679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
11725679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
11735679c59fSAlex Elder 	smp_mb();
11745679c59fSAlex Elder }
11755679c59fSAlex Elder 
11765679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
11775679c59fSAlex Elder {
11785679c59fSAlex Elder 	smp_mb();
11795679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
11805679c59fSAlex Elder }
11815679c59fSAlex Elder 
11825679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
11835679c59fSAlex Elder {
11845679c59fSAlex Elder 	smp_mb();
11855679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
11865679c59fSAlex Elder }
11875679c59fSAlex Elder 
1188bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1189bf0d5f50SAlex Elder {
119037206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
119137206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1192bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1193bf0d5f50SAlex Elder }
1194bf0d5f50SAlex Elder 
1195bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1196bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1197bf0d5f50SAlex Elder {
1198bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
119937206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
120037206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1201bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1202bf0d5f50SAlex Elder }
1203bf0d5f50SAlex Elder 
1204bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1205bf0d5f50SAlex Elder {
120637206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
120737206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1208bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1209bf0d5f50SAlex Elder }
1210bf0d5f50SAlex Elder 
1211bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1212bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1213bf0d5f50SAlex Elder {
1214bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
121537206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
121637206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1217bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1218bf0d5f50SAlex Elder }
1219bf0d5f50SAlex Elder 
1220bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1221bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1222bf0d5f50SAlex Elder {
122325dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
122425dcf954SAlex Elder 
1225b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1226bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
122725dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
12286365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
12296365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1230bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
123125dcf954SAlex Elder 	img_request->obj_request_count++;
123225dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
123337206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
123437206ee5SAlex Elder 		obj_request->which);
1235bf0d5f50SAlex Elder }
1236bf0d5f50SAlex Elder 
1237bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1238bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1239bf0d5f50SAlex Elder {
1240bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
124125dcf954SAlex Elder 
124237206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
124337206ee5SAlex Elder 		obj_request->which);
1244bf0d5f50SAlex Elder 	list_del(&obj_request->links);
124525dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
124625dcf954SAlex Elder 	img_request->obj_request_count--;
124725dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
124825dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
12496365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1250bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1251bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
125225dcf954SAlex Elder 	obj_request->callback = NULL;
1253bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1254bf0d5f50SAlex Elder }
1255bf0d5f50SAlex Elder 
1256bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1257bf0d5f50SAlex Elder {
1258bf0d5f50SAlex Elder 	switch (type) {
12599969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1260bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1261788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1262bf0d5f50SAlex Elder 		return true;
1263bf0d5f50SAlex Elder 	default:
1264bf0d5f50SAlex Elder 		return false;
1265bf0d5f50SAlex Elder 	}
1266bf0d5f50SAlex Elder }
1267bf0d5f50SAlex Elder 
1268bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1269bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1270bf0d5f50SAlex Elder {
127137206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
127237206ee5SAlex Elder 
1273bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1274bf0d5f50SAlex Elder }
1275bf0d5f50SAlex Elder 
1276bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1277bf0d5f50SAlex Elder {
127855f27e09SAlex Elder 
127937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
128055f27e09SAlex Elder 
128155f27e09SAlex Elder 	/*
128255f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
128355f27e09SAlex Elder 	 * count for the image request.  We could instead use
128455f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
128555f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
128655f27e09SAlex Elder 	 */
128755f27e09SAlex Elder 	if (!img_request->result) {
128855f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
128955f27e09SAlex Elder 		u64 xferred = 0;
129055f27e09SAlex Elder 
129155f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
129255f27e09SAlex Elder 			xferred += obj_request->xferred;
129355f27e09SAlex Elder 		img_request->xferred = xferred;
129455f27e09SAlex Elder 	}
129555f27e09SAlex Elder 
1296bf0d5f50SAlex Elder 	if (img_request->callback)
1297bf0d5f50SAlex Elder 		img_request->callback(img_request);
1298bf0d5f50SAlex Elder 	else
1299bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1300bf0d5f50SAlex Elder }
1301bf0d5f50SAlex Elder 
1302788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1303788e2df3SAlex Elder 
1304788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1305788e2df3SAlex Elder {
130637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
130737206ee5SAlex Elder 
1308788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1309788e2df3SAlex Elder }
1310788e2df3SAlex Elder 
13110c425248SAlex Elder /*
13120c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13130c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13140c425248SAlex Elder  * and currently never change thereafter.
13150c425248SAlex Elder  */
13160c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
13170c425248SAlex Elder {
13180c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
13190c425248SAlex Elder 	smp_mb();
13200c425248SAlex Elder }
13210c425248SAlex Elder 
13220c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
13230c425248SAlex Elder {
13240c425248SAlex Elder 	smp_mb();
13250c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
13260c425248SAlex Elder }
13270c425248SAlex Elder 
13289849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
13299849e986SAlex Elder {
13309849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
13319849e986SAlex Elder 	smp_mb();
13329849e986SAlex Elder }
13339849e986SAlex Elder 
13349849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
13359849e986SAlex Elder {
13369849e986SAlex Elder 	smp_mb();
13379849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
13389849e986SAlex Elder }
13399849e986SAlex Elder 
1340d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1341d0b2e944SAlex Elder {
1342d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1343d0b2e944SAlex Elder 	smp_mb();
1344d0b2e944SAlex Elder }
1345d0b2e944SAlex Elder 
1346d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1347d0b2e944SAlex Elder {
1348d0b2e944SAlex Elder 	smp_mb();
1349d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1350d0b2e944SAlex Elder }
1351d0b2e944SAlex Elder 
13526e2a4505SAlex Elder static void
13536e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
13546e2a4505SAlex Elder {
13556e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
13566e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
13576e2a4505SAlex Elder 		obj_request->xferred, obj_request->length);
13586e2a4505SAlex Elder 	/*
13596e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
13606e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
13616e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
13626e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
13636e2a4505SAlex Elder 	 * was satisfied.
13646e2a4505SAlex Elder 	 */
13656e2a4505SAlex Elder 	BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
13666e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
13676e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, 0);
13686e2a4505SAlex Elder 		obj_request->result = 0;
13696e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
13706e2a4505SAlex Elder 	} else if (obj_request->xferred < obj_request->length &&
13716e2a4505SAlex Elder 			!obj_request->result) {
13726e2a4505SAlex Elder 		zero_bio_chain(obj_request->bio_list, obj_request->xferred);
13736e2a4505SAlex Elder 		obj_request->xferred = obj_request->length;
13746e2a4505SAlex Elder 	}
13756e2a4505SAlex Elder 	obj_request_done_set(obj_request);
13766e2a4505SAlex Elder }
13776e2a4505SAlex Elder 
1378bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1379bf0d5f50SAlex Elder {
138037206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
138137206ee5SAlex Elder 		obj_request->callback);
1382bf0d5f50SAlex Elder 	if (obj_request->callback)
1383bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1384788e2df3SAlex Elder 	else
1385788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1386bf0d5f50SAlex Elder }
1387bf0d5f50SAlex Elder 
1388c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
138939bf2c5dSAlex Elder {
139039bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
139139bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
139239bf2c5dSAlex Elder }
139339bf2c5dSAlex Elder 
1394c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1395bf0d5f50SAlex Elder {
139657acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
139757acbaa7SAlex Elder 	bool layered = false;
139857acbaa7SAlex Elder 
139957acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
140057acbaa7SAlex Elder 		img_request = obj_request->img_request;
140157acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
140257acbaa7SAlex Elder 	} else {
140357acbaa7SAlex Elder 		img_request = NULL;
140457acbaa7SAlex Elder 		layered = false;
140557acbaa7SAlex Elder 	}
14068b3e1a56SAlex Elder 
14078b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
14088b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
14098b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
14108b3e1a56SAlex Elder 	if (layered && obj_request->result == -ENOENT)
14118b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
14128b3e1a56SAlex Elder 	else if (img_request)
14136e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
14146e2a4505SAlex Elder 	else
141507741308SAlex Elder 		obj_request_done_set(obj_request);
1416bf0d5f50SAlex Elder }
1417bf0d5f50SAlex Elder 
1418c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1419bf0d5f50SAlex Elder {
14201b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
14211b83bef2SSage Weil 		obj_request->result, obj_request->length);
14221b83bef2SSage Weil 	/*
14238b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
14248b3e1a56SAlex Elder 	 * it to our originally-requested length.
14251b83bef2SSage Weil 	 */
14261b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
142707741308SAlex Elder 	obj_request_done_set(obj_request);
1428bf0d5f50SAlex Elder }
1429bf0d5f50SAlex Elder 
1430fbfab539SAlex Elder /*
1431fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1432fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1433fbfab539SAlex Elder  */
1434c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1435fbfab539SAlex Elder {
143637206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1437fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1438fbfab539SAlex Elder }
1439fbfab539SAlex Elder 
1440bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1441bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1442bf0d5f50SAlex Elder {
1443bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1444bf0d5f50SAlex Elder 	u16 opcode;
1445bf0d5f50SAlex Elder 
144637206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1447bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
144857acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
144957acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
145057acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
145157acbaa7SAlex Elder 	} else {
145257acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
145357acbaa7SAlex Elder 	}
1454bf0d5f50SAlex Elder 
14551b83bef2SSage Weil 	if (osd_req->r_result < 0)
14561b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1457bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1458bf0d5f50SAlex Elder 
14591b83bef2SSage Weil 	WARN_ON(osd_req->r_num_ops != 1);	/* For now */
1460bf0d5f50SAlex Elder 
1461c47f9371SAlex Elder 	/*
1462c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1463c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1464c47f9371SAlex Elder 	 */
14651b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1466c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
146779528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1468bf0d5f50SAlex Elder 	switch (opcode) {
1469bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1470c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1471bf0d5f50SAlex Elder 		break;
1472bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1473c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1474bf0d5f50SAlex Elder 		break;
1475fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1476c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1477fbfab539SAlex Elder 		break;
147836be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1479b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
14809969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1481c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
14829969ebc5SAlex Elder 		break;
1483bf0d5f50SAlex Elder 	default:
1484bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1485bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1486bf0d5f50SAlex Elder 		break;
1487bf0d5f50SAlex Elder 	}
1488bf0d5f50SAlex Elder 
148907741308SAlex Elder 	if (obj_request_done_test(obj_request))
1490bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1491bf0d5f50SAlex Elder }
1492bf0d5f50SAlex Elder 
14939d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1494430c28c3SAlex Elder {
1495430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
14968c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
14979d4df01fSAlex Elder 	u64 snap_id;
1498430c28c3SAlex Elder 
14998c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1500430c28c3SAlex Elder 
15019d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
15028c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15039d4df01fSAlex Elder 			NULL, snap_id, NULL);
15049d4df01fSAlex Elder }
15059d4df01fSAlex Elder 
15069d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
15079d4df01fSAlex Elder {
15089d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15099d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15109d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
15119d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
15129d4df01fSAlex Elder 
15139d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
15149d4df01fSAlex Elder 
15159d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
15169d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15179d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1518430c28c3SAlex Elder }
1519430c28c3SAlex Elder 
1520bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1521bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1522bf0d5f50SAlex Elder 					bool write_request,
1523430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1524bf0d5f50SAlex Elder {
1525bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1526bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1527bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1528bf0d5f50SAlex Elder 
15296365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
15306365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
15316365d33aSAlex Elder 
15320c425248SAlex Elder 		rbd_assert(write_request ==
15330c425248SAlex Elder 				img_request_write_test(img_request));
15340c425248SAlex Elder 		if (write_request)
1535bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1536bf0d5f50SAlex Elder 	}
1537bf0d5f50SAlex Elder 
1538bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1539bf0d5f50SAlex Elder 
1540bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1541bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1542bf0d5f50SAlex Elder 	if (!osd_req)
1543bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1544bf0d5f50SAlex Elder 
1545430c28c3SAlex Elder 	if (write_request)
1546bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1547430c28c3SAlex Elder 	else
1548bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1549bf0d5f50SAlex Elder 
1550bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1551bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1552bf0d5f50SAlex Elder 
1553bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1554bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1555bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1556bf0d5f50SAlex Elder 
1557bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1558bf0d5f50SAlex Elder 
1559bf0d5f50SAlex Elder 	return osd_req;
1560bf0d5f50SAlex Elder }
1561bf0d5f50SAlex Elder 
1562bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1563bf0d5f50SAlex Elder {
1564bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1565bf0d5f50SAlex Elder }
1566bf0d5f50SAlex Elder 
1567bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1568bf0d5f50SAlex Elder 
1569bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1570bf0d5f50SAlex Elder 						u64 offset, u64 length,
1571bf0d5f50SAlex Elder 						enum obj_request_type type)
1572bf0d5f50SAlex Elder {
1573bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1574bf0d5f50SAlex Elder 	size_t size;
1575bf0d5f50SAlex Elder 	char *name;
1576bf0d5f50SAlex Elder 
1577bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1578bf0d5f50SAlex Elder 
1579bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1580bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1581bf0d5f50SAlex Elder 	if (!obj_request)
1582bf0d5f50SAlex Elder 		return NULL;
1583bf0d5f50SAlex Elder 
1584bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1585bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1586bf0d5f50SAlex Elder 	obj_request->offset = offset;
1587bf0d5f50SAlex Elder 	obj_request->length = length;
1588926f9b3fSAlex Elder 	obj_request->flags = 0;
1589bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1590bf0d5f50SAlex Elder 	obj_request->type = type;
1591bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1592788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1593bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1594bf0d5f50SAlex Elder 
159537206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
159637206ee5SAlex Elder 		offset, length, (int)type, obj_request);
159737206ee5SAlex Elder 
1598bf0d5f50SAlex Elder 	return obj_request;
1599bf0d5f50SAlex Elder }
1600bf0d5f50SAlex Elder 
1601bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1602bf0d5f50SAlex Elder {
1603bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1604bf0d5f50SAlex Elder 
1605bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1606bf0d5f50SAlex Elder 
160737206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
160837206ee5SAlex Elder 
1609bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1610bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1611bf0d5f50SAlex Elder 
1612bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1613bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1614bf0d5f50SAlex Elder 
1615bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1616bf0d5f50SAlex Elder 	switch (obj_request->type) {
16179969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
16189969ebc5SAlex Elder 		break;		/* Nothing to do */
1619bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1620bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1621bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1622bf0d5f50SAlex Elder 		break;
1623788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1624788e2df3SAlex Elder 		if (obj_request->pages)
1625788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1626788e2df3SAlex Elder 						obj_request->page_count);
1627788e2df3SAlex Elder 		break;
1628bf0d5f50SAlex Elder 	}
1629bf0d5f50SAlex Elder 
1630bf0d5f50SAlex Elder 	kfree(obj_request);
1631bf0d5f50SAlex Elder }
1632bf0d5f50SAlex Elder 
1633bf0d5f50SAlex Elder /*
1634bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1635bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1636bf0d5f50SAlex Elder  * (if there is one).
1637bf0d5f50SAlex Elder  */
1638cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1639cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1640bf0d5f50SAlex Elder 					u64 offset, u64 length,
16419849e986SAlex Elder 					bool write_request,
16429849e986SAlex Elder 					bool child_request)
1643bf0d5f50SAlex Elder {
1644bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1645bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1646bf0d5f50SAlex Elder 
1647bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1648bf0d5f50SAlex Elder 	if (!img_request)
1649bf0d5f50SAlex Elder 		return NULL;
1650bf0d5f50SAlex Elder 
1651bf0d5f50SAlex Elder 	if (write_request) {
1652bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1653bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1654bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1655bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1656bf0d5f50SAlex Elder 			kfree(img_request);
1657bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1658bf0d5f50SAlex Elder 		}
16590c425248SAlex Elder 
1660bf0d5f50SAlex Elder 	}
1661bf0d5f50SAlex Elder 
1662bf0d5f50SAlex Elder 	img_request->rq = NULL;
1663bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1664bf0d5f50SAlex Elder 	img_request->offset = offset;
1665bf0d5f50SAlex Elder 	img_request->length = length;
16660c425248SAlex Elder 	img_request->flags = 0;
16670c425248SAlex Elder 	if (write_request) {
16680c425248SAlex Elder 		img_request_write_set(img_request);
1669bf0d5f50SAlex Elder 		img_request->snapc = snapc;
16700c425248SAlex Elder 	} else {
1671bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
16720c425248SAlex Elder 	}
16739849e986SAlex Elder 	if (child_request)
16749849e986SAlex Elder 		img_request_child_set(img_request);
1675d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1676d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1677bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1678bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1679bf0d5f50SAlex Elder 	img_request->callback = NULL;
1680a5a337d4SAlex Elder 	img_request->result = 0;
1681bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1682bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1683bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1684bf0d5f50SAlex Elder 
1685bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1686bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1687bf0d5f50SAlex Elder 
168837206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
168937206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
169037206ee5SAlex Elder 		img_request);
169137206ee5SAlex Elder 
1692bf0d5f50SAlex Elder 	return img_request;
1693bf0d5f50SAlex Elder }
1694bf0d5f50SAlex Elder 
1695bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1696bf0d5f50SAlex Elder {
1697bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1698bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1699bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1700bf0d5f50SAlex Elder 
1701bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1702bf0d5f50SAlex Elder 
170337206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
170437206ee5SAlex Elder 
1705bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1706bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
170725dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1708bf0d5f50SAlex Elder 
17090c425248SAlex Elder 	if (img_request_write_test(img_request))
1710bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1711bf0d5f50SAlex Elder 
17128b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
17138b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
17148b3e1a56SAlex Elder 
1715bf0d5f50SAlex Elder 	kfree(img_request);
1716bf0d5f50SAlex Elder }
1717bf0d5f50SAlex Elder 
17181217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
17191217857fSAlex Elder {
17206365d33aSAlex Elder 	struct rbd_img_request *img_request;
17211217857fSAlex Elder 	unsigned int xferred;
17221217857fSAlex Elder 	int result;
17238b3e1a56SAlex Elder 	bool more;
17241217857fSAlex Elder 
17256365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17266365d33aSAlex Elder 	img_request = obj_request->img_request;
17276365d33aSAlex Elder 
17281217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
17291217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
17301217857fSAlex Elder 	result = obj_request->result;
17311217857fSAlex Elder 	if (result) {
17321217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
17331217857fSAlex Elder 
17341217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
17351217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
17361217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
17371217857fSAlex Elder 			obj_request->offset);
17381217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
17391217857fSAlex Elder 			result, xferred);
17401217857fSAlex Elder 		if (!img_request->result)
17411217857fSAlex Elder 			img_request->result = result;
17421217857fSAlex Elder 	}
17431217857fSAlex Elder 
17448b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
17458b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
17468b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
17478b3e1a56SAlex Elder 	} else {
17488b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
17498b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
17508b3e1a56SAlex Elder 	}
17518b3e1a56SAlex Elder 
17528b3e1a56SAlex Elder 	return more;
17531217857fSAlex Elder }
17541217857fSAlex Elder 
17552169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
17562169238dSAlex Elder {
17572169238dSAlex Elder 	struct rbd_img_request *img_request;
17582169238dSAlex Elder 	u32 which = obj_request->which;
17592169238dSAlex Elder 	bool more = true;
17602169238dSAlex Elder 
17616365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
17622169238dSAlex Elder 	img_request = obj_request->img_request;
17632169238dSAlex Elder 
17642169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
17652169238dSAlex Elder 	rbd_assert(img_request != NULL);
17662169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
17672169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
17682169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
17692169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
17702169238dSAlex Elder 
17712169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
17722169238dSAlex Elder 	if (which != img_request->next_completion)
17732169238dSAlex Elder 		goto out;
17742169238dSAlex Elder 
17752169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
17762169238dSAlex Elder 		rbd_assert(more);
17772169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
17782169238dSAlex Elder 
17792169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
17802169238dSAlex Elder 			break;
17811217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
17822169238dSAlex Elder 		which++;
17832169238dSAlex Elder 	}
17842169238dSAlex Elder 
17852169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
17862169238dSAlex Elder 	img_request->next_completion = which;
17872169238dSAlex Elder out:
17882169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
17892169238dSAlex Elder 
17902169238dSAlex Elder 	if (!more)
17912169238dSAlex Elder 		rbd_img_request_complete(img_request);
17922169238dSAlex Elder }
17932169238dSAlex Elder 
1794bf0d5f50SAlex Elder static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1795bf0d5f50SAlex Elder 					struct bio *bio_list)
1796bf0d5f50SAlex Elder {
1797bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1798bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1799bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
18000c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1801bf0d5f50SAlex Elder 	unsigned int bio_offset;
18027da22d29SAlex Elder 	u64 img_offset;
1803bf0d5f50SAlex Elder 	u64 resid;
1804bf0d5f50SAlex Elder 	u16 opcode;
1805bf0d5f50SAlex Elder 
180637206ee5SAlex Elder 	dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
180737206ee5SAlex Elder 
1808430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
1809bf0d5f50SAlex Elder 	bio_offset = 0;
18107da22d29SAlex Elder 	img_offset = img_request->offset;
18117da22d29SAlex Elder 	rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1812bf0d5f50SAlex Elder 	resid = img_request->length;
18134dda41d3SAlex Elder 	rbd_assert(resid > 0);
1814bf0d5f50SAlex Elder 	while (resid) {
18152fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1816bf0d5f50SAlex Elder 		const char *object_name;
1817bf0d5f50SAlex Elder 		unsigned int clone_size;
1818bf0d5f50SAlex Elder 		u64 offset;
1819bf0d5f50SAlex Elder 		u64 length;
1820bf0d5f50SAlex Elder 
18217da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1822bf0d5f50SAlex Elder 		if (!object_name)
1823bf0d5f50SAlex Elder 			goto out_unwind;
18247da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
18257da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1826bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1827bf0d5f50SAlex Elder 						offset, length,
1828bf0d5f50SAlex Elder 						OBJ_REQUEST_BIO);
1829bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1830bf0d5f50SAlex Elder 		if (!obj_request)
1831bf0d5f50SAlex Elder 			goto out_unwind;
1832bf0d5f50SAlex Elder 
1833bf0d5f50SAlex Elder 		rbd_assert(length <= (u64) UINT_MAX);
1834bf0d5f50SAlex Elder 		clone_size = (unsigned int) length;
1835bf0d5f50SAlex Elder 		obj_request->bio_list = bio_chain_clone_range(&bio_list,
1836bf0d5f50SAlex Elder 						&bio_offset, clone_size,
1837bf0d5f50SAlex Elder 						GFP_ATOMIC);
1838bf0d5f50SAlex Elder 		if (!obj_request->bio_list)
1839bf0d5f50SAlex Elder 			goto out_partial;
1840bf0d5f50SAlex Elder 
18412fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
18422fa12320SAlex Elder 						obj_request);
18432fa12320SAlex Elder 		if (!osd_req)
1844bf0d5f50SAlex Elder 			goto out_partial;
18452fa12320SAlex Elder 		obj_request->osd_req = osd_req;
18462169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1847430c28c3SAlex Elder 
18482fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
18492fa12320SAlex Elder 						0, 0);
1850406e2c9fSAlex Elder 		osd_req_op_extent_osd_data_bio(osd_req, 0,
1851a4ce40a9SAlex Elder 				obj_request->bio_list, obj_request->length);
18529d4df01fSAlex Elder 
18539d4df01fSAlex Elder 		if (write_request)
18549d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
18559d4df01fSAlex Elder 		else
18569d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
1857430c28c3SAlex Elder 
18587da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1859bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1860bf0d5f50SAlex Elder 
18617da22d29SAlex Elder 		img_offset += length;
1862bf0d5f50SAlex Elder 		resid -= length;
1863bf0d5f50SAlex Elder 	}
1864bf0d5f50SAlex Elder 
1865bf0d5f50SAlex Elder 	return 0;
1866bf0d5f50SAlex Elder 
1867bf0d5f50SAlex Elder out_partial:
1868bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1869bf0d5f50SAlex Elder out_unwind:
1870bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1871bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
1872bf0d5f50SAlex Elder 
1873bf0d5f50SAlex Elder 	return -ENOMEM;
1874bf0d5f50SAlex Elder }
1875bf0d5f50SAlex Elder 
1876c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
1877c5b5ef6cSAlex Elder {
1878c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
1879c5b5ef6cSAlex Elder 	int result;
1880c5b5ef6cSAlex Elder 
1881c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
1882c5b5ef6cSAlex Elder 
1883c5b5ef6cSAlex Elder 	/*
1884c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
1885c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
1886c5b5ef6cSAlex Elder 	 * we're done with the request.
1887c5b5ef6cSAlex Elder 	 */
1888c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
1889c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
1890c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
1891c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
1892c5b5ef6cSAlex Elder 
1893c5b5ef6cSAlex Elder 	result = obj_request->result;
1894c5b5ef6cSAlex Elder 	obj_request->result = 0;
1895c5b5ef6cSAlex Elder 
1896c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
1897c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
1898c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
1899c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
1900c5b5ef6cSAlex Elder 
1901c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
1902c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
1903c5b5ef6cSAlex Elder 
1904c5b5ef6cSAlex Elder 	/*
1905c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
1906c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
1907c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
1908c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
1909c5b5ef6cSAlex Elder 	 */
1910c5b5ef6cSAlex Elder 	if (!result) {
1911c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
1912c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
1913c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
1914c5b5ef6cSAlex Elder 	} else if (result) {
1915c5b5ef6cSAlex Elder 		orig_request->result = result;
1916c5b5ef6cSAlex Elder 		goto out_err;
1917c5b5ef6cSAlex Elder 	}
1918c5b5ef6cSAlex Elder 
1919c5b5ef6cSAlex Elder 	/*
1920c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
1921c5b5ef6cSAlex Elder 	 * whether the target object exists.
1922c5b5ef6cSAlex Elder 	 */
1923b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
1924c5b5ef6cSAlex Elder out_err:
1925c5b5ef6cSAlex Elder 	if (orig_request->result)
1926c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
1927c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
1928c5b5ef6cSAlex Elder }
1929c5b5ef6cSAlex Elder 
1930c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
1931c5b5ef6cSAlex Elder {
1932c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
1933c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
1934c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
1935c5b5ef6cSAlex Elder 	struct page **pages = NULL;
1936c5b5ef6cSAlex Elder 	u32 page_count;
1937c5b5ef6cSAlex Elder 	size_t size;
1938c5b5ef6cSAlex Elder 	int ret;
1939c5b5ef6cSAlex Elder 
1940c5b5ef6cSAlex Elder 	/*
1941c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
1942c5b5ef6cSAlex Elder 	 *     le64 length;
1943c5b5ef6cSAlex Elder 	 *     struct {
1944c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
1945c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
1946c5b5ef6cSAlex Elder 	 *     } mtime;
1947c5b5ef6cSAlex Elder 	 */
1948c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
1949c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
1950c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1951c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
1952c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
1953c5b5ef6cSAlex Elder 
1954c5b5ef6cSAlex Elder 	ret = -ENOMEM;
1955c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
1956c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
1957c5b5ef6cSAlex Elder 	if (!stat_request)
1958c5b5ef6cSAlex Elder 		goto out;
1959c5b5ef6cSAlex Elder 
1960c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
1961c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
1962c5b5ef6cSAlex Elder 	stat_request->pages = pages;
1963c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
1964c5b5ef6cSAlex Elder 
1965c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
1966c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
1967c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1968c5b5ef6cSAlex Elder 						stat_request);
1969c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
1970c5b5ef6cSAlex Elder 		goto out;
1971c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
1972c5b5ef6cSAlex Elder 
1973c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
1974c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
1975c5b5ef6cSAlex Elder 					false, false);
19769d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
1977c5b5ef6cSAlex Elder 
1978c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1979c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
1980c5b5ef6cSAlex Elder out:
1981c5b5ef6cSAlex Elder 	if (ret)
1982c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
1983c5b5ef6cSAlex Elder 
1984c5b5ef6cSAlex Elder 	return ret;
1985c5b5ef6cSAlex Elder }
1986c5b5ef6cSAlex Elder 
1987b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
1988b454e36dSAlex Elder {
1989b454e36dSAlex Elder 	struct rbd_img_request *img_request;
1990b454e36dSAlex Elder 
1991b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1992b454e36dSAlex Elder 
1993b454e36dSAlex Elder 	img_request = obj_request->img_request;
1994b454e36dSAlex Elder 	rbd_assert(img_request);
1995b454e36dSAlex Elder 
1996b454e36dSAlex Elder 	/* (At the moment we don't care whether it exists or not...) */
1997b454e36dSAlex Elder 	(void) obj_request_exists_test;
1998b454e36dSAlex Elder 
1999b454e36dSAlex Elder 	/*
2000b454e36dSAlex Elder 	 * Only layered writes need special handling.  If it's not a
2001b454e36dSAlex Elder 	 * layered write, or it is a layered write but we know the
2002b454e36dSAlex Elder 	 * target object exists, it's no different from any other
2003b454e36dSAlex Elder 	 * object request.
2004b454e36dSAlex Elder 	 */
2005b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2006b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2007b454e36dSAlex Elder 		obj_request_known_test(obj_request)) {
2008b454e36dSAlex Elder 
2009b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2010b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2011b454e36dSAlex Elder 
2012b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2013b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2014b454e36dSAlex Elder 
2015b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2016b454e36dSAlex Elder 	}
2017b454e36dSAlex Elder 
2018b454e36dSAlex Elder 	/*
2019b454e36dSAlex Elder 	 * It's a layered write and we don't know whether the target
2020b454e36dSAlex Elder 	 * exists.  Issue existence check; once that completes the
2021b454e36dSAlex Elder 	 * original request will be submitted again.
2022b454e36dSAlex Elder 	 */
2023b454e36dSAlex Elder 
2024b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2025b454e36dSAlex Elder }
2026b454e36dSAlex Elder 
2027bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2028bf0d5f50SAlex Elder {
2029bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
203046faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2031bf0d5f50SAlex Elder 
203237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
203346faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2034bf0d5f50SAlex Elder 		int ret;
2035bf0d5f50SAlex Elder 
2036b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2037bf0d5f50SAlex Elder 		if (ret)
2038bf0d5f50SAlex Elder 			return ret;
2039bf0d5f50SAlex Elder 	}
2040bf0d5f50SAlex Elder 
2041bf0d5f50SAlex Elder 	return 0;
2042bf0d5f50SAlex Elder }
2043bf0d5f50SAlex Elder 
20448b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
20458b3e1a56SAlex Elder {
20468b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
20478b3e1a56SAlex Elder 
20488b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
20498b3e1a56SAlex Elder 
20508b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
20518b3e1a56SAlex Elder 	rbd_assert(obj_request != NULL);
20528b3e1a56SAlex Elder 	obj_request->result = img_request->result;
20538b3e1a56SAlex Elder 	obj_request->xferred = img_request->xferred;
20548b3e1a56SAlex Elder 
20558b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
20568b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
20578b3e1a56SAlex Elder }
20588b3e1a56SAlex Elder 
20598b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
20608b3e1a56SAlex Elder {
20618b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
20628b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
20638b3e1a56SAlex Elder 	int result;
20648b3e1a56SAlex Elder 
20658b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20668b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
20678b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
20688b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
20698b3e1a56SAlex Elder 
20708b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
20718b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
20728b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
20738b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
20748b3e1a56SAlex Elder 						obj_request->img_offset,
20758b3e1a56SAlex Elder 						obj_request->length,
20768b3e1a56SAlex Elder 						false, true);
20778b3e1a56SAlex Elder 	result = -ENOMEM;
20788b3e1a56SAlex Elder 	if (!img_request)
20798b3e1a56SAlex Elder 		goto out_err;
20808b3e1a56SAlex Elder 
20818b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
20828b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
20838b3e1a56SAlex Elder 
20848b3e1a56SAlex Elder 	result = rbd_img_request_fill_bio(img_request, obj_request->bio_list);
20858b3e1a56SAlex Elder 	if (result)
20868b3e1a56SAlex Elder 		goto out_err;
20878b3e1a56SAlex Elder 
20888b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
20898b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
20908b3e1a56SAlex Elder 	if (result)
20918b3e1a56SAlex Elder 		goto out_err;
20928b3e1a56SAlex Elder 
20938b3e1a56SAlex Elder 	return;
20948b3e1a56SAlex Elder out_err:
20958b3e1a56SAlex Elder 	if (img_request)
20968b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
20978b3e1a56SAlex Elder 	obj_request->result = result;
20988b3e1a56SAlex Elder 	obj_request->xferred = 0;
20998b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
21008b3e1a56SAlex Elder }
21018b3e1a56SAlex Elder 
2102cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
2103b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
2104b8d70035SAlex Elder {
2105b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
21062169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2107b8d70035SAlex Elder 	int ret;
2108b8d70035SAlex Elder 
2109b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2110b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2111b8d70035SAlex Elder 	if (!obj_request)
2112b8d70035SAlex Elder 		return -ENOMEM;
2113b8d70035SAlex Elder 
2114b8d70035SAlex Elder 	ret = -ENOMEM;
2115430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2116b8d70035SAlex Elder 	if (!obj_request->osd_req)
2117b8d70035SAlex Elder 		goto out;
21182169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2119b8d70035SAlex Elder 
2120c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2121c99d2d4aSAlex Elder 					notify_id, ver, 0);
21229d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2123430c28c3SAlex Elder 
2124b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2125b8d70035SAlex Elder out:
2126cf81b60eSAlex Elder 	if (ret)
2127b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2128b8d70035SAlex Elder 
2129b8d70035SAlex Elder 	return ret;
2130b8d70035SAlex Elder }
2131b8d70035SAlex Elder 
2132b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2133b8d70035SAlex Elder {
2134b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2135b8d70035SAlex Elder 	u64 hver;
2136b8d70035SAlex Elder 	int rc;
2137b8d70035SAlex Elder 
2138b8d70035SAlex Elder 	if (!rbd_dev)
2139b8d70035SAlex Elder 		return;
2140b8d70035SAlex Elder 
214137206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2142b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
2143b8d70035SAlex Elder 		(unsigned int) opcode);
2144b8d70035SAlex Elder 	rc = rbd_dev_refresh(rbd_dev, &hver);
2145b8d70035SAlex Elder 	if (rc)
2146b8d70035SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
2147b8d70035SAlex Elder 			   " update snaps: %d\n", rc);
2148b8d70035SAlex Elder 
2149cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
2150b8d70035SAlex Elder }
2151b8d70035SAlex Elder 
21529969ebc5SAlex Elder /*
21539969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
21549969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
21559969ebc5SAlex Elder  */
21569969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
21579969ebc5SAlex Elder {
21589969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
21599969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
21609969ebc5SAlex Elder 	int ret;
21619969ebc5SAlex Elder 
21629969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
21639969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
21649969ebc5SAlex Elder 
21659969ebc5SAlex Elder 	if (start) {
21663c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
21679969ebc5SAlex Elder 						&rbd_dev->watch_event);
21689969ebc5SAlex Elder 		if (ret < 0)
21699969ebc5SAlex Elder 			return ret;
21708eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
21719969ebc5SAlex Elder 	}
21729969ebc5SAlex Elder 
21739969ebc5SAlex Elder 	ret = -ENOMEM;
21749969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
21759969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
21769969ebc5SAlex Elder 	if (!obj_request)
21779969ebc5SAlex Elder 		goto out_cancel;
21789969ebc5SAlex Elder 
2179430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2180430c28c3SAlex Elder 	if (!obj_request->osd_req)
2181430c28c3SAlex Elder 		goto out_cancel;
2182430c28c3SAlex Elder 
21838eb87565SAlex Elder 	if (start)
2184975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
21858eb87565SAlex Elder 	else
21866977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2187975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
21882169238dSAlex Elder 
21892169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
21902169238dSAlex Elder 				rbd_dev->watch_event->cookie,
21912169238dSAlex Elder 				rbd_dev->header.obj_version, start);
21929d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
21932169238dSAlex Elder 
21949969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
21959969ebc5SAlex Elder 	if (ret)
21969969ebc5SAlex Elder 		goto out_cancel;
21979969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
21989969ebc5SAlex Elder 	if (ret)
21999969ebc5SAlex Elder 		goto out_cancel;
22009969ebc5SAlex Elder 	ret = obj_request->result;
22019969ebc5SAlex Elder 	if (ret)
22029969ebc5SAlex Elder 		goto out_cancel;
22039969ebc5SAlex Elder 
22048eb87565SAlex Elder 	/*
22058eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
22068eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
22078eb87565SAlex Elder 	 * a pointer to the object request during that time (in
22088eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
22098eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
22108eb87565SAlex Elder 	 * unregistered it.
22118eb87565SAlex Elder 	 */
22128eb87565SAlex Elder 	if (start) {
22138eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
22148eb87565SAlex Elder 
22158eb87565SAlex Elder 		return 0;
22168eb87565SAlex Elder 	}
22178eb87565SAlex Elder 
22188eb87565SAlex Elder 	/* We have successfully torn down the watch request */
22198eb87565SAlex Elder 
22208eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
22218eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
22229969ebc5SAlex Elder out_cancel:
22239969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
22249969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
22259969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
22269969ebc5SAlex Elder 	if (obj_request)
22279969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
22289969ebc5SAlex Elder 
22299969ebc5SAlex Elder 	return ret;
22309969ebc5SAlex Elder }
22319969ebc5SAlex Elder 
223236be9a76SAlex Elder /*
223336be9a76SAlex Elder  * Synchronous osd object method call
223436be9a76SAlex Elder  */
223536be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
223636be9a76SAlex Elder 			     const char *object_name,
223736be9a76SAlex Elder 			     const char *class_name,
223836be9a76SAlex Elder 			     const char *method_name,
223936be9a76SAlex Elder 			     const char *outbound,
224036be9a76SAlex Elder 			     size_t outbound_size,
224136be9a76SAlex Elder 			     char *inbound,
224236be9a76SAlex Elder 			     size_t inbound_size,
224336be9a76SAlex Elder 			     u64 *version)
224436be9a76SAlex Elder {
22452169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
224636be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
224736be9a76SAlex Elder 	struct page **pages;
224836be9a76SAlex Elder 	u32 page_count;
224936be9a76SAlex Elder 	int ret;
225036be9a76SAlex Elder 
225136be9a76SAlex Elder 	/*
22526010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
22536010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
22546010a451SAlex Elder 	 * also supply outbound data--parameters for the object
22556010a451SAlex Elder 	 * method.  Currently if this is present it will be a
22566010a451SAlex Elder 	 * snapshot id.
225736be9a76SAlex Elder 	 */
225836be9a76SAlex Elder 	page_count = (u32) calc_pages_for(0, inbound_size);
225936be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
226036be9a76SAlex Elder 	if (IS_ERR(pages))
226136be9a76SAlex Elder 		return PTR_ERR(pages);
226236be9a76SAlex Elder 
226336be9a76SAlex Elder 	ret = -ENOMEM;
22646010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
226536be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
226636be9a76SAlex Elder 	if (!obj_request)
226736be9a76SAlex Elder 		goto out;
226836be9a76SAlex Elder 
226936be9a76SAlex Elder 	obj_request->pages = pages;
227036be9a76SAlex Elder 	obj_request->page_count = page_count;
227136be9a76SAlex Elder 
2272430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
227336be9a76SAlex Elder 	if (!obj_request->osd_req)
227436be9a76SAlex Elder 		goto out;
227536be9a76SAlex Elder 
2276c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
227704017e29SAlex Elder 					class_name, method_name);
227804017e29SAlex Elder 	if (outbound_size) {
227904017e29SAlex Elder 		struct ceph_pagelist *pagelist;
228004017e29SAlex Elder 
228104017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
228204017e29SAlex Elder 		if (!pagelist)
228304017e29SAlex Elder 			goto out;
228404017e29SAlex Elder 
228504017e29SAlex Elder 		ceph_pagelist_init(pagelist);
228604017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
228704017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
228804017e29SAlex Elder 						pagelist);
228904017e29SAlex Elder 	}
2290a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2291a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
229244cd188dSAlex Elder 					0, false, false);
22939d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2294430c28c3SAlex Elder 
229536be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
229636be9a76SAlex Elder 	if (ret)
229736be9a76SAlex Elder 		goto out;
229836be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
229936be9a76SAlex Elder 	if (ret)
230036be9a76SAlex Elder 		goto out;
230136be9a76SAlex Elder 
230236be9a76SAlex Elder 	ret = obj_request->result;
230336be9a76SAlex Elder 	if (ret < 0)
230436be9a76SAlex Elder 		goto out;
230523ed6e13SAlex Elder 	ret = 0;
2306903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
230736be9a76SAlex Elder 	if (version)
230836be9a76SAlex Elder 		*version = obj_request->version;
230936be9a76SAlex Elder out:
231036be9a76SAlex Elder 	if (obj_request)
231136be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
231236be9a76SAlex Elder 	else
231336be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
231436be9a76SAlex Elder 
231536be9a76SAlex Elder 	return ret;
231636be9a76SAlex Elder }
231736be9a76SAlex Elder 
2318bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2319cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2320bf0d5f50SAlex Elder {
2321bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2322bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2323bf0d5f50SAlex Elder 	struct request *rq;
2324bf0d5f50SAlex Elder 	int result;
2325bf0d5f50SAlex Elder 
2326bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2327bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2328bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2329bf0d5f50SAlex Elder 		u64 offset;
2330bf0d5f50SAlex Elder 		u64 length;
2331bf0d5f50SAlex Elder 
2332bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2333bf0d5f50SAlex Elder 
2334bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
23354dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
23364dda41d3SAlex Elder 				(int) rq->cmd_type);
23374dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
23384dda41d3SAlex Elder 			continue;
23394dda41d3SAlex Elder 		}
23404dda41d3SAlex Elder 
23414dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
23424dda41d3SAlex Elder 
23434dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
23444dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
23454dda41d3SAlex Elder 
23464dda41d3SAlex Elder 		if (!length) {
23474dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2348bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2349bf0d5f50SAlex Elder 			continue;
2350bf0d5f50SAlex Elder 		}
2351bf0d5f50SAlex Elder 
2352bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2353bf0d5f50SAlex Elder 
2354bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2355bf0d5f50SAlex Elder 
2356bf0d5f50SAlex Elder 		if (write_request) {
2357bf0d5f50SAlex Elder 			result = -EROFS;
2358bf0d5f50SAlex Elder 			if (read_only)
2359bf0d5f50SAlex Elder 				goto end_request;
2360bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2361bf0d5f50SAlex Elder 		}
2362bf0d5f50SAlex Elder 
23636d292906SAlex Elder 		/*
23646d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
23656d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
23666d292906SAlex Elder 		 * have disappeared by the time our request arrives
23676d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
23686d292906SAlex Elder 		 * we already know.
23696d292906SAlex Elder 		 */
23706d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2371bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2372bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2373bf0d5f50SAlex Elder 			result = -ENXIO;
2374bf0d5f50SAlex Elder 			goto end_request;
2375bf0d5f50SAlex Elder 		}
2376bf0d5f50SAlex Elder 
2377bf0d5f50SAlex Elder 		result = -EINVAL;
2378bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2379bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2380bf0d5f50SAlex Elder 
2381bf0d5f50SAlex Elder 		result = -ENOMEM;
2382bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
23839849e986SAlex Elder 							write_request, false);
2384bf0d5f50SAlex Elder 		if (!img_request)
2385bf0d5f50SAlex Elder 			goto end_request;
2386bf0d5f50SAlex Elder 
2387bf0d5f50SAlex Elder 		img_request->rq = rq;
2388bf0d5f50SAlex Elder 
2389bf0d5f50SAlex Elder 		result = rbd_img_request_fill_bio(img_request, rq->bio);
2390bf0d5f50SAlex Elder 		if (!result)
2391bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2392bf0d5f50SAlex Elder 		if (result)
2393bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2394bf0d5f50SAlex Elder end_request:
2395bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2396bf0d5f50SAlex Elder 		if (result < 0) {
23977da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
23987da22d29SAlex Elder 				write_request ? "write" : "read",
23997da22d29SAlex Elder 				length, offset, result);
24007da22d29SAlex Elder 
2401bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2402bf0d5f50SAlex Elder 		}
2403bf0d5f50SAlex Elder 	}
2404bf0d5f50SAlex Elder }
2405bf0d5f50SAlex Elder 
2406602adf40SYehuda Sadeh /*
2407602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2408602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2409f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2410602adf40SYehuda Sadeh  */
2411602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2412602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2413602adf40SYehuda Sadeh {
2414602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2415e5cfeed2SAlex Elder 	sector_t sector_offset;
2416e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2417e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2418e5cfeed2SAlex Elder 	int ret;
2419602adf40SYehuda Sadeh 
2420e5cfeed2SAlex Elder 	/*
2421e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2422e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2423e5cfeed2SAlex Elder 	 * device.
2424e5cfeed2SAlex Elder 	 */
2425e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2426e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2427e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2428593a9e7bSAlex Elder 
2429e5cfeed2SAlex Elder 	/*
2430e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2431e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2432e5cfeed2SAlex Elder 	 */
2433e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2434e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2435e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2436e5cfeed2SAlex Elder 	else
2437e5cfeed2SAlex Elder 		ret = 0;
2438e5cfeed2SAlex Elder 
2439e5cfeed2SAlex Elder 	/*
2440e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2441e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2442e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2443e5cfeed2SAlex Elder 	 * added to an empty bio."
2444e5cfeed2SAlex Elder 	 */
2445e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2446e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2447e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2448e5cfeed2SAlex Elder 
2449e5cfeed2SAlex Elder 	return ret;
2450602adf40SYehuda Sadeh }
2451602adf40SYehuda Sadeh 
2452602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2453602adf40SYehuda Sadeh {
2454602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2455602adf40SYehuda Sadeh 
2456602adf40SYehuda Sadeh 	if (!disk)
2457602adf40SYehuda Sadeh 		return;
2458602adf40SYehuda Sadeh 
2459602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2460602adf40SYehuda Sadeh 		del_gendisk(disk);
2461602adf40SYehuda Sadeh 	if (disk->queue)
2462602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2463602adf40SYehuda Sadeh 	put_disk(disk);
2464602adf40SYehuda Sadeh }
2465602adf40SYehuda Sadeh 
2466788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2467788e2df3SAlex Elder 				const char *object_name,
2468788e2df3SAlex Elder 				u64 offset, u64 length,
2469788e2df3SAlex Elder 				char *buf, u64 *version)
2470788e2df3SAlex Elder 
2471788e2df3SAlex Elder {
24722169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2473788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2474788e2df3SAlex Elder 	struct page **pages = NULL;
2475788e2df3SAlex Elder 	u32 page_count;
24761ceae7efSAlex Elder 	size_t size;
2477788e2df3SAlex Elder 	int ret;
2478788e2df3SAlex Elder 
2479788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2480788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2481788e2df3SAlex Elder 	if (IS_ERR(pages))
2482788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2483788e2df3SAlex Elder 
2484788e2df3SAlex Elder 	ret = -ENOMEM;
2485788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2486788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2487788e2df3SAlex Elder 	if (!obj_request)
2488788e2df3SAlex Elder 		goto out;
2489788e2df3SAlex Elder 
2490788e2df3SAlex Elder 	obj_request->pages = pages;
2491788e2df3SAlex Elder 	obj_request->page_count = page_count;
2492788e2df3SAlex Elder 
2493430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2494788e2df3SAlex Elder 	if (!obj_request->osd_req)
2495788e2df3SAlex Elder 		goto out;
2496788e2df3SAlex Elder 
2497c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2498c99d2d4aSAlex Elder 					offset, length, 0, 0);
2499406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2500a4ce40a9SAlex Elder 					obj_request->pages,
250144cd188dSAlex Elder 					obj_request->length,
250244cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
250344cd188dSAlex Elder 					false, false);
25049d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2505430c28c3SAlex Elder 
2506788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2507788e2df3SAlex Elder 	if (ret)
2508788e2df3SAlex Elder 		goto out;
2509788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2510788e2df3SAlex Elder 	if (ret)
2511788e2df3SAlex Elder 		goto out;
2512788e2df3SAlex Elder 
2513788e2df3SAlex Elder 	ret = obj_request->result;
2514788e2df3SAlex Elder 	if (ret < 0)
2515788e2df3SAlex Elder 		goto out;
25161ceae7efSAlex Elder 
25171ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
25181ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2519903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
252023ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
252123ed6e13SAlex Elder 	ret = (int) size;
2522788e2df3SAlex Elder 	if (version)
2523788e2df3SAlex Elder 		*version = obj_request->version;
2524788e2df3SAlex Elder out:
2525788e2df3SAlex Elder 	if (obj_request)
2526788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2527788e2df3SAlex Elder 	else
2528788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2529788e2df3SAlex Elder 
2530788e2df3SAlex Elder 	return ret;
2531788e2df3SAlex Elder }
2532788e2df3SAlex Elder 
2533602adf40SYehuda Sadeh /*
25344156d998SAlex Elder  * Read the complete header for the given rbd device.
25354156d998SAlex Elder  *
25364156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
25374156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
25384156d998SAlex Elder  * of a variable that will be filled in with the version of the
25394156d998SAlex Elder  * header object at the time it was read.
25404156d998SAlex Elder  *
25414156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
25424156d998SAlex Elder  */
25434156d998SAlex Elder static struct rbd_image_header_ondisk *
25444156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
25454156d998SAlex Elder {
25464156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
25474156d998SAlex Elder 	u32 snap_count = 0;
25484156d998SAlex Elder 	u64 names_size = 0;
25494156d998SAlex Elder 	u32 want_count;
25504156d998SAlex Elder 	int ret;
25514156d998SAlex Elder 
25524156d998SAlex Elder 	/*
25534156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
25544156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
25554156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
25564156d998SAlex Elder 	 * the number of snapshots could change by the time we read
25574156d998SAlex Elder 	 * it in, in which case we re-read it.
25584156d998SAlex Elder 	 */
25594156d998SAlex Elder 	do {
25604156d998SAlex Elder 		size_t size;
25614156d998SAlex Elder 
25624156d998SAlex Elder 		kfree(ondisk);
25634156d998SAlex Elder 
25644156d998SAlex Elder 		size = sizeof (*ondisk);
25654156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
25664156d998SAlex Elder 		size += names_size;
25674156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
25684156d998SAlex Elder 		if (!ondisk)
25694156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
25704156d998SAlex Elder 
2571788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
25724156d998SAlex Elder 				       0, size,
25734156d998SAlex Elder 				       (char *) ondisk, version);
25744156d998SAlex Elder 		if (ret < 0)
25754156d998SAlex Elder 			goto out_err;
25764156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
25774156d998SAlex Elder 			ret = -ENXIO;
257806ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
257906ecc6cbSAlex Elder 				size, ret);
25804156d998SAlex Elder 			goto out_err;
25814156d998SAlex Elder 		}
25824156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
25834156d998SAlex Elder 			ret = -ENXIO;
258406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
25854156d998SAlex Elder 			goto out_err;
25864156d998SAlex Elder 		}
25874156d998SAlex Elder 
25884156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
25894156d998SAlex Elder 		want_count = snap_count;
25904156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
25914156d998SAlex Elder 	} while (snap_count != want_count);
25924156d998SAlex Elder 
25934156d998SAlex Elder 	return ondisk;
25944156d998SAlex Elder 
25954156d998SAlex Elder out_err:
25964156d998SAlex Elder 	kfree(ondisk);
25974156d998SAlex Elder 
25984156d998SAlex Elder 	return ERR_PTR(ret);
25994156d998SAlex Elder }
26004156d998SAlex Elder 
26014156d998SAlex Elder /*
2602602adf40SYehuda Sadeh  * reload the ondisk the header
2603602adf40SYehuda Sadeh  */
2604602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2605602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2606602adf40SYehuda Sadeh {
26074156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
26084156d998SAlex Elder 	u64 ver = 0;
26094156d998SAlex Elder 	int ret;
2610602adf40SYehuda Sadeh 
26114156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
26124156d998SAlex Elder 	if (IS_ERR(ondisk))
26134156d998SAlex Elder 		return PTR_ERR(ondisk);
26144156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
26154156d998SAlex Elder 	if (ret >= 0)
261659c2be1eSYehuda Sadeh 		header->obj_version = ver;
26174156d998SAlex Elder 	kfree(ondisk);
2618602adf40SYehuda Sadeh 
26194156d998SAlex Elder 	return ret;
2620602adf40SYehuda Sadeh }
2621602adf40SYehuda Sadeh 
262241f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
2623dfc5606dSYehuda Sadeh {
2624dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
2625a0593290SAlex Elder 	struct rbd_snap *next;
2626dfc5606dSYehuda Sadeh 
2627a0593290SAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
262841f38c2bSAlex Elder 		rbd_remove_snap_dev(snap);
2629dfc5606dSYehuda Sadeh }
2630dfc5606dSYehuda Sadeh 
26319478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
26329478554aSAlex Elder {
26339478554aSAlex Elder 	sector_t size;
26349478554aSAlex Elder 
26350d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
26369478554aSAlex Elder 		return;
26379478554aSAlex Elder 
26389478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
26399478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
26409478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
26419478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
26429478554aSAlex Elder }
26439478554aSAlex Elder 
2644602adf40SYehuda Sadeh /*
2645602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
2646602adf40SYehuda Sadeh  */
2647117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
2648602adf40SYehuda Sadeh {
2649602adf40SYehuda Sadeh 	int ret;
2650602adf40SYehuda Sadeh 	struct rbd_image_header h;
2651602adf40SYehuda Sadeh 
2652602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
2653602adf40SYehuda Sadeh 	if (ret < 0)
2654602adf40SYehuda Sadeh 		return ret;
2655602adf40SYehuda Sadeh 
2656a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
2657a51aa0c0SJosh Durgin 
26589478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
26599478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
26609478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
26619db4b3e3SSage Weil 
2662849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
2663602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
2664849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
2665d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
2666d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
2667602adf40SYehuda Sadeh 
2668b813623aSAlex Elder 	if (hver)
2669b813623aSAlex Elder 		*hver = h.obj_version;
2670a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
267193a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
2672602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
2673602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
2674602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
2675849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
2676849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2677849b4260SAlex Elder 	kfree(h.object_prefix);
2678849b4260SAlex Elder 
2679304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
2680304f6808SAlex Elder 	if (!ret)
2681304f6808SAlex Elder 		ret = rbd_dev_snaps_register(rbd_dev);
2682dfc5606dSYehuda Sadeh 
2683c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
2684602adf40SYehuda Sadeh 
2685dfc5606dSYehuda Sadeh 	return ret;
2686602adf40SYehuda Sadeh }
2687602adf40SYehuda Sadeh 
2688117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
26891fe5e993SAlex Elder {
26901fe5e993SAlex Elder 	int ret;
26911fe5e993SAlex Elder 
2692117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
26931fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2694117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
2695117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
2696117973fbSAlex Elder 	else
2697117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
26981fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
26991fe5e993SAlex Elder 
27001fe5e993SAlex Elder 	return ret;
27011fe5e993SAlex Elder }
27021fe5e993SAlex Elder 
2703602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
2704602adf40SYehuda Sadeh {
2705602adf40SYehuda Sadeh 	struct gendisk *disk;
2706602adf40SYehuda Sadeh 	struct request_queue *q;
2707593a9e7bSAlex Elder 	u64 segment_size;
2708602adf40SYehuda Sadeh 
2709602adf40SYehuda Sadeh 	/* create gendisk info */
2710602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2711602adf40SYehuda Sadeh 	if (!disk)
27121fcdb8aaSAlex Elder 		return -ENOMEM;
2713602adf40SYehuda Sadeh 
2714f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
2715de71a297SAlex Elder 		 rbd_dev->dev_id);
2716602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
2717602adf40SYehuda Sadeh 	disk->first_minor = 0;
2718602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
2719602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
2720602adf40SYehuda Sadeh 
2721bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
2722602adf40SYehuda Sadeh 	if (!q)
2723602adf40SYehuda Sadeh 		goto out_disk;
2724029bcbd8SJosh Durgin 
2725593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
2726593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
2727593a9e7bSAlex Elder 
2728029bcbd8SJosh Durgin 	/* set io sizes to object size */
2729593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
2730593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2731593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
2732593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
2733593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
2734029bcbd8SJosh Durgin 
2735602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
2736602adf40SYehuda Sadeh 	disk->queue = q;
2737602adf40SYehuda Sadeh 
2738602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
2739602adf40SYehuda Sadeh 
2740602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
2741602adf40SYehuda Sadeh 
274212f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
274312f02944SAlex Elder 
2744602adf40SYehuda Sadeh 	return 0;
2745602adf40SYehuda Sadeh out_disk:
2746602adf40SYehuda Sadeh 	put_disk(disk);
27471fcdb8aaSAlex Elder 
27481fcdb8aaSAlex Elder 	return -ENOMEM;
2749602adf40SYehuda Sadeh }
2750602adf40SYehuda Sadeh 
2751dfc5606dSYehuda Sadeh /*
2752dfc5606dSYehuda Sadeh   sysfs
2753dfc5606dSYehuda Sadeh */
2754602adf40SYehuda Sadeh 
2755593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2756593a9e7bSAlex Elder {
2757593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
2758593a9e7bSAlex Elder }
2759593a9e7bSAlex Elder 
2760dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
2761dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2762602adf40SYehuda Sadeh {
2763593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2764a51aa0c0SJosh Durgin 	sector_t size;
2765dfc5606dSYehuda Sadeh 
2766a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
2767a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
2768a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
2769a51aa0c0SJosh Durgin 
2770a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
2771602adf40SYehuda Sadeh }
2772602adf40SYehuda Sadeh 
277334b13184SAlex Elder /*
277434b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
277534b13184SAlex Elder  * necessarily the base image.
277634b13184SAlex Elder  */
277734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
277834b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
277934b13184SAlex Elder {
278034b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
278134b13184SAlex Elder 
278234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
278334b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
278434b13184SAlex Elder }
278534b13184SAlex Elder 
2786dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
2787dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
2788602adf40SYehuda Sadeh {
2789593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2790dfc5606dSYehuda Sadeh 
2791dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
2792dfc5606dSYehuda Sadeh }
2793dfc5606dSYehuda Sadeh 
2794dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
2795dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
2796dfc5606dSYehuda Sadeh {
2797593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2798dfc5606dSYehuda Sadeh 
27991dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
28001dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
2801dfc5606dSYehuda Sadeh }
2802dfc5606dSYehuda Sadeh 
2803dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
2804dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2805dfc5606dSYehuda Sadeh {
2806593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2807dfc5606dSYehuda Sadeh 
28080d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
2809dfc5606dSYehuda Sadeh }
2810dfc5606dSYehuda Sadeh 
28119bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
28129bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
28139bb2f334SAlex Elder {
28149bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
28159bb2f334SAlex Elder 
28160d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
28170d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
28189bb2f334SAlex Elder }
28199bb2f334SAlex Elder 
2820dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
2821dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
2822dfc5606dSYehuda Sadeh {
2823593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2824dfc5606dSYehuda Sadeh 
2825a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
28260d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2827a92ffdf8SAlex Elder 
2828a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
2829dfc5606dSYehuda Sadeh }
2830dfc5606dSYehuda Sadeh 
2831589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
2832589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
2833589d30e0SAlex Elder {
2834589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2835589d30e0SAlex Elder 
28360d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
2837589d30e0SAlex Elder }
2838589d30e0SAlex Elder 
283934b13184SAlex Elder /*
284034b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
284134b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
284234b13184SAlex Elder  */
2843dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
2844dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
2845dfc5606dSYehuda Sadeh 			     char *buf)
2846dfc5606dSYehuda Sadeh {
2847593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2848dfc5606dSYehuda Sadeh 
28490d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
2850dfc5606dSYehuda Sadeh }
2851dfc5606dSYehuda Sadeh 
285286b00e0dSAlex Elder /*
285386b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
285486b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
285586b00e0dSAlex Elder  * "(no parent image)".
285686b00e0dSAlex Elder  */
285786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
285886b00e0dSAlex Elder 			     struct device_attribute *attr,
285986b00e0dSAlex Elder 			     char *buf)
286086b00e0dSAlex Elder {
286186b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
286286b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
286386b00e0dSAlex Elder 	int count;
286486b00e0dSAlex Elder 	char *bufp = buf;
286586b00e0dSAlex Elder 
286686b00e0dSAlex Elder 	if (!spec)
286786b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
286886b00e0dSAlex Elder 
286986b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
287086b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
287186b00e0dSAlex Elder 	if (count < 0)
287286b00e0dSAlex Elder 		return count;
287386b00e0dSAlex Elder 	bufp += count;
287486b00e0dSAlex Elder 
287586b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
287686b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
287786b00e0dSAlex Elder 	if (count < 0)
287886b00e0dSAlex Elder 		return count;
287986b00e0dSAlex Elder 	bufp += count;
288086b00e0dSAlex Elder 
288186b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
288286b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
288386b00e0dSAlex Elder 	if (count < 0)
288486b00e0dSAlex Elder 		return count;
288586b00e0dSAlex Elder 	bufp += count;
288686b00e0dSAlex Elder 
288786b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
288886b00e0dSAlex Elder 	if (count < 0)
288986b00e0dSAlex Elder 		return count;
289086b00e0dSAlex Elder 	bufp += count;
289186b00e0dSAlex Elder 
289286b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
289386b00e0dSAlex Elder }
289486b00e0dSAlex Elder 
2895dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
2896dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
2897dfc5606dSYehuda Sadeh 				 const char *buf,
2898dfc5606dSYehuda Sadeh 				 size_t size)
2899dfc5606dSYehuda Sadeh {
2900593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2901b813623aSAlex Elder 	int ret;
2902602adf40SYehuda Sadeh 
2903117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
2904b813623aSAlex Elder 
2905b813623aSAlex Elder 	return ret < 0 ? ret : size;
2906dfc5606dSYehuda Sadeh }
2907602adf40SYehuda Sadeh 
2908dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
290934b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
2910dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2911dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2912dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
29139bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
2914dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
2915589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
2916dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2917dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
291886b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
2919dfc5606dSYehuda Sadeh 
2920dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
2921dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
292234b13184SAlex Elder 	&dev_attr_features.attr,
2923dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
2924dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
2925dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
29269bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
2927dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
2928589d30e0SAlex Elder 	&dev_attr_image_id.attr,
2929dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
293086b00e0dSAlex Elder 	&dev_attr_parent.attr,
2931dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
2932dfc5606dSYehuda Sadeh 	NULL
2933dfc5606dSYehuda Sadeh };
2934dfc5606dSYehuda Sadeh 
2935dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
2936dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
2937dfc5606dSYehuda Sadeh };
2938dfc5606dSYehuda Sadeh 
2939dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
2940dfc5606dSYehuda Sadeh 	&rbd_attr_group,
2941dfc5606dSYehuda Sadeh 	NULL
2942dfc5606dSYehuda Sadeh };
2943dfc5606dSYehuda Sadeh 
2944dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
2945dfc5606dSYehuda Sadeh {
2946dfc5606dSYehuda Sadeh }
2947dfc5606dSYehuda Sadeh 
2948dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
2949dfc5606dSYehuda Sadeh 	.name		= "rbd",
2950dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
2951dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
2952dfc5606dSYehuda Sadeh };
2953dfc5606dSYehuda Sadeh 
2954dfc5606dSYehuda Sadeh 
2955dfc5606dSYehuda Sadeh /*
2956dfc5606dSYehuda Sadeh   sysfs - snapshots
2957dfc5606dSYehuda Sadeh */
2958dfc5606dSYehuda Sadeh 
2959dfc5606dSYehuda Sadeh static ssize_t rbd_snap_size_show(struct device *dev,
2960dfc5606dSYehuda Sadeh 				  struct device_attribute *attr,
2961dfc5606dSYehuda Sadeh 				  char *buf)
2962dfc5606dSYehuda Sadeh {
2963dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2964dfc5606dSYehuda Sadeh 
29653591538fSJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
2966dfc5606dSYehuda Sadeh }
2967dfc5606dSYehuda Sadeh 
2968dfc5606dSYehuda Sadeh static ssize_t rbd_snap_id_show(struct device *dev,
2969dfc5606dSYehuda Sadeh 				struct device_attribute *attr,
2970dfc5606dSYehuda Sadeh 				char *buf)
2971dfc5606dSYehuda Sadeh {
2972dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2973dfc5606dSYehuda Sadeh 
2974593a9e7bSAlex Elder 	return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
2975dfc5606dSYehuda Sadeh }
2976dfc5606dSYehuda Sadeh 
297734b13184SAlex Elder static ssize_t rbd_snap_features_show(struct device *dev,
297834b13184SAlex Elder 				struct device_attribute *attr,
297934b13184SAlex Elder 				char *buf)
298034b13184SAlex Elder {
298134b13184SAlex Elder 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
298234b13184SAlex Elder 
298334b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
298434b13184SAlex Elder 			(unsigned long long) snap->features);
298534b13184SAlex Elder }
298634b13184SAlex Elder 
2987dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2988dfc5606dSYehuda Sadeh static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
298934b13184SAlex Elder static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
2990dfc5606dSYehuda Sadeh 
2991dfc5606dSYehuda Sadeh static struct attribute *rbd_snap_attrs[] = {
2992dfc5606dSYehuda Sadeh 	&dev_attr_snap_size.attr,
2993dfc5606dSYehuda Sadeh 	&dev_attr_snap_id.attr,
299434b13184SAlex Elder 	&dev_attr_snap_features.attr,
2995dfc5606dSYehuda Sadeh 	NULL,
2996dfc5606dSYehuda Sadeh };
2997dfc5606dSYehuda Sadeh 
2998dfc5606dSYehuda Sadeh static struct attribute_group rbd_snap_attr_group = {
2999dfc5606dSYehuda Sadeh 	.attrs = rbd_snap_attrs,
3000dfc5606dSYehuda Sadeh };
3001dfc5606dSYehuda Sadeh 
3002dfc5606dSYehuda Sadeh static void rbd_snap_dev_release(struct device *dev)
3003dfc5606dSYehuda Sadeh {
3004dfc5606dSYehuda Sadeh 	struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3005dfc5606dSYehuda Sadeh 	kfree(snap->name);
3006dfc5606dSYehuda Sadeh 	kfree(snap);
3007dfc5606dSYehuda Sadeh }
3008dfc5606dSYehuda Sadeh 
3009dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_snap_attr_groups[] = {
3010dfc5606dSYehuda Sadeh 	&rbd_snap_attr_group,
3011dfc5606dSYehuda Sadeh 	NULL
3012dfc5606dSYehuda Sadeh };
3013dfc5606dSYehuda Sadeh 
3014dfc5606dSYehuda Sadeh static struct device_type rbd_snap_device_type = {
3015dfc5606dSYehuda Sadeh 	.groups		= rbd_snap_attr_groups,
3016dfc5606dSYehuda Sadeh 	.release	= rbd_snap_dev_release,
3017dfc5606dSYehuda Sadeh };
3018dfc5606dSYehuda Sadeh 
30198b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
30208b8fb99cSAlex Elder {
30218b8fb99cSAlex Elder 	kref_get(&spec->kref);
30228b8fb99cSAlex Elder 
30238b8fb99cSAlex Elder 	return spec;
30248b8fb99cSAlex Elder }
30258b8fb99cSAlex Elder 
30268b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
30278b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
30288b8fb99cSAlex Elder {
30298b8fb99cSAlex Elder 	if (spec)
30308b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
30318b8fb99cSAlex Elder }
30328b8fb99cSAlex Elder 
30338b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
30348b8fb99cSAlex Elder {
30358b8fb99cSAlex Elder 	struct rbd_spec *spec;
30368b8fb99cSAlex Elder 
30378b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
30388b8fb99cSAlex Elder 	if (!spec)
30398b8fb99cSAlex Elder 		return NULL;
30408b8fb99cSAlex Elder 	kref_init(&spec->kref);
30418b8fb99cSAlex Elder 
30428b8fb99cSAlex Elder 	return spec;
30438b8fb99cSAlex Elder }
30448b8fb99cSAlex Elder 
30458b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
30468b8fb99cSAlex Elder {
30478b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
30488b8fb99cSAlex Elder 
30498b8fb99cSAlex Elder 	kfree(spec->pool_name);
30508b8fb99cSAlex Elder 	kfree(spec->image_id);
30518b8fb99cSAlex Elder 	kfree(spec->image_name);
30528b8fb99cSAlex Elder 	kfree(spec->snap_name);
30538b8fb99cSAlex Elder 	kfree(spec);
30548b8fb99cSAlex Elder }
30558b8fb99cSAlex Elder 
3056cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3057c53d5893SAlex Elder 				struct rbd_spec *spec)
3058c53d5893SAlex Elder {
3059c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3060c53d5893SAlex Elder 
3061c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3062c53d5893SAlex Elder 	if (!rbd_dev)
3063c53d5893SAlex Elder 		return NULL;
3064c53d5893SAlex Elder 
3065c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
30666d292906SAlex Elder 	rbd_dev->flags = 0;
3067c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3068c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
3069c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3070c53d5893SAlex Elder 
3071c53d5893SAlex Elder 	rbd_dev->spec = spec;
3072c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3073c53d5893SAlex Elder 
30740903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
30750903e875SAlex Elder 
30760903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
30770903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
30780903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
30790903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
30800903e875SAlex Elder 
3081c53d5893SAlex Elder 	return rbd_dev;
3082c53d5893SAlex Elder }
3083c53d5893SAlex Elder 
3084c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3085c53d5893SAlex Elder {
308686b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
3087c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
3088c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3089c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3090c53d5893SAlex Elder 	kfree(rbd_dev);
3091c53d5893SAlex Elder }
3092c53d5893SAlex Elder 
3093304f6808SAlex Elder static bool rbd_snap_registered(struct rbd_snap *snap)
3094304f6808SAlex Elder {
3095304f6808SAlex Elder 	bool ret = snap->dev.type == &rbd_snap_device_type;
3096304f6808SAlex Elder 	bool reg = device_is_registered(&snap->dev);
3097304f6808SAlex Elder 
3098304f6808SAlex Elder 	rbd_assert(!ret ^ reg);
3099304f6808SAlex Elder 
3100304f6808SAlex Elder 	return ret;
3101304f6808SAlex Elder }
3102304f6808SAlex Elder 
310341f38c2bSAlex Elder static void rbd_remove_snap_dev(struct rbd_snap *snap)
3104dfc5606dSYehuda Sadeh {
3105dfc5606dSYehuda Sadeh 	list_del(&snap->node);
3106304f6808SAlex Elder 	if (device_is_registered(&snap->dev))
3107dfc5606dSYehuda Sadeh 		device_unregister(&snap->dev);
3108dfc5606dSYehuda Sadeh }
3109dfc5606dSYehuda Sadeh 
311014e7085dSAlex Elder static int rbd_register_snap_dev(struct rbd_snap *snap,
3111dfc5606dSYehuda Sadeh 				  struct device *parent)
3112dfc5606dSYehuda Sadeh {
3113dfc5606dSYehuda Sadeh 	struct device *dev = &snap->dev;
3114dfc5606dSYehuda Sadeh 	int ret;
3115dfc5606dSYehuda Sadeh 
3116dfc5606dSYehuda Sadeh 	dev->type = &rbd_snap_device_type;
3117dfc5606dSYehuda Sadeh 	dev->parent = parent;
3118dfc5606dSYehuda Sadeh 	dev->release = rbd_snap_dev_release;
3119d4b125e9SAlex Elder 	dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
3120304f6808SAlex Elder 	dout("%s: registering device for snapshot %s\n", __func__, snap->name);
3121304f6808SAlex Elder 
3122dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3123dfc5606dSYehuda Sadeh 
3124dfc5606dSYehuda Sadeh 	return ret;
3125dfc5606dSYehuda Sadeh }
3126dfc5606dSYehuda Sadeh 
31274e891e0aSAlex Elder static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
3128c8d18425SAlex Elder 						const char *snap_name,
312934b13184SAlex Elder 						u64 snap_id, u64 snap_size,
313034b13184SAlex Elder 						u64 snap_features)
3131dfc5606dSYehuda Sadeh {
31324e891e0aSAlex Elder 	struct rbd_snap *snap;
3133dfc5606dSYehuda Sadeh 	int ret;
31344e891e0aSAlex Elder 
31354e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
3136dfc5606dSYehuda Sadeh 	if (!snap)
31374e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
31384e891e0aSAlex Elder 
31394e891e0aSAlex Elder 	ret = -ENOMEM;
3140c8d18425SAlex Elder 	snap->name = kstrdup(snap_name, GFP_KERNEL);
31414e891e0aSAlex Elder 	if (!snap->name)
31424e891e0aSAlex Elder 		goto err;
31434e891e0aSAlex Elder 
3144c8d18425SAlex Elder 	snap->id = snap_id;
3145c8d18425SAlex Elder 	snap->size = snap_size;
314634b13184SAlex Elder 	snap->features = snap_features;
31474e891e0aSAlex Elder 
31484e891e0aSAlex Elder 	return snap;
31494e891e0aSAlex Elder 
3150dfc5606dSYehuda Sadeh err:
3151dfc5606dSYehuda Sadeh 	kfree(snap->name);
3152dfc5606dSYehuda Sadeh 	kfree(snap);
31534e891e0aSAlex Elder 
31544e891e0aSAlex Elder 	return ERR_PTR(ret);
3155dfc5606dSYehuda Sadeh }
3156dfc5606dSYehuda Sadeh 
3157cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3158cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
3159cd892126SAlex Elder {
3160cd892126SAlex Elder 	char *snap_name;
3161cd892126SAlex Elder 
3162cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3163cd892126SAlex Elder 
3164cd892126SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
3165cd892126SAlex Elder 	*snap_features = 0;	/* No features for v1 */
3166cd892126SAlex Elder 
3167cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
3168cd892126SAlex Elder 
3169cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
3170cd892126SAlex Elder 	while (which--)
3171cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
3172cd892126SAlex Elder 
3173cd892126SAlex Elder 	return snap_name;
3174cd892126SAlex Elder }
3175cd892126SAlex Elder 
3176dfc5606dSYehuda Sadeh /*
31779d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
31789d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
31799d475de5SAlex Elder  * image.
31809d475de5SAlex Elder  */
31819d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
31829d475de5SAlex Elder 				u8 *order, u64 *snap_size)
31839d475de5SAlex Elder {
31849d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
31859d475de5SAlex Elder 	int ret;
31869d475de5SAlex Elder 	struct {
31879d475de5SAlex Elder 		u8 order;
31889d475de5SAlex Elder 		__le64 size;
31899d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
31909d475de5SAlex Elder 
319136be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
31929d475de5SAlex Elder 				"rbd", "get_size",
31939d475de5SAlex Elder 				(char *) &snapid, sizeof (snapid),
319407b2391fSAlex Elder 				(char *) &size_buf, sizeof (size_buf), NULL);
319536be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
31969d475de5SAlex Elder 	if (ret < 0)
31979d475de5SAlex Elder 		return ret;
31989d475de5SAlex Elder 
31999d475de5SAlex Elder 	*order = size_buf.order;
32009d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
32019d475de5SAlex Elder 
32029d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
32039d475de5SAlex Elder 		(unsigned long long) snap_id, (unsigned int) *order,
32049d475de5SAlex Elder 		(unsigned long long) *snap_size);
32059d475de5SAlex Elder 
32069d475de5SAlex Elder 	return 0;
32079d475de5SAlex Elder }
32089d475de5SAlex Elder 
32099d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
32109d475de5SAlex Elder {
32119d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
32129d475de5SAlex Elder 					&rbd_dev->header.obj_order,
32139d475de5SAlex Elder 					&rbd_dev->header.image_size);
32149d475de5SAlex Elder }
32159d475de5SAlex Elder 
32161e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
32171e130199SAlex Elder {
32181e130199SAlex Elder 	void *reply_buf;
32191e130199SAlex Elder 	int ret;
32201e130199SAlex Elder 	void *p;
32211e130199SAlex Elder 
32221e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
32231e130199SAlex Elder 	if (!reply_buf)
32241e130199SAlex Elder 		return -ENOMEM;
32251e130199SAlex Elder 
322636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
32271e130199SAlex Elder 				"rbd", "get_object_prefix",
32281e130199SAlex Elder 				NULL, 0,
322907b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
323036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
32311e130199SAlex Elder 	if (ret < 0)
32321e130199SAlex Elder 		goto out;
32331e130199SAlex Elder 
32341e130199SAlex Elder 	p = reply_buf;
32351e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
32361e130199SAlex Elder 						p + RBD_OBJ_PREFIX_LEN_MAX,
32371e130199SAlex Elder 						NULL, GFP_NOIO);
32381e130199SAlex Elder 
32391e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
32401e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
32411e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
32421e130199SAlex Elder 	} else {
32431e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
32441e130199SAlex Elder 	}
32451e130199SAlex Elder 
32461e130199SAlex Elder out:
32471e130199SAlex Elder 	kfree(reply_buf);
32481e130199SAlex Elder 
32491e130199SAlex Elder 	return ret;
32501e130199SAlex Elder }
32511e130199SAlex Elder 
3252b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3253b1b5402aSAlex Elder 		u64 *snap_features)
3254b1b5402aSAlex Elder {
3255b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3256b1b5402aSAlex Elder 	struct {
3257b1b5402aSAlex Elder 		__le64 features;
3258b1b5402aSAlex Elder 		__le64 incompat;
3259b1b5402aSAlex Elder 	} features_buf = { 0 };
3260d889140cSAlex Elder 	u64 incompat;
3261b1b5402aSAlex Elder 	int ret;
3262b1b5402aSAlex Elder 
326336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3264b1b5402aSAlex Elder 				"rbd", "get_features",
3265b1b5402aSAlex Elder 				(char *) &snapid, sizeof (snapid),
3266b1b5402aSAlex Elder 				(char *) &features_buf, sizeof (features_buf),
326707b2391fSAlex Elder 				NULL);
326836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3269b1b5402aSAlex Elder 	if (ret < 0)
3270b1b5402aSAlex Elder 		return ret;
3271d889140cSAlex Elder 
3272d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
32735cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3274b8f5c6edSAlex Elder 		return -ENXIO;
3275d889140cSAlex Elder 
3276b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3277b1b5402aSAlex Elder 
3278b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3279b1b5402aSAlex Elder 		(unsigned long long) snap_id,
3280b1b5402aSAlex Elder 		(unsigned long long) *snap_features,
3281b1b5402aSAlex Elder 		(unsigned long long) le64_to_cpu(features_buf.incompat));
3282b1b5402aSAlex Elder 
3283b1b5402aSAlex Elder 	return 0;
3284b1b5402aSAlex Elder }
3285b1b5402aSAlex Elder 
3286b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3287b1b5402aSAlex Elder {
3288b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3289b1b5402aSAlex Elder 						&rbd_dev->header.features);
3290b1b5402aSAlex Elder }
3291b1b5402aSAlex Elder 
329286b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
329386b00e0dSAlex Elder {
329486b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
329586b00e0dSAlex Elder 	size_t size;
329686b00e0dSAlex Elder 	void *reply_buf = NULL;
329786b00e0dSAlex Elder 	__le64 snapid;
329886b00e0dSAlex Elder 	void *p;
329986b00e0dSAlex Elder 	void *end;
330086b00e0dSAlex Elder 	char *image_id;
330186b00e0dSAlex Elder 	u64 overlap;
330286b00e0dSAlex Elder 	int ret;
330386b00e0dSAlex Elder 
330486b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
330586b00e0dSAlex Elder 	if (!parent_spec)
330686b00e0dSAlex Elder 		return -ENOMEM;
330786b00e0dSAlex Elder 
330886b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
330986b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
331086b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
331186b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
331286b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
331386b00e0dSAlex Elder 	if (!reply_buf) {
331486b00e0dSAlex Elder 		ret = -ENOMEM;
331586b00e0dSAlex Elder 		goto out_err;
331686b00e0dSAlex Elder 	}
331786b00e0dSAlex Elder 
331886b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
331936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
332086b00e0dSAlex Elder 				"rbd", "get_parent",
332186b00e0dSAlex Elder 				(char *) &snapid, sizeof (snapid),
332207b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
332336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
332486b00e0dSAlex Elder 	if (ret < 0)
332586b00e0dSAlex Elder 		goto out_err;
332686b00e0dSAlex Elder 
332786b00e0dSAlex Elder 	ret = -ERANGE;
332886b00e0dSAlex Elder 	p = reply_buf;
332986b00e0dSAlex Elder 	end = (char *) reply_buf + size;
333086b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
333186b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
333286b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
333386b00e0dSAlex Elder 
33340903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
33350903e875SAlex Elder 
33360903e875SAlex Elder 	ret = -EIO;
33370903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
33380903e875SAlex Elder 		goto out;
33390903e875SAlex Elder 
3340979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
334186b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
334286b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
334386b00e0dSAlex Elder 		goto out_err;
334486b00e0dSAlex Elder 	}
334586b00e0dSAlex Elder 	parent_spec->image_id = image_id;
334686b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
334786b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
334886b00e0dSAlex Elder 
334986b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
335086b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
335186b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
335286b00e0dSAlex Elder out:
335386b00e0dSAlex Elder 	ret = 0;
335486b00e0dSAlex Elder out_err:
335586b00e0dSAlex Elder 	kfree(reply_buf);
335686b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
335786b00e0dSAlex Elder 
335886b00e0dSAlex Elder 	return ret;
335986b00e0dSAlex Elder }
336086b00e0dSAlex Elder 
33619e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
33629e15b77dSAlex Elder {
33639e15b77dSAlex Elder 	size_t image_id_size;
33649e15b77dSAlex Elder 	char *image_id;
33659e15b77dSAlex Elder 	void *p;
33669e15b77dSAlex Elder 	void *end;
33679e15b77dSAlex Elder 	size_t size;
33689e15b77dSAlex Elder 	void *reply_buf = NULL;
33699e15b77dSAlex Elder 	size_t len = 0;
33709e15b77dSAlex Elder 	char *image_name = NULL;
33719e15b77dSAlex Elder 	int ret;
33729e15b77dSAlex Elder 
33739e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
33749e15b77dSAlex Elder 
337569e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
337669e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
33779e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
33789e15b77dSAlex Elder 	if (!image_id)
33799e15b77dSAlex Elder 		return NULL;
33809e15b77dSAlex Elder 
33819e15b77dSAlex Elder 	p = image_id;
33829e15b77dSAlex Elder 	end = (char *) image_id + image_id_size;
338369e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
33849e15b77dSAlex Elder 
33859e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
33869e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
33879e15b77dSAlex Elder 	if (!reply_buf)
33889e15b77dSAlex Elder 		goto out;
33899e15b77dSAlex Elder 
339036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
33919e15b77dSAlex Elder 				"rbd", "dir_get_name",
33929e15b77dSAlex Elder 				image_id, image_id_size,
339307b2391fSAlex Elder 				(char *) reply_buf, size, NULL);
33949e15b77dSAlex Elder 	if (ret < 0)
33959e15b77dSAlex Elder 		goto out;
33969e15b77dSAlex Elder 	p = reply_buf;
33979e15b77dSAlex Elder 	end = (char *) reply_buf + size;
33989e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
33999e15b77dSAlex Elder 	if (IS_ERR(image_name))
34009e15b77dSAlex Elder 		image_name = NULL;
34019e15b77dSAlex Elder 	else
34029e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
34039e15b77dSAlex Elder out:
34049e15b77dSAlex Elder 	kfree(reply_buf);
34059e15b77dSAlex Elder 	kfree(image_id);
34069e15b77dSAlex Elder 
34079e15b77dSAlex Elder 	return image_name;
34089e15b77dSAlex Elder }
34099e15b77dSAlex Elder 
34109e15b77dSAlex Elder /*
34119e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
34129e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
34139e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
34149e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
34159e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
34169e15b77dSAlex Elder  * until then.
34179e15b77dSAlex Elder  */
34189e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
34199e15b77dSAlex Elder {
34209e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
34219e15b77dSAlex Elder 	const char *name;
34229e15b77dSAlex Elder 	void *reply_buf = NULL;
34239e15b77dSAlex Elder 	int ret;
34249e15b77dSAlex Elder 
34259e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
34269e15b77dSAlex Elder 		return 0;	/* Already have the names */
34279e15b77dSAlex Elder 
34289e15b77dSAlex Elder 	/* Look up the pool name */
34299e15b77dSAlex Elder 
34309e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
34319e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3432935dc89fSAlex Elder 	if (!name) {
3433935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3434935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3435935dc89fSAlex Elder 		return -EIO;
3436935dc89fSAlex Elder 	}
34379e15b77dSAlex Elder 
34389e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
34399e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
34409e15b77dSAlex Elder 		return -ENOMEM;
34419e15b77dSAlex Elder 
34429e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
34439e15b77dSAlex Elder 
34449e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
344569e7a02fSAlex Elder 	if (name)
34469e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *) name;
344769e7a02fSAlex Elder 	else
344806ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
34499e15b77dSAlex Elder 
34509e15b77dSAlex Elder 	/* Look up the snapshot name. */
34519e15b77dSAlex Elder 
34529e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
34539e15b77dSAlex Elder 	if (!name) {
3454935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3455935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
34569e15b77dSAlex Elder 		ret = -EIO;
34579e15b77dSAlex Elder 		goto out_err;
34589e15b77dSAlex Elder 	}
34599e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
34609e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
34619e15b77dSAlex Elder 		goto out_err;
34629e15b77dSAlex Elder 
34639e15b77dSAlex Elder 	return 0;
34649e15b77dSAlex Elder out_err:
34659e15b77dSAlex Elder 	kfree(reply_buf);
34669e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
34679e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
34689e15b77dSAlex Elder 
34699e15b77dSAlex Elder 	return ret;
34709e15b77dSAlex Elder }
34719e15b77dSAlex Elder 
34726e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
347335d489f9SAlex Elder {
347435d489f9SAlex Elder 	size_t size;
347535d489f9SAlex Elder 	int ret;
347635d489f9SAlex Elder 	void *reply_buf;
347735d489f9SAlex Elder 	void *p;
347835d489f9SAlex Elder 	void *end;
347935d489f9SAlex Elder 	u64 seq;
348035d489f9SAlex Elder 	u32 snap_count;
348135d489f9SAlex Elder 	struct ceph_snap_context *snapc;
348235d489f9SAlex Elder 	u32 i;
348335d489f9SAlex Elder 
348435d489f9SAlex Elder 	/*
348535d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
348635d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
348735d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
348835d489f9SAlex Elder 	 * prepared to receive.
348935d489f9SAlex Elder 	 */
349035d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
349135d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
349235d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
349335d489f9SAlex Elder 	if (!reply_buf)
349435d489f9SAlex Elder 		return -ENOMEM;
349535d489f9SAlex Elder 
349636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
349735d489f9SAlex Elder 				"rbd", "get_snapcontext",
349835d489f9SAlex Elder 				NULL, 0,
349907b2391fSAlex Elder 				reply_buf, size, ver);
350036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
350135d489f9SAlex Elder 	if (ret < 0)
350235d489f9SAlex Elder 		goto out;
350335d489f9SAlex Elder 
350435d489f9SAlex Elder 	ret = -ERANGE;
350535d489f9SAlex Elder 	p = reply_buf;
350635d489f9SAlex Elder 	end = (char *) reply_buf + size;
350735d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
350835d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
350935d489f9SAlex Elder 
351035d489f9SAlex Elder 	/*
351135d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
351235d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
351335d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
351435d489f9SAlex Elder 	 * allocate is representable in a size_t.
351535d489f9SAlex Elder 	 */
351635d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
351735d489f9SAlex Elder 				 / sizeof (u64)) {
351835d489f9SAlex Elder 		ret = -EINVAL;
351935d489f9SAlex Elder 		goto out;
352035d489f9SAlex Elder 	}
352135d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
352235d489f9SAlex Elder 		goto out;
352335d489f9SAlex Elder 
352435d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
352535d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
352635d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
352735d489f9SAlex Elder 	if (!snapc) {
352835d489f9SAlex Elder 		ret = -ENOMEM;
352935d489f9SAlex Elder 		goto out;
353035d489f9SAlex Elder 	}
353135d489f9SAlex Elder 
353235d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
353335d489f9SAlex Elder 	snapc->seq = seq;
353435d489f9SAlex Elder 	snapc->num_snaps = snap_count;
353535d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
353635d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
353735d489f9SAlex Elder 
353835d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
353935d489f9SAlex Elder 
354035d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
354135d489f9SAlex Elder 		(unsigned long long) seq, (unsigned int) snap_count);
354235d489f9SAlex Elder 
354335d489f9SAlex Elder out:
354435d489f9SAlex Elder 	kfree(reply_buf);
354535d489f9SAlex Elder 
354635d489f9SAlex Elder 	return 0;
354735d489f9SAlex Elder }
354835d489f9SAlex Elder 
3549b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3550b8b1e2dbSAlex Elder {
3551b8b1e2dbSAlex Elder 	size_t size;
3552b8b1e2dbSAlex Elder 	void *reply_buf;
3553b8b1e2dbSAlex Elder 	__le64 snap_id;
3554b8b1e2dbSAlex Elder 	int ret;
3555b8b1e2dbSAlex Elder 	void *p;
3556b8b1e2dbSAlex Elder 	void *end;
3557b8b1e2dbSAlex Elder 	char *snap_name;
3558b8b1e2dbSAlex Elder 
3559b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3560b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3561b8b1e2dbSAlex Elder 	if (!reply_buf)
3562b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3563b8b1e2dbSAlex Elder 
3564b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
356536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3566b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
3567b8b1e2dbSAlex Elder 				(char *) &snap_id, sizeof (snap_id),
356807b2391fSAlex Elder 				reply_buf, size, NULL);
356936be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3570b8b1e2dbSAlex Elder 	if (ret < 0)
3571b8b1e2dbSAlex Elder 		goto out;
3572b8b1e2dbSAlex Elder 
3573b8b1e2dbSAlex Elder 	p = reply_buf;
3574b8b1e2dbSAlex Elder 	end = (char *) reply_buf + size;
3575e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3576b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3577b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3578b8b1e2dbSAlex Elder 		goto out;
3579b8b1e2dbSAlex Elder 	} else {
3580b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3581b8b1e2dbSAlex Elder 			(unsigned long long) le64_to_cpu(snap_id), snap_name);
3582b8b1e2dbSAlex Elder 	}
3583b8b1e2dbSAlex Elder 	kfree(reply_buf);
3584b8b1e2dbSAlex Elder 
3585b8b1e2dbSAlex Elder 	return snap_name;
3586b8b1e2dbSAlex Elder out:
3587b8b1e2dbSAlex Elder 	kfree(reply_buf);
3588b8b1e2dbSAlex Elder 
3589b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3590b8b1e2dbSAlex Elder }
3591b8b1e2dbSAlex Elder 
3592b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3593b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3594b8b1e2dbSAlex Elder {
3595e0b49868SAlex Elder 	u64 snap_id;
3596b8b1e2dbSAlex Elder 	u8 order;
3597b8b1e2dbSAlex Elder 	int ret;
3598b8b1e2dbSAlex Elder 
3599b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3600b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3601b8b1e2dbSAlex Elder 	if (ret)
3602b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3603b8b1e2dbSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3604b8b1e2dbSAlex Elder 	if (ret)
3605b8b1e2dbSAlex Elder 		return ERR_PTR(ret);
3606b8b1e2dbSAlex Elder 
3607b8b1e2dbSAlex Elder 	return rbd_dev_v2_snap_name(rbd_dev, which);
3608b8b1e2dbSAlex Elder }
3609b8b1e2dbSAlex Elder 
3610b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3611b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3612b8b1e2dbSAlex Elder {
3613b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3614b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3615b8b1e2dbSAlex Elder 					snap_size, snap_features);
3616b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3617b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3618b8b1e2dbSAlex Elder 					snap_size, snap_features);
3619b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3620b8b1e2dbSAlex Elder }
3621b8b1e2dbSAlex Elder 
3622117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3623117973fbSAlex Elder {
3624117973fbSAlex Elder 	int ret;
3625117973fbSAlex Elder 	__u8 obj_order;
3626117973fbSAlex Elder 
3627117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3628117973fbSAlex Elder 
3629117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3630117973fbSAlex Elder 
3631117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3632117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3633117973fbSAlex Elder 	if (ret)
3634117973fbSAlex Elder 		goto out;
3635117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3636117973fbSAlex Elder 		ret = -EIO;
3637117973fbSAlex Elder 		goto out;
3638117973fbSAlex Elder 	}
3639117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3640117973fbSAlex Elder 
3641117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3642117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
3643117973fbSAlex Elder 	if (ret)
3644117973fbSAlex Elder 		goto out;
3645117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3646117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
3647117973fbSAlex Elder 	if (ret)
3648117973fbSAlex Elder 		goto out;
3649117973fbSAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
3650117973fbSAlex Elder 	dout("rbd_dev_snaps_register returned %d\n", ret);
3651117973fbSAlex Elder out:
3652117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
3653117973fbSAlex Elder 
3654117973fbSAlex Elder 	return ret;
3655117973fbSAlex Elder }
3656117973fbSAlex Elder 
36579d475de5SAlex Elder /*
365835938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
365935938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
366035938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
366135938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
366235938150SAlex Elder  * And verify there are no changes to snapshots we already know
366335938150SAlex Elder  * about.
366435938150SAlex Elder  *
366535938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
366635938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
366735938150SAlex Elder  * are also maintained in that order.)
3668dfc5606dSYehuda Sadeh  */
3669304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
3670dfc5606dSYehuda Sadeh {
367135938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
367235938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
367335938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
367435938150SAlex Elder 	struct list_head *links = head->next;
367535938150SAlex Elder 	u32 index = 0;
3676dfc5606dSYehuda Sadeh 
36779fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
367835938150SAlex Elder 	while (index < snap_count || links != head) {
367935938150SAlex Elder 		u64 snap_id;
368035938150SAlex Elder 		struct rbd_snap *snap;
3681cd892126SAlex Elder 		char *snap_name;
3682cd892126SAlex Elder 		u64 snap_size = 0;
3683cd892126SAlex Elder 		u64 snap_features = 0;
3684dfc5606dSYehuda Sadeh 
368535938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
368635938150SAlex Elder 					     : CEPH_NOSNAP;
368735938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
368835938150SAlex Elder 				     : NULL;
3689aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
3690dfc5606dSYehuda Sadeh 
369135938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
369235938150SAlex Elder 			struct list_head *next = links->next;
3693dfc5606dSYehuda Sadeh 
36946d292906SAlex Elder 			/*
36956d292906SAlex Elder 			 * A previously-existing snapshot is not in
36966d292906SAlex Elder 			 * the new snap context.
36976d292906SAlex Elder 			 *
36986d292906SAlex Elder 			 * If the now missing snapshot is the one the
36996d292906SAlex Elder 			 * image is mapped to, clear its exists flag
37006d292906SAlex Elder 			 * so we can avoid sending any more requests
37016d292906SAlex Elder 			 * to it.
37026d292906SAlex Elder 			 */
37030d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
37046d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
370541f38c2bSAlex Elder 			rbd_remove_snap_dev(snap);
37069fcbb800SAlex Elder 			dout("%ssnap id %llu has been removed\n",
37070d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
37080d7dbfceSAlex Elder 							"mapped " : "",
37099fcbb800SAlex Elder 				(unsigned long long) snap->id);
3710dfc5606dSYehuda Sadeh 
371135938150SAlex Elder 			/* Done with this list entry; advance */
371235938150SAlex Elder 
371335938150SAlex Elder 			links = next;
371435938150SAlex Elder 			continue;
3715dfc5606dSYehuda Sadeh 		}
371635938150SAlex Elder 
3717b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
3718cd892126SAlex Elder 					&snap_size, &snap_features);
3719cd892126SAlex Elder 		if (IS_ERR(snap_name))
3720cd892126SAlex Elder 			return PTR_ERR(snap_name);
3721cd892126SAlex Elder 
37229fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
37239fcbb800SAlex Elder 			(unsigned long long) snap_id);
372435938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
372535938150SAlex Elder 			struct rbd_snap *new_snap;
372635938150SAlex Elder 
372735938150SAlex Elder 			/* We haven't seen this snapshot before */
372835938150SAlex Elder 
3729c8d18425SAlex Elder 			new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
3730cd892126SAlex Elder 					snap_id, snap_size, snap_features);
37319fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
37329fcbb800SAlex Elder 				int err = PTR_ERR(new_snap);
37339fcbb800SAlex Elder 
37349fcbb800SAlex Elder 				dout("  failed to add dev, error %d\n", err);
37359fcbb800SAlex Elder 
37369fcbb800SAlex Elder 				return err;
37379fcbb800SAlex Elder 			}
373835938150SAlex Elder 
373935938150SAlex Elder 			/* New goes before existing, or at end of list */
374035938150SAlex Elder 
37419fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
374235938150SAlex Elder 			if (snap)
374335938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
374435938150SAlex Elder 			else
3745523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
374635938150SAlex Elder 		} else {
374735938150SAlex Elder 			/* Already have this one */
374835938150SAlex Elder 
37499fcbb800SAlex Elder 			dout("  already present\n");
37509fcbb800SAlex Elder 
3751cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
3752aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
3753cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
375435938150SAlex Elder 
375535938150SAlex Elder 			/* Done with this list entry; advance */
375635938150SAlex Elder 
375735938150SAlex Elder 			links = links->next;
3758dfc5606dSYehuda Sadeh 		}
375935938150SAlex Elder 
376035938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
376135938150SAlex Elder 
376235938150SAlex Elder 		index++;
3763dfc5606dSYehuda Sadeh 	}
37649fcbb800SAlex Elder 	dout("%s: done\n", __func__);
3765dfc5606dSYehuda Sadeh 
3766dfc5606dSYehuda Sadeh 	return 0;
3767dfc5606dSYehuda Sadeh }
3768dfc5606dSYehuda Sadeh 
3769304f6808SAlex Elder /*
3770304f6808SAlex Elder  * Scan the list of snapshots and register the devices for any that
3771304f6808SAlex Elder  * have not already been registered.
3772304f6808SAlex Elder  */
3773304f6808SAlex Elder static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3774304f6808SAlex Elder {
3775304f6808SAlex Elder 	struct rbd_snap *snap;
3776304f6808SAlex Elder 	int ret = 0;
3777304f6808SAlex Elder 
377837206ee5SAlex Elder 	dout("%s:\n", __func__);
377986ff77bbSAlex Elder 	if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
378086ff77bbSAlex Elder 		return -EIO;
3781304f6808SAlex Elder 
3782304f6808SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
3783304f6808SAlex Elder 		if (!rbd_snap_registered(snap)) {
3784304f6808SAlex Elder 			ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3785304f6808SAlex Elder 			if (ret < 0)
3786304f6808SAlex Elder 				break;
3787304f6808SAlex Elder 		}
3788304f6808SAlex Elder 	}
3789304f6808SAlex Elder 	dout("%s: returning %d\n", __func__, ret);
3790304f6808SAlex Elder 
3791304f6808SAlex Elder 	return ret;
3792304f6808SAlex Elder }
3793304f6808SAlex Elder 
3794dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3795dfc5606dSYehuda Sadeh {
3796dfc5606dSYehuda Sadeh 	struct device *dev;
3797cd789ab9SAlex Elder 	int ret;
3798dfc5606dSYehuda Sadeh 
3799dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3800dfc5606dSYehuda Sadeh 
3801cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
3802dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
3803dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
3804dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
3805dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
3806de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
3807dfc5606dSYehuda Sadeh 	ret = device_register(dev);
3808dfc5606dSYehuda Sadeh 
3809dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
3810cd789ab9SAlex Elder 
3811dfc5606dSYehuda Sadeh 	return ret;
3812602adf40SYehuda Sadeh }
3813602adf40SYehuda Sadeh 
3814dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3815dfc5606dSYehuda Sadeh {
3816dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
3817dfc5606dSYehuda Sadeh }
3818dfc5606dSYehuda Sadeh 
3819e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
38201ddbe94eSAlex Elder 
38211ddbe94eSAlex Elder /*
3822499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
3823499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
38241ddbe94eSAlex Elder  */
3825e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
3826b7f23c36SAlex Elder {
3827e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
3828499afd5bSAlex Elder 
3829499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3830499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
3831499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
3832e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3833e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3834b7f23c36SAlex Elder }
3835b7f23c36SAlex Elder 
38361ddbe94eSAlex Elder /*
3837499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
3838499afd5bSAlex Elder  * identifier is no longer in use.
38391ddbe94eSAlex Elder  */
3840e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
38411ddbe94eSAlex Elder {
3842d184f6bfSAlex Elder 	struct list_head *tmp;
3843de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
3844d184f6bfSAlex Elder 	int max_id;
3845d184f6bfSAlex Elder 
3846aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
3847499afd5bSAlex Elder 
3848e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3849e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
3850499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
3851499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
3852d184f6bfSAlex Elder 
3853d184f6bfSAlex Elder 	/*
3854d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
3855d184f6bfSAlex Elder 	 * is nothing special we need to do.
3856d184f6bfSAlex Elder 	 */
3857e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
3858d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
3859d184f6bfSAlex Elder 		return;
3860d184f6bfSAlex Elder 	}
3861d184f6bfSAlex Elder 
3862d184f6bfSAlex Elder 	/*
3863d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
3864d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
3865d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
3866d184f6bfSAlex Elder 	 */
3867d184f6bfSAlex Elder 	max_id = 0;
3868d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
3869d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
3870d184f6bfSAlex Elder 
3871d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
3872b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
3873b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
3874d184f6bfSAlex Elder 	}
3875499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
38761ddbe94eSAlex Elder 
38771ddbe94eSAlex Elder 	/*
3878e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
3879d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
3880d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
3881d184f6bfSAlex Elder 	 * case.
38821ddbe94eSAlex Elder 	 */
3883e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3884e2839308SAlex Elder 	dout("  max dev id has been reset\n");
3885b7f23c36SAlex Elder }
3886b7f23c36SAlex Elder 
3887a725f65eSAlex Elder /*
3888e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
3889e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
3890593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
3891593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
3892e28fff26SAlex Elder  */
3893e28fff26SAlex Elder static inline size_t next_token(const char **buf)
3894e28fff26SAlex Elder {
3895e28fff26SAlex Elder         /*
3896e28fff26SAlex Elder         * These are the characters that produce nonzero for
3897e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
3898e28fff26SAlex Elder         */
3899e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
3900e28fff26SAlex Elder 
3901e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
3902e28fff26SAlex Elder 
3903e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
3904e28fff26SAlex Elder }
3905e28fff26SAlex Elder 
3906e28fff26SAlex Elder /*
3907e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
3908e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
3909593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
3910593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
3911e28fff26SAlex Elder  *
3912e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
3913e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
3914e28fff26SAlex Elder  * token_size if the token would not fit.
3915e28fff26SAlex Elder  *
3916593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
3917e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
3918e28fff26SAlex Elder  * too small to hold it.
3919e28fff26SAlex Elder  */
3920e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
3921e28fff26SAlex Elder 				char *token,
3922e28fff26SAlex Elder 				size_t token_size)
3923e28fff26SAlex Elder {
3924e28fff26SAlex Elder         size_t len;
3925e28fff26SAlex Elder 
3926e28fff26SAlex Elder 	len = next_token(buf);
3927e28fff26SAlex Elder 	if (len < token_size) {
3928e28fff26SAlex Elder 		memcpy(token, *buf, len);
3929e28fff26SAlex Elder 		*(token + len) = '\0';
3930e28fff26SAlex Elder 	}
3931e28fff26SAlex Elder 	*buf += len;
3932e28fff26SAlex Elder 
3933e28fff26SAlex Elder         return len;
3934e28fff26SAlex Elder }
3935e28fff26SAlex Elder 
3936e28fff26SAlex Elder /*
3937ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
3938ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
3939ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
3940ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
3941ea3352f4SAlex Elder  *
3942ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
3943ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
3944ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
3945ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
3946ea3352f4SAlex Elder  *
3947ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
3948ea3352f4SAlex Elder  * the end of the found token.
3949ea3352f4SAlex Elder  *
3950ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
3951ea3352f4SAlex Elder  */
3952ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
3953ea3352f4SAlex Elder {
3954ea3352f4SAlex Elder 	char *dup;
3955ea3352f4SAlex Elder 	size_t len;
3956ea3352f4SAlex Elder 
3957ea3352f4SAlex Elder 	len = next_token(buf);
39584caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
3959ea3352f4SAlex Elder 	if (!dup)
3960ea3352f4SAlex Elder 		return NULL;
3961ea3352f4SAlex Elder 	*(dup + len) = '\0';
3962ea3352f4SAlex Elder 	*buf += len;
3963ea3352f4SAlex Elder 
3964ea3352f4SAlex Elder 	if (lenp)
3965ea3352f4SAlex Elder 		*lenp = len;
3966ea3352f4SAlex Elder 
3967ea3352f4SAlex Elder 	return dup;
3968ea3352f4SAlex Elder }
3969ea3352f4SAlex Elder 
3970ea3352f4SAlex Elder /*
3971859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
3972859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
3973859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
3974859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
3975d22f76e7SAlex Elder  *
3976859c31dfSAlex Elder  * The information extracted from these options is recorded in
3977859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
3978859c31dfSAlex Elder  * structures:
3979859c31dfSAlex Elder  *  ceph_opts
3980859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
3981859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
3982859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
3983859c31dfSAlex Elder  *  rbd_opts
3984859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
3985859c31dfSAlex Elder  *	this function; caller must release with kfree().
3986859c31dfSAlex Elder  *  spec
3987859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
3988859c31dfSAlex Elder  *	initialized by this function based on parsed options.
3989859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
3990859c31dfSAlex Elder  *
3991859c31dfSAlex Elder  * The options passed take this form:
3992859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3993859c31dfSAlex Elder  * where:
3994859c31dfSAlex Elder  *  <mon_addrs>
3995859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
3996859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
3997859c31dfSAlex Elder  *      by a port number (separated by a colon).
3998859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
3999859c31dfSAlex Elder  *  <options>
4000859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4001859c31dfSAlex Elder  *  <pool_name>
4002859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4003859c31dfSAlex Elder  *  <image_name>
4004859c31dfSAlex Elder  *      The name of the image in that pool to map.
4005859c31dfSAlex Elder  *  <snap_id>
4006859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4007859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4008859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4009859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4010a725f65eSAlex Elder  */
4011859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4012dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4013859c31dfSAlex Elder 				struct rbd_options **opts,
4014859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4015a725f65eSAlex Elder {
4016e28fff26SAlex Elder 	size_t len;
4017859c31dfSAlex Elder 	char *options;
40180ddebc0cSAlex Elder 	const char *mon_addrs;
40190ddebc0cSAlex Elder 	size_t mon_addrs_size;
4020859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
40214e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4022859c31dfSAlex Elder 	struct ceph_options *copts;
4023dc79b113SAlex Elder 	int ret;
4024e28fff26SAlex Elder 
4025e28fff26SAlex Elder 	/* The first four tokens are required */
4026e28fff26SAlex Elder 
40277ef3214aSAlex Elder 	len = next_token(&buf);
40284fb5d671SAlex Elder 	if (!len) {
40294fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
40304fb5d671SAlex Elder 		return -EINVAL;
40314fb5d671SAlex Elder 	}
40320ddebc0cSAlex Elder 	mon_addrs = buf;
4033f28e565aSAlex Elder 	mon_addrs_size = len + 1;
40347ef3214aSAlex Elder 	buf += len;
4035a725f65eSAlex Elder 
4036dc79b113SAlex Elder 	ret = -EINVAL;
4037f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4038f28e565aSAlex Elder 	if (!options)
4039dc79b113SAlex Elder 		return -ENOMEM;
40404fb5d671SAlex Elder 	if (!*options) {
40414fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
40424fb5d671SAlex Elder 		goto out_err;
40434fb5d671SAlex Elder 	}
4044a725f65eSAlex Elder 
4045859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4046859c31dfSAlex Elder 	if (!spec)
4047f28e565aSAlex Elder 		goto out_mem;
4048859c31dfSAlex Elder 
4049859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4050859c31dfSAlex Elder 	if (!spec->pool_name)
4051859c31dfSAlex Elder 		goto out_mem;
40524fb5d671SAlex Elder 	if (!*spec->pool_name) {
40534fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
40544fb5d671SAlex Elder 		goto out_err;
40554fb5d671SAlex Elder 	}
4056e28fff26SAlex Elder 
405769e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4058859c31dfSAlex Elder 	if (!spec->image_name)
4059f28e565aSAlex Elder 		goto out_mem;
40604fb5d671SAlex Elder 	if (!*spec->image_name) {
40614fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
40624fb5d671SAlex Elder 		goto out_err;
40634fb5d671SAlex Elder 	}
4064e28fff26SAlex Elder 
4065f28e565aSAlex Elder 	/*
4066f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4067f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4068f28e565aSAlex Elder 	 */
40693feeb894SAlex Elder 	len = next_token(&buf);
4070820a5f3eSAlex Elder 	if (!len) {
40713feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
40723feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4073f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4074dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4075f28e565aSAlex Elder 		goto out_err;
4076849b4260SAlex Elder 	}
40774caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4078859c31dfSAlex Elder 	if (!spec->snap_name)
4079f28e565aSAlex Elder 		goto out_mem;
4080859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
4081e5c35534SAlex Elder 
40820ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4083e28fff26SAlex Elder 
40844e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
40854e9afebaSAlex Elder 	if (!rbd_opts)
40864e9afebaSAlex Elder 		goto out_mem;
40874e9afebaSAlex Elder 
40884e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4089d22f76e7SAlex Elder 
4090859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
40910ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
40924e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4093859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4094859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4095dc79b113SAlex Elder 		goto out_err;
4096dc79b113SAlex Elder 	}
4097859c31dfSAlex Elder 	kfree(options);
4098859c31dfSAlex Elder 
4099859c31dfSAlex Elder 	*ceph_opts = copts;
41004e9afebaSAlex Elder 	*opts = rbd_opts;
4101859c31dfSAlex Elder 	*rbd_spec = spec;
41020ddebc0cSAlex Elder 
4103dc79b113SAlex Elder 	return 0;
4104f28e565aSAlex Elder out_mem:
4105dc79b113SAlex Elder 	ret = -ENOMEM;
4106d22f76e7SAlex Elder out_err:
4107859c31dfSAlex Elder 	kfree(rbd_opts);
4108859c31dfSAlex Elder 	rbd_spec_put(spec);
4109f28e565aSAlex Elder 	kfree(options);
4110d22f76e7SAlex Elder 
4111dc79b113SAlex Elder 	return ret;
4112a725f65eSAlex Elder }
4113a725f65eSAlex Elder 
4114589d30e0SAlex Elder /*
4115589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4116589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4117589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4118589d30e0SAlex Elder  *
4119589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4120589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4121589d30e0SAlex Elder  * with the supplied name.
4122589d30e0SAlex Elder  *
4123589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4124589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4125589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4126589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4127589d30e0SAlex Elder  */
4128589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4129589d30e0SAlex Elder {
4130589d30e0SAlex Elder 	int ret;
4131589d30e0SAlex Elder 	size_t size;
4132589d30e0SAlex Elder 	char *object_name;
4133589d30e0SAlex Elder 	void *response;
4134589d30e0SAlex Elder 	void *p;
4135589d30e0SAlex Elder 
41362f82ee54SAlex Elder 	/* If we already have it we don't need to look it up */
41372f82ee54SAlex Elder 
41382f82ee54SAlex Elder 	if (rbd_dev->spec->image_id)
41392f82ee54SAlex Elder 		return 0;
41402f82ee54SAlex Elder 
4141589d30e0SAlex Elder 	/*
41422c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
41432c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
41442c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
41452c0d0a10SAlex Elder 	 */
41462c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
41472c0d0a10SAlex Elder 		return 0;
41482c0d0a10SAlex Elder 
41492c0d0a10SAlex Elder 	/*
4150589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4151589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4152589d30e0SAlex Elder 	 */
415369e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4154589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4155589d30e0SAlex Elder 	if (!object_name)
4156589d30e0SAlex Elder 		return -ENOMEM;
41570d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4158589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4159589d30e0SAlex Elder 
4160589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4161589d30e0SAlex Elder 
4162589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4163589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4164589d30e0SAlex Elder 	if (!response) {
4165589d30e0SAlex Elder 		ret = -ENOMEM;
4166589d30e0SAlex Elder 		goto out;
4167589d30e0SAlex Elder 	}
4168589d30e0SAlex Elder 
416936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
4170589d30e0SAlex Elder 				"rbd", "get_id",
4171589d30e0SAlex Elder 				NULL, 0,
417207b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
417336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4174589d30e0SAlex Elder 	if (ret < 0)
4175589d30e0SAlex Elder 		goto out;
4176589d30e0SAlex Elder 
4177589d30e0SAlex Elder 	p = response;
41780d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
4179589d30e0SAlex Elder 						p + RBD_IMAGE_ID_LEN_MAX,
4180979ed480SAlex Elder 						NULL, GFP_NOIO);
41810d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
41820d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
41830d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
4184589d30e0SAlex Elder 	} else {
41850d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
4186589d30e0SAlex Elder 	}
4187589d30e0SAlex Elder out:
4188589d30e0SAlex Elder 	kfree(response);
4189589d30e0SAlex Elder 	kfree(object_name);
4190589d30e0SAlex Elder 
4191589d30e0SAlex Elder 	return ret;
4192589d30e0SAlex Elder }
4193589d30e0SAlex Elder 
4194a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4195a30b71b9SAlex Elder {
4196a30b71b9SAlex Elder 	int ret;
4197a30b71b9SAlex Elder 	size_t size;
4198a30b71b9SAlex Elder 
4199a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
4200a30b71b9SAlex Elder 
42010d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
42020d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
4203a30b71b9SAlex Elder 		return -ENOMEM;
4204a30b71b9SAlex Elder 
4205a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
4206a30b71b9SAlex Elder 
420769e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
4208a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4209a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
4210a30b71b9SAlex Elder 		ret = -ENOMEM;
4211a30b71b9SAlex Elder 		goto out_err;
4212a30b71b9SAlex Elder 	}
42130d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
42140d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
4215a30b71b9SAlex Elder 
4216a30b71b9SAlex Elder 	/* Populate rbd image metadata */
4217a30b71b9SAlex Elder 
4218a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4219a30b71b9SAlex Elder 	if (ret < 0)
4220a30b71b9SAlex Elder 		goto out_err;
422186b00e0dSAlex Elder 
422286b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
422386b00e0dSAlex Elder 
422486b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
422586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
422686b00e0dSAlex Elder 
4227a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
4228a30b71b9SAlex Elder 
4229a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4230a30b71b9SAlex Elder 		rbd_dev->header_name);
4231a30b71b9SAlex Elder 
4232a30b71b9SAlex Elder 	return 0;
4233a30b71b9SAlex Elder 
4234a30b71b9SAlex Elder out_err:
4235a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4236a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
42370d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
42380d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4239a30b71b9SAlex Elder 
4240a30b71b9SAlex Elder 	return ret;
4241a30b71b9SAlex Elder }
4242a30b71b9SAlex Elder 
4243a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4244a30b71b9SAlex Elder {
4245a30b71b9SAlex Elder 	size_t size;
42469d475de5SAlex Elder 	int ret;
42476e14b1a6SAlex Elder 	u64 ver = 0;
4248a30b71b9SAlex Elder 
4249a30b71b9SAlex Elder 	/*
4250a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
4251a30b71b9SAlex Elder 	 * object name for this rbd image.
4252a30b71b9SAlex Elder 	 */
4253979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
4254a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4255a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
4256a30b71b9SAlex Elder 		return -ENOMEM;
4257a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
42580d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
42599d475de5SAlex Elder 
42609d475de5SAlex Elder 	/* Get the size and object order for the image */
42619d475de5SAlex Elder 
42629d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
42639d475de5SAlex Elder 	if (ret < 0)
42649d475de5SAlex Elder 		goto out_err;
42651e130199SAlex Elder 
42661e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
42671e130199SAlex Elder 
42681e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
42691e130199SAlex Elder 	if (ret < 0)
42701e130199SAlex Elder 		goto out_err;
4271b1b5402aSAlex Elder 
4272d889140cSAlex Elder 	/* Get the and check features for the image */
4273b1b5402aSAlex Elder 
4274b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
4275b1b5402aSAlex Elder 	if (ret < 0)
4276b1b5402aSAlex Elder 		goto out_err;
427735d489f9SAlex Elder 
427886b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
427986b00e0dSAlex Elder 
428086b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
428186b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
428286b00e0dSAlex Elder 		if (ret < 0)
428386b00e0dSAlex Elder 			goto out_err;
428486b00e0dSAlex Elder 	}
428586b00e0dSAlex Elder 
42866e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
428735d489f9SAlex Elder 
42886e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
42896e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
42906e14b1a6SAlex Elder 
42916e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
42926e14b1a6SAlex Elder 
42936e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
429435d489f9SAlex Elder 	if (ret)
429535d489f9SAlex Elder 		goto out_err;
42966e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
42976e14b1a6SAlex Elder 
4298a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
4299a30b71b9SAlex Elder 
4300a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4301a30b71b9SAlex Elder 		rbd_dev->header_name);
4302a30b71b9SAlex Elder 
430335152979SAlex Elder 	return 0;
43049d475de5SAlex Elder out_err:
430586b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
430686b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
430786b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
43089d475de5SAlex Elder 	kfree(rbd_dev->header_name);
43099d475de5SAlex Elder 	rbd_dev->header_name = NULL;
43101e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
43111e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
43129d475de5SAlex Elder 
43139d475de5SAlex Elder 	return ret;
4314a30b71b9SAlex Elder }
4315a30b71b9SAlex Elder 
431683a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
431783a06263SAlex Elder {
43182f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
43192f82ee54SAlex Elder 	struct rbd_spec *parent_spec = NULL;
43202f82ee54SAlex Elder 	struct rbd_client *rbdc = NULL;
432183a06263SAlex Elder 	int ret;
432283a06263SAlex Elder 
432383a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
432483a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
432583a06263SAlex Elder 	if (ret)
432683a06263SAlex Elder 		return ret;
432783a06263SAlex Elder 
43289e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
43299e15b77dSAlex Elder 	if (ret)
43309e15b77dSAlex Elder 		goto err_out_snaps;
43319e15b77dSAlex Elder 
433283a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
433383a06263SAlex Elder 	if (ret)
433483a06263SAlex Elder 		goto err_out_snaps;
433583a06263SAlex Elder 
433683a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
433783a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
433883a06263SAlex Elder 
433983a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
434083a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
434183a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
434283a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
434383a06263SAlex Elder 
434483a06263SAlex Elder 	/* Get our block major device number. */
434583a06263SAlex Elder 
434683a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
434783a06263SAlex Elder 	if (ret < 0)
434883a06263SAlex Elder 		goto err_out_id;
434983a06263SAlex Elder 	rbd_dev->major = ret;
435083a06263SAlex Elder 
435183a06263SAlex Elder 	/* Set up the blkdev mapping. */
435283a06263SAlex Elder 
435383a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
435483a06263SAlex Elder 	if (ret)
435583a06263SAlex Elder 		goto err_out_blkdev;
435683a06263SAlex Elder 
435783a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
435883a06263SAlex Elder 	if (ret)
435983a06263SAlex Elder 		goto err_out_disk;
436083a06263SAlex Elder 
436183a06263SAlex Elder 	/*
436283a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
436383a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
436483a06263SAlex Elder 	 */
43652f82ee54SAlex Elder 	/* Probe the parent if there is one */
43662f82ee54SAlex Elder 
43672f82ee54SAlex Elder 	if (rbd_dev->parent_spec) {
43682f82ee54SAlex Elder 		/*
43692f82ee54SAlex Elder 		 * We need to pass a reference to the client and the
43702f82ee54SAlex Elder 		 * parent spec when creating the parent rbd_dev.
43712f82ee54SAlex Elder 		 * Images related by parent/child relationships
43722f82ee54SAlex Elder 		 * always share both.
43732f82ee54SAlex Elder 		 */
43742f82ee54SAlex Elder 		parent_spec = rbd_spec_get(rbd_dev->parent_spec);
43752f82ee54SAlex Elder 		rbdc = __rbd_get_client(rbd_dev->rbd_client);
43762f82ee54SAlex Elder 
43772f82ee54SAlex Elder 		parent = rbd_dev_create(rbdc, parent_spec);
43782f82ee54SAlex Elder 		if (!parent) {
43792f82ee54SAlex Elder 			ret = -ENOMEM;
43802f82ee54SAlex Elder 			goto err_out_spec;
43812f82ee54SAlex Elder 		}
43822f82ee54SAlex Elder 		rbdc = NULL;		/* parent now owns reference */
43832f82ee54SAlex Elder 		parent_spec = NULL;	/* parent now owns reference */
43842f82ee54SAlex Elder 		ret = rbd_dev_probe(parent);
43852f82ee54SAlex Elder 		if (ret < 0)
43862f82ee54SAlex Elder 			goto err_out_parent;
43872f82ee54SAlex Elder 		rbd_dev->parent = parent;
43882f82ee54SAlex Elder 	}
43892f82ee54SAlex Elder 
439083a06263SAlex Elder 	down_write(&rbd_dev->header_rwsem);
439183a06263SAlex Elder 	ret = rbd_dev_snaps_register(rbd_dev);
439283a06263SAlex Elder 	up_write(&rbd_dev->header_rwsem);
439383a06263SAlex Elder 	if (ret)
439483a06263SAlex Elder 		goto err_out_bus;
439583a06263SAlex Elder 
43969969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
439783a06263SAlex Elder 	if (ret)
439883a06263SAlex Elder 		goto err_out_bus;
439983a06263SAlex Elder 
440083a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
440183a06263SAlex Elder 
440283a06263SAlex Elder 	add_disk(rbd_dev->disk);
440383a06263SAlex Elder 
440483a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
440583a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
440683a06263SAlex Elder 
440783a06263SAlex Elder 	return ret;
44082f82ee54SAlex Elder 
44092f82ee54SAlex Elder err_out_parent:
44102f82ee54SAlex Elder 	rbd_dev_destroy(parent);
44112f82ee54SAlex Elder err_out_spec:
44122f82ee54SAlex Elder 	rbd_spec_put(parent_spec);
44132f82ee54SAlex Elder 	rbd_put_client(rbdc);
441483a06263SAlex Elder err_out_bus:
441583a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
441683a06263SAlex Elder 
441783a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
441883a06263SAlex Elder 
441983a06263SAlex Elder 	return ret;
442083a06263SAlex Elder err_out_disk:
442183a06263SAlex Elder 	rbd_free_disk(rbd_dev);
442283a06263SAlex Elder err_out_blkdev:
442383a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
442483a06263SAlex Elder err_out_id:
442583a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
442683a06263SAlex Elder err_out_snaps:
442783a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
442883a06263SAlex Elder 
442983a06263SAlex Elder 	return ret;
443083a06263SAlex Elder }
443183a06263SAlex Elder 
4432a30b71b9SAlex Elder /*
4433a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4434a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4435a30b71b9SAlex Elder  * id.
4436a30b71b9SAlex Elder  */
4437a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4438a30b71b9SAlex Elder {
4439a30b71b9SAlex Elder 	int ret;
4440a30b71b9SAlex Elder 
4441a30b71b9SAlex Elder 	/*
4442a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4443a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4444a30b71b9SAlex Elder 	 * it's a format 1 image.
4445a30b71b9SAlex Elder 	 */
4446a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4447a30b71b9SAlex Elder 	if (ret)
4448a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4449a30b71b9SAlex Elder 	else
4450a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
445183a06263SAlex Elder 	if (ret) {
4452a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4453a30b71b9SAlex Elder 
4454a30b71b9SAlex Elder 		return ret;
4455a30b71b9SAlex Elder 	}
4456a30b71b9SAlex Elder 
445783a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
445883a06263SAlex Elder 	if (ret)
445983a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
446083a06263SAlex Elder 
446183a06263SAlex Elder 	return ret;
446283a06263SAlex Elder }
446383a06263SAlex Elder 
446459c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
446559c2be1eSYehuda Sadeh 		       const char *buf,
446659c2be1eSYehuda Sadeh 		       size_t count)
4467602adf40SYehuda Sadeh {
4468cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4469dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
44704e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4471859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
44729d3997fdSAlex Elder 	struct rbd_client *rbdc;
447327cc2594SAlex Elder 	struct ceph_osd_client *osdc;
447427cc2594SAlex Elder 	int rc = -ENOMEM;
4475602adf40SYehuda Sadeh 
4476602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4477602adf40SYehuda Sadeh 		return -ENODEV;
4478602adf40SYehuda Sadeh 
4479a725f65eSAlex Elder 	/* parse add command */
4480859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4481dc79b113SAlex Elder 	if (rc < 0)
4482bd4ba655SAlex Elder 		goto err_out_module;
4483a725f65eSAlex Elder 
44849d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
44859d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
44869d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
44870ddebc0cSAlex Elder 		goto err_out_args;
44889d3997fdSAlex Elder 	}
4489c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4490602adf40SYehuda Sadeh 
4491602adf40SYehuda Sadeh 	/* pick the pool */
44929d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4493859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4494602adf40SYehuda Sadeh 	if (rc < 0)
4495602adf40SYehuda Sadeh 		goto err_out_client;
4496859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4497859c31dfSAlex Elder 
44980903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
44990903e875SAlex Elder 
45000903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
45010903e875SAlex Elder 		rc = -EIO;
45020903e875SAlex Elder 		goto err_out_client;
45030903e875SAlex Elder 	}
45040903e875SAlex Elder 
4505c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4506bd4ba655SAlex Elder 	if (!rbd_dev)
4507bd4ba655SAlex Elder 		goto err_out_client;
4508c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4509c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4510602adf40SYehuda Sadeh 
4511bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4512c53d5893SAlex Elder 	kfree(rbd_opts);
4513c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4514bd4ba655SAlex Elder 
4515a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4516a30b71b9SAlex Elder 	if (rc < 0)
4517c53d5893SAlex Elder 		goto err_out_rbd_dev;
451805fd6f6fSAlex Elder 
4519602adf40SYehuda Sadeh 	return count;
4520c53d5893SAlex Elder err_out_rbd_dev:
4521c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4522bd4ba655SAlex Elder err_out_client:
45239d3997fdSAlex Elder 	rbd_put_client(rbdc);
45240ddebc0cSAlex Elder err_out_args:
452578cea76eSAlex Elder 	if (ceph_opts)
452678cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
45274e9afebaSAlex Elder 	kfree(rbd_opts);
4528859c31dfSAlex Elder 	rbd_spec_put(spec);
4529bd4ba655SAlex Elder err_out_module:
4530bd4ba655SAlex Elder 	module_put(THIS_MODULE);
453127cc2594SAlex Elder 
4532602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
453327cc2594SAlex Elder 
453427cc2594SAlex Elder 	return (ssize_t) rc;
4535602adf40SYehuda Sadeh }
4536602adf40SYehuda Sadeh 
4537de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4538602adf40SYehuda Sadeh {
4539602adf40SYehuda Sadeh 	struct list_head *tmp;
4540602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4541602adf40SYehuda Sadeh 
4542e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4543602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4544602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4545de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4546e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4547602adf40SYehuda Sadeh 			return rbd_dev;
4548602adf40SYehuda Sadeh 		}
4549e124a82fSAlex Elder 	}
4550e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4551602adf40SYehuda Sadeh 	return NULL;
4552602adf40SYehuda Sadeh }
4553602adf40SYehuda Sadeh 
4554dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4555602adf40SYehuda Sadeh {
4556593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4557602adf40SYehuda Sadeh 
455859c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
45599969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4560602adf40SYehuda Sadeh 
4561602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4562602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4563602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
456432eec68dSAlex Elder 
45652ac4e75dSAlex Elder 	/* release allocated disk header fields */
45662ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
45672ac4e75dSAlex Elder 
456832eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4569e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4570c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4571c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4572602adf40SYehuda Sadeh 
4573602adf40SYehuda Sadeh 	/* release module ref */
4574602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4575602adf40SYehuda Sadeh }
4576602adf40SYehuda Sadeh 
45772f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev)
45782f82ee54SAlex Elder {
45792f82ee54SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
45802f82ee54SAlex Elder 	rbd_bus_del_dev(rbd_dev);
45812f82ee54SAlex Elder }
45822f82ee54SAlex Elder 
4583dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4584602adf40SYehuda Sadeh 			  const char *buf,
4585602adf40SYehuda Sadeh 			  size_t count)
4586602adf40SYehuda Sadeh {
4587602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4588602adf40SYehuda Sadeh 	int target_id, rc;
4589602adf40SYehuda Sadeh 	unsigned long ul;
4590602adf40SYehuda Sadeh 	int ret = count;
4591602adf40SYehuda Sadeh 
4592602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4593602adf40SYehuda Sadeh 	if (rc)
4594602adf40SYehuda Sadeh 		return rc;
4595602adf40SYehuda Sadeh 
4596602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4597602adf40SYehuda Sadeh 	target_id = (int) ul;
4598602adf40SYehuda Sadeh 	if (target_id != ul)
4599602adf40SYehuda Sadeh 		return -EINVAL;
4600602adf40SYehuda Sadeh 
4601602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4602602adf40SYehuda Sadeh 
4603602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4604602adf40SYehuda Sadeh 	if (!rbd_dev) {
4605602adf40SYehuda Sadeh 		ret = -ENOENT;
4606602adf40SYehuda Sadeh 		goto done;
4607602adf40SYehuda Sadeh 	}
4608602adf40SYehuda Sadeh 
4609a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4610b82d167bSAlex Elder 	if (rbd_dev->open_count)
461142382b70SAlex Elder 		ret = -EBUSY;
4612b82d167bSAlex Elder 	else
4613b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4614a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4615b82d167bSAlex Elder 	if (ret < 0)
461642382b70SAlex Elder 		goto done;
461742382b70SAlex Elder 
46182f82ee54SAlex Elder 	while (rbd_dev->parent_spec) {
46192f82ee54SAlex Elder 		struct rbd_device *first = rbd_dev;
46202f82ee54SAlex Elder 		struct rbd_device *second = first->parent;
46212f82ee54SAlex Elder 		struct rbd_device *third;
46222f82ee54SAlex Elder 
46232f82ee54SAlex Elder 		/*
46242f82ee54SAlex Elder 		 * Follow to the parent with no grandparent and
46252f82ee54SAlex Elder 		 * remove it.
46262f82ee54SAlex Elder 		 */
46272f82ee54SAlex Elder 		while (second && (third = second->parent)) {
46282f82ee54SAlex Elder 			first = second;
46292f82ee54SAlex Elder 			second = third;
46302f82ee54SAlex Elder 		}
46312f82ee54SAlex Elder 		__rbd_remove(second);
46322f82ee54SAlex Elder 		rbd_spec_put(first->parent_spec);
46332f82ee54SAlex Elder 		first->parent_spec = NULL;
46342f82ee54SAlex Elder 		first->parent_overlap = 0;
46352f82ee54SAlex Elder 		first->parent = NULL;
46362f82ee54SAlex Elder 	}
46372f82ee54SAlex Elder 	__rbd_remove(rbd_dev);
4638602adf40SYehuda Sadeh 
4639602adf40SYehuda Sadeh done:
4640602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4641aafb230eSAlex Elder 
4642602adf40SYehuda Sadeh 	return ret;
4643602adf40SYehuda Sadeh }
4644602adf40SYehuda Sadeh 
4645602adf40SYehuda Sadeh /*
4646602adf40SYehuda Sadeh  * create control files in sysfs
4647dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4648602adf40SYehuda Sadeh  */
4649602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4650602adf40SYehuda Sadeh {
4651dfc5606dSYehuda Sadeh 	int ret;
4652602adf40SYehuda Sadeh 
4653fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
4654dfc5606dSYehuda Sadeh 	if (ret < 0)
4655dfc5606dSYehuda Sadeh 		return ret;
4656602adf40SYehuda Sadeh 
4657fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
4658fed4c143SAlex Elder 	if (ret < 0)
4659fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
4660602adf40SYehuda Sadeh 
4661602adf40SYehuda Sadeh 	return ret;
4662602adf40SYehuda Sadeh }
4663602adf40SYehuda Sadeh 
4664602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
4665602adf40SYehuda Sadeh {
4666dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
4667fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
4668602adf40SYehuda Sadeh }
4669602adf40SYehuda Sadeh 
4670cc344fa1SAlex Elder static int __init rbd_init(void)
4671602adf40SYehuda Sadeh {
4672602adf40SYehuda Sadeh 	int rc;
4673602adf40SYehuda Sadeh 
46741e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
46751e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
46761e32d34cSAlex Elder 
46771e32d34cSAlex Elder 		return -EINVAL;
46781e32d34cSAlex Elder 	}
4679602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
4680602adf40SYehuda Sadeh 	if (rc)
4681602adf40SYehuda Sadeh 		return rc;
4682f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
4683602adf40SYehuda Sadeh 	return 0;
4684602adf40SYehuda Sadeh }
4685602adf40SYehuda Sadeh 
4686cc344fa1SAlex Elder static void __exit rbd_exit(void)
4687602adf40SYehuda Sadeh {
4688602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
4689602adf40SYehuda Sadeh }
4690602adf40SYehuda Sadeh 
4691602adf40SYehuda Sadeh module_init(rbd_init);
4692602adf40SYehuda Sadeh module_exit(rbd_exit);
4693602adf40SYehuda Sadeh 
4694602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4695602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4696602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
4697602adf40SYehuda Sadeh 
4698602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
4699602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4700602adf40SYehuda Sadeh 
4701602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
4702