xref: /openbmc/linux/drivers/block/rbd.c (revision 6e584f52)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
83770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
11159c2be1eSYehuda Sadeh 	u64 obj_version;
11259c2be1eSYehuda Sadeh };
11359c2be1eSYehuda Sadeh 
1140d7dbfceSAlex Elder /*
1150d7dbfceSAlex Elder  * An rbd image specification.
1160d7dbfceSAlex Elder  *
1170d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
118c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
119c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
120c66c6e0cSAlex Elder  *
121c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
122c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
123c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
124c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
127c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
128c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
129c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
130c66c6e0cSAlex Elder  * is shared between the parent and child).
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
133c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
134c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
137c66c6e0cSAlex Elder  * could be a null pointer).
1380d7dbfceSAlex Elder  */
1390d7dbfceSAlex Elder struct rbd_spec {
1400d7dbfceSAlex Elder 	u64		pool_id;
1410d7dbfceSAlex Elder 	char		*pool_name;
1420d7dbfceSAlex Elder 
1430d7dbfceSAlex Elder 	char		*image_id;
1440d7dbfceSAlex Elder 	char		*image_name;
1450d7dbfceSAlex Elder 
1460d7dbfceSAlex Elder 	u64		snap_id;
1470d7dbfceSAlex Elder 	char		*snap_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	struct kref	kref;
1500d7dbfceSAlex Elder };
1510d7dbfceSAlex Elder 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
154602adf40SYehuda Sadeh  */
155602adf40SYehuda Sadeh struct rbd_client {
156602adf40SYehuda Sadeh 	struct ceph_client	*client;
157602adf40SYehuda Sadeh 	struct kref		kref;
158602adf40SYehuda Sadeh 	struct list_head	node;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161bf0d5f50SAlex Elder struct rbd_img_request;
162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163bf0d5f50SAlex Elder 
164bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
165bf0d5f50SAlex Elder 
166bf0d5f50SAlex Elder struct rbd_obj_request;
167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168bf0d5f50SAlex Elder 
1699969ebc5SAlex Elder enum obj_request_type {
1709969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1719969ebc5SAlex Elder };
172bf0d5f50SAlex Elder 
173926f9b3fSAlex Elder enum obj_req_flags {
174926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1756365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1765679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1775679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
178926f9b3fSAlex Elder };
179926f9b3fSAlex Elder 
180bf0d5f50SAlex Elder struct rbd_obj_request {
181bf0d5f50SAlex Elder 	const char		*object_name;
182bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
183bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
184926f9b3fSAlex Elder 	unsigned long		flags;
185bf0d5f50SAlex Elder 
186c5b5ef6cSAlex Elder 	/*
187c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
188c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
189c5b5ef6cSAlex Elder 	 *
190c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
191c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
192c5b5ef6cSAlex Elder 	 *
193c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
194c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
195c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
196c5b5ef6cSAlex Elder 	 *
197c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
198c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
199c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
200c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
201c5b5ef6cSAlex Elder 	 */
202c5b5ef6cSAlex Elder 	union {
203c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
204c5b5ef6cSAlex Elder 		struct {
205bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
206c5b5ef6cSAlex Elder 			u64			img_offset;
207c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
208c5b5ef6cSAlex Elder 			struct list_head	links;
209c5b5ef6cSAlex Elder 		};
210c5b5ef6cSAlex Elder 	};
211bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
212bf0d5f50SAlex Elder 
213bf0d5f50SAlex Elder 	enum obj_request_type	type;
214788e2df3SAlex Elder 	union {
215bf0d5f50SAlex Elder 		struct bio	*bio_list;
216788e2df3SAlex Elder 		struct {
217788e2df3SAlex Elder 			struct page	**pages;
218788e2df3SAlex Elder 			u32		page_count;
219788e2df3SAlex Elder 		};
220788e2df3SAlex Elder 	};
2210eefd470SAlex Elder 	struct page		**copyup_pages;
222bf0d5f50SAlex Elder 
223bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
224bf0d5f50SAlex Elder 
225bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
226bf0d5f50SAlex Elder 	u64			version;
2271b83bef2SSage Weil 	int			result;
228bf0d5f50SAlex Elder 
229bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
230788e2df3SAlex Elder 	struct completion	completion;
231bf0d5f50SAlex Elder 
232bf0d5f50SAlex Elder 	struct kref		kref;
233bf0d5f50SAlex Elder };
234bf0d5f50SAlex Elder 
2350c425248SAlex Elder enum img_req_flags {
2369849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2379849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
238d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2390c425248SAlex Elder };
2400c425248SAlex Elder 
241bf0d5f50SAlex Elder struct rbd_img_request {
242bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
243bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
244bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2450c425248SAlex Elder 	unsigned long		flags;
246bf0d5f50SAlex Elder 	union {
247bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2489849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2499849e986SAlex Elder 	};
2509849e986SAlex Elder 	union {
2519849e986SAlex Elder 		struct request		*rq;		/* block request */
2529849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
253bf0d5f50SAlex Elder 	};
2543d7efd18SAlex Elder 	struct page		**copyup_pages;
255bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
256bf0d5f50SAlex Elder 	u32			next_completion;
257bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
25855f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
259a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
260bf0d5f50SAlex Elder 
261bf0d5f50SAlex Elder 	u32			obj_request_count;
262bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
263bf0d5f50SAlex Elder 
264bf0d5f50SAlex Elder 	struct kref		kref;
265bf0d5f50SAlex Elder };
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
268ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
269bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
270ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
271bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
272ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
273bf0d5f50SAlex Elder 
274dfc5606dSYehuda Sadeh struct rbd_snap {
275dfc5606dSYehuda Sadeh 	const char		*name;
2763591538fSJosh Durgin 	u64			size;
277dfc5606dSYehuda Sadeh 	struct list_head	node;
278dfc5606dSYehuda Sadeh 	u64			id;
27934b13184SAlex Elder 	u64			features;
280dfc5606dSYehuda Sadeh };
281dfc5606dSYehuda Sadeh 
282f84344f3SAlex Elder struct rbd_mapping {
28399c1f08fSAlex Elder 	u64                     size;
28434b13184SAlex Elder 	u64                     features;
285f84344f3SAlex Elder 	bool			read_only;
286f84344f3SAlex Elder };
287f84344f3SAlex Elder 
288602adf40SYehuda Sadeh /*
289602adf40SYehuda Sadeh  * a single device
290602adf40SYehuda Sadeh  */
291602adf40SYehuda Sadeh struct rbd_device {
292de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
293602adf40SYehuda Sadeh 
294602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
295602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
296602adf40SYehuda Sadeh 
297a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
298602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
299602adf40SYehuda Sadeh 
300602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
301602adf40SYehuda Sadeh 
302b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
303602adf40SYehuda Sadeh 
304602adf40SYehuda Sadeh 	struct rbd_image_header	header;
305b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3060d7dbfceSAlex Elder 	struct rbd_spec		*spec;
307602adf40SYehuda Sadeh 
3080d7dbfceSAlex Elder 	char			*header_name;
309971f839aSAlex Elder 
3100903e875SAlex Elder 	struct ceph_file_layout	layout;
3110903e875SAlex Elder 
31259c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
313975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31459c2be1eSYehuda Sadeh 
31586b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31686b00e0dSAlex Elder 	u64			parent_overlap;
3172f82ee54SAlex Elder 	struct rbd_device	*parent;
31886b00e0dSAlex Elder 
319cc070d59SAlex Elder 	u64			stripe_unit;
320cc070d59SAlex Elder 	u64			stripe_count;
321cc070d59SAlex Elder 
322c666601aSJosh Durgin 	/* protects updating the header */
323c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
324f84344f3SAlex Elder 
325f84344f3SAlex Elder 	struct rbd_mapping	mapping;
326602adf40SYehuda Sadeh 
327602adf40SYehuda Sadeh 	struct list_head	node;
328dfc5606dSYehuda Sadeh 
329dfc5606dSYehuda Sadeh 	/* list of snapshots */
330dfc5606dSYehuda Sadeh 	struct list_head	snaps;
331dfc5606dSYehuda Sadeh 
332dfc5606dSYehuda Sadeh 	/* sysfs related */
333dfc5606dSYehuda Sadeh 	struct device		dev;
334b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
335dfc5606dSYehuda Sadeh };
336dfc5606dSYehuda Sadeh 
337b82d167bSAlex Elder /*
338b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
339b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
340b82d167bSAlex Elder  *
341b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
342b82d167bSAlex Elder  * "open_count" field) requires atomic access.
343b82d167bSAlex Elder  */
3446d292906SAlex Elder enum rbd_dev_flags {
3456d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
346b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3476d292906SAlex Elder };
3486d292906SAlex Elder 
349602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
350e124a82fSAlex Elder 
351602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
352e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
353e124a82fSAlex Elder 
354602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
355432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
356602adf40SYehuda Sadeh 
3573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3583d7efd18SAlex Elder 
359304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
360304f6808SAlex Elder 
361dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
3626087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap);
363dfc5606dSYehuda Sadeh 
364f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
365f0f8cef5SAlex Elder 		       size_t count);
366f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
367f0f8cef5SAlex Elder 			  size_t count);
3682f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev);
369f0f8cef5SAlex Elder 
370f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
371f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
372f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
373f0f8cef5SAlex Elder 	__ATTR_NULL
374f0f8cef5SAlex Elder };
375f0f8cef5SAlex Elder 
376f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
377f0f8cef5SAlex Elder 	.name		= "rbd",
378f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
379f0f8cef5SAlex Elder };
380f0f8cef5SAlex Elder 
381f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
382f0f8cef5SAlex Elder {
383f0f8cef5SAlex Elder }
384f0f8cef5SAlex Elder 
385f0f8cef5SAlex Elder static struct device rbd_root_dev = {
386f0f8cef5SAlex Elder 	.init_name =    "rbd",
387f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
388f0f8cef5SAlex Elder };
389f0f8cef5SAlex Elder 
39006ecc6cbSAlex Elder static __printf(2, 3)
39106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
39206ecc6cbSAlex Elder {
39306ecc6cbSAlex Elder 	struct va_format vaf;
39406ecc6cbSAlex Elder 	va_list args;
39506ecc6cbSAlex Elder 
39606ecc6cbSAlex Elder 	va_start(args, fmt);
39706ecc6cbSAlex Elder 	vaf.fmt = fmt;
39806ecc6cbSAlex Elder 	vaf.va = &args;
39906ecc6cbSAlex Elder 
40006ecc6cbSAlex Elder 	if (!rbd_dev)
40106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
40206ecc6cbSAlex Elder 	else if (rbd_dev->disk)
40306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
40406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
40506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
40606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
41006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
41106ecc6cbSAlex Elder 	else	/* punt */
41206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
41306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
41406ecc6cbSAlex Elder 	va_end(args);
41506ecc6cbSAlex Elder }
41606ecc6cbSAlex Elder 
417aafb230eSAlex Elder #ifdef RBD_DEBUG
418aafb230eSAlex Elder #define rbd_assert(expr)						\
419aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
420aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
421aafb230eSAlex Elder 						"at line %d:\n\n"	\
422aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
423aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
424aafb230eSAlex Elder 			BUG();						\
425aafb230eSAlex Elder 		}
426aafb230eSAlex Elder #else /* !RBD_DEBUG */
427aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
428aafb230eSAlex Elder #endif /* !RBD_DEBUG */
429dfc5606dSYehuda Sadeh 
4308b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
431b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
4328b3e1a56SAlex Elder 
433117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
434117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
43559c2be1eSYehuda Sadeh 
436602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
437602adf40SYehuda Sadeh {
438f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
439b82d167bSAlex Elder 	bool removing = false;
440602adf40SYehuda Sadeh 
441f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
442602adf40SYehuda Sadeh 		return -EROFS;
443602adf40SYehuda Sadeh 
444a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
445b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
446b82d167bSAlex Elder 		removing = true;
447b82d167bSAlex Elder 	else
448b82d167bSAlex Elder 		rbd_dev->open_count++;
449a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
450b82d167bSAlex Elder 	if (removing)
451b82d167bSAlex Elder 		return -ENOENT;
452b82d167bSAlex Elder 
45342382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
454c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
455f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
457340c7a2bSAlex Elder 
458602adf40SYehuda Sadeh 	return 0;
459602adf40SYehuda Sadeh }
460602adf40SYehuda Sadeh 
461dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
462dfc5606dSYehuda Sadeh {
463dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
464b82d167bSAlex Elder 	unsigned long open_count_before;
465b82d167bSAlex Elder 
466a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
467b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
468a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
469b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
470dfc5606dSYehuda Sadeh 
47142382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
472c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
47342382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
474dfc5606dSYehuda Sadeh 
475dfc5606dSYehuda Sadeh 	return 0;
476dfc5606dSYehuda Sadeh }
477dfc5606dSYehuda Sadeh 
478602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
479602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
480602adf40SYehuda Sadeh 	.open			= rbd_open,
481dfc5606dSYehuda Sadeh 	.release		= rbd_release,
482602adf40SYehuda Sadeh };
483602adf40SYehuda Sadeh 
484602adf40SYehuda Sadeh /*
485602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48643ae4701SAlex Elder  * We own *ceph_opts.
487602adf40SYehuda Sadeh  */
488f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
489602adf40SYehuda Sadeh {
490602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
491602adf40SYehuda Sadeh 	int ret = -ENOMEM;
492602adf40SYehuda Sadeh 
49337206ee5SAlex Elder 	dout("%s:\n", __func__);
494602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
495602adf40SYehuda Sadeh 	if (!rbdc)
496602adf40SYehuda Sadeh 		goto out_opt;
497602adf40SYehuda Sadeh 
498602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
499602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
500602adf40SYehuda Sadeh 
501bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
502bc534d86SAlex Elder 
50343ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
504602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
505bc534d86SAlex Elder 		goto out_mutex;
50643ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
507602adf40SYehuda Sadeh 
508602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
509602adf40SYehuda Sadeh 	if (ret < 0)
510602adf40SYehuda Sadeh 		goto out_err;
511602adf40SYehuda Sadeh 
512432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
513602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
514432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
515602adf40SYehuda Sadeh 
516bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
518bc534d86SAlex Elder 
519602adf40SYehuda Sadeh 	return rbdc;
520602adf40SYehuda Sadeh 
521602adf40SYehuda Sadeh out_err:
522602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
523bc534d86SAlex Elder out_mutex:
524bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
525602adf40SYehuda Sadeh 	kfree(rbdc);
526602adf40SYehuda Sadeh out_opt:
52743ae4701SAlex Elder 	if (ceph_opts)
52843ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
52937206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
53037206ee5SAlex Elder 
53128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
532602adf40SYehuda Sadeh }
533602adf40SYehuda Sadeh 
5342f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5352f82ee54SAlex Elder {
5362f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5372f82ee54SAlex Elder 
5382f82ee54SAlex Elder 	return rbdc;
5392f82ee54SAlex Elder }
5402f82ee54SAlex Elder 
541602adf40SYehuda Sadeh /*
5421f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5431f7ba331SAlex Elder  * found, bump its reference count.
544602adf40SYehuda Sadeh  */
5451f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
546602adf40SYehuda Sadeh {
547602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5481f7ba331SAlex Elder 	bool found = false;
549602adf40SYehuda Sadeh 
55043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
551602adf40SYehuda Sadeh 		return NULL;
552602adf40SYehuda Sadeh 
5531f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5541f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5551f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5562f82ee54SAlex Elder 			__rbd_get_client(client_node);
5572f82ee54SAlex Elder 
5581f7ba331SAlex Elder 			found = true;
5591f7ba331SAlex Elder 			break;
5601f7ba331SAlex Elder 		}
5611f7ba331SAlex Elder 	}
5621f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5631f7ba331SAlex Elder 
5641f7ba331SAlex Elder 	return found ? client_node : NULL;
565602adf40SYehuda Sadeh }
566602adf40SYehuda Sadeh 
567602adf40SYehuda Sadeh /*
56859c2be1eSYehuda Sadeh  * mount options
56959c2be1eSYehuda Sadeh  */
57059c2be1eSYehuda Sadeh enum {
57159c2be1eSYehuda Sadeh 	Opt_last_int,
57259c2be1eSYehuda Sadeh 	/* int args above */
57359c2be1eSYehuda Sadeh 	Opt_last_string,
57459c2be1eSYehuda Sadeh 	/* string args above */
575cc0538b6SAlex Elder 	Opt_read_only,
576cc0538b6SAlex Elder 	Opt_read_write,
577cc0538b6SAlex Elder 	/* Boolean args above */
578cc0538b6SAlex Elder 	Opt_last_bool,
57959c2be1eSYehuda Sadeh };
58059c2be1eSYehuda Sadeh 
58143ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
58259c2be1eSYehuda Sadeh 	/* int args above */
58359c2be1eSYehuda Sadeh 	/* string args above */
584be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
585cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
586cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
587cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
588cc0538b6SAlex Elder 	/* Boolean args above */
58959c2be1eSYehuda Sadeh 	{-1, NULL}
59059c2be1eSYehuda Sadeh };
59159c2be1eSYehuda Sadeh 
59298571b5aSAlex Elder struct rbd_options {
59398571b5aSAlex Elder 	bool	read_only;
59498571b5aSAlex Elder };
59598571b5aSAlex Elder 
59698571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59798571b5aSAlex Elder 
59859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
59959c2be1eSYehuda Sadeh {
60043ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
60159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
60259c2be1eSYehuda Sadeh 	int token, intval, ret;
60359c2be1eSYehuda Sadeh 
60443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60559c2be1eSYehuda Sadeh 	if (token < 0)
60659c2be1eSYehuda Sadeh 		return -EINVAL;
60759c2be1eSYehuda Sadeh 
60859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
60959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
61059c2be1eSYehuda Sadeh 		if (ret < 0) {
61159c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
61259c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
61359c2be1eSYehuda Sadeh 			return ret;
61459c2be1eSYehuda Sadeh 		}
61559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61759c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
61859c2be1eSYehuda Sadeh 		     argstr[0].from);
619cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
620cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
62159c2be1eSYehuda Sadeh 	} else {
62259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
62359c2be1eSYehuda Sadeh 	}
62459c2be1eSYehuda Sadeh 
62559c2be1eSYehuda Sadeh 	switch (token) {
626cc0538b6SAlex Elder 	case Opt_read_only:
627cc0538b6SAlex Elder 		rbd_opts->read_only = true;
628cc0538b6SAlex Elder 		break;
629cc0538b6SAlex Elder 	case Opt_read_write:
630cc0538b6SAlex Elder 		rbd_opts->read_only = false;
631cc0538b6SAlex Elder 		break;
63259c2be1eSYehuda Sadeh 	default:
633aafb230eSAlex Elder 		rbd_assert(false);
634aafb230eSAlex Elder 		break;
63559c2be1eSYehuda Sadeh 	}
63659c2be1eSYehuda Sadeh 	return 0;
63759c2be1eSYehuda Sadeh }
63859c2be1eSYehuda Sadeh 
63959c2be1eSYehuda Sadeh /*
640602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
641602adf40SYehuda Sadeh  * not exist create it.
642602adf40SYehuda Sadeh  */
6439d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
644602adf40SYehuda Sadeh {
645f8c38929SAlex Elder 	struct rbd_client *rbdc;
64659c2be1eSYehuda Sadeh 
6471f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6489d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
64943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6509d3997fdSAlex Elder 	else
651f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
652d720bcb0SAlex Elder 
6539d3997fdSAlex Elder 	return rbdc;
654602adf40SYehuda Sadeh }
655602adf40SYehuda Sadeh 
656602adf40SYehuda Sadeh /*
657602adf40SYehuda Sadeh  * Destroy ceph client
658d23a4b3fSAlex Elder  *
659432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
660602adf40SYehuda Sadeh  */
661602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
662602adf40SYehuda Sadeh {
663602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
664602adf40SYehuda Sadeh 
66537206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
666cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
667602adf40SYehuda Sadeh 	list_del(&rbdc->node);
668cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
669602adf40SYehuda Sadeh 
670602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
671602adf40SYehuda Sadeh 	kfree(rbdc);
672602adf40SYehuda Sadeh }
673602adf40SYehuda Sadeh 
674602adf40SYehuda Sadeh /*
675602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
676602adf40SYehuda Sadeh  * it.
677602adf40SYehuda Sadeh  */
6789d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
679602adf40SYehuda Sadeh {
680c53d5893SAlex Elder 	if (rbdc)
6819d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
682602adf40SYehuda Sadeh }
683602adf40SYehuda Sadeh 
684a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
685a30b71b9SAlex Elder {
686a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
687a30b71b9SAlex Elder }
688a30b71b9SAlex Elder 
6898e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6908e94af8eSAlex Elder {
691103a150fSAlex Elder 	size_t size;
692103a150fSAlex Elder 	u32 snap_count;
693103a150fSAlex Elder 
694103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
695103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
696103a150fSAlex Elder 		return false;
697103a150fSAlex Elder 
698db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
699db2388b6SAlex Elder 
700db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
701db2388b6SAlex Elder 		return false;
702db2388b6SAlex Elder 
703db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
704db2388b6SAlex Elder 
705db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
706db2388b6SAlex Elder 		return false;
707db2388b6SAlex Elder 
708103a150fSAlex Elder 	/*
709103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
710103a150fSAlex Elder 	 * that limits the number of snapshots.
711103a150fSAlex Elder 	 */
712103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
713103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
714103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
715103a150fSAlex Elder 		return false;
716103a150fSAlex Elder 
717103a150fSAlex Elder 	/*
718103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
719103a150fSAlex Elder 	 * header must also be representable in a size_t.
720103a150fSAlex Elder 	 */
721103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
722103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
723103a150fSAlex Elder 		return false;
724103a150fSAlex Elder 
725103a150fSAlex Elder 	return true;
7268e94af8eSAlex Elder }
7278e94af8eSAlex Elder 
728602adf40SYehuda Sadeh /*
729602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
730602adf40SYehuda Sadeh  * header.
731602adf40SYehuda Sadeh  */
732602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7334156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
734602adf40SYehuda Sadeh {
735ccece235SAlex Elder 	u32 snap_count;
73658c17b0eSAlex Elder 	size_t len;
737d2bb24e5SAlex Elder 	size_t size;
738621901d6SAlex Elder 	u32 i;
739602adf40SYehuda Sadeh 
7406a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7416a52325fSAlex Elder 
742103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
743103a150fSAlex Elder 
74458c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
74558c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7466a52325fSAlex Elder 	if (!header->object_prefix)
747602adf40SYehuda Sadeh 		return -ENOMEM;
74858c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
74958c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
75000f1f36fSAlex Elder 
751602adf40SYehuda Sadeh 	if (snap_count) {
752f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
753f785cc1dSAlex Elder 
754621901d6SAlex Elder 		/* Save a copy of the snapshot names */
755621901d6SAlex Elder 
756f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
757f785cc1dSAlex Elder 			return -EIO;
758f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
759602adf40SYehuda Sadeh 		if (!header->snap_names)
7606a52325fSAlex Elder 			goto out_err;
761f785cc1dSAlex Elder 		/*
762f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
763f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
764f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
765f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
766f785cc1dSAlex Elder 		 */
767f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
768f785cc1dSAlex Elder 			snap_names_len);
7696a52325fSAlex Elder 
770621901d6SAlex Elder 		/* Record each snapshot's size */
771621901d6SAlex Elder 
772d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
773d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
774602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7756a52325fSAlex Elder 			goto out_err;
776621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
777621901d6SAlex Elder 			header->snap_sizes[i] =
778621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
779602adf40SYehuda Sadeh 	} else {
780ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
781602adf40SYehuda Sadeh 		header->snap_names = NULL;
782602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
783602adf40SYehuda Sadeh 	}
784849b4260SAlex Elder 
78534b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
786602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
787602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
788602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7896a52325fSAlex Elder 
790621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
791621901d6SAlex Elder 
792f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7936a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7946a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7956a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7966a52325fSAlex Elder 	if (!header->snapc)
7976a52325fSAlex Elder 		goto out_err;
798602adf40SYehuda Sadeh 
799602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
800505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
801602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
802621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
803602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
804602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
805602adf40SYehuda Sadeh 
806602adf40SYehuda Sadeh 	return 0;
807602adf40SYehuda Sadeh 
8086a52325fSAlex Elder out_err:
809849b4260SAlex Elder 	kfree(header->snap_sizes);
810ccece235SAlex Elder 	header->snap_sizes = NULL;
811602adf40SYehuda Sadeh 	kfree(header->snap_names);
812ccece235SAlex Elder 	header->snap_names = NULL;
8136a52325fSAlex Elder 	kfree(header->object_prefix);
8146a52325fSAlex Elder 	header->object_prefix = NULL;
815ccece235SAlex Elder 
81600f1f36fSAlex Elder 	return -ENOMEM;
817602adf40SYehuda Sadeh }
818602adf40SYehuda Sadeh 
8199e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
8209e15b77dSAlex Elder {
8219e15b77dSAlex Elder 	struct rbd_snap *snap;
8229e15b77dSAlex Elder 
8239e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
8249e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
8259e15b77dSAlex Elder 
8269e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
8279e15b77dSAlex Elder 		if (snap_id == snap->id)
8289e15b77dSAlex Elder 			return snap->name;
8299e15b77dSAlex Elder 
8309e15b77dSAlex Elder 	return NULL;
8319e15b77dSAlex Elder }
8329e15b77dSAlex Elder 
8338836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
834602adf40SYehuda Sadeh {
835602adf40SYehuda Sadeh 
836e86924a8SAlex Elder 	struct rbd_snap *snap;
83700f1f36fSAlex Elder 
838e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
839e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
8400d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
841e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
84234b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
84300f1f36fSAlex Elder 
844e86924a8SAlex Elder 			return 0;
845602adf40SYehuda Sadeh 		}
84600f1f36fSAlex Elder 	}
847e86924a8SAlex Elder 
84800f1f36fSAlex Elder 	return -ENOENT;
84900f1f36fSAlex Elder }
850602adf40SYehuda Sadeh 
851819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
852602adf40SYehuda Sadeh {
85378dc447dSAlex Elder 	int ret;
854602adf40SYehuda Sadeh 
8550d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
856cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8570d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
85899c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
85934b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
860e86924a8SAlex Elder 		ret = 0;
861602adf40SYehuda Sadeh 	} else {
8620d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
863602adf40SYehuda Sadeh 		if (ret < 0)
864602adf40SYehuda Sadeh 			goto done;
865f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
866602adf40SYehuda Sadeh 	}
8676d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8686d292906SAlex Elder 
869602adf40SYehuda Sadeh done:
870602adf40SYehuda Sadeh 	return ret;
871602adf40SYehuda Sadeh }
872602adf40SYehuda Sadeh 
873602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
874602adf40SYehuda Sadeh {
875849b4260SAlex Elder 	kfree(header->object_prefix);
876d78fd7aeSAlex Elder 	header->object_prefix = NULL;
877602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
878d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
879849b4260SAlex Elder 	kfree(header->snap_names);
880d78fd7aeSAlex Elder 	header->snap_names = NULL;
881d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
882d78fd7aeSAlex Elder 	header->snapc = NULL;
883602adf40SYehuda Sadeh }
884602adf40SYehuda Sadeh 
88598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
886602adf40SYehuda Sadeh {
88765ccfe21SAlex Elder 	char *name;
88865ccfe21SAlex Elder 	u64 segment;
88965ccfe21SAlex Elder 	int ret;
890602adf40SYehuda Sadeh 
8912fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
89265ccfe21SAlex Elder 	if (!name)
89365ccfe21SAlex Elder 		return NULL;
89465ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8952fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
89665ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8972fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
89865ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
89965ccfe21SAlex Elder 			segment, ret);
90065ccfe21SAlex Elder 		kfree(name);
90165ccfe21SAlex Elder 		name = NULL;
90265ccfe21SAlex Elder 	}
903602adf40SYehuda Sadeh 
90465ccfe21SAlex Elder 	return name;
90565ccfe21SAlex Elder }
906602adf40SYehuda Sadeh 
90765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
90865ccfe21SAlex Elder {
90965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
910602adf40SYehuda Sadeh 
91165ccfe21SAlex Elder 	return offset & (segment_size - 1);
91265ccfe21SAlex Elder }
91365ccfe21SAlex Elder 
91465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
91565ccfe21SAlex Elder 				u64 offset, u64 length)
91665ccfe21SAlex Elder {
91765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
91865ccfe21SAlex Elder 
91965ccfe21SAlex Elder 	offset &= segment_size - 1;
92065ccfe21SAlex Elder 
921aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
92265ccfe21SAlex Elder 	if (offset + length > segment_size)
92365ccfe21SAlex Elder 		length = segment_size - offset;
92465ccfe21SAlex Elder 
92565ccfe21SAlex Elder 	return length;
926602adf40SYehuda Sadeh }
927602adf40SYehuda Sadeh 
928602adf40SYehuda Sadeh /*
929029bcbd8SJosh Durgin  * returns the size of an object in the image
930029bcbd8SJosh Durgin  */
931029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
932029bcbd8SJosh Durgin {
933029bcbd8SJosh Durgin 	return 1 << header->obj_order;
934029bcbd8SJosh Durgin }
935029bcbd8SJosh Durgin 
936029bcbd8SJosh Durgin /*
937602adf40SYehuda Sadeh  * bio helpers
938602adf40SYehuda Sadeh  */
939602adf40SYehuda Sadeh 
940602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
941602adf40SYehuda Sadeh {
942602adf40SYehuda Sadeh 	struct bio *tmp;
943602adf40SYehuda Sadeh 
944602adf40SYehuda Sadeh 	while (chain) {
945602adf40SYehuda Sadeh 		tmp = chain;
946602adf40SYehuda Sadeh 		chain = chain->bi_next;
947602adf40SYehuda Sadeh 		bio_put(tmp);
948602adf40SYehuda Sadeh 	}
949602adf40SYehuda Sadeh }
950602adf40SYehuda Sadeh 
951602adf40SYehuda Sadeh /*
952602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
953602adf40SYehuda Sadeh  */
954602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
955602adf40SYehuda Sadeh {
956602adf40SYehuda Sadeh 	struct bio_vec *bv;
957602adf40SYehuda Sadeh 	unsigned long flags;
958602adf40SYehuda Sadeh 	void *buf;
959602adf40SYehuda Sadeh 	int i;
960602adf40SYehuda Sadeh 	int pos = 0;
961602adf40SYehuda Sadeh 
962602adf40SYehuda Sadeh 	while (chain) {
963602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
964602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
965602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
966602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
967602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
968602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
96985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
970602adf40SYehuda Sadeh 			}
971602adf40SYehuda Sadeh 			pos += bv->bv_len;
972602adf40SYehuda Sadeh 		}
973602adf40SYehuda Sadeh 
974602adf40SYehuda Sadeh 		chain = chain->bi_next;
975602adf40SYehuda Sadeh 	}
976602adf40SYehuda Sadeh }
977602adf40SYehuda Sadeh 
978602adf40SYehuda Sadeh /*
979b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
980b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
981b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
982b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
983b9434c5bSAlex Elder  */
984b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
985b9434c5bSAlex Elder {
986b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
987b9434c5bSAlex Elder 
988b9434c5bSAlex Elder 	rbd_assert(end > offset);
989b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
990b9434c5bSAlex Elder 	while (offset < end) {
991b9434c5bSAlex Elder 		size_t page_offset;
992b9434c5bSAlex Elder 		size_t length;
993b9434c5bSAlex Elder 		unsigned long flags;
994b9434c5bSAlex Elder 		void *kaddr;
995b9434c5bSAlex Elder 
996b9434c5bSAlex Elder 		page_offset = (size_t)(offset & ~PAGE_MASK);
997b9434c5bSAlex Elder 		length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
998b9434c5bSAlex Elder 		local_irq_save(flags);
999b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1000b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1001b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1002b9434c5bSAlex Elder 		local_irq_restore(flags);
1003b9434c5bSAlex Elder 
1004b9434c5bSAlex Elder 		offset += length;
1005b9434c5bSAlex Elder 		page++;
1006b9434c5bSAlex Elder 	}
1007b9434c5bSAlex Elder }
1008b9434c5bSAlex Elder 
1009b9434c5bSAlex Elder /*
1010f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1011f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1012602adf40SYehuda Sadeh  */
1013f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1014f7760dadSAlex Elder 					unsigned int offset,
1015f7760dadSAlex Elder 					unsigned int len,
1016f7760dadSAlex Elder 					gfp_t gfpmask)
1017602adf40SYehuda Sadeh {
1018f7760dadSAlex Elder 	struct bio_vec *bv;
1019f7760dadSAlex Elder 	unsigned int resid;
1020f7760dadSAlex Elder 	unsigned short idx;
1021f7760dadSAlex Elder 	unsigned int voff;
1022f7760dadSAlex Elder 	unsigned short end_idx;
1023f7760dadSAlex Elder 	unsigned short vcnt;
1024f7760dadSAlex Elder 	struct bio *bio;
1025602adf40SYehuda Sadeh 
1026f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1027f7760dadSAlex Elder 
1028f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1029f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1030f7760dadSAlex Elder 
1031f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1032f7760dadSAlex Elder 		return NULL;
1033f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1034f7760dadSAlex Elder 		return NULL;
1035f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1036f7760dadSAlex Elder 		return NULL;
1037f7760dadSAlex Elder 
1038f7760dadSAlex Elder 	/* Find first affected segment... */
1039f7760dadSAlex Elder 
1040f7760dadSAlex Elder 	resid = offset;
1041f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1042f7760dadSAlex Elder 		if (resid < bv->bv_len)
1043f7760dadSAlex Elder 			break;
1044f7760dadSAlex Elder 		resid -= bv->bv_len;
1045602adf40SYehuda Sadeh 	}
1046f7760dadSAlex Elder 	voff = resid;
1047602adf40SYehuda Sadeh 
1048f7760dadSAlex Elder 	/* ...and the last affected segment */
1049542582fcSAlex Elder 
1050f7760dadSAlex Elder 	resid += len;
1051f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1052f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1053f7760dadSAlex Elder 			break;
1054f7760dadSAlex Elder 		resid -= bv->bv_len;
1055f7760dadSAlex Elder 	}
1056f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1057602adf40SYehuda Sadeh 
1058f7760dadSAlex Elder 	/* Build the clone */
1059f7760dadSAlex Elder 
1060f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1061f7760dadSAlex Elder 	if (!bio)
1062f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1063f7760dadSAlex Elder 
1064f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1065f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1066f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1067f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh 	/*
1070f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1071f7760dadSAlex Elder 	 * and last (or only) entries.
1072602adf40SYehuda Sadeh 	 */
1073f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1074f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1075f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1076f7760dadSAlex Elder 	if (vcnt > 1) {
1077f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1078f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1079602adf40SYehuda Sadeh 	} else {
1080f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1081602adf40SYehuda Sadeh 	}
1082602adf40SYehuda Sadeh 
1083f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1084f7760dadSAlex Elder 	bio->bi_size = len;
1085f7760dadSAlex Elder 	bio->bi_idx = 0;
1086602adf40SYehuda Sadeh 
1087f7760dadSAlex Elder 	return bio;
1088602adf40SYehuda Sadeh }
1089602adf40SYehuda Sadeh 
1090f7760dadSAlex Elder /*
1091f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1092f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1093f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1094f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1095f7760dadSAlex Elder  *
1096f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1097f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1098f7760dadSAlex Elder  * the start of data to be cloned is located.
1099f7760dadSAlex Elder  *
1100f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1101f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1102f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1103f7760dadSAlex Elder  */
1104f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1105f7760dadSAlex Elder 					unsigned int *offset,
1106f7760dadSAlex Elder 					unsigned int len,
1107f7760dadSAlex Elder 					gfp_t gfpmask)
1108f7760dadSAlex Elder {
1109f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1110f7760dadSAlex Elder 	unsigned int off = *offset;
1111f7760dadSAlex Elder 	struct bio *chain = NULL;
1112f7760dadSAlex Elder 	struct bio **end;
1113602adf40SYehuda Sadeh 
1114f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1115602adf40SYehuda Sadeh 
1116f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1117f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1118602adf40SYehuda Sadeh 
1119f7760dadSAlex Elder 	end = &chain;
1120f7760dadSAlex Elder 	while (len) {
1121f7760dadSAlex Elder 		unsigned int bi_size;
1122f7760dadSAlex Elder 		struct bio *bio;
1123f7760dadSAlex Elder 
1124f5400b7aSAlex Elder 		if (!bi) {
1125f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1126f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1127f5400b7aSAlex Elder 		}
1128f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1129f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1130f7760dadSAlex Elder 		if (!bio)
1131f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1132f7760dadSAlex Elder 
1133f7760dadSAlex Elder 		*end = bio;
1134f7760dadSAlex Elder 		end = &bio->bi_next;
1135f7760dadSAlex Elder 
1136f7760dadSAlex Elder 		off += bi_size;
1137f7760dadSAlex Elder 		if (off == bi->bi_size) {
1138f7760dadSAlex Elder 			bi = bi->bi_next;
1139f7760dadSAlex Elder 			off = 0;
1140f7760dadSAlex Elder 		}
1141f7760dadSAlex Elder 		len -= bi_size;
1142f7760dadSAlex Elder 	}
1143f7760dadSAlex Elder 	*bio_src = bi;
1144f7760dadSAlex Elder 	*offset = off;
1145f7760dadSAlex Elder 
1146f7760dadSAlex Elder 	return chain;
1147f7760dadSAlex Elder out_err:
1148f7760dadSAlex Elder 	bio_chain_put(chain);
1149f7760dadSAlex Elder 
1150602adf40SYehuda Sadeh 	return NULL;
1151602adf40SYehuda Sadeh }
1152602adf40SYehuda Sadeh 
1153926f9b3fSAlex Elder /*
1154926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1155926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1156926f9b3fSAlex Elder  * again.
1157926f9b3fSAlex Elder  */
11586365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
11596365d33aSAlex Elder {
11606365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
11616365d33aSAlex Elder 		struct rbd_device *rbd_dev;
11626365d33aSAlex Elder 
116357acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
11646365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
11656365d33aSAlex Elder 			obj_request);
11666365d33aSAlex Elder 	}
11676365d33aSAlex Elder }
11686365d33aSAlex Elder 
11696365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
11706365d33aSAlex Elder {
11716365d33aSAlex Elder 	smp_mb();
11726365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
11736365d33aSAlex Elder }
11746365d33aSAlex Elder 
117557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
117657acbaa7SAlex Elder {
117757acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
117857acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
117957acbaa7SAlex Elder 
118057acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
118157acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
118257acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
118357acbaa7SAlex Elder 			obj_request);
118457acbaa7SAlex Elder 	}
118557acbaa7SAlex Elder }
118657acbaa7SAlex Elder 
118757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
118857acbaa7SAlex Elder {
118957acbaa7SAlex Elder 	smp_mb();
119057acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
119157acbaa7SAlex Elder }
119257acbaa7SAlex Elder 
11935679c59fSAlex Elder /*
11945679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
11955679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
11965679c59fSAlex Elder  *
11975679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
11985679c59fSAlex Elder  * away again.  It's possible that the response from two existence
11995679c59fSAlex Elder  * checks are separated by the creation of the target object, and
12005679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
12015679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
12025679c59fSAlex Elder  */
12035679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
12045679c59fSAlex Elder 				bool exists)
12055679c59fSAlex Elder {
12065679c59fSAlex Elder 	if (exists)
12075679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
12085679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
12095679c59fSAlex Elder 	smp_mb();
12105679c59fSAlex Elder }
12115679c59fSAlex Elder 
12125679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
12135679c59fSAlex Elder {
12145679c59fSAlex Elder 	smp_mb();
12155679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
12165679c59fSAlex Elder }
12175679c59fSAlex Elder 
12185679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
12195679c59fSAlex Elder {
12205679c59fSAlex Elder 	smp_mb();
12215679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
12225679c59fSAlex Elder }
12235679c59fSAlex Elder 
1224bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1225bf0d5f50SAlex Elder {
122637206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
122737206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1228bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1229bf0d5f50SAlex Elder }
1230bf0d5f50SAlex Elder 
1231bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1232bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1233bf0d5f50SAlex Elder {
1234bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
123537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
123637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1237bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1238bf0d5f50SAlex Elder }
1239bf0d5f50SAlex Elder 
1240bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1241bf0d5f50SAlex Elder {
124237206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
124337206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1244bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1245bf0d5f50SAlex Elder }
1246bf0d5f50SAlex Elder 
1247bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1248bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1249bf0d5f50SAlex Elder {
1250bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
125137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
125237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1253bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1254bf0d5f50SAlex Elder }
1255bf0d5f50SAlex Elder 
1256bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1257bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1258bf0d5f50SAlex Elder {
125925dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
126025dcf954SAlex Elder 
1261b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1262bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
126325dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
12646365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
12656365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1266bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
126725dcf954SAlex Elder 	img_request->obj_request_count++;
126825dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
126937206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
127037206ee5SAlex Elder 		obj_request->which);
1271bf0d5f50SAlex Elder }
1272bf0d5f50SAlex Elder 
1273bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1274bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1275bf0d5f50SAlex Elder {
1276bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
127725dcf954SAlex Elder 
127837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
127937206ee5SAlex Elder 		obj_request->which);
1280bf0d5f50SAlex Elder 	list_del(&obj_request->links);
128125dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
128225dcf954SAlex Elder 	img_request->obj_request_count--;
128325dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
128425dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
12856365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1286bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1287bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
128825dcf954SAlex Elder 	obj_request->callback = NULL;
1289bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1290bf0d5f50SAlex Elder }
1291bf0d5f50SAlex Elder 
1292bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1293bf0d5f50SAlex Elder {
1294bf0d5f50SAlex Elder 	switch (type) {
12959969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1296bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1297788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1298bf0d5f50SAlex Elder 		return true;
1299bf0d5f50SAlex Elder 	default:
1300bf0d5f50SAlex Elder 		return false;
1301bf0d5f50SAlex Elder 	}
1302bf0d5f50SAlex Elder }
1303bf0d5f50SAlex Elder 
1304bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1305bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1306bf0d5f50SAlex Elder {
130737206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
130837206ee5SAlex Elder 
1309bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1310bf0d5f50SAlex Elder }
1311bf0d5f50SAlex Elder 
1312bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1313bf0d5f50SAlex Elder {
131455f27e09SAlex Elder 
131537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
131655f27e09SAlex Elder 
131755f27e09SAlex Elder 	/*
131855f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
131955f27e09SAlex Elder 	 * count for the image request.  We could instead use
132055f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
132155f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
132255f27e09SAlex Elder 	 */
132355f27e09SAlex Elder 	if (!img_request->result) {
132455f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
132555f27e09SAlex Elder 		u64 xferred = 0;
132655f27e09SAlex Elder 
132755f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
132855f27e09SAlex Elder 			xferred += obj_request->xferred;
132955f27e09SAlex Elder 		img_request->xferred = xferred;
133055f27e09SAlex Elder 	}
133155f27e09SAlex Elder 
1332bf0d5f50SAlex Elder 	if (img_request->callback)
1333bf0d5f50SAlex Elder 		img_request->callback(img_request);
1334bf0d5f50SAlex Elder 	else
1335bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1336bf0d5f50SAlex Elder }
1337bf0d5f50SAlex Elder 
1338788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1339788e2df3SAlex Elder 
1340788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1341788e2df3SAlex Elder {
134237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
134337206ee5SAlex Elder 
1344788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1345788e2df3SAlex Elder }
1346788e2df3SAlex Elder 
13470c425248SAlex Elder /*
13480c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13490c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13500c425248SAlex Elder  * and currently never change thereafter.
13510c425248SAlex Elder  */
13520c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
13530c425248SAlex Elder {
13540c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
13550c425248SAlex Elder 	smp_mb();
13560c425248SAlex Elder }
13570c425248SAlex Elder 
13580c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
13590c425248SAlex Elder {
13600c425248SAlex Elder 	smp_mb();
13610c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
13620c425248SAlex Elder }
13630c425248SAlex Elder 
13649849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
13659849e986SAlex Elder {
13669849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
13679849e986SAlex Elder 	smp_mb();
13689849e986SAlex Elder }
13699849e986SAlex Elder 
13709849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
13719849e986SAlex Elder {
13729849e986SAlex Elder 	smp_mb();
13739849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
13749849e986SAlex Elder }
13759849e986SAlex Elder 
1376d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1377d0b2e944SAlex Elder {
1378d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1379d0b2e944SAlex Elder 	smp_mb();
1380d0b2e944SAlex Elder }
1381d0b2e944SAlex Elder 
1382d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1383d0b2e944SAlex Elder {
1384d0b2e944SAlex Elder 	smp_mb();
1385d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1386d0b2e944SAlex Elder }
1387d0b2e944SAlex Elder 
13886e2a4505SAlex Elder static void
13896e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
13906e2a4505SAlex Elder {
1391b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1392b9434c5bSAlex Elder 	u64 length = obj_request->length;
1393b9434c5bSAlex Elder 
13946e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
13956e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1396b9434c5bSAlex Elder 		xferred, length);
13976e2a4505SAlex Elder 	/*
13986e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
13996e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
14006e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
14016e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
14026e2a4505SAlex Elder 	 * was satisfied.
14036e2a4505SAlex Elder 	 */
1404b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
14056e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1406b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
14076e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1408b9434c5bSAlex Elder 		else
1409b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
14106e2a4505SAlex Elder 		obj_request->result = 0;
1411b9434c5bSAlex Elder 		obj_request->xferred = length;
1412b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1413b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1414b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1415b9434c5bSAlex Elder 		else
1416b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1417b9434c5bSAlex Elder 		obj_request->xferred = length;
14186e2a4505SAlex Elder 	}
14196e2a4505SAlex Elder 	obj_request_done_set(obj_request);
14206e2a4505SAlex Elder }
14216e2a4505SAlex Elder 
1422bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1423bf0d5f50SAlex Elder {
142437206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
142537206ee5SAlex Elder 		obj_request->callback);
1426bf0d5f50SAlex Elder 	if (obj_request->callback)
1427bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1428788e2df3SAlex Elder 	else
1429788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1430bf0d5f50SAlex Elder }
1431bf0d5f50SAlex Elder 
1432c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
143339bf2c5dSAlex Elder {
143439bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
143539bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
143639bf2c5dSAlex Elder }
143739bf2c5dSAlex Elder 
1438c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1439bf0d5f50SAlex Elder {
144057acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1441a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
144257acbaa7SAlex Elder 	bool layered = false;
144357acbaa7SAlex Elder 
144457acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
144557acbaa7SAlex Elder 		img_request = obj_request->img_request;
144657acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1447a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
144857acbaa7SAlex Elder 	}
14498b3e1a56SAlex Elder 
14508b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
14518b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
14528b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1453a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1454a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
14558b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
14568b3e1a56SAlex Elder 	else if (img_request)
14576e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
14586e2a4505SAlex Elder 	else
145907741308SAlex Elder 		obj_request_done_set(obj_request);
1460bf0d5f50SAlex Elder }
1461bf0d5f50SAlex Elder 
1462c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1463bf0d5f50SAlex Elder {
14641b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
14651b83bef2SSage Weil 		obj_request->result, obj_request->length);
14661b83bef2SSage Weil 	/*
14678b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
14688b3e1a56SAlex Elder 	 * it to our originally-requested length.
14691b83bef2SSage Weil 	 */
14701b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
147107741308SAlex Elder 	obj_request_done_set(obj_request);
1472bf0d5f50SAlex Elder }
1473bf0d5f50SAlex Elder 
1474fbfab539SAlex Elder /*
1475fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1476fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1477fbfab539SAlex Elder  */
1478c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1479fbfab539SAlex Elder {
148037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1481fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1482fbfab539SAlex Elder }
1483fbfab539SAlex Elder 
1484bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1485bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1486bf0d5f50SAlex Elder {
1487bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1488bf0d5f50SAlex Elder 	u16 opcode;
1489bf0d5f50SAlex Elder 
149037206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1491bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
149257acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
149357acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
149457acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
149557acbaa7SAlex Elder 	} else {
149657acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
149757acbaa7SAlex Elder 	}
1498bf0d5f50SAlex Elder 
14991b83bef2SSage Weil 	if (osd_req->r_result < 0)
15001b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1501bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1502bf0d5f50SAlex Elder 
15030eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1504bf0d5f50SAlex Elder 
1505c47f9371SAlex Elder 	/*
1506c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1507c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1508c47f9371SAlex Elder 	 */
15091b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1510c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
151179528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1512bf0d5f50SAlex Elder 	switch (opcode) {
1513bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1514c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1515bf0d5f50SAlex Elder 		break;
1516bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1517c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1518bf0d5f50SAlex Elder 		break;
1519fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1520c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1521fbfab539SAlex Elder 		break;
152236be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1523b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
15249969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1525c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
15269969ebc5SAlex Elder 		break;
1527bf0d5f50SAlex Elder 	default:
1528bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1529bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1530bf0d5f50SAlex Elder 		break;
1531bf0d5f50SAlex Elder 	}
1532bf0d5f50SAlex Elder 
153307741308SAlex Elder 	if (obj_request_done_test(obj_request))
1534bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1535bf0d5f50SAlex Elder }
1536bf0d5f50SAlex Elder 
15379d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1538430c28c3SAlex Elder {
1539430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15408c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15419d4df01fSAlex Elder 	u64 snap_id;
1542430c28c3SAlex Elder 
15438c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1544430c28c3SAlex Elder 
15459d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
15468c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15479d4df01fSAlex Elder 			NULL, snap_id, NULL);
15489d4df01fSAlex Elder }
15499d4df01fSAlex Elder 
15509d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
15519d4df01fSAlex Elder {
15529d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15539d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15549d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
15559d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
15569d4df01fSAlex Elder 
15579d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
15589d4df01fSAlex Elder 
15599d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
15609d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15619d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1562430c28c3SAlex Elder }
1563430c28c3SAlex Elder 
1564bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1565bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1566bf0d5f50SAlex Elder 					bool write_request,
1567430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1568bf0d5f50SAlex Elder {
1569bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1570bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1571bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1572bf0d5f50SAlex Elder 
15736365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
15746365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
15756365d33aSAlex Elder 
15760c425248SAlex Elder 		rbd_assert(write_request ==
15770c425248SAlex Elder 				img_request_write_test(img_request));
15780c425248SAlex Elder 		if (write_request)
1579bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1580bf0d5f50SAlex Elder 	}
1581bf0d5f50SAlex Elder 
1582bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1583bf0d5f50SAlex Elder 
1584bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1585bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1586bf0d5f50SAlex Elder 	if (!osd_req)
1587bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1588bf0d5f50SAlex Elder 
1589430c28c3SAlex Elder 	if (write_request)
1590bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1591430c28c3SAlex Elder 	else
1592bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1593bf0d5f50SAlex Elder 
1594bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1595bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1596bf0d5f50SAlex Elder 
1597bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1598bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1599bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1600bf0d5f50SAlex Elder 
1601bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1602bf0d5f50SAlex Elder 
1603bf0d5f50SAlex Elder 	return osd_req;
1604bf0d5f50SAlex Elder }
1605bf0d5f50SAlex Elder 
16060eefd470SAlex Elder /*
16070eefd470SAlex Elder  * Create a copyup osd request based on the information in the
16080eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
16090eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
16100eefd470SAlex Elder  */
16110eefd470SAlex Elder static struct ceph_osd_request *
16120eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
16130eefd470SAlex Elder {
16140eefd470SAlex Elder 	struct rbd_img_request *img_request;
16150eefd470SAlex Elder 	struct ceph_snap_context *snapc;
16160eefd470SAlex Elder 	struct rbd_device *rbd_dev;
16170eefd470SAlex Elder 	struct ceph_osd_client *osdc;
16180eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
16190eefd470SAlex Elder 
16200eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16210eefd470SAlex Elder 	img_request = obj_request->img_request;
16220eefd470SAlex Elder 	rbd_assert(img_request);
16230eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
16240eefd470SAlex Elder 
16250eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
16260eefd470SAlex Elder 
16270eefd470SAlex Elder 	snapc = img_request->snapc;
16280eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
16290eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
16300eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
16310eefd470SAlex Elder 	if (!osd_req)
16320eefd470SAlex Elder 		return NULL;	/* ENOMEM */
16330eefd470SAlex Elder 
16340eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
16350eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
16360eefd470SAlex Elder 	osd_req->r_priv = obj_request;
16370eefd470SAlex Elder 
16380eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
16390eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
16400eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
16410eefd470SAlex Elder 
16420eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
16430eefd470SAlex Elder 
16440eefd470SAlex Elder 	return osd_req;
16450eefd470SAlex Elder }
16460eefd470SAlex Elder 
16470eefd470SAlex Elder 
1648bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1649bf0d5f50SAlex Elder {
1650bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1651bf0d5f50SAlex Elder }
1652bf0d5f50SAlex Elder 
1653bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1654bf0d5f50SAlex Elder 
1655bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1656bf0d5f50SAlex Elder 						u64 offset, u64 length,
1657bf0d5f50SAlex Elder 						enum obj_request_type type)
1658bf0d5f50SAlex Elder {
1659bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1660bf0d5f50SAlex Elder 	size_t size;
1661bf0d5f50SAlex Elder 	char *name;
1662bf0d5f50SAlex Elder 
1663bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1664bf0d5f50SAlex Elder 
1665bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1666bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1667bf0d5f50SAlex Elder 	if (!obj_request)
1668bf0d5f50SAlex Elder 		return NULL;
1669bf0d5f50SAlex Elder 
1670bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1671bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1672bf0d5f50SAlex Elder 	obj_request->offset = offset;
1673bf0d5f50SAlex Elder 	obj_request->length = length;
1674926f9b3fSAlex Elder 	obj_request->flags = 0;
1675bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1676bf0d5f50SAlex Elder 	obj_request->type = type;
1677bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1678788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1679bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1680bf0d5f50SAlex Elder 
168137206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
168237206ee5SAlex Elder 		offset, length, (int)type, obj_request);
168337206ee5SAlex Elder 
1684bf0d5f50SAlex Elder 	return obj_request;
1685bf0d5f50SAlex Elder }
1686bf0d5f50SAlex Elder 
1687bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1688bf0d5f50SAlex Elder {
1689bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1690bf0d5f50SAlex Elder 
1691bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1692bf0d5f50SAlex Elder 
169337206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
169437206ee5SAlex Elder 
1695bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1696bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1697bf0d5f50SAlex Elder 
1698bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1699bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1700bf0d5f50SAlex Elder 
1701bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1702bf0d5f50SAlex Elder 	switch (obj_request->type) {
17039969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
17049969ebc5SAlex Elder 		break;		/* Nothing to do */
1705bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1706bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1707bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1708bf0d5f50SAlex Elder 		break;
1709788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1710788e2df3SAlex Elder 		if (obj_request->pages)
1711788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1712788e2df3SAlex Elder 						obj_request->page_count);
1713788e2df3SAlex Elder 		break;
1714bf0d5f50SAlex Elder 	}
1715bf0d5f50SAlex Elder 
1716bf0d5f50SAlex Elder 	kfree(obj_request);
1717bf0d5f50SAlex Elder }
1718bf0d5f50SAlex Elder 
1719bf0d5f50SAlex Elder /*
1720bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1721bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1722bf0d5f50SAlex Elder  * (if there is one).
1723bf0d5f50SAlex Elder  */
1724cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1725cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1726bf0d5f50SAlex Elder 					u64 offset, u64 length,
17279849e986SAlex Elder 					bool write_request,
17289849e986SAlex Elder 					bool child_request)
1729bf0d5f50SAlex Elder {
1730bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1731bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1732bf0d5f50SAlex Elder 
1733bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1734bf0d5f50SAlex Elder 	if (!img_request)
1735bf0d5f50SAlex Elder 		return NULL;
1736bf0d5f50SAlex Elder 
1737bf0d5f50SAlex Elder 	if (write_request) {
1738bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1739bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1740bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1741bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1742bf0d5f50SAlex Elder 			kfree(img_request);
1743bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1744bf0d5f50SAlex Elder 		}
17450c425248SAlex Elder 
1746bf0d5f50SAlex Elder 	}
1747bf0d5f50SAlex Elder 
1748bf0d5f50SAlex Elder 	img_request->rq = NULL;
1749bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1750bf0d5f50SAlex Elder 	img_request->offset = offset;
1751bf0d5f50SAlex Elder 	img_request->length = length;
17520c425248SAlex Elder 	img_request->flags = 0;
17530c425248SAlex Elder 	if (write_request) {
17540c425248SAlex Elder 		img_request_write_set(img_request);
1755bf0d5f50SAlex Elder 		img_request->snapc = snapc;
17560c425248SAlex Elder 	} else {
1757bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
17580c425248SAlex Elder 	}
17599849e986SAlex Elder 	if (child_request)
17609849e986SAlex Elder 		img_request_child_set(img_request);
1761d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1762d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1763bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1764bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1765bf0d5f50SAlex Elder 	img_request->callback = NULL;
1766a5a337d4SAlex Elder 	img_request->result = 0;
1767bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1768bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1769bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1770bf0d5f50SAlex Elder 
1771bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1772bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1773bf0d5f50SAlex Elder 
177437206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
177537206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
177637206ee5SAlex Elder 		img_request);
177737206ee5SAlex Elder 
1778bf0d5f50SAlex Elder 	return img_request;
1779bf0d5f50SAlex Elder }
1780bf0d5f50SAlex Elder 
1781bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1782bf0d5f50SAlex Elder {
1783bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1784bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1785bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1786bf0d5f50SAlex Elder 
1787bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1788bf0d5f50SAlex Elder 
178937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
179037206ee5SAlex Elder 
1791bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1792bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
179325dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1794bf0d5f50SAlex Elder 
17950c425248SAlex Elder 	if (img_request_write_test(img_request))
1796bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1797bf0d5f50SAlex Elder 
17988b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
17998b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
18008b3e1a56SAlex Elder 
1801bf0d5f50SAlex Elder 	kfree(img_request);
1802bf0d5f50SAlex Elder }
1803bf0d5f50SAlex Elder 
18041217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
18051217857fSAlex Elder {
18066365d33aSAlex Elder 	struct rbd_img_request *img_request;
18071217857fSAlex Elder 	unsigned int xferred;
18081217857fSAlex Elder 	int result;
18098b3e1a56SAlex Elder 	bool more;
18101217857fSAlex Elder 
18116365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18126365d33aSAlex Elder 	img_request = obj_request->img_request;
18136365d33aSAlex Elder 
18141217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
18151217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
18161217857fSAlex Elder 	result = obj_request->result;
18171217857fSAlex Elder 	if (result) {
18181217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
18191217857fSAlex Elder 
18201217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
18211217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
18221217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
18231217857fSAlex Elder 			obj_request->offset);
18241217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
18251217857fSAlex Elder 			result, xferred);
18261217857fSAlex Elder 		if (!img_request->result)
18271217857fSAlex Elder 			img_request->result = result;
18281217857fSAlex Elder 	}
18291217857fSAlex Elder 
1830f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
1831f1a4739fSAlex Elder 
1832f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
1833f1a4739fSAlex Elder 		obj_request->pages = NULL;
1834f1a4739fSAlex Elder 		obj_request->page_count = 0;
1835f1a4739fSAlex Elder 	}
1836f1a4739fSAlex Elder 
18378b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
18388b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
18398b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
18408b3e1a56SAlex Elder 	} else {
18418b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
18428b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
18438b3e1a56SAlex Elder 	}
18448b3e1a56SAlex Elder 
18458b3e1a56SAlex Elder 	return more;
18461217857fSAlex Elder }
18471217857fSAlex Elder 
18482169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
18492169238dSAlex Elder {
18502169238dSAlex Elder 	struct rbd_img_request *img_request;
18512169238dSAlex Elder 	u32 which = obj_request->which;
18522169238dSAlex Elder 	bool more = true;
18532169238dSAlex Elder 
18546365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18552169238dSAlex Elder 	img_request = obj_request->img_request;
18562169238dSAlex Elder 
18572169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
18582169238dSAlex Elder 	rbd_assert(img_request != NULL);
18592169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
18602169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
18612169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
18622169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
18632169238dSAlex Elder 
18642169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
18652169238dSAlex Elder 	if (which != img_request->next_completion)
18662169238dSAlex Elder 		goto out;
18672169238dSAlex Elder 
18682169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
18692169238dSAlex Elder 		rbd_assert(more);
18702169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
18712169238dSAlex Elder 
18722169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
18732169238dSAlex Elder 			break;
18741217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
18752169238dSAlex Elder 		which++;
18762169238dSAlex Elder 	}
18772169238dSAlex Elder 
18782169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
18792169238dSAlex Elder 	img_request->next_completion = which;
18802169238dSAlex Elder out:
18812169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
18822169238dSAlex Elder 
18832169238dSAlex Elder 	if (!more)
18842169238dSAlex Elder 		rbd_img_request_complete(img_request);
18852169238dSAlex Elder }
18862169238dSAlex Elder 
1887f1a4739fSAlex Elder /*
1888f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
1889f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
1890f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
1891f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
1892f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
1893f1a4739fSAlex Elder  * all data described by the image request.
1894f1a4739fSAlex Elder  */
1895f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
1896f1a4739fSAlex Elder 					enum obj_request_type type,
1897f1a4739fSAlex Elder 					void *data_desc)
1898bf0d5f50SAlex Elder {
1899bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1900bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1901bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
19020c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1903f1a4739fSAlex Elder 	struct bio *bio_list;
1904f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
1905f1a4739fSAlex Elder 	struct page **pages;
19067da22d29SAlex Elder 	u64 img_offset;
1907bf0d5f50SAlex Elder 	u64 resid;
1908bf0d5f50SAlex Elder 	u16 opcode;
1909bf0d5f50SAlex Elder 
1910f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1911f1a4739fSAlex Elder 		(int)type, data_desc);
191237206ee5SAlex Elder 
1913430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
19147da22d29SAlex Elder 	img_offset = img_request->offset;
1915bf0d5f50SAlex Elder 	resid = img_request->length;
19164dda41d3SAlex Elder 	rbd_assert(resid > 0);
1917f1a4739fSAlex Elder 
1918f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
1919f1a4739fSAlex Elder 		bio_list = data_desc;
1920f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1921f1a4739fSAlex Elder 	} else {
1922f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
1923f1a4739fSAlex Elder 		pages = data_desc;
1924f1a4739fSAlex Elder 	}
1925f1a4739fSAlex Elder 
1926bf0d5f50SAlex Elder 	while (resid) {
19272fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1928bf0d5f50SAlex Elder 		const char *object_name;
1929bf0d5f50SAlex Elder 		u64 offset;
1930bf0d5f50SAlex Elder 		u64 length;
1931bf0d5f50SAlex Elder 
19327da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1933bf0d5f50SAlex Elder 		if (!object_name)
1934bf0d5f50SAlex Elder 			goto out_unwind;
19357da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
19367da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1937bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1938f1a4739fSAlex Elder 						offset, length, type);
1939bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1940bf0d5f50SAlex Elder 		if (!obj_request)
1941bf0d5f50SAlex Elder 			goto out_unwind;
1942bf0d5f50SAlex Elder 
1943f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
1944f1a4739fSAlex Elder 			unsigned int clone_size;
1945f1a4739fSAlex Elder 
1946bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
1947bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
1948f1a4739fSAlex Elder 			obj_request->bio_list =
1949f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
1950f1a4739fSAlex Elder 								&bio_offset,
1951f1a4739fSAlex Elder 								clone_size,
1952bf0d5f50SAlex Elder 								GFP_ATOMIC);
1953bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
1954bf0d5f50SAlex Elder 				goto out_partial;
1955f1a4739fSAlex Elder 		} else {
1956f1a4739fSAlex Elder 			unsigned int page_count;
1957f1a4739fSAlex Elder 
1958f1a4739fSAlex Elder 			obj_request->pages = pages;
1959f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
1960f1a4739fSAlex Elder 			obj_request->page_count = page_count;
1961f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
1962f1a4739fSAlex Elder 				page_count--;	/* more on last page */
1963f1a4739fSAlex Elder 			pages += page_count;
1964f1a4739fSAlex Elder 		}
1965bf0d5f50SAlex Elder 
19662fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
19672fa12320SAlex Elder 						obj_request);
19682fa12320SAlex Elder 		if (!osd_req)
1969bf0d5f50SAlex Elder 			goto out_partial;
19702fa12320SAlex Elder 		obj_request->osd_req = osd_req;
19712169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1972430c28c3SAlex Elder 
19732fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
19742fa12320SAlex Elder 						0, 0);
1975f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
1976406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
1977f1a4739fSAlex Elder 					obj_request->bio_list, length);
1978f1a4739fSAlex Elder 		else
1979f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
1980f1a4739fSAlex Elder 					obj_request->pages, length,
1981f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
19829d4df01fSAlex Elder 
19839d4df01fSAlex Elder 		if (write_request)
19849d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
19859d4df01fSAlex Elder 		else
19869d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
1987430c28c3SAlex Elder 
19887da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1989bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1990bf0d5f50SAlex Elder 
19917da22d29SAlex Elder 		img_offset += length;
1992bf0d5f50SAlex Elder 		resid -= length;
1993bf0d5f50SAlex Elder 	}
1994bf0d5f50SAlex Elder 
1995bf0d5f50SAlex Elder 	return 0;
1996bf0d5f50SAlex Elder 
1997bf0d5f50SAlex Elder out_partial:
1998bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1999bf0d5f50SAlex Elder out_unwind:
2000bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2001bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2002bf0d5f50SAlex Elder 
2003bf0d5f50SAlex Elder 	return -ENOMEM;
2004bf0d5f50SAlex Elder }
2005bf0d5f50SAlex Elder 
20063d7efd18SAlex Elder static void
20070eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
20080eefd470SAlex Elder {
20090eefd470SAlex Elder 	struct rbd_img_request *img_request;
20100eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20110eefd470SAlex Elder 	u64 length;
20120eefd470SAlex Elder 	u32 page_count;
20130eefd470SAlex Elder 
20140eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
20150eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20160eefd470SAlex Elder 	img_request = obj_request->img_request;
20170eefd470SAlex Elder 	rbd_assert(img_request);
20180eefd470SAlex Elder 
20190eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20200eefd470SAlex Elder 	rbd_assert(rbd_dev);
20210eefd470SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
20220eefd470SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
20230eefd470SAlex Elder 
20240eefd470SAlex Elder 	rbd_assert(obj_request->copyup_pages);
20250eefd470SAlex Elder 	ceph_release_page_vector(obj_request->copyup_pages, page_count);
20260eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
20270eefd470SAlex Elder 
20280eefd470SAlex Elder 	/*
20290eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
20300eefd470SAlex Elder 	 * original write request.  There is no such thing as a
20310eefd470SAlex Elder 	 * successful short write, so if the request was successful
20320eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
20330eefd470SAlex Elder 	 */
20340eefd470SAlex Elder 	if (!obj_request->result)
20350eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
20360eefd470SAlex Elder 
20370eefd470SAlex Elder 	/* Finish up with the normal image object callback */
20380eefd470SAlex Elder 
20390eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
20400eefd470SAlex Elder }
20410eefd470SAlex Elder 
20420eefd470SAlex Elder static void
20433d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
20443d7efd18SAlex Elder {
20453d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
20460eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
20470eefd470SAlex Elder 	struct ceph_osd_client *osdc;
20480eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20493d7efd18SAlex Elder 	struct page **pages;
20503d7efd18SAlex Elder 	int result;
20513d7efd18SAlex Elder 	u64 obj_size;
20523d7efd18SAlex Elder 	u64 xferred;
20533d7efd18SAlex Elder 
20543d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
20553d7efd18SAlex Elder 
20563d7efd18SAlex Elder 	/* First get what we need from the image request */
20573d7efd18SAlex Elder 
20583d7efd18SAlex Elder 	pages = img_request->copyup_pages;
20593d7efd18SAlex Elder 	rbd_assert(pages != NULL);
20603d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
20613d7efd18SAlex Elder 
20623d7efd18SAlex Elder 	orig_request = img_request->obj_request;
20633d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
20640eefd470SAlex Elder 	rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
20653d7efd18SAlex Elder 	result = img_request->result;
20663d7efd18SAlex Elder 	obj_size = img_request->length;
20673d7efd18SAlex Elder 	xferred = img_request->xferred;
20683d7efd18SAlex Elder 
20690eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20700eefd470SAlex Elder 	rbd_assert(rbd_dev);
20710eefd470SAlex Elder 	rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
20720eefd470SAlex Elder 
20733d7efd18SAlex Elder 	rbd_img_request_put(img_request);
20743d7efd18SAlex Elder 
20750eefd470SAlex Elder 	if (result)
20760eefd470SAlex Elder 		goto out_err;
20773d7efd18SAlex Elder 
20780eefd470SAlex Elder 	/* Allocate the new copyup osd request for the original request */
20793d7efd18SAlex Elder 
20800eefd470SAlex Elder 	result = -ENOMEM;
20810eefd470SAlex Elder 	rbd_assert(!orig_request->osd_req);
20820eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
20830eefd470SAlex Elder 	if (!osd_req)
20840eefd470SAlex Elder 		goto out_err;
20850eefd470SAlex Elder 	orig_request->osd_req = osd_req;
20860eefd470SAlex Elder 	orig_request->copyup_pages = pages;
20873d7efd18SAlex Elder 
20880eefd470SAlex Elder 	/* Initialize the copyup op */
20890eefd470SAlex Elder 
20900eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
20910eefd470SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
20920eefd470SAlex Elder 						false, false);
20930eefd470SAlex Elder 
20940eefd470SAlex Elder 	/* Then the original write request op */
20950eefd470SAlex Elder 
20960eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
20970eefd470SAlex Elder 					orig_request->offset,
20980eefd470SAlex Elder 					orig_request->length, 0, 0);
20990eefd470SAlex Elder 	osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
21000eefd470SAlex Elder 					orig_request->length);
21010eefd470SAlex Elder 
21020eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
21030eefd470SAlex Elder 
21040eefd470SAlex Elder 	/* All set, send it off. */
21050eefd470SAlex Elder 
21060eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
21070eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
21080eefd470SAlex Elder 	result = rbd_obj_request_submit(osdc, orig_request);
21090eefd470SAlex Elder 	if (!result)
21100eefd470SAlex Elder 		return;
21110eefd470SAlex Elder out_err:
21120eefd470SAlex Elder 	/* Record the error code and complete the request */
21130eefd470SAlex Elder 
21140eefd470SAlex Elder 	orig_request->result = result;
21150eefd470SAlex Elder 	orig_request->xferred = 0;
21163d7efd18SAlex Elder 	obj_request_done_set(orig_request);
21173d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
21183d7efd18SAlex Elder }
21193d7efd18SAlex Elder 
21203d7efd18SAlex Elder /*
21213d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
21223d7efd18SAlex Elder  * entire target of the given object request.  This is used for
21233d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
21243d7efd18SAlex Elder  * object request from the image request does not exist.
21253d7efd18SAlex Elder  *
21263d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
21273d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
21283d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
21293d7efd18SAlex Elder  * the original object request for the copyup operation.
21303d7efd18SAlex Elder  *
21313d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
21323d7efd18SAlex Elder  * object request and mark it done so it gets completed.
21333d7efd18SAlex Elder  */
21343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
21353d7efd18SAlex Elder {
21363d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
21373d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
21383d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
21393d7efd18SAlex Elder 	u64 img_offset;
21403d7efd18SAlex Elder 	u64 length;
21413d7efd18SAlex Elder 	struct page **pages = NULL;
21423d7efd18SAlex Elder 	u32 page_count;
21433d7efd18SAlex Elder 	int result;
21443d7efd18SAlex Elder 
21453d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21463d7efd18SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
21473d7efd18SAlex Elder 
21483d7efd18SAlex Elder 	img_request = obj_request->img_request;
21493d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
21503d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
21513d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
21523d7efd18SAlex Elder 
21533d7efd18SAlex Elder 	/*
21540eefd470SAlex Elder 	 * First things first.  The original osd request is of no
21550eefd470SAlex Elder 	 * use to use any more, we'll need a new one that can hold
21560eefd470SAlex Elder 	 * the two ops in a copyup request.  We'll get that later,
21570eefd470SAlex Elder 	 * but for now we can release the old one.
21580eefd470SAlex Elder 	 */
21590eefd470SAlex Elder 	rbd_osd_req_destroy(obj_request->osd_req);
21600eefd470SAlex Elder 	obj_request->osd_req = NULL;
21610eefd470SAlex Elder 
21620eefd470SAlex Elder 	/*
21633d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
21643d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
21653d7efd18SAlex Elder 	 */
21663d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
21673d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
21683d7efd18SAlex Elder 
21693d7efd18SAlex Elder 	/*
2170a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2171a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2172a9e8ba2cSAlex Elder 	 * necessary.
2173a9e8ba2cSAlex Elder 	 */
2174a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2175a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2176a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2177a9e8ba2cSAlex Elder 	}
2178a9e8ba2cSAlex Elder 
2179a9e8ba2cSAlex Elder 	/*
21803d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
21813d7efd18SAlex Elder 	 * from the parent.
21823d7efd18SAlex Elder 	 */
21833d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
21843d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
21853d7efd18SAlex Elder 	if (IS_ERR(pages)) {
21863d7efd18SAlex Elder 		result = PTR_ERR(pages);
21873d7efd18SAlex Elder 		pages = NULL;
21883d7efd18SAlex Elder 		goto out_err;
21893d7efd18SAlex Elder 	}
21903d7efd18SAlex Elder 
21913d7efd18SAlex Elder 	result = -ENOMEM;
21923d7efd18SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
21933d7efd18SAlex Elder 						img_offset, length,
21943d7efd18SAlex Elder 						false, true);
21953d7efd18SAlex Elder 	if (!parent_request)
21963d7efd18SAlex Elder 		goto out_err;
21973d7efd18SAlex Elder 	rbd_obj_request_get(obj_request);
21983d7efd18SAlex Elder 	parent_request->obj_request = obj_request;
21993d7efd18SAlex Elder 
22003d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
22013d7efd18SAlex Elder 	if (result)
22023d7efd18SAlex Elder 		goto out_err;
22033d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
22043d7efd18SAlex Elder 
22053d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
22063d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
22073d7efd18SAlex Elder 	if (!result)
22083d7efd18SAlex Elder 		return 0;
22093d7efd18SAlex Elder 
22103d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
22113d7efd18SAlex Elder 	parent_request->obj_request = NULL;
22123d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
22133d7efd18SAlex Elder out_err:
22143d7efd18SAlex Elder 	if (pages)
22153d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
22163d7efd18SAlex Elder 	if (parent_request)
22173d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
22183d7efd18SAlex Elder 	obj_request->result = result;
22193d7efd18SAlex Elder 	obj_request->xferred = 0;
22203d7efd18SAlex Elder 	obj_request_done_set(obj_request);
22213d7efd18SAlex Elder 
22223d7efd18SAlex Elder 	return result;
22233d7efd18SAlex Elder }
22243d7efd18SAlex Elder 
2225c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2226c5b5ef6cSAlex Elder {
2227c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2228c5b5ef6cSAlex Elder 	int result;
2229c5b5ef6cSAlex Elder 
2230c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2231c5b5ef6cSAlex Elder 
2232c5b5ef6cSAlex Elder 	/*
2233c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2234c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2235c5b5ef6cSAlex Elder 	 * we're done with the request.
2236c5b5ef6cSAlex Elder 	 */
2237c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2238c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2239c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2240c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2241c5b5ef6cSAlex Elder 
2242c5b5ef6cSAlex Elder 	result = obj_request->result;
2243c5b5ef6cSAlex Elder 	obj_request->result = 0;
2244c5b5ef6cSAlex Elder 
2245c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2246c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2247c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2248c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2249c5b5ef6cSAlex Elder 
2250c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2251c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2252c5b5ef6cSAlex Elder 
2253c5b5ef6cSAlex Elder 	/*
2254c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2255c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2256c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2257c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2258c5b5ef6cSAlex Elder 	 */
2259c5b5ef6cSAlex Elder 	if (!result) {
2260c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2261c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2262c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2263c5b5ef6cSAlex Elder 	} else if (result) {
2264c5b5ef6cSAlex Elder 		orig_request->result = result;
22653d7efd18SAlex Elder 		goto out;
2266c5b5ef6cSAlex Elder 	}
2267c5b5ef6cSAlex Elder 
2268c5b5ef6cSAlex Elder 	/*
2269c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2270c5b5ef6cSAlex Elder 	 * whether the target object exists.
2271c5b5ef6cSAlex Elder 	 */
2272b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
22733d7efd18SAlex Elder out:
2274c5b5ef6cSAlex Elder 	if (orig_request->result)
2275c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2276c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
2277c5b5ef6cSAlex Elder }
2278c5b5ef6cSAlex Elder 
2279c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2280c5b5ef6cSAlex Elder {
2281c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2282c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2283c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2284c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2285c5b5ef6cSAlex Elder 	u32 page_count;
2286c5b5ef6cSAlex Elder 	size_t size;
2287c5b5ef6cSAlex Elder 	int ret;
2288c5b5ef6cSAlex Elder 
2289c5b5ef6cSAlex Elder 	/*
2290c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2291c5b5ef6cSAlex Elder 	 *     le64 length;
2292c5b5ef6cSAlex Elder 	 *     struct {
2293c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2294c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2295c5b5ef6cSAlex Elder 	 *     } mtime;
2296c5b5ef6cSAlex Elder 	 */
2297c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2298c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2299c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2300c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2301c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2302c5b5ef6cSAlex Elder 
2303c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2304c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2305c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2306c5b5ef6cSAlex Elder 	if (!stat_request)
2307c5b5ef6cSAlex Elder 		goto out;
2308c5b5ef6cSAlex Elder 
2309c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2310c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2311c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2312c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2313c5b5ef6cSAlex Elder 
2314c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2315c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2316c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2317c5b5ef6cSAlex Elder 						stat_request);
2318c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2319c5b5ef6cSAlex Elder 		goto out;
2320c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2321c5b5ef6cSAlex Elder 
2322c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2323c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2324c5b5ef6cSAlex Elder 					false, false);
23259d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2326c5b5ef6cSAlex Elder 
2327c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2328c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2329c5b5ef6cSAlex Elder out:
2330c5b5ef6cSAlex Elder 	if (ret)
2331c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2332c5b5ef6cSAlex Elder 
2333c5b5ef6cSAlex Elder 	return ret;
2334c5b5ef6cSAlex Elder }
2335c5b5ef6cSAlex Elder 
2336b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2337b454e36dSAlex Elder {
2338b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2339a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
23403d7efd18SAlex Elder 	bool known;
2341b454e36dSAlex Elder 
2342b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2343b454e36dSAlex Elder 
2344b454e36dSAlex Elder 	img_request = obj_request->img_request;
2345b454e36dSAlex Elder 	rbd_assert(img_request);
2346a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2347b454e36dSAlex Elder 
2348b454e36dSAlex Elder 	/*
2349a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2350a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2351a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2352a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2353a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2354a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2355a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2356a9e8ba2cSAlex Elder 	 * simple object request.
2357b454e36dSAlex Elder 	 */
2358b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2359b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2360a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
23613d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
23623d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2363b454e36dSAlex Elder 
2364b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2365b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2366b454e36dSAlex Elder 
2367b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2368b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2369b454e36dSAlex Elder 
2370b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2371b454e36dSAlex Elder 	}
2372b454e36dSAlex Elder 
2373b454e36dSAlex Elder 	/*
23743d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
23753d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
23763d7efd18SAlex Elder 	 * start by reading the data for the full target object from
23773d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2378b454e36dSAlex Elder 	 */
23793d7efd18SAlex Elder 	if (known)
23803d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
23813d7efd18SAlex Elder 
23823d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2383b454e36dSAlex Elder 
2384b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2385b454e36dSAlex Elder }
2386b454e36dSAlex Elder 
2387bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2388bf0d5f50SAlex Elder {
2389bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
239046faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2391bf0d5f50SAlex Elder 
239237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
239346faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2394bf0d5f50SAlex Elder 		int ret;
2395bf0d5f50SAlex Elder 
2396b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2397bf0d5f50SAlex Elder 		if (ret)
2398bf0d5f50SAlex Elder 			return ret;
2399bf0d5f50SAlex Elder 	}
2400bf0d5f50SAlex Elder 
2401bf0d5f50SAlex Elder 	return 0;
2402bf0d5f50SAlex Elder }
2403bf0d5f50SAlex Elder 
24048b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
24058b3e1a56SAlex Elder {
24068b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2407a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2408a9e8ba2cSAlex Elder 	u64 obj_end;
24098b3e1a56SAlex Elder 
24108b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
24118b3e1a56SAlex Elder 
24128b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
2413a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2414a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
24158b3e1a56SAlex Elder 
2416a9e8ba2cSAlex Elder 	obj_request->result = img_request->result;
2417a9e8ba2cSAlex Elder 	if (obj_request->result)
2418a9e8ba2cSAlex Elder 		goto out;
2419a9e8ba2cSAlex Elder 
2420a9e8ba2cSAlex Elder 	/*
2421a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2422a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2423a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2424a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2425a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2426a9e8ba2cSAlex Elder 	 */
2427a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2428a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2429a9e8ba2cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2430a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2431a9e8ba2cSAlex Elder 		u64 xferred = 0;
2432a9e8ba2cSAlex Elder 
2433a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2434a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2435a9e8ba2cSAlex Elder 					obj_request->img_offset;
2436a9e8ba2cSAlex Elder 
2437a9e8ba2cSAlex Elder 		obj_request->xferred = min(img_request->xferred, xferred);
2438a9e8ba2cSAlex Elder 	} else {
2439a9e8ba2cSAlex Elder 		obj_request->xferred = img_request->xferred;
2440a9e8ba2cSAlex Elder 	}
2441a9e8ba2cSAlex Elder out:
24428b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
24438b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
24448b3e1a56SAlex Elder }
24458b3e1a56SAlex Elder 
24468b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
24478b3e1a56SAlex Elder {
24488b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
24498b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
24508b3e1a56SAlex Elder 	int result;
24518b3e1a56SAlex Elder 
24528b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
24538b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
24548b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
24558b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
24568b3e1a56SAlex Elder 
24578b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
24588b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
24598b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
24608b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
24618b3e1a56SAlex Elder 						obj_request->img_offset,
24628b3e1a56SAlex Elder 						obj_request->length,
24638b3e1a56SAlex Elder 						false, true);
24648b3e1a56SAlex Elder 	result = -ENOMEM;
24658b3e1a56SAlex Elder 	if (!img_request)
24668b3e1a56SAlex Elder 		goto out_err;
24678b3e1a56SAlex Elder 
24688b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
24698b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
24708b3e1a56SAlex Elder 
2471f1a4739fSAlex Elder 	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2472f1a4739fSAlex Elder 					obj_request->bio_list);
24738b3e1a56SAlex Elder 	if (result)
24748b3e1a56SAlex Elder 		goto out_err;
24758b3e1a56SAlex Elder 
24768b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
24778b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
24788b3e1a56SAlex Elder 	if (result)
24798b3e1a56SAlex Elder 		goto out_err;
24808b3e1a56SAlex Elder 
24818b3e1a56SAlex Elder 	return;
24828b3e1a56SAlex Elder out_err:
24838b3e1a56SAlex Elder 	if (img_request)
24848b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
24858b3e1a56SAlex Elder 	obj_request->result = result;
24868b3e1a56SAlex Elder 	obj_request->xferred = 0;
24878b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
24888b3e1a56SAlex Elder }
24898b3e1a56SAlex Elder 
2490cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
2491b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
2492b8d70035SAlex Elder {
2493b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
24942169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2495b8d70035SAlex Elder 	int ret;
2496b8d70035SAlex Elder 
2497b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2498b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2499b8d70035SAlex Elder 	if (!obj_request)
2500b8d70035SAlex Elder 		return -ENOMEM;
2501b8d70035SAlex Elder 
2502b8d70035SAlex Elder 	ret = -ENOMEM;
2503430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2504b8d70035SAlex Elder 	if (!obj_request->osd_req)
2505b8d70035SAlex Elder 		goto out;
25062169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2507b8d70035SAlex Elder 
2508c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2509c99d2d4aSAlex Elder 					notify_id, ver, 0);
25109d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2511430c28c3SAlex Elder 
2512b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2513b8d70035SAlex Elder out:
2514cf81b60eSAlex Elder 	if (ret)
2515b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2516b8d70035SAlex Elder 
2517b8d70035SAlex Elder 	return ret;
2518b8d70035SAlex Elder }
2519b8d70035SAlex Elder 
2520b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2521b8d70035SAlex Elder {
2522b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2523b8d70035SAlex Elder 	u64 hver;
2524b8d70035SAlex Elder 
2525b8d70035SAlex Elder 	if (!rbd_dev)
2526b8d70035SAlex Elder 		return;
2527b8d70035SAlex Elder 
252837206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2529b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
2530b8d70035SAlex Elder 		(unsigned int) opcode);
2531522a0cc0SAlex Elder 	(void)rbd_dev_refresh(rbd_dev, &hver);
2532b8d70035SAlex Elder 
2533cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
2534b8d70035SAlex Elder }
2535b8d70035SAlex Elder 
25369969ebc5SAlex Elder /*
25379969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
25389969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
25399969ebc5SAlex Elder  */
25409969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
25419969ebc5SAlex Elder {
25429969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
25439969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
25449969ebc5SAlex Elder 	int ret;
25459969ebc5SAlex Elder 
25469969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
25479969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
25489969ebc5SAlex Elder 
25499969ebc5SAlex Elder 	if (start) {
25503c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
25519969ebc5SAlex Elder 						&rbd_dev->watch_event);
25529969ebc5SAlex Elder 		if (ret < 0)
25539969ebc5SAlex Elder 			return ret;
25548eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
25559969ebc5SAlex Elder 	}
25569969ebc5SAlex Elder 
25579969ebc5SAlex Elder 	ret = -ENOMEM;
25589969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
25599969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
25609969ebc5SAlex Elder 	if (!obj_request)
25619969ebc5SAlex Elder 		goto out_cancel;
25629969ebc5SAlex Elder 
2563430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2564430c28c3SAlex Elder 	if (!obj_request->osd_req)
2565430c28c3SAlex Elder 		goto out_cancel;
2566430c28c3SAlex Elder 
25678eb87565SAlex Elder 	if (start)
2568975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
25698eb87565SAlex Elder 	else
25706977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2571975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
25722169238dSAlex Elder 
25732169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
25742169238dSAlex Elder 				rbd_dev->watch_event->cookie,
25752169238dSAlex Elder 				rbd_dev->header.obj_version, start);
25769d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
25772169238dSAlex Elder 
25789969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
25799969ebc5SAlex Elder 	if (ret)
25809969ebc5SAlex Elder 		goto out_cancel;
25819969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
25829969ebc5SAlex Elder 	if (ret)
25839969ebc5SAlex Elder 		goto out_cancel;
25849969ebc5SAlex Elder 	ret = obj_request->result;
25859969ebc5SAlex Elder 	if (ret)
25869969ebc5SAlex Elder 		goto out_cancel;
25879969ebc5SAlex Elder 
25888eb87565SAlex Elder 	/*
25898eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
25908eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
25918eb87565SAlex Elder 	 * a pointer to the object request during that time (in
25928eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
25938eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
25948eb87565SAlex Elder 	 * unregistered it.
25958eb87565SAlex Elder 	 */
25968eb87565SAlex Elder 	if (start) {
25978eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
25988eb87565SAlex Elder 
25998eb87565SAlex Elder 		return 0;
26008eb87565SAlex Elder 	}
26018eb87565SAlex Elder 
26028eb87565SAlex Elder 	/* We have successfully torn down the watch request */
26038eb87565SAlex Elder 
26048eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
26058eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
26069969ebc5SAlex Elder out_cancel:
26079969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
26089969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
26099969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
26109969ebc5SAlex Elder 	if (obj_request)
26119969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
26129969ebc5SAlex Elder 
26139969ebc5SAlex Elder 	return ret;
26149969ebc5SAlex Elder }
26159969ebc5SAlex Elder 
261636be9a76SAlex Elder /*
261736be9a76SAlex Elder  * Synchronous osd object method call
261836be9a76SAlex Elder  */
261936be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
262036be9a76SAlex Elder 			     const char *object_name,
262136be9a76SAlex Elder 			     const char *class_name,
262236be9a76SAlex Elder 			     const char *method_name,
26234157976bSAlex Elder 			     const void *outbound,
262436be9a76SAlex Elder 			     size_t outbound_size,
26254157976bSAlex Elder 			     void *inbound,
262636be9a76SAlex Elder 			     size_t inbound_size,
262736be9a76SAlex Elder 			     u64 *version)
262836be9a76SAlex Elder {
26292169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
263036be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
263136be9a76SAlex Elder 	struct page **pages;
263236be9a76SAlex Elder 	u32 page_count;
263336be9a76SAlex Elder 	int ret;
263436be9a76SAlex Elder 
263536be9a76SAlex Elder 	/*
26366010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
26376010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
26386010a451SAlex Elder 	 * also supply outbound data--parameters for the object
26396010a451SAlex Elder 	 * method.  Currently if this is present it will be a
26406010a451SAlex Elder 	 * snapshot id.
264136be9a76SAlex Elder 	 */
264236be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
264336be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
264436be9a76SAlex Elder 	if (IS_ERR(pages))
264536be9a76SAlex Elder 		return PTR_ERR(pages);
264636be9a76SAlex Elder 
264736be9a76SAlex Elder 	ret = -ENOMEM;
26486010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
264936be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
265036be9a76SAlex Elder 	if (!obj_request)
265136be9a76SAlex Elder 		goto out;
265236be9a76SAlex Elder 
265336be9a76SAlex Elder 	obj_request->pages = pages;
265436be9a76SAlex Elder 	obj_request->page_count = page_count;
265536be9a76SAlex Elder 
2656430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
265736be9a76SAlex Elder 	if (!obj_request->osd_req)
265836be9a76SAlex Elder 		goto out;
265936be9a76SAlex Elder 
2660c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
266104017e29SAlex Elder 					class_name, method_name);
266204017e29SAlex Elder 	if (outbound_size) {
266304017e29SAlex Elder 		struct ceph_pagelist *pagelist;
266404017e29SAlex Elder 
266504017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
266604017e29SAlex Elder 		if (!pagelist)
266704017e29SAlex Elder 			goto out;
266804017e29SAlex Elder 
266904017e29SAlex Elder 		ceph_pagelist_init(pagelist);
267004017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
267104017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
267204017e29SAlex Elder 						pagelist);
267304017e29SAlex Elder 	}
2674a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2675a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
267644cd188dSAlex Elder 					0, false, false);
26779d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2678430c28c3SAlex Elder 
267936be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
268036be9a76SAlex Elder 	if (ret)
268136be9a76SAlex Elder 		goto out;
268236be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
268336be9a76SAlex Elder 	if (ret)
268436be9a76SAlex Elder 		goto out;
268536be9a76SAlex Elder 
268636be9a76SAlex Elder 	ret = obj_request->result;
268736be9a76SAlex Elder 	if (ret < 0)
268836be9a76SAlex Elder 		goto out;
268957385b51SAlex Elder 
269057385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
269157385b51SAlex Elder 	ret = (int)obj_request->xferred;
2692903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
269336be9a76SAlex Elder 	if (version)
269436be9a76SAlex Elder 		*version = obj_request->version;
269536be9a76SAlex Elder out:
269636be9a76SAlex Elder 	if (obj_request)
269736be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
269836be9a76SAlex Elder 	else
269936be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
270036be9a76SAlex Elder 
270136be9a76SAlex Elder 	return ret;
270236be9a76SAlex Elder }
270336be9a76SAlex Elder 
2704bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2705cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2706bf0d5f50SAlex Elder {
2707bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2708bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2709bf0d5f50SAlex Elder 	struct request *rq;
2710bf0d5f50SAlex Elder 	int result;
2711bf0d5f50SAlex Elder 
2712bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2713bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2714bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2715bf0d5f50SAlex Elder 		u64 offset;
2716bf0d5f50SAlex Elder 		u64 length;
2717bf0d5f50SAlex Elder 
2718bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2719bf0d5f50SAlex Elder 
2720bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
27214dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
27224dda41d3SAlex Elder 				(int) rq->cmd_type);
27234dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
27244dda41d3SAlex Elder 			continue;
27254dda41d3SAlex Elder 		}
27264dda41d3SAlex Elder 
27274dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
27284dda41d3SAlex Elder 
27294dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
27304dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
27314dda41d3SAlex Elder 
27324dda41d3SAlex Elder 		if (!length) {
27334dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2734bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2735bf0d5f50SAlex Elder 			continue;
2736bf0d5f50SAlex Elder 		}
2737bf0d5f50SAlex Elder 
2738bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2739bf0d5f50SAlex Elder 
2740bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2741bf0d5f50SAlex Elder 
2742bf0d5f50SAlex Elder 		if (write_request) {
2743bf0d5f50SAlex Elder 			result = -EROFS;
2744bf0d5f50SAlex Elder 			if (read_only)
2745bf0d5f50SAlex Elder 				goto end_request;
2746bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2747bf0d5f50SAlex Elder 		}
2748bf0d5f50SAlex Elder 
27496d292906SAlex Elder 		/*
27506d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
27516d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
27526d292906SAlex Elder 		 * have disappeared by the time our request arrives
27536d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
27546d292906SAlex Elder 		 * we already know.
27556d292906SAlex Elder 		 */
27566d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2757bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2758bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2759bf0d5f50SAlex Elder 			result = -ENXIO;
2760bf0d5f50SAlex Elder 			goto end_request;
2761bf0d5f50SAlex Elder 		}
2762bf0d5f50SAlex Elder 
2763bf0d5f50SAlex Elder 		result = -EINVAL;
2764bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2765bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2766bf0d5f50SAlex Elder 
2767bf0d5f50SAlex Elder 		result = -ENOMEM;
2768bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
27699849e986SAlex Elder 							write_request, false);
2770bf0d5f50SAlex Elder 		if (!img_request)
2771bf0d5f50SAlex Elder 			goto end_request;
2772bf0d5f50SAlex Elder 
2773bf0d5f50SAlex Elder 		img_request->rq = rq;
2774bf0d5f50SAlex Elder 
2775f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2776f1a4739fSAlex Elder 						rq->bio);
2777bf0d5f50SAlex Elder 		if (!result)
2778bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2779bf0d5f50SAlex Elder 		if (result)
2780bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2781bf0d5f50SAlex Elder end_request:
2782bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2783bf0d5f50SAlex Elder 		if (result < 0) {
27847da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
27857da22d29SAlex Elder 				write_request ? "write" : "read",
27867da22d29SAlex Elder 				length, offset, result);
27877da22d29SAlex Elder 
2788bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2789bf0d5f50SAlex Elder 		}
2790bf0d5f50SAlex Elder 	}
2791bf0d5f50SAlex Elder }
2792bf0d5f50SAlex Elder 
2793602adf40SYehuda Sadeh /*
2794602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2795602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2796f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2797602adf40SYehuda Sadeh  */
2798602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2799602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2800602adf40SYehuda Sadeh {
2801602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2802e5cfeed2SAlex Elder 	sector_t sector_offset;
2803e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2804e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2805e5cfeed2SAlex Elder 	int ret;
2806602adf40SYehuda Sadeh 
2807e5cfeed2SAlex Elder 	/*
2808e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2809e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2810e5cfeed2SAlex Elder 	 * device.
2811e5cfeed2SAlex Elder 	 */
2812e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2813e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2814e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2815593a9e7bSAlex Elder 
2816e5cfeed2SAlex Elder 	/*
2817e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2818e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2819e5cfeed2SAlex Elder 	 */
2820e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2821e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2822e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2823e5cfeed2SAlex Elder 	else
2824e5cfeed2SAlex Elder 		ret = 0;
2825e5cfeed2SAlex Elder 
2826e5cfeed2SAlex Elder 	/*
2827e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2828e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2829e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2830e5cfeed2SAlex Elder 	 * added to an empty bio."
2831e5cfeed2SAlex Elder 	 */
2832e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2833e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2834e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2835e5cfeed2SAlex Elder 
2836e5cfeed2SAlex Elder 	return ret;
2837602adf40SYehuda Sadeh }
2838602adf40SYehuda Sadeh 
2839602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2840602adf40SYehuda Sadeh {
2841602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2842602adf40SYehuda Sadeh 
2843602adf40SYehuda Sadeh 	if (!disk)
2844602adf40SYehuda Sadeh 		return;
2845602adf40SYehuda Sadeh 
2846602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2847602adf40SYehuda Sadeh 		del_gendisk(disk);
2848602adf40SYehuda Sadeh 	if (disk->queue)
2849602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2850602adf40SYehuda Sadeh 	put_disk(disk);
2851602adf40SYehuda Sadeh }
2852602adf40SYehuda Sadeh 
2853788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2854788e2df3SAlex Elder 				const char *object_name,
2855788e2df3SAlex Elder 				u64 offset, u64 length,
285680ef15bfSAlex Elder 				void *buf, u64 *version)
2857788e2df3SAlex Elder 
2858788e2df3SAlex Elder {
28592169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2860788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2861788e2df3SAlex Elder 	struct page **pages = NULL;
2862788e2df3SAlex Elder 	u32 page_count;
28631ceae7efSAlex Elder 	size_t size;
2864788e2df3SAlex Elder 	int ret;
2865788e2df3SAlex Elder 
2866788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2867788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2868788e2df3SAlex Elder 	if (IS_ERR(pages))
2869788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2870788e2df3SAlex Elder 
2871788e2df3SAlex Elder 	ret = -ENOMEM;
2872788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2873788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2874788e2df3SAlex Elder 	if (!obj_request)
2875788e2df3SAlex Elder 		goto out;
2876788e2df3SAlex Elder 
2877788e2df3SAlex Elder 	obj_request->pages = pages;
2878788e2df3SAlex Elder 	obj_request->page_count = page_count;
2879788e2df3SAlex Elder 
2880430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2881788e2df3SAlex Elder 	if (!obj_request->osd_req)
2882788e2df3SAlex Elder 		goto out;
2883788e2df3SAlex Elder 
2884c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2885c99d2d4aSAlex Elder 					offset, length, 0, 0);
2886406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2887a4ce40a9SAlex Elder 					obj_request->pages,
288844cd188dSAlex Elder 					obj_request->length,
288944cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
289044cd188dSAlex Elder 					false, false);
28919d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2892430c28c3SAlex Elder 
2893788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2894788e2df3SAlex Elder 	if (ret)
2895788e2df3SAlex Elder 		goto out;
2896788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2897788e2df3SAlex Elder 	if (ret)
2898788e2df3SAlex Elder 		goto out;
2899788e2df3SAlex Elder 
2900788e2df3SAlex Elder 	ret = obj_request->result;
2901788e2df3SAlex Elder 	if (ret < 0)
2902788e2df3SAlex Elder 		goto out;
29031ceae7efSAlex Elder 
29041ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
29051ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2906903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
290723ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
290823ed6e13SAlex Elder 	ret = (int) size;
2909788e2df3SAlex Elder 	if (version)
2910788e2df3SAlex Elder 		*version = obj_request->version;
2911788e2df3SAlex Elder out:
2912788e2df3SAlex Elder 	if (obj_request)
2913788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2914788e2df3SAlex Elder 	else
2915788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2916788e2df3SAlex Elder 
2917788e2df3SAlex Elder 	return ret;
2918788e2df3SAlex Elder }
2919788e2df3SAlex Elder 
2920602adf40SYehuda Sadeh /*
29214156d998SAlex Elder  * Read the complete header for the given rbd device.
29224156d998SAlex Elder  *
29234156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
29244156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
29254156d998SAlex Elder  * of a variable that will be filled in with the version of the
29264156d998SAlex Elder  * header object at the time it was read.
29274156d998SAlex Elder  *
29284156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
29294156d998SAlex Elder  */
29304156d998SAlex Elder static struct rbd_image_header_ondisk *
29314156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
29324156d998SAlex Elder {
29334156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
29344156d998SAlex Elder 	u32 snap_count = 0;
29354156d998SAlex Elder 	u64 names_size = 0;
29364156d998SAlex Elder 	u32 want_count;
29374156d998SAlex Elder 	int ret;
29384156d998SAlex Elder 
29394156d998SAlex Elder 	/*
29404156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
29414156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
29424156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
29434156d998SAlex Elder 	 * the number of snapshots could change by the time we read
29444156d998SAlex Elder 	 * it in, in which case we re-read it.
29454156d998SAlex Elder 	 */
29464156d998SAlex Elder 	do {
29474156d998SAlex Elder 		size_t size;
29484156d998SAlex Elder 
29494156d998SAlex Elder 		kfree(ondisk);
29504156d998SAlex Elder 
29514156d998SAlex Elder 		size = sizeof (*ondisk);
29524156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
29534156d998SAlex Elder 		size += names_size;
29544156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
29554156d998SAlex Elder 		if (!ondisk)
29564156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
29574156d998SAlex Elder 
2958788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
295980ef15bfSAlex Elder 				       0, size, ondisk, version);
29604156d998SAlex Elder 		if (ret < 0)
29614156d998SAlex Elder 			goto out_err;
29624156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
29634156d998SAlex Elder 			ret = -ENXIO;
296406ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
296506ecc6cbSAlex Elder 				size, ret);
29664156d998SAlex Elder 			goto out_err;
29674156d998SAlex Elder 		}
29684156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
29694156d998SAlex Elder 			ret = -ENXIO;
297006ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
29714156d998SAlex Elder 			goto out_err;
29724156d998SAlex Elder 		}
29734156d998SAlex Elder 
29744156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
29754156d998SAlex Elder 		want_count = snap_count;
29764156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
29774156d998SAlex Elder 	} while (snap_count != want_count);
29784156d998SAlex Elder 
29794156d998SAlex Elder 	return ondisk;
29804156d998SAlex Elder 
29814156d998SAlex Elder out_err:
29824156d998SAlex Elder 	kfree(ondisk);
29834156d998SAlex Elder 
29844156d998SAlex Elder 	return ERR_PTR(ret);
29854156d998SAlex Elder }
29864156d998SAlex Elder 
29874156d998SAlex Elder /*
2988602adf40SYehuda Sadeh  * reload the ondisk the header
2989602adf40SYehuda Sadeh  */
2990602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2991602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2992602adf40SYehuda Sadeh {
29934156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
29944156d998SAlex Elder 	u64 ver = 0;
29954156d998SAlex Elder 	int ret;
2996602adf40SYehuda Sadeh 
29974156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
29984156d998SAlex Elder 	if (IS_ERR(ondisk))
29994156d998SAlex Elder 		return PTR_ERR(ondisk);
30004156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
30014156d998SAlex Elder 	if (ret >= 0)
300259c2be1eSYehuda Sadeh 		header->obj_version = ver;
30034156d998SAlex Elder 	kfree(ondisk);
3004602adf40SYehuda Sadeh 
30054156d998SAlex Elder 	return ret;
3006602adf40SYehuda Sadeh }
3007602adf40SYehuda Sadeh 
300841f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
3009dfc5606dSYehuda Sadeh {
3010dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
3011a0593290SAlex Elder 	struct rbd_snap *next;
3012dfc5606dSYehuda Sadeh 
30136087b51bSAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
30146087b51bSAlex Elder 		list_del(&snap->node);
30156087b51bSAlex Elder 		rbd_snap_destroy(snap);
30166087b51bSAlex Elder 	}
3017dfc5606dSYehuda Sadeh }
3018dfc5606dSYehuda Sadeh 
30199478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
30209478554aSAlex Elder {
30219478554aSAlex Elder 	sector_t size;
30229478554aSAlex Elder 
30230d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
30249478554aSAlex Elder 		return;
30259478554aSAlex Elder 
30269478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
30279478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
30289478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
30299478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
30309478554aSAlex Elder }
30319478554aSAlex Elder 
3032602adf40SYehuda Sadeh /*
3033602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
3034602adf40SYehuda Sadeh  */
3035117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
3036602adf40SYehuda Sadeh {
3037602adf40SYehuda Sadeh 	int ret;
3038602adf40SYehuda Sadeh 	struct rbd_image_header h;
3039602adf40SYehuda Sadeh 
3040602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
3041602adf40SYehuda Sadeh 	if (ret < 0)
3042602adf40SYehuda Sadeh 		return ret;
3043602adf40SYehuda Sadeh 
3044a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
3045a51aa0c0SJosh Durgin 
30469478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
30479478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
30489478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
30499db4b3e3SSage Weil 
3050849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
3051602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
3052849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
3053d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
3054d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
3055602adf40SYehuda Sadeh 
3056b813623aSAlex Elder 	if (hver)
3057b813623aSAlex Elder 		*hver = h.obj_version;
3058a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
305993a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
3060602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
3061602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
3062602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
3063849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
3064849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3065849b4260SAlex Elder 	kfree(h.object_prefix);
3066849b4260SAlex Elder 
3067304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3068dfc5606dSYehuda Sadeh 
3069c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
3070602adf40SYehuda Sadeh 
3071dfc5606dSYehuda Sadeh 	return ret;
3072602adf40SYehuda Sadeh }
3073602adf40SYehuda Sadeh 
3074117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
30751fe5e993SAlex Elder {
30761fe5e993SAlex Elder 	int ret;
30771fe5e993SAlex Elder 
3078117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
30791fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3080117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
3081117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
3082117973fbSAlex Elder 	else
3083117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
30841fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
3085d98df63eSLaurent Barbe 	revalidate_disk(rbd_dev->disk);
3086522a0cc0SAlex Elder 	if (ret)
3087522a0cc0SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
3088522a0cc0SAlex Elder 			   " update snaps: %d\n", ret);
30891fe5e993SAlex Elder 
30901fe5e993SAlex Elder 	return ret;
30911fe5e993SAlex Elder }
30921fe5e993SAlex Elder 
3093602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3094602adf40SYehuda Sadeh {
3095602adf40SYehuda Sadeh 	struct gendisk *disk;
3096602adf40SYehuda Sadeh 	struct request_queue *q;
3097593a9e7bSAlex Elder 	u64 segment_size;
3098602adf40SYehuda Sadeh 
3099602adf40SYehuda Sadeh 	/* create gendisk info */
3100602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3101602adf40SYehuda Sadeh 	if (!disk)
31021fcdb8aaSAlex Elder 		return -ENOMEM;
3103602adf40SYehuda Sadeh 
3104f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3105de71a297SAlex Elder 		 rbd_dev->dev_id);
3106602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3107602adf40SYehuda Sadeh 	disk->first_minor = 0;
3108602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3109602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3110602adf40SYehuda Sadeh 
3111bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3112602adf40SYehuda Sadeh 	if (!q)
3113602adf40SYehuda Sadeh 		goto out_disk;
3114029bcbd8SJosh Durgin 
3115593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3116593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3117593a9e7bSAlex Elder 
3118029bcbd8SJosh Durgin 	/* set io sizes to object size */
3119593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3120593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3121593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3122593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3123593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3124029bcbd8SJosh Durgin 
3125602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3126602adf40SYehuda Sadeh 	disk->queue = q;
3127602adf40SYehuda Sadeh 
3128602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3129602adf40SYehuda Sadeh 
3130602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3131602adf40SYehuda Sadeh 
313212f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
313312f02944SAlex Elder 
3134602adf40SYehuda Sadeh 	return 0;
3135602adf40SYehuda Sadeh out_disk:
3136602adf40SYehuda Sadeh 	put_disk(disk);
31371fcdb8aaSAlex Elder 
31381fcdb8aaSAlex Elder 	return -ENOMEM;
3139602adf40SYehuda Sadeh }
3140602adf40SYehuda Sadeh 
3141dfc5606dSYehuda Sadeh /*
3142dfc5606dSYehuda Sadeh   sysfs
3143dfc5606dSYehuda Sadeh */
3144602adf40SYehuda Sadeh 
3145593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3146593a9e7bSAlex Elder {
3147593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3148593a9e7bSAlex Elder }
3149593a9e7bSAlex Elder 
3150dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3151dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3152602adf40SYehuda Sadeh {
3153593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3154a51aa0c0SJosh Durgin 	sector_t size;
3155dfc5606dSYehuda Sadeh 
3156a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
3157a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
3158a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
3159a51aa0c0SJosh Durgin 
3160a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
3161602adf40SYehuda Sadeh }
3162602adf40SYehuda Sadeh 
316334b13184SAlex Elder /*
316434b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
316534b13184SAlex Elder  * necessarily the base image.
316634b13184SAlex Elder  */
316734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
316834b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
316934b13184SAlex Elder {
317034b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
317134b13184SAlex Elder 
317234b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
317334b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
317434b13184SAlex Elder }
317534b13184SAlex Elder 
3176dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3177dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3178602adf40SYehuda Sadeh {
3179593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3180dfc5606dSYehuda Sadeh 
3181dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
3182dfc5606dSYehuda Sadeh }
3183dfc5606dSYehuda Sadeh 
3184dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3185dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3186dfc5606dSYehuda Sadeh {
3187593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3188dfc5606dSYehuda Sadeh 
31891dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
31901dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3191dfc5606dSYehuda Sadeh }
3192dfc5606dSYehuda Sadeh 
3193dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3194dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3195dfc5606dSYehuda Sadeh {
3196593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3197dfc5606dSYehuda Sadeh 
31980d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3199dfc5606dSYehuda Sadeh }
3200dfc5606dSYehuda Sadeh 
32019bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
32029bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
32039bb2f334SAlex Elder {
32049bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
32059bb2f334SAlex Elder 
32060d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
32070d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
32089bb2f334SAlex Elder }
32099bb2f334SAlex Elder 
3210dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3211dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3212dfc5606dSYehuda Sadeh {
3213593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3214dfc5606dSYehuda Sadeh 
3215a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
32160d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3217a92ffdf8SAlex Elder 
3218a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3219dfc5606dSYehuda Sadeh }
3220dfc5606dSYehuda Sadeh 
3221589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3222589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3223589d30e0SAlex Elder {
3224589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3225589d30e0SAlex Elder 
32260d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3227589d30e0SAlex Elder }
3228589d30e0SAlex Elder 
322934b13184SAlex Elder /*
323034b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
323134b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
323234b13184SAlex Elder  */
3233dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3234dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3235dfc5606dSYehuda Sadeh 			     char *buf)
3236dfc5606dSYehuda Sadeh {
3237593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3238dfc5606dSYehuda Sadeh 
32390d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3240dfc5606dSYehuda Sadeh }
3241dfc5606dSYehuda Sadeh 
324286b00e0dSAlex Elder /*
324386b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
324486b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
324586b00e0dSAlex Elder  * "(no parent image)".
324686b00e0dSAlex Elder  */
324786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
324886b00e0dSAlex Elder 			     struct device_attribute *attr,
324986b00e0dSAlex Elder 			     char *buf)
325086b00e0dSAlex Elder {
325186b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
325286b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
325386b00e0dSAlex Elder 	int count;
325486b00e0dSAlex Elder 	char *bufp = buf;
325586b00e0dSAlex Elder 
325686b00e0dSAlex Elder 	if (!spec)
325786b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
325886b00e0dSAlex Elder 
325986b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
326086b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
326186b00e0dSAlex Elder 	if (count < 0)
326286b00e0dSAlex Elder 		return count;
326386b00e0dSAlex Elder 	bufp += count;
326486b00e0dSAlex Elder 
326586b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
326686b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
326786b00e0dSAlex Elder 	if (count < 0)
326886b00e0dSAlex Elder 		return count;
326986b00e0dSAlex Elder 	bufp += count;
327086b00e0dSAlex Elder 
327186b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
327286b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
327386b00e0dSAlex Elder 	if (count < 0)
327486b00e0dSAlex Elder 		return count;
327586b00e0dSAlex Elder 	bufp += count;
327686b00e0dSAlex Elder 
327786b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
327886b00e0dSAlex Elder 	if (count < 0)
327986b00e0dSAlex Elder 		return count;
328086b00e0dSAlex Elder 	bufp += count;
328186b00e0dSAlex Elder 
328286b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
328386b00e0dSAlex Elder }
328486b00e0dSAlex Elder 
3285dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3286dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3287dfc5606dSYehuda Sadeh 				 const char *buf,
3288dfc5606dSYehuda Sadeh 				 size_t size)
3289dfc5606dSYehuda Sadeh {
3290593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3291b813623aSAlex Elder 	int ret;
3292602adf40SYehuda Sadeh 
3293117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
3294b813623aSAlex Elder 
3295b813623aSAlex Elder 	return ret < 0 ? ret : size;
3296dfc5606dSYehuda Sadeh }
3297602adf40SYehuda Sadeh 
3298dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
329934b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3300dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3301dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3302dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
33039bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3304dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3305589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3306dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3307dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
330886b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3309dfc5606dSYehuda Sadeh 
3310dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3311dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
331234b13184SAlex Elder 	&dev_attr_features.attr,
3313dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3314dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3315dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
33169bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3317dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3318589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3319dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
332086b00e0dSAlex Elder 	&dev_attr_parent.attr,
3321dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3322dfc5606dSYehuda Sadeh 	NULL
3323dfc5606dSYehuda Sadeh };
3324dfc5606dSYehuda Sadeh 
3325dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3326dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3327dfc5606dSYehuda Sadeh };
3328dfc5606dSYehuda Sadeh 
3329dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3330dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3331dfc5606dSYehuda Sadeh 	NULL
3332dfc5606dSYehuda Sadeh };
3333dfc5606dSYehuda Sadeh 
3334dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3335dfc5606dSYehuda Sadeh {
3336dfc5606dSYehuda Sadeh }
3337dfc5606dSYehuda Sadeh 
3338dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3339dfc5606dSYehuda Sadeh 	.name		= "rbd",
3340dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3341dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3342dfc5606dSYehuda Sadeh };
3343dfc5606dSYehuda Sadeh 
33448b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
33458b8fb99cSAlex Elder {
33468b8fb99cSAlex Elder 	kref_get(&spec->kref);
33478b8fb99cSAlex Elder 
33488b8fb99cSAlex Elder 	return spec;
33498b8fb99cSAlex Elder }
33508b8fb99cSAlex Elder 
33518b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
33528b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
33538b8fb99cSAlex Elder {
33548b8fb99cSAlex Elder 	if (spec)
33558b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
33568b8fb99cSAlex Elder }
33578b8fb99cSAlex Elder 
33588b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
33598b8fb99cSAlex Elder {
33608b8fb99cSAlex Elder 	struct rbd_spec *spec;
33618b8fb99cSAlex Elder 
33628b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
33638b8fb99cSAlex Elder 	if (!spec)
33648b8fb99cSAlex Elder 		return NULL;
33658b8fb99cSAlex Elder 	kref_init(&spec->kref);
33668b8fb99cSAlex Elder 
33678b8fb99cSAlex Elder 	return spec;
33688b8fb99cSAlex Elder }
33698b8fb99cSAlex Elder 
33708b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
33718b8fb99cSAlex Elder {
33728b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
33738b8fb99cSAlex Elder 
33748b8fb99cSAlex Elder 	kfree(spec->pool_name);
33758b8fb99cSAlex Elder 	kfree(spec->image_id);
33768b8fb99cSAlex Elder 	kfree(spec->image_name);
33778b8fb99cSAlex Elder 	kfree(spec->snap_name);
33788b8fb99cSAlex Elder 	kfree(spec);
33798b8fb99cSAlex Elder }
33808b8fb99cSAlex Elder 
3381cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3382c53d5893SAlex Elder 				struct rbd_spec *spec)
3383c53d5893SAlex Elder {
3384c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3385c53d5893SAlex Elder 
3386c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3387c53d5893SAlex Elder 	if (!rbd_dev)
3388c53d5893SAlex Elder 		return NULL;
3389c53d5893SAlex Elder 
3390c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
33916d292906SAlex Elder 	rbd_dev->flags = 0;
3392c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3393c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
3394c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3395c53d5893SAlex Elder 
3396c53d5893SAlex Elder 	rbd_dev->spec = spec;
3397c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3398c53d5893SAlex Elder 
33990903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
34000903e875SAlex Elder 
34010903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34020903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
34030903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34040903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
34050903e875SAlex Elder 
3406c53d5893SAlex Elder 	return rbd_dev;
3407c53d5893SAlex Elder }
3408c53d5893SAlex Elder 
3409c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3410c53d5893SAlex Elder {
341186b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
3412c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
3413c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3414c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3415c53d5893SAlex Elder 	kfree(rbd_dev);
3416c53d5893SAlex Elder }
3417c53d5893SAlex Elder 
34186087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap)
3419dfc5606dSYehuda Sadeh {
34203e83b65bSAlex Elder 	kfree(snap->name);
34213e83b65bSAlex Elder 	kfree(snap);
3422dfc5606dSYehuda Sadeh }
3423dfc5606dSYehuda Sadeh 
34246087b51bSAlex Elder static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
3425c8d18425SAlex Elder 						const char *snap_name,
342634b13184SAlex Elder 						u64 snap_id, u64 snap_size,
342734b13184SAlex Elder 						u64 snap_features)
3428dfc5606dSYehuda Sadeh {
34294e891e0aSAlex Elder 	struct rbd_snap *snap;
34304e891e0aSAlex Elder 
34314e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
3432dfc5606dSYehuda Sadeh 	if (!snap)
34334e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
34344e891e0aSAlex Elder 
34356e584f52SAlex Elder 	snap->name = snap_name;
3436c8d18425SAlex Elder 	snap->id = snap_id;
3437c8d18425SAlex Elder 	snap->size = snap_size;
343834b13184SAlex Elder 	snap->features = snap_features;
34394e891e0aSAlex Elder 
34404e891e0aSAlex Elder 	return snap;
3441dfc5606dSYehuda Sadeh }
3442dfc5606dSYehuda Sadeh 
34436e584f52SAlex Elder /*
34446e584f52SAlex Elder  * Returns a dynamically-allocated snapshot name if successful, or a
34456e584f52SAlex Elder  * pointer-coded error otherwise.
34466e584f52SAlex Elder  */
3447cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3448cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
3449cd892126SAlex Elder {
3450cd892126SAlex Elder 	char *snap_name;
34516e584f52SAlex Elder 	int i;
3452cd892126SAlex Elder 
3453cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3454cd892126SAlex Elder 
3455cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
3456cd892126SAlex Elder 
3457cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
34586e584f52SAlex Elder 	for (i = 0; i < which; i++)
3459cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
3460cd892126SAlex Elder 
34616e584f52SAlex Elder 	snap_name = kstrdup(snap_name, GFP_KERNEL);
34626e584f52SAlex Elder 	if (!snap_name)
34636e584f52SAlex Elder 		return ERR_PTR(-ENOMEM);
34646e584f52SAlex Elder 
34656e584f52SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
34666e584f52SAlex Elder 	*snap_features = 0;	/* No features for v1 */
34676e584f52SAlex Elder 
3468cd892126SAlex Elder 	return snap_name;
3469cd892126SAlex Elder }
3470cd892126SAlex Elder 
3471dfc5606dSYehuda Sadeh /*
34729d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
34739d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
34749d475de5SAlex Elder  * image.
34759d475de5SAlex Elder  */
34769d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
34779d475de5SAlex Elder 				u8 *order, u64 *snap_size)
34789d475de5SAlex Elder {
34799d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
34809d475de5SAlex Elder 	int ret;
34819d475de5SAlex Elder 	struct {
34829d475de5SAlex Elder 		u8 order;
34839d475de5SAlex Elder 		__le64 size;
34849d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
34859d475de5SAlex Elder 
348636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
34879d475de5SAlex Elder 				"rbd", "get_size",
34884157976bSAlex Elder 				&snapid, sizeof (snapid),
34894157976bSAlex Elder 				&size_buf, sizeof (size_buf), NULL);
349036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
34919d475de5SAlex Elder 	if (ret < 0)
34929d475de5SAlex Elder 		return ret;
349357385b51SAlex Elder 	if (ret < sizeof (size_buf))
349457385b51SAlex Elder 		return -ERANGE;
34959d475de5SAlex Elder 
3496c86f86e9SAlex Elder 	if (order)
34979d475de5SAlex Elder 		*order = size_buf.order;
34989d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
34999d475de5SAlex Elder 
35009d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
35019d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
35029d475de5SAlex Elder 		(unsigned long long)*snap_size);
35039d475de5SAlex Elder 
35049d475de5SAlex Elder 	return 0;
35059d475de5SAlex Elder }
35069d475de5SAlex Elder 
35079d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
35089d475de5SAlex Elder {
35099d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
35109d475de5SAlex Elder 					&rbd_dev->header.obj_order,
35119d475de5SAlex Elder 					&rbd_dev->header.image_size);
35129d475de5SAlex Elder }
35139d475de5SAlex Elder 
35141e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
35151e130199SAlex Elder {
35161e130199SAlex Elder 	void *reply_buf;
35171e130199SAlex Elder 	int ret;
35181e130199SAlex Elder 	void *p;
35191e130199SAlex Elder 
35201e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
35211e130199SAlex Elder 	if (!reply_buf)
35221e130199SAlex Elder 		return -ENOMEM;
35231e130199SAlex Elder 
352436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35254157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
352607b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
352736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35281e130199SAlex Elder 	if (ret < 0)
35291e130199SAlex Elder 		goto out;
35301e130199SAlex Elder 
35311e130199SAlex Elder 	p = reply_buf;
35321e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
353357385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
353457385b51SAlex Elder 	ret = 0;
35351e130199SAlex Elder 
35361e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
35371e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
35381e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
35391e130199SAlex Elder 	} else {
35401e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
35411e130199SAlex Elder 	}
35421e130199SAlex Elder out:
35431e130199SAlex Elder 	kfree(reply_buf);
35441e130199SAlex Elder 
35451e130199SAlex Elder 	return ret;
35461e130199SAlex Elder }
35471e130199SAlex Elder 
3548b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3549b1b5402aSAlex Elder 		u64 *snap_features)
3550b1b5402aSAlex Elder {
3551b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3552b1b5402aSAlex Elder 	struct {
3553b1b5402aSAlex Elder 		__le64 features;
3554b1b5402aSAlex Elder 		__le64 incompat;
35554157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3556d889140cSAlex Elder 	u64 incompat;
3557b1b5402aSAlex Elder 	int ret;
3558b1b5402aSAlex Elder 
355936be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3560b1b5402aSAlex Elder 				"rbd", "get_features",
35614157976bSAlex Elder 				&snapid, sizeof (snapid),
35624157976bSAlex Elder 				&features_buf, sizeof (features_buf), NULL);
356336be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3564b1b5402aSAlex Elder 	if (ret < 0)
3565b1b5402aSAlex Elder 		return ret;
356657385b51SAlex Elder 	if (ret < sizeof (features_buf))
356757385b51SAlex Elder 		return -ERANGE;
3568d889140cSAlex Elder 
3569d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
35705cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3571b8f5c6edSAlex Elder 		return -ENXIO;
3572d889140cSAlex Elder 
3573b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3574b1b5402aSAlex Elder 
3575b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3576b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3577b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3578b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3579b1b5402aSAlex Elder 
3580b1b5402aSAlex Elder 	return 0;
3581b1b5402aSAlex Elder }
3582b1b5402aSAlex Elder 
3583b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3584b1b5402aSAlex Elder {
3585b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3586b1b5402aSAlex Elder 						&rbd_dev->header.features);
3587b1b5402aSAlex Elder }
3588b1b5402aSAlex Elder 
358986b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
359086b00e0dSAlex Elder {
359186b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
359286b00e0dSAlex Elder 	size_t size;
359386b00e0dSAlex Elder 	void *reply_buf = NULL;
359486b00e0dSAlex Elder 	__le64 snapid;
359586b00e0dSAlex Elder 	void *p;
359686b00e0dSAlex Elder 	void *end;
359786b00e0dSAlex Elder 	char *image_id;
359886b00e0dSAlex Elder 	u64 overlap;
359986b00e0dSAlex Elder 	int ret;
360086b00e0dSAlex Elder 
360186b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
360286b00e0dSAlex Elder 	if (!parent_spec)
360386b00e0dSAlex Elder 		return -ENOMEM;
360486b00e0dSAlex Elder 
360586b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
360686b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
360786b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
360886b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
360986b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
361086b00e0dSAlex Elder 	if (!reply_buf) {
361186b00e0dSAlex Elder 		ret = -ENOMEM;
361286b00e0dSAlex Elder 		goto out_err;
361386b00e0dSAlex Elder 	}
361486b00e0dSAlex Elder 
361586b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
361636be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
361786b00e0dSAlex Elder 				"rbd", "get_parent",
36184157976bSAlex Elder 				&snapid, sizeof (snapid),
36194157976bSAlex Elder 				reply_buf, size, NULL);
362036be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
362186b00e0dSAlex Elder 	if (ret < 0)
362286b00e0dSAlex Elder 		goto out_err;
362386b00e0dSAlex Elder 
362486b00e0dSAlex Elder 	p = reply_buf;
362557385b51SAlex Elder 	end = reply_buf + ret;
362657385b51SAlex Elder 	ret = -ERANGE;
362786b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
362886b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
362986b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
363086b00e0dSAlex Elder 
36310903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36320903e875SAlex Elder 
36330903e875SAlex Elder 	ret = -EIO;
36340903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX))
363557385b51SAlex Elder 		goto out_err;
36360903e875SAlex Elder 
3637979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
363886b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
363986b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
364086b00e0dSAlex Elder 		goto out_err;
364186b00e0dSAlex Elder 	}
364286b00e0dSAlex Elder 	parent_spec->image_id = image_id;
364386b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
364486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
364586b00e0dSAlex Elder 
364686b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
364786b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
364886b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
364986b00e0dSAlex Elder out:
365086b00e0dSAlex Elder 	ret = 0;
365186b00e0dSAlex Elder out_err:
365286b00e0dSAlex Elder 	kfree(reply_buf);
365386b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
365486b00e0dSAlex Elder 
365586b00e0dSAlex Elder 	return ret;
365686b00e0dSAlex Elder }
365786b00e0dSAlex Elder 
3658cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3659cc070d59SAlex Elder {
3660cc070d59SAlex Elder 	struct {
3661cc070d59SAlex Elder 		__le64 stripe_unit;
3662cc070d59SAlex Elder 		__le64 stripe_count;
3663cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3664cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3665cc070d59SAlex Elder 	void *p;
3666cc070d59SAlex Elder 	u64 obj_size;
3667cc070d59SAlex Elder 	u64 stripe_unit;
3668cc070d59SAlex Elder 	u64 stripe_count;
3669cc070d59SAlex Elder 	int ret;
3670cc070d59SAlex Elder 
3671cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3672cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3673cc070d59SAlex Elder 				(char *)&striping_info_buf, size, NULL);
3674cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3675cc070d59SAlex Elder 	if (ret < 0)
3676cc070d59SAlex Elder 		return ret;
3677cc070d59SAlex Elder 	if (ret < size)
3678cc070d59SAlex Elder 		return -ERANGE;
3679cc070d59SAlex Elder 
3680cc070d59SAlex Elder 	/*
3681cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3682cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3683cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3684cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3685cc070d59SAlex Elder 	 */
3686cc070d59SAlex Elder 	ret = -EINVAL;
3687cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3688cc070d59SAlex Elder 	p = &striping_info_buf;
3689cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3690cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3691cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3692cc070d59SAlex Elder 				"(got %llu want %llu)",
3693cc070d59SAlex Elder 				stripe_unit, obj_size);
3694cc070d59SAlex Elder 		return -EINVAL;
3695cc070d59SAlex Elder 	}
3696cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3697cc070d59SAlex Elder 	if (stripe_count != 1) {
3698cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3699cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3700cc070d59SAlex Elder 		return -EINVAL;
3701cc070d59SAlex Elder 	}
3702cc070d59SAlex Elder 	rbd_dev->stripe_unit = stripe_unit;
3703cc070d59SAlex Elder 	rbd_dev->stripe_count = stripe_count;
3704cc070d59SAlex Elder 
3705cc070d59SAlex Elder 	return 0;
3706cc070d59SAlex Elder }
3707cc070d59SAlex Elder 
37089e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
37099e15b77dSAlex Elder {
37109e15b77dSAlex Elder 	size_t image_id_size;
37119e15b77dSAlex Elder 	char *image_id;
37129e15b77dSAlex Elder 	void *p;
37139e15b77dSAlex Elder 	void *end;
37149e15b77dSAlex Elder 	size_t size;
37159e15b77dSAlex Elder 	void *reply_buf = NULL;
37169e15b77dSAlex Elder 	size_t len = 0;
37179e15b77dSAlex Elder 	char *image_name = NULL;
37189e15b77dSAlex Elder 	int ret;
37199e15b77dSAlex Elder 
37209e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
37219e15b77dSAlex Elder 
372269e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
372369e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
37249e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
37259e15b77dSAlex Elder 	if (!image_id)
37269e15b77dSAlex Elder 		return NULL;
37279e15b77dSAlex Elder 
37289e15b77dSAlex Elder 	p = image_id;
37294157976bSAlex Elder 	end = image_id + image_id_size;
373069e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
37319e15b77dSAlex Elder 
37329e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
37339e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
37349e15b77dSAlex Elder 	if (!reply_buf)
37359e15b77dSAlex Elder 		goto out;
37369e15b77dSAlex Elder 
373736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
37389e15b77dSAlex Elder 				"rbd", "dir_get_name",
37399e15b77dSAlex Elder 				image_id, image_id_size,
37404157976bSAlex Elder 				reply_buf, size, NULL);
37419e15b77dSAlex Elder 	if (ret < 0)
37429e15b77dSAlex Elder 		goto out;
37439e15b77dSAlex Elder 	p = reply_buf;
37444157976bSAlex Elder 	end = reply_buf + size;
37459e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
37469e15b77dSAlex Elder 	if (IS_ERR(image_name))
37479e15b77dSAlex Elder 		image_name = NULL;
37489e15b77dSAlex Elder 	else
37499e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
37509e15b77dSAlex Elder out:
37519e15b77dSAlex Elder 	kfree(reply_buf);
37529e15b77dSAlex Elder 	kfree(image_id);
37539e15b77dSAlex Elder 
37549e15b77dSAlex Elder 	return image_name;
37559e15b77dSAlex Elder }
37569e15b77dSAlex Elder 
37579e15b77dSAlex Elder /*
37589e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
37599e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
37609e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
37619e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
37629e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
37639e15b77dSAlex Elder  * until then.
37649e15b77dSAlex Elder  */
37659e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
37669e15b77dSAlex Elder {
37679e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
37689e15b77dSAlex Elder 	const char *name;
37699e15b77dSAlex Elder 	void *reply_buf = NULL;
37709e15b77dSAlex Elder 	int ret;
37719e15b77dSAlex Elder 
37729e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
37739e15b77dSAlex Elder 		return 0;	/* Already have the names */
37749e15b77dSAlex Elder 
37759e15b77dSAlex Elder 	/* Look up the pool name */
37769e15b77dSAlex Elder 
37779e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
37789e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3779935dc89fSAlex Elder 	if (!name) {
3780935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3781935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3782935dc89fSAlex Elder 		return -EIO;
3783935dc89fSAlex Elder 	}
37849e15b77dSAlex Elder 
37859e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
37869e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
37879e15b77dSAlex Elder 		return -ENOMEM;
37889e15b77dSAlex Elder 
37899e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
37909e15b77dSAlex Elder 
37919e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
379269e7a02fSAlex Elder 	if (name)
37939e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *)name;
379469e7a02fSAlex Elder 	else
379506ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
37969e15b77dSAlex Elder 
37979e15b77dSAlex Elder 	/* Look up the snapshot name. */
37989e15b77dSAlex Elder 
37999e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
38009e15b77dSAlex Elder 	if (!name) {
3801935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3802935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
38039e15b77dSAlex Elder 		ret = -EIO;
38049e15b77dSAlex Elder 		goto out_err;
38059e15b77dSAlex Elder 	}
38069e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
38079e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
38089e15b77dSAlex Elder 		goto out_err;
38099e15b77dSAlex Elder 
38109e15b77dSAlex Elder 	return 0;
38119e15b77dSAlex Elder out_err:
38129e15b77dSAlex Elder 	kfree(reply_buf);
38139e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
38149e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
38159e15b77dSAlex Elder 
38169e15b77dSAlex Elder 	return ret;
38179e15b77dSAlex Elder }
38189e15b77dSAlex Elder 
38196e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
382035d489f9SAlex Elder {
382135d489f9SAlex Elder 	size_t size;
382235d489f9SAlex Elder 	int ret;
382335d489f9SAlex Elder 	void *reply_buf;
382435d489f9SAlex Elder 	void *p;
382535d489f9SAlex Elder 	void *end;
382635d489f9SAlex Elder 	u64 seq;
382735d489f9SAlex Elder 	u32 snap_count;
382835d489f9SAlex Elder 	struct ceph_snap_context *snapc;
382935d489f9SAlex Elder 	u32 i;
383035d489f9SAlex Elder 
383135d489f9SAlex Elder 	/*
383235d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
383335d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
383435d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
383535d489f9SAlex Elder 	 * prepared to receive.
383635d489f9SAlex Elder 	 */
383735d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
383835d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
383935d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
384035d489f9SAlex Elder 	if (!reply_buf)
384135d489f9SAlex Elder 		return -ENOMEM;
384235d489f9SAlex Elder 
384336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38444157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
384507b2391fSAlex Elder 				reply_buf, size, ver);
384636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
384735d489f9SAlex Elder 	if (ret < 0)
384835d489f9SAlex Elder 		goto out;
384935d489f9SAlex Elder 
385035d489f9SAlex Elder 	p = reply_buf;
385157385b51SAlex Elder 	end = reply_buf + ret;
385257385b51SAlex Elder 	ret = -ERANGE;
385335d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
385435d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
385535d489f9SAlex Elder 
385635d489f9SAlex Elder 	/*
385735d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
385835d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
385935d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
386035d489f9SAlex Elder 	 * allocate is representable in a size_t.
386135d489f9SAlex Elder 	 */
386235d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
386335d489f9SAlex Elder 				 / sizeof (u64)) {
386435d489f9SAlex Elder 		ret = -EINVAL;
386535d489f9SAlex Elder 		goto out;
386635d489f9SAlex Elder 	}
386735d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
386835d489f9SAlex Elder 		goto out;
386935d489f9SAlex Elder 
387035d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
387135d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
387235d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
387335d489f9SAlex Elder 	if (!snapc) {
387435d489f9SAlex Elder 		ret = -ENOMEM;
387535d489f9SAlex Elder 		goto out;
387635d489f9SAlex Elder 	}
387757385b51SAlex Elder 	ret = 0;
387835d489f9SAlex Elder 
387935d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
388035d489f9SAlex Elder 	snapc->seq = seq;
388135d489f9SAlex Elder 	snapc->num_snaps = snap_count;
388235d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
388335d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
388435d489f9SAlex Elder 
388535d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
388635d489f9SAlex Elder 
388735d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
388835d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
388935d489f9SAlex Elder out:
389035d489f9SAlex Elder 	kfree(reply_buf);
389135d489f9SAlex Elder 
389257385b51SAlex Elder 	return ret;
389335d489f9SAlex Elder }
389435d489f9SAlex Elder 
3895b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3896b8b1e2dbSAlex Elder {
3897b8b1e2dbSAlex Elder 	size_t size;
3898b8b1e2dbSAlex Elder 	void *reply_buf;
3899b8b1e2dbSAlex Elder 	__le64 snap_id;
3900b8b1e2dbSAlex Elder 	int ret;
3901b8b1e2dbSAlex Elder 	void *p;
3902b8b1e2dbSAlex Elder 	void *end;
3903b8b1e2dbSAlex Elder 	char *snap_name;
3904b8b1e2dbSAlex Elder 
3905b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3906b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3907b8b1e2dbSAlex Elder 	if (!reply_buf)
3908b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3909b8b1e2dbSAlex Elder 
3910acb1b6caSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3911b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
391236be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3913b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
39144157976bSAlex Elder 				&snap_id, sizeof (snap_id),
391507b2391fSAlex Elder 				reply_buf, size, NULL);
391636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3917b8b1e2dbSAlex Elder 	if (ret < 0)
3918b8b1e2dbSAlex Elder 		goto out;
3919b8b1e2dbSAlex Elder 
3920b8b1e2dbSAlex Elder 	p = reply_buf;
39214157976bSAlex Elder 	end = reply_buf + size;
3922e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3923b8b1e2dbSAlex Elder 	if (IS_ERR(snap_name)) {
3924b8b1e2dbSAlex Elder 		ret = PTR_ERR(snap_name);
3925b8b1e2dbSAlex Elder 		goto out;
3926b8b1e2dbSAlex Elder 	} else {
3927b8b1e2dbSAlex Elder 		dout("  snap_id 0x%016llx snap_name = %s\n",
3928b8b1e2dbSAlex Elder 			(unsigned long long)le64_to_cpu(snap_id), snap_name);
3929b8b1e2dbSAlex Elder 	}
3930b8b1e2dbSAlex Elder 	kfree(reply_buf);
3931b8b1e2dbSAlex Elder 
3932b8b1e2dbSAlex Elder 	return snap_name;
3933b8b1e2dbSAlex Elder out:
3934b8b1e2dbSAlex Elder 	kfree(reply_buf);
3935b8b1e2dbSAlex Elder 
3936b8b1e2dbSAlex Elder 	return ERR_PTR(ret);
3937b8b1e2dbSAlex Elder }
3938b8b1e2dbSAlex Elder 
3939b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3940b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3941b8b1e2dbSAlex Elder {
3942e0b49868SAlex Elder 	u64 snap_id;
3943acb1b6caSAlex Elder 	u64 size;
3944acb1b6caSAlex Elder 	u64 features;
3945acb1b6caSAlex Elder 	char *snap_name;
3946b8b1e2dbSAlex Elder 	int ret;
3947b8b1e2dbSAlex Elder 
3948acb1b6caSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3949b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3950acb1b6caSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
3951b8b1e2dbSAlex Elder 	if (ret)
3952acb1b6caSAlex Elder 		goto out_err;
3953b8b1e2dbSAlex Elder 
3954acb1b6caSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
3955acb1b6caSAlex Elder 	if (ret)
3956acb1b6caSAlex Elder 		goto out_err;
3957acb1b6caSAlex Elder 
3958acb1b6caSAlex Elder 	snap_name = rbd_dev_v2_snap_name(rbd_dev, which);
3959acb1b6caSAlex Elder 	if (!IS_ERR(snap_name)) {
3960acb1b6caSAlex Elder 		*snap_size = size;
3961acb1b6caSAlex Elder 		*snap_features = features;
3962acb1b6caSAlex Elder 	}
3963acb1b6caSAlex Elder 
3964acb1b6caSAlex Elder 	return snap_name;
3965acb1b6caSAlex Elder out_err:
3966acb1b6caSAlex Elder 	return ERR_PTR(ret);
3967b8b1e2dbSAlex Elder }
3968b8b1e2dbSAlex Elder 
3969b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3970b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3971b8b1e2dbSAlex Elder {
3972b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3973b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3974b8b1e2dbSAlex Elder 					snap_size, snap_features);
3975b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3976b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3977b8b1e2dbSAlex Elder 					snap_size, snap_features);
3978b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3979b8b1e2dbSAlex Elder }
3980b8b1e2dbSAlex Elder 
3981117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3982117973fbSAlex Elder {
3983117973fbSAlex Elder 	int ret;
3984117973fbSAlex Elder 	__u8 obj_order;
3985117973fbSAlex Elder 
3986117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3987117973fbSAlex Elder 
3988117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3989117973fbSAlex Elder 
3990117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3991117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3992117973fbSAlex Elder 	if (ret)
3993117973fbSAlex Elder 		goto out;
3994117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3995117973fbSAlex Elder 		ret = -EIO;
3996117973fbSAlex Elder 		goto out;
3997117973fbSAlex Elder 	}
3998117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3999117973fbSAlex Elder 
4000117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4001117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4002117973fbSAlex Elder 	if (ret)
4003117973fbSAlex Elder 		goto out;
4004117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
4005117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
4006117973fbSAlex Elder 	if (ret)
4007117973fbSAlex Elder 		goto out;
4008117973fbSAlex Elder out:
4009117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
4010117973fbSAlex Elder 
4011117973fbSAlex Elder 	return ret;
4012117973fbSAlex Elder }
4013117973fbSAlex Elder 
40149d475de5SAlex Elder /*
401535938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
401635938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
401735938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
401835938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
401935938150SAlex Elder  * And verify there are no changes to snapshots we already know
402035938150SAlex Elder  * about.
402135938150SAlex Elder  *
402235938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
402335938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
402435938150SAlex Elder  * are also maintained in that order.)
4025522a0cc0SAlex Elder  *
4026522a0cc0SAlex Elder  * Note that any error occurs while updating the snapshot list
4027522a0cc0SAlex Elder  * aborts the update, and the entire list is cleared.  The snapshot
4028522a0cc0SAlex Elder  * list becomes inconsistent at that point anyway, so it might as
4029522a0cc0SAlex Elder  * well be empty.
4030dfc5606dSYehuda Sadeh  */
4031304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
4032dfc5606dSYehuda Sadeh {
403335938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
403435938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
403535938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
403635938150SAlex Elder 	struct list_head *links = head->next;
403735938150SAlex Elder 	u32 index = 0;
4038522a0cc0SAlex Elder 	int ret = 0;
4039dfc5606dSYehuda Sadeh 
40409fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
404135938150SAlex Elder 	while (index < snap_count || links != head) {
404235938150SAlex Elder 		u64 snap_id;
404335938150SAlex Elder 		struct rbd_snap *snap;
4044cd892126SAlex Elder 		char *snap_name;
4045cd892126SAlex Elder 		u64 snap_size = 0;
4046cd892126SAlex Elder 		u64 snap_features = 0;
4047dfc5606dSYehuda Sadeh 
404835938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
404935938150SAlex Elder 					     : CEPH_NOSNAP;
405035938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
405135938150SAlex Elder 				     : NULL;
4052aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
4053dfc5606dSYehuda Sadeh 
405435938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
405535938150SAlex Elder 			struct list_head *next = links->next;
4056dfc5606dSYehuda Sadeh 
40576d292906SAlex Elder 			/*
40586d292906SAlex Elder 			 * A previously-existing snapshot is not in
40596d292906SAlex Elder 			 * the new snap context.
40606d292906SAlex Elder 			 *
4061522a0cc0SAlex Elder 			 * If the now-missing snapshot is the one
4062522a0cc0SAlex Elder 			 * the image represents, clear its existence
4063522a0cc0SAlex Elder 			 * flag so we can avoid sending any more
4064522a0cc0SAlex Elder 			 * requests to it.
40656d292906SAlex Elder 			 */
40660d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
40676d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
40683e83b65bSAlex Elder 			dout("removing %ssnap id %llu\n",
40690d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
40700d7dbfceSAlex Elder 							"mapped " : "",
40719fcbb800SAlex Elder 				(unsigned long long)snap->id);
40726087b51bSAlex Elder 
40736087b51bSAlex Elder 			list_del(&snap->node);
40746087b51bSAlex Elder 			rbd_snap_destroy(snap);
4075dfc5606dSYehuda Sadeh 
407635938150SAlex Elder 			/* Done with this list entry; advance */
407735938150SAlex Elder 
407835938150SAlex Elder 			links = next;
407935938150SAlex Elder 			continue;
4080dfc5606dSYehuda Sadeh 		}
408135938150SAlex Elder 
4082b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
4083cd892126SAlex Elder 					&snap_size, &snap_features);
4084522a0cc0SAlex Elder 		if (IS_ERR(snap_name)) {
4085522a0cc0SAlex Elder 			ret = PTR_ERR(snap_name);
4086522a0cc0SAlex Elder 			dout("failed to get snap info, error %d\n", ret);
4087522a0cc0SAlex Elder 			goto out_err;
4088522a0cc0SAlex Elder 		}
4089cd892126SAlex Elder 
40909fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
40919fcbb800SAlex Elder 			(unsigned long long)snap_id);
409235938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
409335938150SAlex Elder 			struct rbd_snap *new_snap;
409435938150SAlex Elder 
409535938150SAlex Elder 			/* We haven't seen this snapshot before */
409635938150SAlex Elder 
40976087b51bSAlex Elder 			new_snap = rbd_snap_create(rbd_dev, snap_name,
4098cd892126SAlex Elder 					snap_id, snap_size, snap_features);
40999fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
4100522a0cc0SAlex Elder 				ret = PTR_ERR(new_snap);
4101522a0cc0SAlex Elder 				dout("  failed to add dev, error %d\n", ret);
4102522a0cc0SAlex Elder 				goto out_err;
41039fcbb800SAlex Elder 			}
410435938150SAlex Elder 
410535938150SAlex Elder 			/* New goes before existing, or at end of list */
410635938150SAlex Elder 
41079fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
410835938150SAlex Elder 			if (snap)
410935938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
411035938150SAlex Elder 			else
4111523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
411235938150SAlex Elder 		} else {
411335938150SAlex Elder 			/* Already have this one */
411435938150SAlex Elder 
41159fcbb800SAlex Elder 			dout("  already present\n");
41169fcbb800SAlex Elder 
4117cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
4118aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
4119cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
412035938150SAlex Elder 
412135938150SAlex Elder 			/* Done with this list entry; advance */
412235938150SAlex Elder 
412335938150SAlex Elder 			links = links->next;
4124dfc5606dSYehuda Sadeh 		}
412535938150SAlex Elder 
412635938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
412735938150SAlex Elder 
412835938150SAlex Elder 		index++;
4129dfc5606dSYehuda Sadeh 	}
41309fcbb800SAlex Elder 	dout("%s: done\n", __func__);
4131dfc5606dSYehuda Sadeh 
4132dfc5606dSYehuda Sadeh 	return 0;
4133522a0cc0SAlex Elder out_err:
4134522a0cc0SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4135522a0cc0SAlex Elder 
4136522a0cc0SAlex Elder 	return ret;
4137dfc5606dSYehuda Sadeh }
4138dfc5606dSYehuda Sadeh 
4139dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4140dfc5606dSYehuda Sadeh {
4141dfc5606dSYehuda Sadeh 	struct device *dev;
4142cd789ab9SAlex Elder 	int ret;
4143dfc5606dSYehuda Sadeh 
4144dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4145dfc5606dSYehuda Sadeh 
4146cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4147dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4148dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4149dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4150dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
4151de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4152dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4153dfc5606dSYehuda Sadeh 
4154dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4155cd789ab9SAlex Elder 
4156dfc5606dSYehuda Sadeh 	return ret;
4157602adf40SYehuda Sadeh }
4158602adf40SYehuda Sadeh 
4159dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4160dfc5606dSYehuda Sadeh {
4161dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4162dfc5606dSYehuda Sadeh }
4163dfc5606dSYehuda Sadeh 
4164e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
41651ddbe94eSAlex Elder 
41661ddbe94eSAlex Elder /*
4167499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4168499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
41691ddbe94eSAlex Elder  */
4170e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4171b7f23c36SAlex Elder {
4172e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4173499afd5bSAlex Elder 
4174499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4175499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4176499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4177e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4178e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4179b7f23c36SAlex Elder }
4180b7f23c36SAlex Elder 
41811ddbe94eSAlex Elder /*
4182499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4183499afd5bSAlex Elder  * identifier is no longer in use.
41841ddbe94eSAlex Elder  */
4185e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
41861ddbe94eSAlex Elder {
4187d184f6bfSAlex Elder 	struct list_head *tmp;
4188de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4189d184f6bfSAlex Elder 	int max_id;
4190d184f6bfSAlex Elder 
4191aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4192499afd5bSAlex Elder 
4193e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4194e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4195499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4196499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4197d184f6bfSAlex Elder 
4198d184f6bfSAlex Elder 	/*
4199d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4200d184f6bfSAlex Elder 	 * is nothing special we need to do.
4201d184f6bfSAlex Elder 	 */
4202e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4203d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4204d184f6bfSAlex Elder 		return;
4205d184f6bfSAlex Elder 	}
4206d184f6bfSAlex Elder 
4207d184f6bfSAlex Elder 	/*
4208d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4209d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4210d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4211d184f6bfSAlex Elder 	 */
4212d184f6bfSAlex Elder 	max_id = 0;
4213d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4214d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4215d184f6bfSAlex Elder 
4216d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4217b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4218b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4219d184f6bfSAlex Elder 	}
4220499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
42211ddbe94eSAlex Elder 
42221ddbe94eSAlex Elder 	/*
4223e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4224d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4225d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4226d184f6bfSAlex Elder 	 * case.
42271ddbe94eSAlex Elder 	 */
4228e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4229e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4230b7f23c36SAlex Elder }
4231b7f23c36SAlex Elder 
4232a725f65eSAlex Elder /*
4233e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4234e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4235593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4236593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4237e28fff26SAlex Elder  */
4238e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4239e28fff26SAlex Elder {
4240e28fff26SAlex Elder         /*
4241e28fff26SAlex Elder         * These are the characters that produce nonzero for
4242e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4243e28fff26SAlex Elder         */
4244e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4245e28fff26SAlex Elder 
4246e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4247e28fff26SAlex Elder 
4248e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4249e28fff26SAlex Elder }
4250e28fff26SAlex Elder 
4251e28fff26SAlex Elder /*
4252e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4253e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4254593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4255593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4256e28fff26SAlex Elder  *
4257e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4258e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4259e28fff26SAlex Elder  * token_size if the token would not fit.
4260e28fff26SAlex Elder  *
4261593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4262e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4263e28fff26SAlex Elder  * too small to hold it.
4264e28fff26SAlex Elder  */
4265e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4266e28fff26SAlex Elder 				char *token,
4267e28fff26SAlex Elder 				size_t token_size)
4268e28fff26SAlex Elder {
4269e28fff26SAlex Elder         size_t len;
4270e28fff26SAlex Elder 
4271e28fff26SAlex Elder 	len = next_token(buf);
4272e28fff26SAlex Elder 	if (len < token_size) {
4273e28fff26SAlex Elder 		memcpy(token, *buf, len);
4274e28fff26SAlex Elder 		*(token + len) = '\0';
4275e28fff26SAlex Elder 	}
4276e28fff26SAlex Elder 	*buf += len;
4277e28fff26SAlex Elder 
4278e28fff26SAlex Elder         return len;
4279e28fff26SAlex Elder }
4280e28fff26SAlex Elder 
4281e28fff26SAlex Elder /*
4282ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4283ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4284ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4285ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4286ea3352f4SAlex Elder  *
4287ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4288ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4289ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4290ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4291ea3352f4SAlex Elder  *
4292ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4293ea3352f4SAlex Elder  * the end of the found token.
4294ea3352f4SAlex Elder  *
4295ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4296ea3352f4SAlex Elder  */
4297ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4298ea3352f4SAlex Elder {
4299ea3352f4SAlex Elder 	char *dup;
4300ea3352f4SAlex Elder 	size_t len;
4301ea3352f4SAlex Elder 
4302ea3352f4SAlex Elder 	len = next_token(buf);
43034caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4304ea3352f4SAlex Elder 	if (!dup)
4305ea3352f4SAlex Elder 		return NULL;
4306ea3352f4SAlex Elder 	*(dup + len) = '\0';
4307ea3352f4SAlex Elder 	*buf += len;
4308ea3352f4SAlex Elder 
4309ea3352f4SAlex Elder 	if (lenp)
4310ea3352f4SAlex Elder 		*lenp = len;
4311ea3352f4SAlex Elder 
4312ea3352f4SAlex Elder 	return dup;
4313ea3352f4SAlex Elder }
4314ea3352f4SAlex Elder 
4315ea3352f4SAlex Elder /*
4316859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4317859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4318859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4319859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4320d22f76e7SAlex Elder  *
4321859c31dfSAlex Elder  * The information extracted from these options is recorded in
4322859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4323859c31dfSAlex Elder  * structures:
4324859c31dfSAlex Elder  *  ceph_opts
4325859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4326859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4327859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4328859c31dfSAlex Elder  *  rbd_opts
4329859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4330859c31dfSAlex Elder  *	this function; caller must release with kfree().
4331859c31dfSAlex Elder  *  spec
4332859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4333859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4334859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4335859c31dfSAlex Elder  *
4336859c31dfSAlex Elder  * The options passed take this form:
4337859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4338859c31dfSAlex Elder  * where:
4339859c31dfSAlex Elder  *  <mon_addrs>
4340859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4341859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4342859c31dfSAlex Elder  *      by a port number (separated by a colon).
4343859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4344859c31dfSAlex Elder  *  <options>
4345859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4346859c31dfSAlex Elder  *  <pool_name>
4347859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4348859c31dfSAlex Elder  *  <image_name>
4349859c31dfSAlex Elder  *      The name of the image in that pool to map.
4350859c31dfSAlex Elder  *  <snap_id>
4351859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4352859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4353859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4354859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4355a725f65eSAlex Elder  */
4356859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4357dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4358859c31dfSAlex Elder 				struct rbd_options **opts,
4359859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4360a725f65eSAlex Elder {
4361e28fff26SAlex Elder 	size_t len;
4362859c31dfSAlex Elder 	char *options;
43630ddebc0cSAlex Elder 	const char *mon_addrs;
43640ddebc0cSAlex Elder 	size_t mon_addrs_size;
4365859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
43664e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4367859c31dfSAlex Elder 	struct ceph_options *copts;
4368dc79b113SAlex Elder 	int ret;
4369e28fff26SAlex Elder 
4370e28fff26SAlex Elder 	/* The first four tokens are required */
4371e28fff26SAlex Elder 
43727ef3214aSAlex Elder 	len = next_token(&buf);
43734fb5d671SAlex Elder 	if (!len) {
43744fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
43754fb5d671SAlex Elder 		return -EINVAL;
43764fb5d671SAlex Elder 	}
43770ddebc0cSAlex Elder 	mon_addrs = buf;
4378f28e565aSAlex Elder 	mon_addrs_size = len + 1;
43797ef3214aSAlex Elder 	buf += len;
4380a725f65eSAlex Elder 
4381dc79b113SAlex Elder 	ret = -EINVAL;
4382f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4383f28e565aSAlex Elder 	if (!options)
4384dc79b113SAlex Elder 		return -ENOMEM;
43854fb5d671SAlex Elder 	if (!*options) {
43864fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
43874fb5d671SAlex Elder 		goto out_err;
43884fb5d671SAlex Elder 	}
4389a725f65eSAlex Elder 
4390859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4391859c31dfSAlex Elder 	if (!spec)
4392f28e565aSAlex Elder 		goto out_mem;
4393859c31dfSAlex Elder 
4394859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4395859c31dfSAlex Elder 	if (!spec->pool_name)
4396859c31dfSAlex Elder 		goto out_mem;
43974fb5d671SAlex Elder 	if (!*spec->pool_name) {
43984fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
43994fb5d671SAlex Elder 		goto out_err;
44004fb5d671SAlex Elder 	}
4401e28fff26SAlex Elder 
440269e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4403859c31dfSAlex Elder 	if (!spec->image_name)
4404f28e565aSAlex Elder 		goto out_mem;
44054fb5d671SAlex Elder 	if (!*spec->image_name) {
44064fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
44074fb5d671SAlex Elder 		goto out_err;
44084fb5d671SAlex Elder 	}
4409e28fff26SAlex Elder 
4410f28e565aSAlex Elder 	/*
4411f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4412f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4413f28e565aSAlex Elder 	 */
44143feeb894SAlex Elder 	len = next_token(&buf);
4415820a5f3eSAlex Elder 	if (!len) {
44163feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
44173feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4418f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4419dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4420f28e565aSAlex Elder 		goto out_err;
4421849b4260SAlex Elder 	}
44224caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4423859c31dfSAlex Elder 	if (!spec->snap_name)
4424f28e565aSAlex Elder 		goto out_mem;
4425859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
4426e5c35534SAlex Elder 
44270ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4428e28fff26SAlex Elder 
44294e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
44304e9afebaSAlex Elder 	if (!rbd_opts)
44314e9afebaSAlex Elder 		goto out_mem;
44324e9afebaSAlex Elder 
44334e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4434d22f76e7SAlex Elder 
4435859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
44360ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
44374e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4438859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4439859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4440dc79b113SAlex Elder 		goto out_err;
4441dc79b113SAlex Elder 	}
4442859c31dfSAlex Elder 	kfree(options);
4443859c31dfSAlex Elder 
4444859c31dfSAlex Elder 	*ceph_opts = copts;
44454e9afebaSAlex Elder 	*opts = rbd_opts;
4446859c31dfSAlex Elder 	*rbd_spec = spec;
44470ddebc0cSAlex Elder 
4448dc79b113SAlex Elder 	return 0;
4449f28e565aSAlex Elder out_mem:
4450dc79b113SAlex Elder 	ret = -ENOMEM;
4451d22f76e7SAlex Elder out_err:
4452859c31dfSAlex Elder 	kfree(rbd_opts);
4453859c31dfSAlex Elder 	rbd_spec_put(spec);
4454f28e565aSAlex Elder 	kfree(options);
4455d22f76e7SAlex Elder 
4456dc79b113SAlex Elder 	return ret;
4457a725f65eSAlex Elder }
4458a725f65eSAlex Elder 
4459589d30e0SAlex Elder /*
4460589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4461589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4462589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4463589d30e0SAlex Elder  *
4464589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4465589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4466589d30e0SAlex Elder  * with the supplied name.
4467589d30e0SAlex Elder  *
4468589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4469589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4470589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4471589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4472589d30e0SAlex Elder  */
4473589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4474589d30e0SAlex Elder {
4475589d30e0SAlex Elder 	int ret;
4476589d30e0SAlex Elder 	size_t size;
4477589d30e0SAlex Elder 	char *object_name;
4478589d30e0SAlex Elder 	void *response;
4479589d30e0SAlex Elder 	void *p;
4480589d30e0SAlex Elder 
44812f82ee54SAlex Elder 	/* If we already have it we don't need to look it up */
44822f82ee54SAlex Elder 
44832f82ee54SAlex Elder 	if (rbd_dev->spec->image_id)
44842f82ee54SAlex Elder 		return 0;
44852f82ee54SAlex Elder 
4486589d30e0SAlex Elder 	/*
44872c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
44882c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
44892c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
44902c0d0a10SAlex Elder 	 */
44912c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
44922c0d0a10SAlex Elder 		return 0;
44932c0d0a10SAlex Elder 
44942c0d0a10SAlex Elder 	/*
4495589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4496589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4497589d30e0SAlex Elder 	 */
449869e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4499589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4500589d30e0SAlex Elder 	if (!object_name)
4501589d30e0SAlex Elder 		return -ENOMEM;
45020d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4503589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4504589d30e0SAlex Elder 
4505589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4506589d30e0SAlex Elder 
4507589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4508589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4509589d30e0SAlex Elder 	if (!response) {
4510589d30e0SAlex Elder 		ret = -ENOMEM;
4511589d30e0SAlex Elder 		goto out;
4512589d30e0SAlex Elder 	}
4513589d30e0SAlex Elder 
451436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
45154157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
451607b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
451736be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4518589d30e0SAlex Elder 	if (ret < 0)
4519589d30e0SAlex Elder 		goto out;
4520589d30e0SAlex Elder 
4521589d30e0SAlex Elder 	p = response;
45220d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
452357385b51SAlex Elder 						p + ret,
4524979ed480SAlex Elder 						NULL, GFP_NOIO);
452557385b51SAlex Elder 	ret = 0;
452657385b51SAlex Elder 
45270d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
45280d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
45290d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
4530589d30e0SAlex Elder 	} else {
45310d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
4532589d30e0SAlex Elder 	}
4533589d30e0SAlex Elder out:
4534589d30e0SAlex Elder 	kfree(response);
4535589d30e0SAlex Elder 	kfree(object_name);
4536589d30e0SAlex Elder 
4537589d30e0SAlex Elder 	return ret;
4538589d30e0SAlex Elder }
4539589d30e0SAlex Elder 
4540a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4541a30b71b9SAlex Elder {
4542a30b71b9SAlex Elder 	int ret;
4543a30b71b9SAlex Elder 	size_t size;
4544a30b71b9SAlex Elder 
4545a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
4546a30b71b9SAlex Elder 
45470d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
45480d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
4549a30b71b9SAlex Elder 		return -ENOMEM;
4550a30b71b9SAlex Elder 
4551a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
4552a30b71b9SAlex Elder 
455369e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
4554a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4555a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
4556a30b71b9SAlex Elder 		ret = -ENOMEM;
4557a30b71b9SAlex Elder 		goto out_err;
4558a30b71b9SAlex Elder 	}
45590d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
45600d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
4561a30b71b9SAlex Elder 
4562a30b71b9SAlex Elder 	/* Populate rbd image metadata */
4563a30b71b9SAlex Elder 
4564a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4565a30b71b9SAlex Elder 	if (ret < 0)
4566a30b71b9SAlex Elder 		goto out_err;
456786b00e0dSAlex Elder 
456886b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
456986b00e0dSAlex Elder 
457086b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
457186b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
457286b00e0dSAlex Elder 
4573a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
4574a30b71b9SAlex Elder 
4575a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4576a30b71b9SAlex Elder 		rbd_dev->header_name);
4577a30b71b9SAlex Elder 
4578a30b71b9SAlex Elder 	return 0;
4579a30b71b9SAlex Elder 
4580a30b71b9SAlex Elder out_err:
4581a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4582a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
45830d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
45840d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4585a30b71b9SAlex Elder 
4586a30b71b9SAlex Elder 	return ret;
4587a30b71b9SAlex Elder }
4588a30b71b9SAlex Elder 
4589a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4590a30b71b9SAlex Elder {
4591a30b71b9SAlex Elder 	size_t size;
45929d475de5SAlex Elder 	int ret;
45936e14b1a6SAlex Elder 	u64 ver = 0;
4594a30b71b9SAlex Elder 
4595a30b71b9SAlex Elder 	/*
4596a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
4597a30b71b9SAlex Elder 	 * object name for this rbd image.
4598a30b71b9SAlex Elder 	 */
4599979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
4600a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4601a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
4602a30b71b9SAlex Elder 		return -ENOMEM;
4603a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
46040d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
46059d475de5SAlex Elder 
46069d475de5SAlex Elder 	/* Get the size and object order for the image */
46079d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
460857385b51SAlex Elder 	if (ret)
46099d475de5SAlex Elder 		goto out_err;
46101e130199SAlex Elder 
46111e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
46121e130199SAlex Elder 
46131e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
461457385b51SAlex Elder 	if (ret)
46151e130199SAlex Elder 		goto out_err;
4616b1b5402aSAlex Elder 
4617d889140cSAlex Elder 	/* Get the and check features for the image */
4618b1b5402aSAlex Elder 
4619b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
462057385b51SAlex Elder 	if (ret)
4621b1b5402aSAlex Elder 		goto out_err;
462235d489f9SAlex Elder 
462386b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
462486b00e0dSAlex Elder 
462586b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
462686b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
462757385b51SAlex Elder 		if (ret)
462886b00e0dSAlex Elder 			goto out_err;
4629770eba6eSAlex Elder 		rbd_warn(rbd_dev, "WARNING: kernel support for "
4630770eba6eSAlex Elder 					"layered rbd images is EXPERIMENTAL!");
463186b00e0dSAlex Elder 	}
463286b00e0dSAlex Elder 
4633cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4634cc070d59SAlex Elder 
4635cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4636cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4637cc070d59SAlex Elder 		if (ret < 0)
4638cc070d59SAlex Elder 			goto out_err;
4639cc070d59SAlex Elder 	}
4640cc070d59SAlex Elder 
46416e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
464235d489f9SAlex Elder 
46436e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
46446e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
46456e14b1a6SAlex Elder 
46466e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
46476e14b1a6SAlex Elder 
46486e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
464935d489f9SAlex Elder 	if (ret)
465035d489f9SAlex Elder 		goto out_err;
46516e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
46526e14b1a6SAlex Elder 
4653a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
4654a30b71b9SAlex Elder 
4655a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4656a30b71b9SAlex Elder 		rbd_dev->header_name);
4657a30b71b9SAlex Elder 
465835152979SAlex Elder 	return 0;
46599d475de5SAlex Elder out_err:
466086b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
466186b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
466286b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
46639d475de5SAlex Elder 	kfree(rbd_dev->header_name);
46649d475de5SAlex Elder 	rbd_dev->header_name = NULL;
46651e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
46661e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
46679d475de5SAlex Elder 
46689d475de5SAlex Elder 	return ret;
4669a30b71b9SAlex Elder }
4670a30b71b9SAlex Elder 
467183a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
467283a06263SAlex Elder {
46732f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
46742f82ee54SAlex Elder 	struct rbd_spec *parent_spec = NULL;
46752f82ee54SAlex Elder 	struct rbd_client *rbdc = NULL;
467683a06263SAlex Elder 	int ret;
467783a06263SAlex Elder 
467883a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
467983a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
468083a06263SAlex Elder 	if (ret)
468183a06263SAlex Elder 		return ret;
468283a06263SAlex Elder 
46839e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
46849e15b77dSAlex Elder 	if (ret)
46859e15b77dSAlex Elder 		goto err_out_snaps;
46869e15b77dSAlex Elder 
468783a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
468883a06263SAlex Elder 	if (ret)
468983a06263SAlex Elder 		goto err_out_snaps;
469083a06263SAlex Elder 
469183a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
469283a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
469383a06263SAlex Elder 
469483a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
469583a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
469683a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
469783a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
469883a06263SAlex Elder 
469983a06263SAlex Elder 	/* Get our block major device number. */
470083a06263SAlex Elder 
470183a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
470283a06263SAlex Elder 	if (ret < 0)
470383a06263SAlex Elder 		goto err_out_id;
470483a06263SAlex Elder 	rbd_dev->major = ret;
470583a06263SAlex Elder 
470683a06263SAlex Elder 	/* Set up the blkdev mapping. */
470783a06263SAlex Elder 
470883a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
470983a06263SAlex Elder 	if (ret)
471083a06263SAlex Elder 		goto err_out_blkdev;
471183a06263SAlex Elder 
471283a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
471383a06263SAlex Elder 	if (ret)
471483a06263SAlex Elder 		goto err_out_disk;
471583a06263SAlex Elder 
471683a06263SAlex Elder 	/*
471783a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
471883a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
471983a06263SAlex Elder 	 */
47202f82ee54SAlex Elder 	/* Probe the parent if there is one */
47212f82ee54SAlex Elder 
47222f82ee54SAlex Elder 	if (rbd_dev->parent_spec) {
47232f82ee54SAlex Elder 		/*
47242f82ee54SAlex Elder 		 * We need to pass a reference to the client and the
47252f82ee54SAlex Elder 		 * parent spec when creating the parent rbd_dev.
47262f82ee54SAlex Elder 		 * Images related by parent/child relationships
47272f82ee54SAlex Elder 		 * always share both.
47282f82ee54SAlex Elder 		 */
47292f82ee54SAlex Elder 		parent_spec = rbd_spec_get(rbd_dev->parent_spec);
47302f82ee54SAlex Elder 		rbdc = __rbd_get_client(rbd_dev->rbd_client);
47312f82ee54SAlex Elder 
47322f82ee54SAlex Elder 		parent = rbd_dev_create(rbdc, parent_spec);
47332f82ee54SAlex Elder 		if (!parent) {
47342f82ee54SAlex Elder 			ret = -ENOMEM;
47352f82ee54SAlex Elder 			goto err_out_spec;
47362f82ee54SAlex Elder 		}
47372f82ee54SAlex Elder 		rbdc = NULL;		/* parent now owns reference */
47382f82ee54SAlex Elder 		parent_spec = NULL;	/* parent now owns reference */
47392f82ee54SAlex Elder 		ret = rbd_dev_probe(parent);
47402f82ee54SAlex Elder 		if (ret < 0)
47412f82ee54SAlex Elder 			goto err_out_parent;
47422f82ee54SAlex Elder 		rbd_dev->parent = parent;
47432f82ee54SAlex Elder 	}
47442f82ee54SAlex Elder 
47459969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
474683a06263SAlex Elder 	if (ret)
474783a06263SAlex Elder 		goto err_out_bus;
474883a06263SAlex Elder 
474983a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
475083a06263SAlex Elder 
475183a06263SAlex Elder 	add_disk(rbd_dev->disk);
475283a06263SAlex Elder 
475383a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
475483a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
475583a06263SAlex Elder 
475683a06263SAlex Elder 	return ret;
47572f82ee54SAlex Elder 
47582f82ee54SAlex Elder err_out_parent:
47592f82ee54SAlex Elder 	rbd_dev_destroy(parent);
47602f82ee54SAlex Elder err_out_spec:
47612f82ee54SAlex Elder 	rbd_spec_put(parent_spec);
47622f82ee54SAlex Elder 	rbd_put_client(rbdc);
476383a06263SAlex Elder err_out_bus:
476483a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
476583a06263SAlex Elder 
476683a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
476783a06263SAlex Elder 
476883a06263SAlex Elder 	return ret;
476983a06263SAlex Elder err_out_disk:
477083a06263SAlex Elder 	rbd_free_disk(rbd_dev);
477183a06263SAlex Elder err_out_blkdev:
477283a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
477383a06263SAlex Elder err_out_id:
477483a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
477583a06263SAlex Elder err_out_snaps:
477683a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
477783a06263SAlex Elder 
477883a06263SAlex Elder 	return ret;
477983a06263SAlex Elder }
478083a06263SAlex Elder 
4781a30b71b9SAlex Elder /*
4782a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4783a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4784a30b71b9SAlex Elder  * id.
4785a30b71b9SAlex Elder  */
4786a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4787a30b71b9SAlex Elder {
4788a30b71b9SAlex Elder 	int ret;
4789a30b71b9SAlex Elder 
4790a30b71b9SAlex Elder 	/*
4791a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4792a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4793a30b71b9SAlex Elder 	 * it's a format 1 image.
4794a30b71b9SAlex Elder 	 */
4795a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4796a30b71b9SAlex Elder 	if (ret)
4797a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4798a30b71b9SAlex Elder 	else
4799a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
480083a06263SAlex Elder 	if (ret) {
4801a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4802a30b71b9SAlex Elder 
4803a30b71b9SAlex Elder 		return ret;
4804a30b71b9SAlex Elder 	}
4805a30b71b9SAlex Elder 
480683a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
480783a06263SAlex Elder 	if (ret)
480883a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
480983a06263SAlex Elder 
481083a06263SAlex Elder 	return ret;
481183a06263SAlex Elder }
481283a06263SAlex Elder 
481359c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
481459c2be1eSYehuda Sadeh 		       const char *buf,
481559c2be1eSYehuda Sadeh 		       size_t count)
4816602adf40SYehuda Sadeh {
4817cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4818dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
48194e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4820859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48219d3997fdSAlex Elder 	struct rbd_client *rbdc;
482227cc2594SAlex Elder 	struct ceph_osd_client *osdc;
482327cc2594SAlex Elder 	int rc = -ENOMEM;
4824602adf40SYehuda Sadeh 
4825602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4826602adf40SYehuda Sadeh 		return -ENODEV;
4827602adf40SYehuda Sadeh 
4828a725f65eSAlex Elder 	/* parse add command */
4829859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4830dc79b113SAlex Elder 	if (rc < 0)
4831bd4ba655SAlex Elder 		goto err_out_module;
4832a725f65eSAlex Elder 
48339d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
48349d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
48359d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
48360ddebc0cSAlex Elder 		goto err_out_args;
48379d3997fdSAlex Elder 	}
4838c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4839602adf40SYehuda Sadeh 
4840602adf40SYehuda Sadeh 	/* pick the pool */
48419d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4842859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4843602adf40SYehuda Sadeh 	if (rc < 0)
4844602adf40SYehuda Sadeh 		goto err_out_client;
4845859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4846859c31dfSAlex Elder 
48470903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
48480903e875SAlex Elder 
48490903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
48500903e875SAlex Elder 		rc = -EIO;
48510903e875SAlex Elder 		goto err_out_client;
48520903e875SAlex Elder 	}
48530903e875SAlex Elder 
4854c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4855bd4ba655SAlex Elder 	if (!rbd_dev)
4856bd4ba655SAlex Elder 		goto err_out_client;
4857c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4858c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4859602adf40SYehuda Sadeh 
4860bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4861c53d5893SAlex Elder 	kfree(rbd_opts);
4862c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4863bd4ba655SAlex Elder 
4864a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4865a30b71b9SAlex Elder 	if (rc < 0)
4866c53d5893SAlex Elder 		goto err_out_rbd_dev;
486705fd6f6fSAlex Elder 
4868602adf40SYehuda Sadeh 	return count;
4869c53d5893SAlex Elder err_out_rbd_dev:
4870c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4871bd4ba655SAlex Elder err_out_client:
48729d3997fdSAlex Elder 	rbd_put_client(rbdc);
48730ddebc0cSAlex Elder err_out_args:
487478cea76eSAlex Elder 	if (ceph_opts)
487578cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
48764e9afebaSAlex Elder 	kfree(rbd_opts);
4877859c31dfSAlex Elder 	rbd_spec_put(spec);
4878bd4ba655SAlex Elder err_out_module:
4879bd4ba655SAlex Elder 	module_put(THIS_MODULE);
488027cc2594SAlex Elder 
4881602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
488227cc2594SAlex Elder 
488327cc2594SAlex Elder 	return (ssize_t) rc;
4884602adf40SYehuda Sadeh }
4885602adf40SYehuda Sadeh 
4886de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4887602adf40SYehuda Sadeh {
4888602adf40SYehuda Sadeh 	struct list_head *tmp;
4889602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4890602adf40SYehuda Sadeh 
4891e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4892602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4893602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4894de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4895e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4896602adf40SYehuda Sadeh 			return rbd_dev;
4897602adf40SYehuda Sadeh 		}
4898e124a82fSAlex Elder 	}
4899e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4900602adf40SYehuda Sadeh 	return NULL;
4901602adf40SYehuda Sadeh }
4902602adf40SYehuda Sadeh 
4903dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4904602adf40SYehuda Sadeh {
4905593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4906602adf40SYehuda Sadeh 
490759c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
49089969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4909602adf40SYehuda Sadeh 
4910602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4911602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4912602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
491332eec68dSAlex Elder 
49142ac4e75dSAlex Elder 	/* release allocated disk header fields */
49152ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
49162ac4e75dSAlex Elder 
491732eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4918e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4919c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4920c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4921602adf40SYehuda Sadeh 
4922602adf40SYehuda Sadeh 	/* release module ref */
4923602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4924602adf40SYehuda Sadeh }
4925602adf40SYehuda Sadeh 
49262f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev)
49272f82ee54SAlex Elder {
49282f82ee54SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
49292f82ee54SAlex Elder 	rbd_bus_del_dev(rbd_dev);
49302f82ee54SAlex Elder }
49312f82ee54SAlex Elder 
4932dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4933602adf40SYehuda Sadeh 			  const char *buf,
4934602adf40SYehuda Sadeh 			  size_t count)
4935602adf40SYehuda Sadeh {
4936602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4937602adf40SYehuda Sadeh 	int target_id, rc;
4938602adf40SYehuda Sadeh 	unsigned long ul;
4939602adf40SYehuda Sadeh 	int ret = count;
4940602adf40SYehuda Sadeh 
4941602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4942602adf40SYehuda Sadeh 	if (rc)
4943602adf40SYehuda Sadeh 		return rc;
4944602adf40SYehuda Sadeh 
4945602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4946602adf40SYehuda Sadeh 	target_id = (int) ul;
4947602adf40SYehuda Sadeh 	if (target_id != ul)
4948602adf40SYehuda Sadeh 		return -EINVAL;
4949602adf40SYehuda Sadeh 
4950602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4951602adf40SYehuda Sadeh 
4952602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4953602adf40SYehuda Sadeh 	if (!rbd_dev) {
4954602adf40SYehuda Sadeh 		ret = -ENOENT;
4955602adf40SYehuda Sadeh 		goto done;
4956602adf40SYehuda Sadeh 	}
4957602adf40SYehuda Sadeh 
4958a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4959b82d167bSAlex Elder 	if (rbd_dev->open_count)
496042382b70SAlex Elder 		ret = -EBUSY;
4961b82d167bSAlex Elder 	else
4962b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4963a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4964b82d167bSAlex Elder 	if (ret < 0)
496542382b70SAlex Elder 		goto done;
496642382b70SAlex Elder 
49672f82ee54SAlex Elder 	while (rbd_dev->parent_spec) {
49682f82ee54SAlex Elder 		struct rbd_device *first = rbd_dev;
49692f82ee54SAlex Elder 		struct rbd_device *second = first->parent;
49702f82ee54SAlex Elder 		struct rbd_device *third;
49712f82ee54SAlex Elder 
49722f82ee54SAlex Elder 		/*
49732f82ee54SAlex Elder 		 * Follow to the parent with no grandparent and
49742f82ee54SAlex Elder 		 * remove it.
49752f82ee54SAlex Elder 		 */
49762f82ee54SAlex Elder 		while (second && (third = second->parent)) {
49772f82ee54SAlex Elder 			first = second;
49782f82ee54SAlex Elder 			second = third;
49792f82ee54SAlex Elder 		}
49802f82ee54SAlex Elder 		__rbd_remove(second);
49812f82ee54SAlex Elder 		rbd_spec_put(first->parent_spec);
49822f82ee54SAlex Elder 		first->parent_spec = NULL;
49832f82ee54SAlex Elder 		first->parent_overlap = 0;
49842f82ee54SAlex Elder 		first->parent = NULL;
49852f82ee54SAlex Elder 	}
49862f82ee54SAlex Elder 	__rbd_remove(rbd_dev);
4987602adf40SYehuda Sadeh 
4988602adf40SYehuda Sadeh done:
4989602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4990aafb230eSAlex Elder 
4991602adf40SYehuda Sadeh 	return ret;
4992602adf40SYehuda Sadeh }
4993602adf40SYehuda Sadeh 
4994602adf40SYehuda Sadeh /*
4995602adf40SYehuda Sadeh  * create control files in sysfs
4996dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4997602adf40SYehuda Sadeh  */
4998602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4999602adf40SYehuda Sadeh {
5000dfc5606dSYehuda Sadeh 	int ret;
5001602adf40SYehuda Sadeh 
5002fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5003dfc5606dSYehuda Sadeh 	if (ret < 0)
5004dfc5606dSYehuda Sadeh 		return ret;
5005602adf40SYehuda Sadeh 
5006fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5007fed4c143SAlex Elder 	if (ret < 0)
5008fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5009602adf40SYehuda Sadeh 
5010602adf40SYehuda Sadeh 	return ret;
5011602adf40SYehuda Sadeh }
5012602adf40SYehuda Sadeh 
5013602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5014602adf40SYehuda Sadeh {
5015dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5016fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5017602adf40SYehuda Sadeh }
5018602adf40SYehuda Sadeh 
5019cc344fa1SAlex Elder static int __init rbd_init(void)
5020602adf40SYehuda Sadeh {
5021602adf40SYehuda Sadeh 	int rc;
5022602adf40SYehuda Sadeh 
50231e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
50241e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
50251e32d34cSAlex Elder 
50261e32d34cSAlex Elder 		return -EINVAL;
50271e32d34cSAlex Elder 	}
5028602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
5029602adf40SYehuda Sadeh 	if (rc)
5030602adf40SYehuda Sadeh 		return rc;
5031f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5032602adf40SYehuda Sadeh 	return 0;
5033602adf40SYehuda Sadeh }
5034602adf40SYehuda Sadeh 
5035cc344fa1SAlex Elder static void __exit rbd_exit(void)
5036602adf40SYehuda Sadeh {
5037602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
5038602adf40SYehuda Sadeh }
5039602adf40SYehuda Sadeh 
5040602adf40SYehuda Sadeh module_init(rbd_init);
5041602adf40SYehuda Sadeh module_exit(rbd_exit);
5042602adf40SYehuda Sadeh 
5043602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5044602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5045602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5046602adf40SYehuda Sadeh 
5047602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5048602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5049602adf40SYehuda Sadeh 
5050602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5051