xref: /openbmc/linux/drivers/block/rbd.c (revision f40eb349)
1602adf40SYehuda Sadeh /*
2602adf40SYehuda Sadeh    rbd.c -- Export ceph rados objects as a Linux block device
3602adf40SYehuda Sadeh 
4602adf40SYehuda Sadeh 
5602adf40SYehuda Sadeh    based on drivers/block/osdblk.c:
6602adf40SYehuda Sadeh 
7602adf40SYehuda Sadeh    Copyright 2009 Red Hat, Inc.
8602adf40SYehuda Sadeh 
9602adf40SYehuda Sadeh    This program is free software; you can redistribute it and/or modify
10602adf40SYehuda Sadeh    it under the terms of the GNU General Public License as published by
11602adf40SYehuda Sadeh    the Free Software Foundation.
12602adf40SYehuda Sadeh 
13602adf40SYehuda Sadeh    This program is distributed in the hope that it will be useful,
14602adf40SYehuda Sadeh    but WITHOUT ANY WARRANTY; without even the implied warranty of
15602adf40SYehuda Sadeh    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16602adf40SYehuda Sadeh    GNU General Public License for more details.
17602adf40SYehuda Sadeh 
18602adf40SYehuda Sadeh    You should have received a copy of the GNU General Public License
19602adf40SYehuda Sadeh    along with this program; see the file COPYING.  If not, write to
20602adf40SYehuda Sadeh    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21602adf40SYehuda Sadeh 
22602adf40SYehuda Sadeh 
23602adf40SYehuda Sadeh 
24dfc5606dSYehuda Sadeh    For usage instructions, please refer to:
25602adf40SYehuda Sadeh 
26dfc5606dSYehuda Sadeh                  Documentation/ABI/testing/sysfs-bus-rbd
27602adf40SYehuda Sadeh 
28602adf40SYehuda Sadeh  */
29602adf40SYehuda Sadeh 
30602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
31602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
32602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3459c2be1eSYehuda Sadeh #include <linux/parser.h>
35602adf40SYehuda Sadeh 
36602adf40SYehuda Sadeh #include <linux/kernel.h>
37602adf40SYehuda Sadeh #include <linux/device.h>
38602adf40SYehuda Sadeh #include <linux/module.h>
39602adf40SYehuda Sadeh #include <linux/fs.h>
40602adf40SYehuda Sadeh #include <linux/blkdev.h>
41602adf40SYehuda Sadeh 
42602adf40SYehuda Sadeh #include "rbd_types.h"
43602adf40SYehuda Sadeh 
44aafb230eSAlex Elder #define RBD_DEBUG	/* Activate rbd_assert() calls */
45aafb230eSAlex Elder 
46593a9e7bSAlex Elder /*
47593a9e7bSAlex Elder  * The basic unit of block I/O is a sector.  It is interpreted in a
48593a9e7bSAlex Elder  * number of contexts in Linux (blk, bio, genhd), but the default is
49593a9e7bSAlex Elder  * universally 512 bytes.  These symbols are just slightly more
50593a9e7bSAlex Elder  * meaningful than the bare numbers they represent.
51593a9e7bSAlex Elder  */
52593a9e7bSAlex Elder #define	SECTOR_SHIFT	9
53593a9e7bSAlex Elder #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
54593a9e7bSAlex Elder 
55f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
56f0f8cef5SAlex Elder #define RBD_DRV_NAME_LONG "rbd (rados block device)"
57602adf40SYehuda Sadeh 
58602adf40SYehuda Sadeh #define RBD_MINORS_PER_MAJOR	256		/* max minors per blkdev */
59602adf40SYehuda Sadeh 
60d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
61d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN	\
62d4b125e9SAlex Elder 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63d4b125e9SAlex Elder 
6435d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
65602adf40SYehuda Sadeh 
66602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME	"-"
67602adf40SYehuda Sadeh 
689e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
699e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
70589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX	64
719e15b77dSAlex Elder 
721e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX	64
73589d30e0SAlex Elder 
74d889140cSAlex Elder /* Feature bits */
75d889140cSAlex Elder 
765cbf6f12SAlex Elder #define RBD_FEATURE_LAYERING	(1<<0)
775cbf6f12SAlex Elder #define RBD_FEATURE_STRIPINGV2	(1<<1)
785cbf6f12SAlex Elder #define RBD_FEATURES_ALL \
795cbf6f12SAlex Elder 	    (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
80d889140cSAlex Elder 
81d889140cSAlex Elder /* Features supported by this (client software) implementation. */
82d889140cSAlex Elder 
83770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
84d889140cSAlex Elder 
8581a89793SAlex Elder /*
8681a89793SAlex Elder  * An RBD device name will be "rbd#", where the "rbd" comes from
8781a89793SAlex Elder  * RBD_DRV_NAME above, and # is a unique integer identifier.
8881a89793SAlex Elder  * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
8981a89793SAlex Elder  * enough to hold all possible device names.
9081a89793SAlex Elder  */
91602adf40SYehuda Sadeh #define DEV_NAME_LEN		32
9281a89793SAlex Elder #define MAX_INT_FORMAT_WIDTH	((5 * sizeof (int)) / 2 + 1)
93602adf40SYehuda Sadeh 
94602adf40SYehuda Sadeh /*
95602adf40SYehuda Sadeh  * block device image metadata (in-memory version)
96602adf40SYehuda Sadeh  */
97602adf40SYehuda Sadeh struct rbd_image_header {
98f84344f3SAlex Elder 	/* These four fields never change for a given rbd image */
99849b4260SAlex Elder 	char *object_prefix;
10034b13184SAlex Elder 	u64 features;
101602adf40SYehuda Sadeh 	__u8 obj_order;
102602adf40SYehuda Sadeh 	__u8 crypt_type;
103602adf40SYehuda Sadeh 	__u8 comp_type;
104602adf40SYehuda Sadeh 
105f84344f3SAlex Elder 	/* The remaining fields need to be updated occasionally */
106f84344f3SAlex Elder 	u64 image_size;
107f84344f3SAlex Elder 	struct ceph_snap_context *snapc;
108602adf40SYehuda Sadeh 	char *snap_names;
109602adf40SYehuda Sadeh 	u64 *snap_sizes;
11059c2be1eSYehuda Sadeh 
11159c2be1eSYehuda Sadeh 	u64 obj_version;
11259c2be1eSYehuda Sadeh };
11359c2be1eSYehuda Sadeh 
1140d7dbfceSAlex Elder /*
1150d7dbfceSAlex Elder  * An rbd image specification.
1160d7dbfceSAlex Elder  *
1170d7dbfceSAlex Elder  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
118c66c6e0cSAlex Elder  * identify an image.  Each rbd_dev structure includes a pointer to
119c66c6e0cSAlex Elder  * an rbd_spec structure that encapsulates this identity.
120c66c6e0cSAlex Elder  *
121c66c6e0cSAlex Elder  * Each of the id's in an rbd_spec has an associated name.  For a
122c66c6e0cSAlex Elder  * user-mapped image, the names are supplied and the id's associated
123c66c6e0cSAlex Elder  * with them are looked up.  For a layered image, a parent image is
124c66c6e0cSAlex Elder  * defined by the tuple, and the names are looked up.
125c66c6e0cSAlex Elder  *
126c66c6e0cSAlex Elder  * An rbd_dev structure contains a parent_spec pointer which is
127c66c6e0cSAlex Elder  * non-null if the image it represents is a child in a layered
128c66c6e0cSAlex Elder  * image.  This pointer will refer to the rbd_spec structure used
129c66c6e0cSAlex Elder  * by the parent rbd_dev for its own identity (i.e., the structure
130c66c6e0cSAlex Elder  * is shared between the parent and child).
131c66c6e0cSAlex Elder  *
132c66c6e0cSAlex Elder  * Since these structures are populated once, during the discovery
133c66c6e0cSAlex Elder  * phase of image construction, they are effectively immutable so
134c66c6e0cSAlex Elder  * we make no effort to synchronize access to them.
135c66c6e0cSAlex Elder  *
136c66c6e0cSAlex Elder  * Note that code herein does not assume the image name is known (it
137c66c6e0cSAlex Elder  * could be a null pointer).
1380d7dbfceSAlex Elder  */
1390d7dbfceSAlex Elder struct rbd_spec {
1400d7dbfceSAlex Elder 	u64		pool_id;
1410d7dbfceSAlex Elder 	char		*pool_name;
1420d7dbfceSAlex Elder 
1430d7dbfceSAlex Elder 	char		*image_id;
1440d7dbfceSAlex Elder 	char		*image_name;
1450d7dbfceSAlex Elder 
1460d7dbfceSAlex Elder 	u64		snap_id;
1470d7dbfceSAlex Elder 	char		*snap_name;
1480d7dbfceSAlex Elder 
1490d7dbfceSAlex Elder 	struct kref	kref;
1500d7dbfceSAlex Elder };
1510d7dbfceSAlex Elder 
152602adf40SYehuda Sadeh /*
153f0f8cef5SAlex Elder  * an instance of the client.  multiple devices may share an rbd client.
154602adf40SYehuda Sadeh  */
155602adf40SYehuda Sadeh struct rbd_client {
156602adf40SYehuda Sadeh 	struct ceph_client	*client;
157602adf40SYehuda Sadeh 	struct kref		kref;
158602adf40SYehuda Sadeh 	struct list_head	node;
159602adf40SYehuda Sadeh };
160602adf40SYehuda Sadeh 
161bf0d5f50SAlex Elder struct rbd_img_request;
162bf0d5f50SAlex Elder typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163bf0d5f50SAlex Elder 
164bf0d5f50SAlex Elder #define	BAD_WHICH	U32_MAX		/* Good which or bad which, which? */
165bf0d5f50SAlex Elder 
166bf0d5f50SAlex Elder struct rbd_obj_request;
167bf0d5f50SAlex Elder typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168bf0d5f50SAlex Elder 
1699969ebc5SAlex Elder enum obj_request_type {
1709969ebc5SAlex Elder 	OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
1719969ebc5SAlex Elder };
172bf0d5f50SAlex Elder 
173926f9b3fSAlex Elder enum obj_req_flags {
174926f9b3fSAlex Elder 	OBJ_REQ_DONE,		/* completion flag: not done = 0, done = 1 */
1756365d33aSAlex Elder 	OBJ_REQ_IMG_DATA,	/* object usage: standalone = 0, image = 1 */
1765679c59fSAlex Elder 	OBJ_REQ_KNOWN,		/* EXISTS flag valid: no = 0, yes = 1 */
1775679c59fSAlex Elder 	OBJ_REQ_EXISTS,		/* target exists: no = 0, yes = 1 */
178926f9b3fSAlex Elder };
179926f9b3fSAlex Elder 
180bf0d5f50SAlex Elder struct rbd_obj_request {
181bf0d5f50SAlex Elder 	const char		*object_name;
182bf0d5f50SAlex Elder 	u64			offset;		/* object start byte */
183bf0d5f50SAlex Elder 	u64			length;		/* bytes from offset */
184926f9b3fSAlex Elder 	unsigned long		flags;
185bf0d5f50SAlex Elder 
186c5b5ef6cSAlex Elder 	/*
187c5b5ef6cSAlex Elder 	 * An object request associated with an image will have its
188c5b5ef6cSAlex Elder 	 * img_data flag set; a standalone object request will not.
189c5b5ef6cSAlex Elder 	 *
190c5b5ef6cSAlex Elder 	 * A standalone object request will have which == BAD_WHICH
191c5b5ef6cSAlex Elder 	 * and a null obj_request pointer.
192c5b5ef6cSAlex Elder 	 *
193c5b5ef6cSAlex Elder 	 * An object request initiated in support of a layered image
194c5b5ef6cSAlex Elder 	 * object (to check for its existence before a write) will
195c5b5ef6cSAlex Elder 	 * have which == BAD_WHICH and a non-null obj_request pointer.
196c5b5ef6cSAlex Elder 	 *
197c5b5ef6cSAlex Elder 	 * Finally, an object request for rbd image data will have
198c5b5ef6cSAlex Elder 	 * which != BAD_WHICH, and will have a non-null img_request
199c5b5ef6cSAlex Elder 	 * pointer.  The value of which will be in the range
200c5b5ef6cSAlex Elder 	 * 0..(img_request->obj_request_count-1).
201c5b5ef6cSAlex Elder 	 */
202c5b5ef6cSAlex Elder 	union {
203c5b5ef6cSAlex Elder 		struct rbd_obj_request	*obj_request;	/* STAT op */
204c5b5ef6cSAlex Elder 		struct {
205bf0d5f50SAlex Elder 			struct rbd_img_request	*img_request;
206c5b5ef6cSAlex Elder 			u64			img_offset;
207c5b5ef6cSAlex Elder 			/* links for img_request->obj_requests list */
208c5b5ef6cSAlex Elder 			struct list_head	links;
209c5b5ef6cSAlex Elder 		};
210c5b5ef6cSAlex Elder 	};
211bf0d5f50SAlex Elder 	u32			which;		/* posn image request list */
212bf0d5f50SAlex Elder 
213bf0d5f50SAlex Elder 	enum obj_request_type	type;
214788e2df3SAlex Elder 	union {
215bf0d5f50SAlex Elder 		struct bio	*bio_list;
216788e2df3SAlex Elder 		struct {
217788e2df3SAlex Elder 			struct page	**pages;
218788e2df3SAlex Elder 			u32		page_count;
219788e2df3SAlex Elder 		};
220788e2df3SAlex Elder 	};
2210eefd470SAlex Elder 	struct page		**copyup_pages;
222bf0d5f50SAlex Elder 
223bf0d5f50SAlex Elder 	struct ceph_osd_request	*osd_req;
224bf0d5f50SAlex Elder 
225bf0d5f50SAlex Elder 	u64			xferred;	/* bytes transferred */
226bf0d5f50SAlex Elder 	u64			version;
2271b83bef2SSage Weil 	int			result;
228bf0d5f50SAlex Elder 
229bf0d5f50SAlex Elder 	rbd_obj_callback_t	callback;
230788e2df3SAlex Elder 	struct completion	completion;
231bf0d5f50SAlex Elder 
232bf0d5f50SAlex Elder 	struct kref		kref;
233bf0d5f50SAlex Elder };
234bf0d5f50SAlex Elder 
2350c425248SAlex Elder enum img_req_flags {
2369849e986SAlex Elder 	IMG_REQ_WRITE,		/* I/O direction: read = 0, write = 1 */
2379849e986SAlex Elder 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
238d0b2e944SAlex Elder 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
2390c425248SAlex Elder };
2400c425248SAlex Elder 
241bf0d5f50SAlex Elder struct rbd_img_request {
242bf0d5f50SAlex Elder 	struct rbd_device	*rbd_dev;
243bf0d5f50SAlex Elder 	u64			offset;	/* starting image byte offset */
244bf0d5f50SAlex Elder 	u64			length;	/* byte count from offset */
2450c425248SAlex Elder 	unsigned long		flags;
246bf0d5f50SAlex Elder 	union {
247bf0d5f50SAlex Elder 		u64			snap_id;	/* for reads */
2489849e986SAlex Elder 		struct ceph_snap_context *snapc;	/* for writes */
2499849e986SAlex Elder 	};
2509849e986SAlex Elder 	union {
2519849e986SAlex Elder 		struct request		*rq;		/* block request */
2529849e986SAlex Elder 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
253bf0d5f50SAlex Elder 	};
2543d7efd18SAlex Elder 	struct page		**copyup_pages;
255bf0d5f50SAlex Elder 	spinlock_t		completion_lock;/* protects next_completion */
256bf0d5f50SAlex Elder 	u32			next_completion;
257bf0d5f50SAlex Elder 	rbd_img_callback_t	callback;
25855f27e09SAlex Elder 	u64			xferred;/* aggregate bytes transferred */
259a5a337d4SAlex Elder 	int			result;	/* first nonzero obj_request result */
260bf0d5f50SAlex Elder 
261bf0d5f50SAlex Elder 	u32			obj_request_count;
262bf0d5f50SAlex Elder 	struct list_head	obj_requests;	/* rbd_obj_request structs */
263bf0d5f50SAlex Elder 
264bf0d5f50SAlex Elder 	struct kref		kref;
265bf0d5f50SAlex Elder };
266bf0d5f50SAlex Elder 
267bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
268ef06f4d3SAlex Elder 	list_for_each_entry(oreq, &(ireq)->obj_requests, links)
269bf0d5f50SAlex Elder #define for_each_obj_request_from(ireq, oreq) \
270ef06f4d3SAlex Elder 	list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
271bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
272ef06f4d3SAlex Elder 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
273bf0d5f50SAlex Elder 
274dfc5606dSYehuda Sadeh struct rbd_snap {
275dfc5606dSYehuda Sadeh 	const char		*name;
2763591538fSJosh Durgin 	u64			size;
277dfc5606dSYehuda Sadeh 	struct list_head	node;
278dfc5606dSYehuda Sadeh 	u64			id;
27934b13184SAlex Elder 	u64			features;
280dfc5606dSYehuda Sadeh };
281dfc5606dSYehuda Sadeh 
282f84344f3SAlex Elder struct rbd_mapping {
28399c1f08fSAlex Elder 	u64                     size;
28434b13184SAlex Elder 	u64                     features;
285f84344f3SAlex Elder 	bool			read_only;
286f84344f3SAlex Elder };
287f84344f3SAlex Elder 
288602adf40SYehuda Sadeh /*
289602adf40SYehuda Sadeh  * a single device
290602adf40SYehuda Sadeh  */
291602adf40SYehuda Sadeh struct rbd_device {
292de71a297SAlex Elder 	int			dev_id;		/* blkdev unique id */
293602adf40SYehuda Sadeh 
294602adf40SYehuda Sadeh 	int			major;		/* blkdev assigned major */
295602adf40SYehuda Sadeh 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
296602adf40SYehuda Sadeh 
297a30b71b9SAlex Elder 	u32			image_format;	/* Either 1 or 2 */
298602adf40SYehuda Sadeh 	struct rbd_client	*rbd_client;
299602adf40SYehuda Sadeh 
300602adf40SYehuda Sadeh 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
301602adf40SYehuda Sadeh 
302b82d167bSAlex Elder 	spinlock_t		lock;		/* queue, flags, open_count */
303602adf40SYehuda Sadeh 
304602adf40SYehuda Sadeh 	struct rbd_image_header	header;
305b82d167bSAlex Elder 	unsigned long		flags;		/* possibly lock protected */
3060d7dbfceSAlex Elder 	struct rbd_spec		*spec;
307602adf40SYehuda Sadeh 
3080d7dbfceSAlex Elder 	char			*header_name;
309971f839aSAlex Elder 
3100903e875SAlex Elder 	struct ceph_file_layout	layout;
3110903e875SAlex Elder 
31259c2be1eSYehuda Sadeh 	struct ceph_osd_event   *watch_event;
313975241afSAlex Elder 	struct rbd_obj_request	*watch_request;
31459c2be1eSYehuda Sadeh 
31586b00e0dSAlex Elder 	struct rbd_spec		*parent_spec;
31686b00e0dSAlex Elder 	u64			parent_overlap;
3172f82ee54SAlex Elder 	struct rbd_device	*parent;
31886b00e0dSAlex Elder 
319cc070d59SAlex Elder 	u64			stripe_unit;
320cc070d59SAlex Elder 	u64			stripe_count;
321cc070d59SAlex Elder 
322c666601aSJosh Durgin 	/* protects updating the header */
323c666601aSJosh Durgin 	struct rw_semaphore     header_rwsem;
324f84344f3SAlex Elder 
325f84344f3SAlex Elder 	struct rbd_mapping	mapping;
326602adf40SYehuda Sadeh 
327602adf40SYehuda Sadeh 	struct list_head	node;
328dfc5606dSYehuda Sadeh 
329dfc5606dSYehuda Sadeh 	/* list of snapshots */
330dfc5606dSYehuda Sadeh 	struct list_head	snaps;
331dfc5606dSYehuda Sadeh 
332dfc5606dSYehuda Sadeh 	/* sysfs related */
333dfc5606dSYehuda Sadeh 	struct device		dev;
334b82d167bSAlex Elder 	unsigned long		open_count;	/* protected by lock */
335dfc5606dSYehuda Sadeh };
336dfc5606dSYehuda Sadeh 
337b82d167bSAlex Elder /*
338b82d167bSAlex Elder  * Flag bits for rbd_dev->flags.  If atomicity is required,
339b82d167bSAlex Elder  * rbd_dev->lock is used to protect access.
340b82d167bSAlex Elder  *
341b82d167bSAlex Elder  * Currently, only the "removing" flag (which is coupled with the
342b82d167bSAlex Elder  * "open_count" field) requires atomic access.
343b82d167bSAlex Elder  */
3446d292906SAlex Elder enum rbd_dev_flags {
3456d292906SAlex Elder 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
346b82d167bSAlex Elder 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
3476d292906SAlex Elder };
3486d292906SAlex Elder 
349602adf40SYehuda Sadeh static DEFINE_MUTEX(ctl_mutex);	  /* Serialize open/close/setup/teardown */
350e124a82fSAlex Elder 
351602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list);    /* devices */
352e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
353e124a82fSAlex Elder 
354602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list);		/* clients */
355432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
356602adf40SYehuda Sadeh 
3573d7efd18SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request);
3583d7efd18SAlex Elder 
359304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
360304f6808SAlex Elder 
361dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev);
3626087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap);
363dfc5606dSYehuda Sadeh 
364f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf,
365f0f8cef5SAlex Elder 		       size_t count);
366f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
367f0f8cef5SAlex Elder 			  size_t count);
3682f82ee54SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev);
369f0f8cef5SAlex Elder 
370f0f8cef5SAlex Elder static struct bus_attribute rbd_bus_attrs[] = {
371f0f8cef5SAlex Elder 	__ATTR(add, S_IWUSR, NULL, rbd_add),
372f0f8cef5SAlex Elder 	__ATTR(remove, S_IWUSR, NULL, rbd_remove),
373f0f8cef5SAlex Elder 	__ATTR_NULL
374f0f8cef5SAlex Elder };
375f0f8cef5SAlex Elder 
376f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
377f0f8cef5SAlex Elder 	.name		= "rbd",
378f0f8cef5SAlex Elder 	.bus_attrs	= rbd_bus_attrs,
379f0f8cef5SAlex Elder };
380f0f8cef5SAlex Elder 
381f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
382f0f8cef5SAlex Elder {
383f0f8cef5SAlex Elder }
384f0f8cef5SAlex Elder 
385f0f8cef5SAlex Elder static struct device rbd_root_dev = {
386f0f8cef5SAlex Elder 	.init_name =    "rbd",
387f0f8cef5SAlex Elder 	.release =      rbd_root_dev_release,
388f0f8cef5SAlex Elder };
389f0f8cef5SAlex Elder 
39006ecc6cbSAlex Elder static __printf(2, 3)
39106ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
39206ecc6cbSAlex Elder {
39306ecc6cbSAlex Elder 	struct va_format vaf;
39406ecc6cbSAlex Elder 	va_list args;
39506ecc6cbSAlex Elder 
39606ecc6cbSAlex Elder 	va_start(args, fmt);
39706ecc6cbSAlex Elder 	vaf.fmt = fmt;
39806ecc6cbSAlex Elder 	vaf.va = &args;
39906ecc6cbSAlex Elder 
40006ecc6cbSAlex Elder 	if (!rbd_dev)
40106ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
40206ecc6cbSAlex Elder 	else if (rbd_dev->disk)
40306ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: %s: %pV\n",
40406ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
40506ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
40606ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: image %s: %pV\n",
40706ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
40806ecc6cbSAlex Elder 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
40906ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: id %s: %pV\n",
41006ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
41106ecc6cbSAlex Elder 	else	/* punt */
41206ecc6cbSAlex Elder 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
41306ecc6cbSAlex Elder 			RBD_DRV_NAME, rbd_dev, &vaf);
41406ecc6cbSAlex Elder 	va_end(args);
41506ecc6cbSAlex Elder }
41606ecc6cbSAlex Elder 
417aafb230eSAlex Elder #ifdef RBD_DEBUG
418aafb230eSAlex Elder #define rbd_assert(expr)						\
419aafb230eSAlex Elder 		if (unlikely(!(expr))) {				\
420aafb230eSAlex Elder 			printk(KERN_ERR "\nAssertion failure in %s() "	\
421aafb230eSAlex Elder 						"at line %d:\n\n"	\
422aafb230eSAlex Elder 					"\trbd_assert(%s);\n\n",	\
423aafb230eSAlex Elder 					__func__, __LINE__, #expr);	\
424aafb230eSAlex Elder 			BUG();						\
425aafb230eSAlex Elder 		}
426aafb230eSAlex Elder #else /* !RBD_DEBUG */
427aafb230eSAlex Elder #  define rbd_assert(expr)	((void) 0)
428aafb230eSAlex Elder #endif /* !RBD_DEBUG */
429dfc5606dSYehuda Sadeh 
4308b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
431b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
4328b3e1a56SAlex Elder 
433117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
434117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
43559c2be1eSYehuda Sadeh 
436602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode)
437602adf40SYehuda Sadeh {
438f0f8cef5SAlex Elder 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
439b82d167bSAlex Elder 	bool removing = false;
440602adf40SYehuda Sadeh 
441f84344f3SAlex Elder 	if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
442602adf40SYehuda Sadeh 		return -EROFS;
443602adf40SYehuda Sadeh 
444a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
445b82d167bSAlex Elder 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
446b82d167bSAlex Elder 		removing = true;
447b82d167bSAlex Elder 	else
448b82d167bSAlex Elder 		rbd_dev->open_count++;
449a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
450b82d167bSAlex Elder 	if (removing)
451b82d167bSAlex Elder 		return -ENOENT;
452b82d167bSAlex Elder 
45342382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
454c3e946ceSAlex Elder 	(void) get_device(&rbd_dev->dev);
455f84344f3SAlex Elder 	set_device_ro(bdev, rbd_dev->mapping.read_only);
45642382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
457340c7a2bSAlex Elder 
458602adf40SYehuda Sadeh 	return 0;
459602adf40SYehuda Sadeh }
460602adf40SYehuda Sadeh 
461dfc5606dSYehuda Sadeh static int rbd_release(struct gendisk *disk, fmode_t mode)
462dfc5606dSYehuda Sadeh {
463dfc5606dSYehuda Sadeh 	struct rbd_device *rbd_dev = disk->private_data;
464b82d167bSAlex Elder 	unsigned long open_count_before;
465b82d167bSAlex Elder 
466a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
467b82d167bSAlex Elder 	open_count_before = rbd_dev->open_count--;
468a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
469b82d167bSAlex Elder 	rbd_assert(open_count_before > 0);
470dfc5606dSYehuda Sadeh 
47142382b70SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
472c3e946ceSAlex Elder 	put_device(&rbd_dev->dev);
47342382b70SAlex Elder 	mutex_unlock(&ctl_mutex);
474dfc5606dSYehuda Sadeh 
475dfc5606dSYehuda Sadeh 	return 0;
476dfc5606dSYehuda Sadeh }
477dfc5606dSYehuda Sadeh 
478602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
479602adf40SYehuda Sadeh 	.owner			= THIS_MODULE,
480602adf40SYehuda Sadeh 	.open			= rbd_open,
481dfc5606dSYehuda Sadeh 	.release		= rbd_release,
482602adf40SYehuda Sadeh };
483602adf40SYehuda Sadeh 
484602adf40SYehuda Sadeh /*
485602adf40SYehuda Sadeh  * Initialize an rbd client instance.
48643ae4701SAlex Elder  * We own *ceph_opts.
487602adf40SYehuda Sadeh  */
488f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
489602adf40SYehuda Sadeh {
490602adf40SYehuda Sadeh 	struct rbd_client *rbdc;
491602adf40SYehuda Sadeh 	int ret = -ENOMEM;
492602adf40SYehuda Sadeh 
49337206ee5SAlex Elder 	dout("%s:\n", __func__);
494602adf40SYehuda Sadeh 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
495602adf40SYehuda Sadeh 	if (!rbdc)
496602adf40SYehuda Sadeh 		goto out_opt;
497602adf40SYehuda Sadeh 
498602adf40SYehuda Sadeh 	kref_init(&rbdc->kref);
499602adf40SYehuda Sadeh 	INIT_LIST_HEAD(&rbdc->node);
500602adf40SYehuda Sadeh 
501bc534d86SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
502bc534d86SAlex Elder 
50343ae4701SAlex Elder 	rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
504602adf40SYehuda Sadeh 	if (IS_ERR(rbdc->client))
505bc534d86SAlex Elder 		goto out_mutex;
50643ae4701SAlex Elder 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
507602adf40SYehuda Sadeh 
508602adf40SYehuda Sadeh 	ret = ceph_open_session(rbdc->client);
509602adf40SYehuda Sadeh 	if (ret < 0)
510602adf40SYehuda Sadeh 		goto out_err;
511602adf40SYehuda Sadeh 
512432b8587SAlex Elder 	spin_lock(&rbd_client_list_lock);
513602adf40SYehuda Sadeh 	list_add_tail(&rbdc->node, &rbd_client_list);
514432b8587SAlex Elder 	spin_unlock(&rbd_client_list_lock);
515602adf40SYehuda Sadeh 
516bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
51737206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
518bc534d86SAlex Elder 
519602adf40SYehuda Sadeh 	return rbdc;
520602adf40SYehuda Sadeh 
521602adf40SYehuda Sadeh out_err:
522602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
523bc534d86SAlex Elder out_mutex:
524bc534d86SAlex Elder 	mutex_unlock(&ctl_mutex);
525602adf40SYehuda Sadeh 	kfree(rbdc);
526602adf40SYehuda Sadeh out_opt:
52743ae4701SAlex Elder 	if (ceph_opts)
52843ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
52937206ee5SAlex Elder 	dout("%s: error %d\n", __func__, ret);
53037206ee5SAlex Elder 
53128f259b7SVasiliy Kulikov 	return ERR_PTR(ret);
532602adf40SYehuda Sadeh }
533602adf40SYehuda Sadeh 
5342f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
5352f82ee54SAlex Elder {
5362f82ee54SAlex Elder 	kref_get(&rbdc->kref);
5372f82ee54SAlex Elder 
5382f82ee54SAlex Elder 	return rbdc;
5392f82ee54SAlex Elder }
5402f82ee54SAlex Elder 
541602adf40SYehuda Sadeh /*
5421f7ba331SAlex Elder  * Find a ceph client with specific addr and configuration.  If
5431f7ba331SAlex Elder  * found, bump its reference count.
544602adf40SYehuda Sadeh  */
5451f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
546602adf40SYehuda Sadeh {
547602adf40SYehuda Sadeh 	struct rbd_client *client_node;
5481f7ba331SAlex Elder 	bool found = false;
549602adf40SYehuda Sadeh 
55043ae4701SAlex Elder 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
551602adf40SYehuda Sadeh 		return NULL;
552602adf40SYehuda Sadeh 
5531f7ba331SAlex Elder 	spin_lock(&rbd_client_list_lock);
5541f7ba331SAlex Elder 	list_for_each_entry(client_node, &rbd_client_list, node) {
5551f7ba331SAlex Elder 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
5562f82ee54SAlex Elder 			__rbd_get_client(client_node);
5572f82ee54SAlex Elder 
5581f7ba331SAlex Elder 			found = true;
5591f7ba331SAlex Elder 			break;
5601f7ba331SAlex Elder 		}
5611f7ba331SAlex Elder 	}
5621f7ba331SAlex Elder 	spin_unlock(&rbd_client_list_lock);
5631f7ba331SAlex Elder 
5641f7ba331SAlex Elder 	return found ? client_node : NULL;
565602adf40SYehuda Sadeh }
566602adf40SYehuda Sadeh 
567602adf40SYehuda Sadeh /*
56859c2be1eSYehuda Sadeh  * mount options
56959c2be1eSYehuda Sadeh  */
57059c2be1eSYehuda Sadeh enum {
57159c2be1eSYehuda Sadeh 	Opt_last_int,
57259c2be1eSYehuda Sadeh 	/* int args above */
57359c2be1eSYehuda Sadeh 	Opt_last_string,
57459c2be1eSYehuda Sadeh 	/* string args above */
575cc0538b6SAlex Elder 	Opt_read_only,
576cc0538b6SAlex Elder 	Opt_read_write,
577cc0538b6SAlex Elder 	/* Boolean args above */
578cc0538b6SAlex Elder 	Opt_last_bool,
57959c2be1eSYehuda Sadeh };
58059c2be1eSYehuda Sadeh 
58143ae4701SAlex Elder static match_table_t rbd_opts_tokens = {
58259c2be1eSYehuda Sadeh 	/* int args above */
58359c2be1eSYehuda Sadeh 	/* string args above */
584be466c1cSAlex Elder 	{Opt_read_only, "read_only"},
585cc0538b6SAlex Elder 	{Opt_read_only, "ro"},		/* Alternate spelling */
586cc0538b6SAlex Elder 	{Opt_read_write, "read_write"},
587cc0538b6SAlex Elder 	{Opt_read_write, "rw"},		/* Alternate spelling */
588cc0538b6SAlex Elder 	/* Boolean args above */
58959c2be1eSYehuda Sadeh 	{-1, NULL}
59059c2be1eSYehuda Sadeh };
59159c2be1eSYehuda Sadeh 
59298571b5aSAlex Elder struct rbd_options {
59398571b5aSAlex Elder 	bool	read_only;
59498571b5aSAlex Elder };
59598571b5aSAlex Elder 
59698571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT	false
59798571b5aSAlex Elder 
59859c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private)
59959c2be1eSYehuda Sadeh {
60043ae4701SAlex Elder 	struct rbd_options *rbd_opts = private;
60159c2be1eSYehuda Sadeh 	substring_t argstr[MAX_OPT_ARGS];
60259c2be1eSYehuda Sadeh 	int token, intval, ret;
60359c2be1eSYehuda Sadeh 
60443ae4701SAlex Elder 	token = match_token(c, rbd_opts_tokens, argstr);
60559c2be1eSYehuda Sadeh 	if (token < 0)
60659c2be1eSYehuda Sadeh 		return -EINVAL;
60759c2be1eSYehuda Sadeh 
60859c2be1eSYehuda Sadeh 	if (token < Opt_last_int) {
60959c2be1eSYehuda Sadeh 		ret = match_int(&argstr[0], &intval);
61059c2be1eSYehuda Sadeh 		if (ret < 0) {
61159c2be1eSYehuda Sadeh 			pr_err("bad mount option arg (not int) "
61259c2be1eSYehuda Sadeh 			       "at '%s'\n", c);
61359c2be1eSYehuda Sadeh 			return ret;
61459c2be1eSYehuda Sadeh 		}
61559c2be1eSYehuda Sadeh 		dout("got int token %d val %d\n", token, intval);
61659c2be1eSYehuda Sadeh 	} else if (token > Opt_last_int && token < Opt_last_string) {
61759c2be1eSYehuda Sadeh 		dout("got string token %d val %s\n", token,
61859c2be1eSYehuda Sadeh 		     argstr[0].from);
619cc0538b6SAlex Elder 	} else if (token > Opt_last_string && token < Opt_last_bool) {
620cc0538b6SAlex Elder 		dout("got Boolean token %d\n", token);
62159c2be1eSYehuda Sadeh 	} else {
62259c2be1eSYehuda Sadeh 		dout("got token %d\n", token);
62359c2be1eSYehuda Sadeh 	}
62459c2be1eSYehuda Sadeh 
62559c2be1eSYehuda Sadeh 	switch (token) {
626cc0538b6SAlex Elder 	case Opt_read_only:
627cc0538b6SAlex Elder 		rbd_opts->read_only = true;
628cc0538b6SAlex Elder 		break;
629cc0538b6SAlex Elder 	case Opt_read_write:
630cc0538b6SAlex Elder 		rbd_opts->read_only = false;
631cc0538b6SAlex Elder 		break;
63259c2be1eSYehuda Sadeh 	default:
633aafb230eSAlex Elder 		rbd_assert(false);
634aafb230eSAlex Elder 		break;
63559c2be1eSYehuda Sadeh 	}
63659c2be1eSYehuda Sadeh 	return 0;
63759c2be1eSYehuda Sadeh }
63859c2be1eSYehuda Sadeh 
63959c2be1eSYehuda Sadeh /*
640602adf40SYehuda Sadeh  * Get a ceph client with specific addr and configuration, if one does
641602adf40SYehuda Sadeh  * not exist create it.
642602adf40SYehuda Sadeh  */
6439d3997fdSAlex Elder static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
644602adf40SYehuda Sadeh {
645f8c38929SAlex Elder 	struct rbd_client *rbdc;
64659c2be1eSYehuda Sadeh 
6471f7ba331SAlex Elder 	rbdc = rbd_client_find(ceph_opts);
6489d3997fdSAlex Elder 	if (rbdc)	/* using an existing client */
64943ae4701SAlex Elder 		ceph_destroy_options(ceph_opts);
6509d3997fdSAlex Elder 	else
651f8c38929SAlex Elder 		rbdc = rbd_client_create(ceph_opts);
652d720bcb0SAlex Elder 
6539d3997fdSAlex Elder 	return rbdc;
654602adf40SYehuda Sadeh }
655602adf40SYehuda Sadeh 
656602adf40SYehuda Sadeh /*
657602adf40SYehuda Sadeh  * Destroy ceph client
658d23a4b3fSAlex Elder  *
659432b8587SAlex Elder  * Caller must hold rbd_client_list_lock.
660602adf40SYehuda Sadeh  */
661602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
662602adf40SYehuda Sadeh {
663602adf40SYehuda Sadeh 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
664602adf40SYehuda Sadeh 
66537206ee5SAlex Elder 	dout("%s: rbdc %p\n", __func__, rbdc);
666cd9d9f5dSAlex Elder 	spin_lock(&rbd_client_list_lock);
667602adf40SYehuda Sadeh 	list_del(&rbdc->node);
668cd9d9f5dSAlex Elder 	spin_unlock(&rbd_client_list_lock);
669602adf40SYehuda Sadeh 
670602adf40SYehuda Sadeh 	ceph_destroy_client(rbdc->client);
671602adf40SYehuda Sadeh 	kfree(rbdc);
672602adf40SYehuda Sadeh }
673602adf40SYehuda Sadeh 
674602adf40SYehuda Sadeh /*
675602adf40SYehuda Sadeh  * Drop reference to ceph client node. If it's not referenced anymore, release
676602adf40SYehuda Sadeh  * it.
677602adf40SYehuda Sadeh  */
6789d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
679602adf40SYehuda Sadeh {
680c53d5893SAlex Elder 	if (rbdc)
6819d3997fdSAlex Elder 		kref_put(&rbdc->kref, rbd_client_release);
682602adf40SYehuda Sadeh }
683602adf40SYehuda Sadeh 
684a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
685a30b71b9SAlex Elder {
686a30b71b9SAlex Elder 	return image_format == 1 || image_format == 2;
687a30b71b9SAlex Elder }
688a30b71b9SAlex Elder 
6898e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
6908e94af8eSAlex Elder {
691103a150fSAlex Elder 	size_t size;
692103a150fSAlex Elder 	u32 snap_count;
693103a150fSAlex Elder 
694103a150fSAlex Elder 	/* The header has to start with the magic rbd header text */
695103a150fSAlex Elder 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
696103a150fSAlex Elder 		return false;
697103a150fSAlex Elder 
698db2388b6SAlex Elder 	/* The bio layer requires at least sector-sized I/O */
699db2388b6SAlex Elder 
700db2388b6SAlex Elder 	if (ondisk->options.order < SECTOR_SHIFT)
701db2388b6SAlex Elder 		return false;
702db2388b6SAlex Elder 
703db2388b6SAlex Elder 	/* If we use u64 in a few spots we may be able to loosen this */
704db2388b6SAlex Elder 
705db2388b6SAlex Elder 	if (ondisk->options.order > 8 * sizeof (int) - 1)
706db2388b6SAlex Elder 		return false;
707db2388b6SAlex Elder 
708103a150fSAlex Elder 	/*
709103a150fSAlex Elder 	 * The size of a snapshot header has to fit in a size_t, and
710103a150fSAlex Elder 	 * that limits the number of snapshots.
711103a150fSAlex Elder 	 */
712103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
713103a150fSAlex Elder 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
714103a150fSAlex Elder 	if (snap_count > size / sizeof (__le64))
715103a150fSAlex Elder 		return false;
716103a150fSAlex Elder 
717103a150fSAlex Elder 	/*
718103a150fSAlex Elder 	 * Not only that, but the size of the entire the snapshot
719103a150fSAlex Elder 	 * header must also be representable in a size_t.
720103a150fSAlex Elder 	 */
721103a150fSAlex Elder 	size -= snap_count * sizeof (__le64);
722103a150fSAlex Elder 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
723103a150fSAlex Elder 		return false;
724103a150fSAlex Elder 
725103a150fSAlex Elder 	return true;
7268e94af8eSAlex Elder }
7278e94af8eSAlex Elder 
728602adf40SYehuda Sadeh /*
729602adf40SYehuda Sadeh  * Create a new header structure, translate header format from the on-disk
730602adf40SYehuda Sadeh  * header.
731602adf40SYehuda Sadeh  */
732602adf40SYehuda Sadeh static int rbd_header_from_disk(struct rbd_image_header *header,
7334156d998SAlex Elder 				 struct rbd_image_header_ondisk *ondisk)
734602adf40SYehuda Sadeh {
735ccece235SAlex Elder 	u32 snap_count;
73658c17b0eSAlex Elder 	size_t len;
737d2bb24e5SAlex Elder 	size_t size;
738621901d6SAlex Elder 	u32 i;
739602adf40SYehuda Sadeh 
7406a52325fSAlex Elder 	memset(header, 0, sizeof (*header));
7416a52325fSAlex Elder 
742103a150fSAlex Elder 	snap_count = le32_to_cpu(ondisk->snap_count);
743103a150fSAlex Elder 
74458c17b0eSAlex Elder 	len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
74558c17b0eSAlex Elder 	header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
7466a52325fSAlex Elder 	if (!header->object_prefix)
747602adf40SYehuda Sadeh 		return -ENOMEM;
74858c17b0eSAlex Elder 	memcpy(header->object_prefix, ondisk->object_prefix, len);
74958c17b0eSAlex Elder 	header->object_prefix[len] = '\0';
75000f1f36fSAlex Elder 
751602adf40SYehuda Sadeh 	if (snap_count) {
752f785cc1dSAlex Elder 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
753f785cc1dSAlex Elder 
754621901d6SAlex Elder 		/* Save a copy of the snapshot names */
755621901d6SAlex Elder 
756f785cc1dSAlex Elder 		if (snap_names_len > (u64) SIZE_MAX)
757f785cc1dSAlex Elder 			return -EIO;
758f785cc1dSAlex Elder 		header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
759602adf40SYehuda Sadeh 		if (!header->snap_names)
7606a52325fSAlex Elder 			goto out_err;
761f785cc1dSAlex Elder 		/*
762f785cc1dSAlex Elder 		 * Note that rbd_dev_v1_header_read() guarantees
763f785cc1dSAlex Elder 		 * the ondisk buffer we're working with has
764f785cc1dSAlex Elder 		 * snap_names_len bytes beyond the end of the
765f785cc1dSAlex Elder 		 * snapshot id array, this memcpy() is safe.
766f785cc1dSAlex Elder 		 */
767f785cc1dSAlex Elder 		memcpy(header->snap_names, &ondisk->snaps[snap_count],
768f785cc1dSAlex Elder 			snap_names_len);
7696a52325fSAlex Elder 
770621901d6SAlex Elder 		/* Record each snapshot's size */
771621901d6SAlex Elder 
772d2bb24e5SAlex Elder 		size = snap_count * sizeof (*header->snap_sizes);
773d2bb24e5SAlex Elder 		header->snap_sizes = kmalloc(size, GFP_KERNEL);
774602adf40SYehuda Sadeh 		if (!header->snap_sizes)
7756a52325fSAlex Elder 			goto out_err;
776621901d6SAlex Elder 		for (i = 0; i < snap_count; i++)
777621901d6SAlex Elder 			header->snap_sizes[i] =
778621901d6SAlex Elder 				le64_to_cpu(ondisk->snaps[i].image_size);
779602adf40SYehuda Sadeh 	} else {
780ccece235SAlex Elder 		WARN_ON(ondisk->snap_names_len);
781602adf40SYehuda Sadeh 		header->snap_names = NULL;
782602adf40SYehuda Sadeh 		header->snap_sizes = NULL;
783602adf40SYehuda Sadeh 	}
784849b4260SAlex Elder 
78534b13184SAlex Elder 	header->features = 0;	/* No features support in v1 images */
786602adf40SYehuda Sadeh 	header->obj_order = ondisk->options.order;
787602adf40SYehuda Sadeh 	header->crypt_type = ondisk->options.crypt_type;
788602adf40SYehuda Sadeh 	header->comp_type = ondisk->options.comp_type;
7896a52325fSAlex Elder 
790621901d6SAlex Elder 	/* Allocate and fill in the snapshot context */
791621901d6SAlex Elder 
792f84344f3SAlex Elder 	header->image_size = le64_to_cpu(ondisk->image_size);
7936a52325fSAlex Elder 	size = sizeof (struct ceph_snap_context);
7946a52325fSAlex Elder 	size += snap_count * sizeof (header->snapc->snaps[0]);
7956a52325fSAlex Elder 	header->snapc = kzalloc(size, GFP_KERNEL);
7966a52325fSAlex Elder 	if (!header->snapc)
7976a52325fSAlex Elder 		goto out_err;
798602adf40SYehuda Sadeh 
799602adf40SYehuda Sadeh 	atomic_set(&header->snapc->nref, 1);
800505cbb9bSAlex Elder 	header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
801602adf40SYehuda Sadeh 	header->snapc->num_snaps = snap_count;
802621901d6SAlex Elder 	for (i = 0; i < snap_count; i++)
803602adf40SYehuda Sadeh 		header->snapc->snaps[i] =
804602adf40SYehuda Sadeh 			le64_to_cpu(ondisk->snaps[i].id);
805602adf40SYehuda Sadeh 
806602adf40SYehuda Sadeh 	return 0;
807602adf40SYehuda Sadeh 
8086a52325fSAlex Elder out_err:
809849b4260SAlex Elder 	kfree(header->snap_sizes);
810ccece235SAlex Elder 	header->snap_sizes = NULL;
811602adf40SYehuda Sadeh 	kfree(header->snap_names);
812ccece235SAlex Elder 	header->snap_names = NULL;
8136a52325fSAlex Elder 	kfree(header->object_prefix);
8146a52325fSAlex Elder 	header->object_prefix = NULL;
815ccece235SAlex Elder 
81600f1f36fSAlex Elder 	return -ENOMEM;
817602adf40SYehuda Sadeh }
818602adf40SYehuda Sadeh 
8199e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
8209e15b77dSAlex Elder {
8219e15b77dSAlex Elder 	struct rbd_snap *snap;
8229e15b77dSAlex Elder 
8239e15b77dSAlex Elder 	if (snap_id == CEPH_NOSNAP)
8249e15b77dSAlex Elder 		return RBD_SNAP_HEAD_NAME;
8259e15b77dSAlex Elder 
8269e15b77dSAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node)
8279e15b77dSAlex Elder 		if (snap_id == snap->id)
8289e15b77dSAlex Elder 			return snap->name;
8299e15b77dSAlex Elder 
8309e15b77dSAlex Elder 	return NULL;
8319e15b77dSAlex Elder }
8329e15b77dSAlex Elder 
8338836b995SAlex Elder static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
834602adf40SYehuda Sadeh {
835602adf40SYehuda Sadeh 
836e86924a8SAlex Elder 	struct rbd_snap *snap;
83700f1f36fSAlex Elder 
838e86924a8SAlex Elder 	list_for_each_entry(snap, &rbd_dev->snaps, node) {
839e86924a8SAlex Elder 		if (!strcmp(snap_name, snap->name)) {
8400d7dbfceSAlex Elder 			rbd_dev->spec->snap_id = snap->id;
841e86924a8SAlex Elder 			rbd_dev->mapping.size = snap->size;
84234b13184SAlex Elder 			rbd_dev->mapping.features = snap->features;
84300f1f36fSAlex Elder 
844e86924a8SAlex Elder 			return 0;
845602adf40SYehuda Sadeh 		}
84600f1f36fSAlex Elder 	}
847e86924a8SAlex Elder 
84800f1f36fSAlex Elder 	return -ENOENT;
84900f1f36fSAlex Elder }
850602adf40SYehuda Sadeh 
851819d52bfSAlex Elder static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
852602adf40SYehuda Sadeh {
85378dc447dSAlex Elder 	int ret;
854602adf40SYehuda Sadeh 
8550d7dbfceSAlex Elder 	if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
856cc9d734cSJosh Durgin 		    sizeof (RBD_SNAP_HEAD_NAME))) {
8570d7dbfceSAlex Elder 		rbd_dev->spec->snap_id = CEPH_NOSNAP;
85899c1f08fSAlex Elder 		rbd_dev->mapping.size = rbd_dev->header.image_size;
85934b13184SAlex Elder 		rbd_dev->mapping.features = rbd_dev->header.features;
860e86924a8SAlex Elder 		ret = 0;
861602adf40SYehuda Sadeh 	} else {
8620d7dbfceSAlex Elder 		ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
863602adf40SYehuda Sadeh 		if (ret < 0)
864602adf40SYehuda Sadeh 			goto done;
865f84344f3SAlex Elder 		rbd_dev->mapping.read_only = true;
866602adf40SYehuda Sadeh 	}
8676d292906SAlex Elder 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
8686d292906SAlex Elder 
869602adf40SYehuda Sadeh done:
870602adf40SYehuda Sadeh 	return ret;
871602adf40SYehuda Sadeh }
872602adf40SYehuda Sadeh 
873602adf40SYehuda Sadeh static void rbd_header_free(struct rbd_image_header *header)
874602adf40SYehuda Sadeh {
875849b4260SAlex Elder 	kfree(header->object_prefix);
876d78fd7aeSAlex Elder 	header->object_prefix = NULL;
877602adf40SYehuda Sadeh 	kfree(header->snap_sizes);
878d78fd7aeSAlex Elder 	header->snap_sizes = NULL;
879849b4260SAlex Elder 	kfree(header->snap_names);
880d78fd7aeSAlex Elder 	header->snap_names = NULL;
881d1d25646SJosh Durgin 	ceph_put_snap_context(header->snapc);
882d78fd7aeSAlex Elder 	header->snapc = NULL;
883602adf40SYehuda Sadeh }
884602adf40SYehuda Sadeh 
88598571b5aSAlex Elder static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
886602adf40SYehuda Sadeh {
88765ccfe21SAlex Elder 	char *name;
88865ccfe21SAlex Elder 	u64 segment;
88965ccfe21SAlex Elder 	int ret;
890602adf40SYehuda Sadeh 
8912fd82b9eSAlex Elder 	name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
89265ccfe21SAlex Elder 	if (!name)
89365ccfe21SAlex Elder 		return NULL;
89465ccfe21SAlex Elder 	segment = offset >> rbd_dev->header.obj_order;
8952fd82b9eSAlex Elder 	ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
89665ccfe21SAlex Elder 			rbd_dev->header.object_prefix, segment);
8972fd82b9eSAlex Elder 	if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
89865ccfe21SAlex Elder 		pr_err("error formatting segment name for #%llu (%d)\n",
89965ccfe21SAlex Elder 			segment, ret);
90065ccfe21SAlex Elder 		kfree(name);
90165ccfe21SAlex Elder 		name = NULL;
90265ccfe21SAlex Elder 	}
903602adf40SYehuda Sadeh 
90465ccfe21SAlex Elder 	return name;
90565ccfe21SAlex Elder }
906602adf40SYehuda Sadeh 
90765ccfe21SAlex Elder static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
90865ccfe21SAlex Elder {
90965ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
910602adf40SYehuda Sadeh 
91165ccfe21SAlex Elder 	return offset & (segment_size - 1);
91265ccfe21SAlex Elder }
91365ccfe21SAlex Elder 
91465ccfe21SAlex Elder static u64 rbd_segment_length(struct rbd_device *rbd_dev,
91565ccfe21SAlex Elder 				u64 offset, u64 length)
91665ccfe21SAlex Elder {
91765ccfe21SAlex Elder 	u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
91865ccfe21SAlex Elder 
91965ccfe21SAlex Elder 	offset &= segment_size - 1;
92065ccfe21SAlex Elder 
921aafb230eSAlex Elder 	rbd_assert(length <= U64_MAX - offset);
92265ccfe21SAlex Elder 	if (offset + length > segment_size)
92365ccfe21SAlex Elder 		length = segment_size - offset;
92465ccfe21SAlex Elder 
92565ccfe21SAlex Elder 	return length;
926602adf40SYehuda Sadeh }
927602adf40SYehuda Sadeh 
928602adf40SYehuda Sadeh /*
929029bcbd8SJosh Durgin  * returns the size of an object in the image
930029bcbd8SJosh Durgin  */
931029bcbd8SJosh Durgin static u64 rbd_obj_bytes(struct rbd_image_header *header)
932029bcbd8SJosh Durgin {
933029bcbd8SJosh Durgin 	return 1 << header->obj_order;
934029bcbd8SJosh Durgin }
935029bcbd8SJosh Durgin 
936029bcbd8SJosh Durgin /*
937602adf40SYehuda Sadeh  * bio helpers
938602adf40SYehuda Sadeh  */
939602adf40SYehuda Sadeh 
940602adf40SYehuda Sadeh static void bio_chain_put(struct bio *chain)
941602adf40SYehuda Sadeh {
942602adf40SYehuda Sadeh 	struct bio *tmp;
943602adf40SYehuda Sadeh 
944602adf40SYehuda Sadeh 	while (chain) {
945602adf40SYehuda Sadeh 		tmp = chain;
946602adf40SYehuda Sadeh 		chain = chain->bi_next;
947602adf40SYehuda Sadeh 		bio_put(tmp);
948602adf40SYehuda Sadeh 	}
949602adf40SYehuda Sadeh }
950602adf40SYehuda Sadeh 
951602adf40SYehuda Sadeh /*
952602adf40SYehuda Sadeh  * zeros a bio chain, starting at specific offset
953602adf40SYehuda Sadeh  */
954602adf40SYehuda Sadeh static void zero_bio_chain(struct bio *chain, int start_ofs)
955602adf40SYehuda Sadeh {
956602adf40SYehuda Sadeh 	struct bio_vec *bv;
957602adf40SYehuda Sadeh 	unsigned long flags;
958602adf40SYehuda Sadeh 	void *buf;
959602adf40SYehuda Sadeh 	int i;
960602adf40SYehuda Sadeh 	int pos = 0;
961602adf40SYehuda Sadeh 
962602adf40SYehuda Sadeh 	while (chain) {
963602adf40SYehuda Sadeh 		bio_for_each_segment(bv, chain, i) {
964602adf40SYehuda Sadeh 			if (pos + bv->bv_len > start_ofs) {
965602adf40SYehuda Sadeh 				int remainder = max(start_ofs - pos, 0);
966602adf40SYehuda Sadeh 				buf = bvec_kmap_irq(bv, &flags);
967602adf40SYehuda Sadeh 				memset(buf + remainder, 0,
968602adf40SYehuda Sadeh 				       bv->bv_len - remainder);
96985b5aaa6SDan Carpenter 				bvec_kunmap_irq(buf, &flags);
970602adf40SYehuda Sadeh 			}
971602adf40SYehuda Sadeh 			pos += bv->bv_len;
972602adf40SYehuda Sadeh 		}
973602adf40SYehuda Sadeh 
974602adf40SYehuda Sadeh 		chain = chain->bi_next;
975602adf40SYehuda Sadeh 	}
976602adf40SYehuda Sadeh }
977602adf40SYehuda Sadeh 
978602adf40SYehuda Sadeh /*
979b9434c5bSAlex Elder  * similar to zero_bio_chain(), zeros data defined by a page array,
980b9434c5bSAlex Elder  * starting at the given byte offset from the start of the array and
981b9434c5bSAlex Elder  * continuing up to the given end offset.  The pages array is
982b9434c5bSAlex Elder  * assumed to be big enough to hold all bytes up to the end.
983b9434c5bSAlex Elder  */
984b9434c5bSAlex Elder static void zero_pages(struct page **pages, u64 offset, u64 end)
985b9434c5bSAlex Elder {
986b9434c5bSAlex Elder 	struct page **page = &pages[offset >> PAGE_SHIFT];
987b9434c5bSAlex Elder 
988b9434c5bSAlex Elder 	rbd_assert(end > offset);
989b9434c5bSAlex Elder 	rbd_assert(end - offset <= (u64)SIZE_MAX);
990b9434c5bSAlex Elder 	while (offset < end) {
991b9434c5bSAlex Elder 		size_t page_offset;
992b9434c5bSAlex Elder 		size_t length;
993b9434c5bSAlex Elder 		unsigned long flags;
994b9434c5bSAlex Elder 		void *kaddr;
995b9434c5bSAlex Elder 
996b9434c5bSAlex Elder 		page_offset = (size_t)(offset & ~PAGE_MASK);
997b9434c5bSAlex Elder 		length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
998b9434c5bSAlex Elder 		local_irq_save(flags);
999b9434c5bSAlex Elder 		kaddr = kmap_atomic(*page);
1000b9434c5bSAlex Elder 		memset(kaddr + page_offset, 0, length);
1001b9434c5bSAlex Elder 		kunmap_atomic(kaddr);
1002b9434c5bSAlex Elder 		local_irq_restore(flags);
1003b9434c5bSAlex Elder 
1004b9434c5bSAlex Elder 		offset += length;
1005b9434c5bSAlex Elder 		page++;
1006b9434c5bSAlex Elder 	}
1007b9434c5bSAlex Elder }
1008b9434c5bSAlex Elder 
1009b9434c5bSAlex Elder /*
1010f7760dadSAlex Elder  * Clone a portion of a bio, starting at the given byte offset
1011f7760dadSAlex Elder  * and continuing for the number of bytes indicated.
1012602adf40SYehuda Sadeh  */
1013f7760dadSAlex Elder static struct bio *bio_clone_range(struct bio *bio_src,
1014f7760dadSAlex Elder 					unsigned int offset,
1015f7760dadSAlex Elder 					unsigned int len,
1016f7760dadSAlex Elder 					gfp_t gfpmask)
1017602adf40SYehuda Sadeh {
1018f7760dadSAlex Elder 	struct bio_vec *bv;
1019f7760dadSAlex Elder 	unsigned int resid;
1020f7760dadSAlex Elder 	unsigned short idx;
1021f7760dadSAlex Elder 	unsigned int voff;
1022f7760dadSAlex Elder 	unsigned short end_idx;
1023f7760dadSAlex Elder 	unsigned short vcnt;
1024f7760dadSAlex Elder 	struct bio *bio;
1025602adf40SYehuda Sadeh 
1026f7760dadSAlex Elder 	/* Handle the easy case for the caller */
1027f7760dadSAlex Elder 
1028f7760dadSAlex Elder 	if (!offset && len == bio_src->bi_size)
1029f7760dadSAlex Elder 		return bio_clone(bio_src, gfpmask);
1030f7760dadSAlex Elder 
1031f7760dadSAlex Elder 	if (WARN_ON_ONCE(!len))
1032f7760dadSAlex Elder 		return NULL;
1033f7760dadSAlex Elder 	if (WARN_ON_ONCE(len > bio_src->bi_size))
1034f7760dadSAlex Elder 		return NULL;
1035f7760dadSAlex Elder 	if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1036f7760dadSAlex Elder 		return NULL;
1037f7760dadSAlex Elder 
1038f7760dadSAlex Elder 	/* Find first affected segment... */
1039f7760dadSAlex Elder 
1040f7760dadSAlex Elder 	resid = offset;
1041f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, idx, 0) {
1042f7760dadSAlex Elder 		if (resid < bv->bv_len)
1043f7760dadSAlex Elder 			break;
1044f7760dadSAlex Elder 		resid -= bv->bv_len;
1045602adf40SYehuda Sadeh 	}
1046f7760dadSAlex Elder 	voff = resid;
1047602adf40SYehuda Sadeh 
1048f7760dadSAlex Elder 	/* ...and the last affected segment */
1049542582fcSAlex Elder 
1050f7760dadSAlex Elder 	resid += len;
1051f7760dadSAlex Elder 	__bio_for_each_segment(bv, bio_src, end_idx, idx) {
1052f7760dadSAlex Elder 		if (resid <= bv->bv_len)
1053f7760dadSAlex Elder 			break;
1054f7760dadSAlex Elder 		resid -= bv->bv_len;
1055f7760dadSAlex Elder 	}
1056f7760dadSAlex Elder 	vcnt = end_idx - idx + 1;
1057602adf40SYehuda Sadeh 
1058f7760dadSAlex Elder 	/* Build the clone */
1059f7760dadSAlex Elder 
1060f7760dadSAlex Elder 	bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1061f7760dadSAlex Elder 	if (!bio)
1062f7760dadSAlex Elder 		return NULL;	/* ENOMEM */
1063f7760dadSAlex Elder 
1064f7760dadSAlex Elder 	bio->bi_bdev = bio_src->bi_bdev;
1065f7760dadSAlex Elder 	bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1066f7760dadSAlex Elder 	bio->bi_rw = bio_src->bi_rw;
1067f7760dadSAlex Elder 	bio->bi_flags |= 1 << BIO_CLONED;
1068602adf40SYehuda Sadeh 
1069602adf40SYehuda Sadeh 	/*
1070f7760dadSAlex Elder 	 * Copy over our part of the bio_vec, then update the first
1071f7760dadSAlex Elder 	 * and last (or only) entries.
1072602adf40SYehuda Sadeh 	 */
1073f7760dadSAlex Elder 	memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1074f7760dadSAlex Elder 			vcnt * sizeof (struct bio_vec));
1075f7760dadSAlex Elder 	bio->bi_io_vec[0].bv_offset += voff;
1076f7760dadSAlex Elder 	if (vcnt > 1) {
1077f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len -= voff;
1078f7760dadSAlex Elder 		bio->bi_io_vec[vcnt - 1].bv_len = resid;
1079602adf40SYehuda Sadeh 	} else {
1080f7760dadSAlex Elder 		bio->bi_io_vec[0].bv_len = len;
1081602adf40SYehuda Sadeh 	}
1082602adf40SYehuda Sadeh 
1083f7760dadSAlex Elder 	bio->bi_vcnt = vcnt;
1084f7760dadSAlex Elder 	bio->bi_size = len;
1085f7760dadSAlex Elder 	bio->bi_idx = 0;
1086602adf40SYehuda Sadeh 
1087f7760dadSAlex Elder 	return bio;
1088602adf40SYehuda Sadeh }
1089602adf40SYehuda Sadeh 
1090f7760dadSAlex Elder /*
1091f7760dadSAlex Elder  * Clone a portion of a bio chain, starting at the given byte offset
1092f7760dadSAlex Elder  * into the first bio in the source chain and continuing for the
1093f7760dadSAlex Elder  * number of bytes indicated.  The result is another bio chain of
1094f7760dadSAlex Elder  * exactly the given length, or a null pointer on error.
1095f7760dadSAlex Elder  *
1096f7760dadSAlex Elder  * The bio_src and offset parameters are both in-out.  On entry they
1097f7760dadSAlex Elder  * refer to the first source bio and the offset into that bio where
1098f7760dadSAlex Elder  * the start of data to be cloned is located.
1099f7760dadSAlex Elder  *
1100f7760dadSAlex Elder  * On return, bio_src is updated to refer to the bio in the source
1101f7760dadSAlex Elder  * chain that contains first un-cloned byte, and *offset will
1102f7760dadSAlex Elder  * contain the offset of that byte within that bio.
1103f7760dadSAlex Elder  */
1104f7760dadSAlex Elder static struct bio *bio_chain_clone_range(struct bio **bio_src,
1105f7760dadSAlex Elder 					unsigned int *offset,
1106f7760dadSAlex Elder 					unsigned int len,
1107f7760dadSAlex Elder 					gfp_t gfpmask)
1108f7760dadSAlex Elder {
1109f7760dadSAlex Elder 	struct bio *bi = *bio_src;
1110f7760dadSAlex Elder 	unsigned int off = *offset;
1111f7760dadSAlex Elder 	struct bio *chain = NULL;
1112f7760dadSAlex Elder 	struct bio **end;
1113602adf40SYehuda Sadeh 
1114f7760dadSAlex Elder 	/* Build up a chain of clone bios up to the limit */
1115602adf40SYehuda Sadeh 
1116f7760dadSAlex Elder 	if (!bi || off >= bi->bi_size || !len)
1117f7760dadSAlex Elder 		return NULL;		/* Nothing to clone */
1118602adf40SYehuda Sadeh 
1119f7760dadSAlex Elder 	end = &chain;
1120f7760dadSAlex Elder 	while (len) {
1121f7760dadSAlex Elder 		unsigned int bi_size;
1122f7760dadSAlex Elder 		struct bio *bio;
1123f7760dadSAlex Elder 
1124f5400b7aSAlex Elder 		if (!bi) {
1125f5400b7aSAlex Elder 			rbd_warn(NULL, "bio_chain exhausted with %u left", len);
1126f7760dadSAlex Elder 			goto out_err;	/* EINVAL; ran out of bio's */
1127f5400b7aSAlex Elder 		}
1128f7760dadSAlex Elder 		bi_size = min_t(unsigned int, bi->bi_size - off, len);
1129f7760dadSAlex Elder 		bio = bio_clone_range(bi, off, bi_size, gfpmask);
1130f7760dadSAlex Elder 		if (!bio)
1131f7760dadSAlex Elder 			goto out_err;	/* ENOMEM */
1132f7760dadSAlex Elder 
1133f7760dadSAlex Elder 		*end = bio;
1134f7760dadSAlex Elder 		end = &bio->bi_next;
1135f7760dadSAlex Elder 
1136f7760dadSAlex Elder 		off += bi_size;
1137f7760dadSAlex Elder 		if (off == bi->bi_size) {
1138f7760dadSAlex Elder 			bi = bi->bi_next;
1139f7760dadSAlex Elder 			off = 0;
1140f7760dadSAlex Elder 		}
1141f7760dadSAlex Elder 		len -= bi_size;
1142f7760dadSAlex Elder 	}
1143f7760dadSAlex Elder 	*bio_src = bi;
1144f7760dadSAlex Elder 	*offset = off;
1145f7760dadSAlex Elder 
1146f7760dadSAlex Elder 	return chain;
1147f7760dadSAlex Elder out_err:
1148f7760dadSAlex Elder 	bio_chain_put(chain);
1149f7760dadSAlex Elder 
1150602adf40SYehuda Sadeh 	return NULL;
1151602adf40SYehuda Sadeh }
1152602adf40SYehuda Sadeh 
1153926f9b3fSAlex Elder /*
1154926f9b3fSAlex Elder  * The default/initial value for all object request flags is 0.  For
1155926f9b3fSAlex Elder  * each flag, once its value is set to 1 it is never reset to 0
1156926f9b3fSAlex Elder  * again.
1157926f9b3fSAlex Elder  */
11586365d33aSAlex Elder static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
11596365d33aSAlex Elder {
11606365d33aSAlex Elder 	if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
11616365d33aSAlex Elder 		struct rbd_device *rbd_dev;
11626365d33aSAlex Elder 
116357acbaa7SAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
11646365d33aSAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
11656365d33aSAlex Elder 			obj_request);
11666365d33aSAlex Elder 	}
11676365d33aSAlex Elder }
11686365d33aSAlex Elder 
11696365d33aSAlex Elder static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
11706365d33aSAlex Elder {
11716365d33aSAlex Elder 	smp_mb();
11726365d33aSAlex Elder 	return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
11736365d33aSAlex Elder }
11746365d33aSAlex Elder 
117557acbaa7SAlex Elder static void obj_request_done_set(struct rbd_obj_request *obj_request)
117657acbaa7SAlex Elder {
117757acbaa7SAlex Elder 	if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
117857acbaa7SAlex Elder 		struct rbd_device *rbd_dev = NULL;
117957acbaa7SAlex Elder 
118057acbaa7SAlex Elder 		if (obj_request_img_data_test(obj_request))
118157acbaa7SAlex Elder 			rbd_dev = obj_request->img_request->rbd_dev;
118257acbaa7SAlex Elder 		rbd_warn(rbd_dev, "obj_request %p already marked done\n",
118357acbaa7SAlex Elder 			obj_request);
118457acbaa7SAlex Elder 	}
118557acbaa7SAlex Elder }
118657acbaa7SAlex Elder 
118757acbaa7SAlex Elder static bool obj_request_done_test(struct rbd_obj_request *obj_request)
118857acbaa7SAlex Elder {
118957acbaa7SAlex Elder 	smp_mb();
119057acbaa7SAlex Elder 	return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
119157acbaa7SAlex Elder }
119257acbaa7SAlex Elder 
11935679c59fSAlex Elder /*
11945679c59fSAlex Elder  * This sets the KNOWN flag after (possibly) setting the EXISTS
11955679c59fSAlex Elder  * flag.  The latter is set based on the "exists" value provided.
11965679c59fSAlex Elder  *
11975679c59fSAlex Elder  * Note that for our purposes once an object exists it never goes
11985679c59fSAlex Elder  * away again.  It's possible that the response from two existence
11995679c59fSAlex Elder  * checks are separated by the creation of the target object, and
12005679c59fSAlex Elder  * the first ("doesn't exist") response arrives *after* the second
12015679c59fSAlex Elder  * ("does exist").  In that case we ignore the second one.
12025679c59fSAlex Elder  */
12035679c59fSAlex Elder static void obj_request_existence_set(struct rbd_obj_request *obj_request,
12045679c59fSAlex Elder 				bool exists)
12055679c59fSAlex Elder {
12065679c59fSAlex Elder 	if (exists)
12075679c59fSAlex Elder 		set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
12085679c59fSAlex Elder 	set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
12095679c59fSAlex Elder 	smp_mb();
12105679c59fSAlex Elder }
12115679c59fSAlex Elder 
12125679c59fSAlex Elder static bool obj_request_known_test(struct rbd_obj_request *obj_request)
12135679c59fSAlex Elder {
12145679c59fSAlex Elder 	smp_mb();
12155679c59fSAlex Elder 	return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
12165679c59fSAlex Elder }
12175679c59fSAlex Elder 
12185679c59fSAlex Elder static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
12195679c59fSAlex Elder {
12205679c59fSAlex Elder 	smp_mb();
12215679c59fSAlex Elder 	return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
12225679c59fSAlex Elder }
12235679c59fSAlex Elder 
1224bf0d5f50SAlex Elder static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1225bf0d5f50SAlex Elder {
122637206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
122737206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1228bf0d5f50SAlex Elder 	kref_get(&obj_request->kref);
1229bf0d5f50SAlex Elder }
1230bf0d5f50SAlex Elder 
1231bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
1232bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1233bf0d5f50SAlex Elder {
1234bf0d5f50SAlex Elder 	rbd_assert(obj_request != NULL);
123537206ee5SAlex Elder 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
123637206ee5SAlex Elder 		atomic_read(&obj_request->kref.refcount));
1237bf0d5f50SAlex Elder 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1238bf0d5f50SAlex Elder }
1239bf0d5f50SAlex Elder 
1240bf0d5f50SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request)
1241bf0d5f50SAlex Elder {
124237206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
124337206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1244bf0d5f50SAlex Elder 	kref_get(&img_request->kref);
1245bf0d5f50SAlex Elder }
1246bf0d5f50SAlex Elder 
1247bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref);
1248bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request)
1249bf0d5f50SAlex Elder {
1250bf0d5f50SAlex Elder 	rbd_assert(img_request != NULL);
125137206ee5SAlex Elder 	dout("%s: img %p (was %d)\n", __func__, img_request,
125237206ee5SAlex Elder 		atomic_read(&img_request->kref.refcount));
1253bf0d5f50SAlex Elder 	kref_put(&img_request->kref, rbd_img_request_destroy);
1254bf0d5f50SAlex Elder }
1255bf0d5f50SAlex Elder 
1256bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1257bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1258bf0d5f50SAlex Elder {
125925dcf954SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
126025dcf954SAlex Elder 
1261b155e86cSAlex Elder 	/* Image request now owns object's original reference */
1262bf0d5f50SAlex Elder 	obj_request->img_request = img_request;
126325dcf954SAlex Elder 	obj_request->which = img_request->obj_request_count;
12646365d33aSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
12656365d33aSAlex Elder 	obj_request_img_data_set(obj_request);
1266bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
126725dcf954SAlex Elder 	img_request->obj_request_count++;
126825dcf954SAlex Elder 	list_add_tail(&obj_request->links, &img_request->obj_requests);
126937206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
127037206ee5SAlex Elder 		obj_request->which);
1271bf0d5f50SAlex Elder }
1272bf0d5f50SAlex Elder 
1273bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1274bf0d5f50SAlex Elder 					struct rbd_obj_request *obj_request)
1275bf0d5f50SAlex Elder {
1276bf0d5f50SAlex Elder 	rbd_assert(obj_request->which != BAD_WHICH);
127725dcf954SAlex Elder 
127837206ee5SAlex Elder 	dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
127937206ee5SAlex Elder 		obj_request->which);
1280bf0d5f50SAlex Elder 	list_del(&obj_request->links);
128125dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
128225dcf954SAlex Elder 	img_request->obj_request_count--;
128325dcf954SAlex Elder 	rbd_assert(obj_request->which == img_request->obj_request_count);
128425dcf954SAlex Elder 	obj_request->which = BAD_WHICH;
12856365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
1286bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == img_request);
1287bf0d5f50SAlex Elder 	obj_request->img_request = NULL;
128825dcf954SAlex Elder 	obj_request->callback = NULL;
1289bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1290bf0d5f50SAlex Elder }
1291bf0d5f50SAlex Elder 
1292bf0d5f50SAlex Elder static bool obj_request_type_valid(enum obj_request_type type)
1293bf0d5f50SAlex Elder {
1294bf0d5f50SAlex Elder 	switch (type) {
12959969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
1296bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1297788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1298bf0d5f50SAlex Elder 		return true;
1299bf0d5f50SAlex Elder 	default:
1300bf0d5f50SAlex Elder 		return false;
1301bf0d5f50SAlex Elder 	}
1302bf0d5f50SAlex Elder }
1303bf0d5f50SAlex Elder 
1304bf0d5f50SAlex Elder static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1305bf0d5f50SAlex Elder 				struct rbd_obj_request *obj_request)
1306bf0d5f50SAlex Elder {
130737206ee5SAlex Elder 	dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
130837206ee5SAlex Elder 
1309bf0d5f50SAlex Elder 	return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1310bf0d5f50SAlex Elder }
1311bf0d5f50SAlex Elder 
1312bf0d5f50SAlex Elder static void rbd_img_request_complete(struct rbd_img_request *img_request)
1313bf0d5f50SAlex Elder {
131455f27e09SAlex Elder 
131537206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
131655f27e09SAlex Elder 
131755f27e09SAlex Elder 	/*
131855f27e09SAlex Elder 	 * If no error occurred, compute the aggregate transfer
131955f27e09SAlex Elder 	 * count for the image request.  We could instead use
132055f27e09SAlex Elder 	 * atomic64_cmpxchg() to update it as each object request
132155f27e09SAlex Elder 	 * completes; not clear which way is better off hand.
132255f27e09SAlex Elder 	 */
132355f27e09SAlex Elder 	if (!img_request->result) {
132455f27e09SAlex Elder 		struct rbd_obj_request *obj_request;
132555f27e09SAlex Elder 		u64 xferred = 0;
132655f27e09SAlex Elder 
132755f27e09SAlex Elder 		for_each_obj_request(img_request, obj_request)
132855f27e09SAlex Elder 			xferred += obj_request->xferred;
132955f27e09SAlex Elder 		img_request->xferred = xferred;
133055f27e09SAlex Elder 	}
133155f27e09SAlex Elder 
1332bf0d5f50SAlex Elder 	if (img_request->callback)
1333bf0d5f50SAlex Elder 		img_request->callback(img_request);
1334bf0d5f50SAlex Elder 	else
1335bf0d5f50SAlex Elder 		rbd_img_request_put(img_request);
1336bf0d5f50SAlex Elder }
1337bf0d5f50SAlex Elder 
1338788e2df3SAlex Elder /* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1339788e2df3SAlex Elder 
1340788e2df3SAlex Elder static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1341788e2df3SAlex Elder {
134237206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
134337206ee5SAlex Elder 
1344788e2df3SAlex Elder 	return wait_for_completion_interruptible(&obj_request->completion);
1345788e2df3SAlex Elder }
1346788e2df3SAlex Elder 
13470c425248SAlex Elder /*
13480c425248SAlex Elder  * The default/initial value for all image request flags is 0.  Each
13490c425248SAlex Elder  * is conditionally set to 1 at image request initialization time
13500c425248SAlex Elder  * and currently never change thereafter.
13510c425248SAlex Elder  */
13520c425248SAlex Elder static void img_request_write_set(struct rbd_img_request *img_request)
13530c425248SAlex Elder {
13540c425248SAlex Elder 	set_bit(IMG_REQ_WRITE, &img_request->flags);
13550c425248SAlex Elder 	smp_mb();
13560c425248SAlex Elder }
13570c425248SAlex Elder 
13580c425248SAlex Elder static bool img_request_write_test(struct rbd_img_request *img_request)
13590c425248SAlex Elder {
13600c425248SAlex Elder 	smp_mb();
13610c425248SAlex Elder 	return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
13620c425248SAlex Elder }
13630c425248SAlex Elder 
13649849e986SAlex Elder static void img_request_child_set(struct rbd_img_request *img_request)
13659849e986SAlex Elder {
13669849e986SAlex Elder 	set_bit(IMG_REQ_CHILD, &img_request->flags);
13679849e986SAlex Elder 	smp_mb();
13689849e986SAlex Elder }
13699849e986SAlex Elder 
13709849e986SAlex Elder static bool img_request_child_test(struct rbd_img_request *img_request)
13719849e986SAlex Elder {
13729849e986SAlex Elder 	smp_mb();
13739849e986SAlex Elder 	return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
13749849e986SAlex Elder }
13759849e986SAlex Elder 
1376d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1377d0b2e944SAlex Elder {
1378d0b2e944SAlex Elder 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1379d0b2e944SAlex Elder 	smp_mb();
1380d0b2e944SAlex Elder }
1381d0b2e944SAlex Elder 
1382d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1383d0b2e944SAlex Elder {
1384d0b2e944SAlex Elder 	smp_mb();
1385d0b2e944SAlex Elder 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1386d0b2e944SAlex Elder }
1387d0b2e944SAlex Elder 
13886e2a4505SAlex Elder static void
13896e2a4505SAlex Elder rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
13906e2a4505SAlex Elder {
1391b9434c5bSAlex Elder 	u64 xferred = obj_request->xferred;
1392b9434c5bSAlex Elder 	u64 length = obj_request->length;
1393b9434c5bSAlex Elder 
13946e2a4505SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
13956e2a4505SAlex Elder 		obj_request, obj_request->img_request, obj_request->result,
1396b9434c5bSAlex Elder 		xferred, length);
13976e2a4505SAlex Elder 	/*
13986e2a4505SAlex Elder 	 * ENOENT means a hole in the image.  We zero-fill the
13996e2a4505SAlex Elder 	 * entire length of the request.  A short read also implies
14006e2a4505SAlex Elder 	 * zero-fill to the end of the request.  Either way we
14016e2a4505SAlex Elder 	 * update the xferred count to indicate the whole request
14026e2a4505SAlex Elder 	 * was satisfied.
14036e2a4505SAlex Elder 	 */
1404b9434c5bSAlex Elder 	rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
14056e2a4505SAlex Elder 	if (obj_request->result == -ENOENT) {
1406b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
14076e2a4505SAlex Elder 			zero_bio_chain(obj_request->bio_list, 0);
1408b9434c5bSAlex Elder 		else
1409b9434c5bSAlex Elder 			zero_pages(obj_request->pages, 0, length);
14106e2a4505SAlex Elder 		obj_request->result = 0;
1411b9434c5bSAlex Elder 		obj_request->xferred = length;
1412b9434c5bSAlex Elder 	} else if (xferred < length && !obj_request->result) {
1413b9434c5bSAlex Elder 		if (obj_request->type == OBJ_REQUEST_BIO)
1414b9434c5bSAlex Elder 			zero_bio_chain(obj_request->bio_list, xferred);
1415b9434c5bSAlex Elder 		else
1416b9434c5bSAlex Elder 			zero_pages(obj_request->pages, xferred, length);
1417b9434c5bSAlex Elder 		obj_request->xferred = length;
14186e2a4505SAlex Elder 	}
14196e2a4505SAlex Elder 	obj_request_done_set(obj_request);
14206e2a4505SAlex Elder }
14216e2a4505SAlex Elder 
1422bf0d5f50SAlex Elder static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1423bf0d5f50SAlex Elder {
142437206ee5SAlex Elder 	dout("%s: obj %p cb %p\n", __func__, obj_request,
142537206ee5SAlex Elder 		obj_request->callback);
1426bf0d5f50SAlex Elder 	if (obj_request->callback)
1427bf0d5f50SAlex Elder 		obj_request->callback(obj_request);
1428788e2df3SAlex Elder 	else
1429788e2df3SAlex Elder 		complete_all(&obj_request->completion);
1430bf0d5f50SAlex Elder }
1431bf0d5f50SAlex Elder 
1432c47f9371SAlex Elder static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
143339bf2c5dSAlex Elder {
143439bf2c5dSAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
143539bf2c5dSAlex Elder 	obj_request_done_set(obj_request);
143639bf2c5dSAlex Elder }
143739bf2c5dSAlex Elder 
1438c47f9371SAlex Elder static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
1439bf0d5f50SAlex Elder {
144057acbaa7SAlex Elder 	struct rbd_img_request *img_request = NULL;
1441a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev = NULL;
144257acbaa7SAlex Elder 	bool layered = false;
144357acbaa7SAlex Elder 
144457acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
144557acbaa7SAlex Elder 		img_request = obj_request->img_request;
144657acbaa7SAlex Elder 		layered = img_request && img_request_layered_test(img_request);
1447a9e8ba2cSAlex Elder 		rbd_dev = img_request->rbd_dev;
144857acbaa7SAlex Elder 	}
14498b3e1a56SAlex Elder 
14508b3e1a56SAlex Elder 	dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
14518b3e1a56SAlex Elder 		obj_request, img_request, obj_request->result,
14528b3e1a56SAlex Elder 		obj_request->xferred, obj_request->length);
1453a9e8ba2cSAlex Elder 	if (layered && obj_request->result == -ENOENT &&
1454a9e8ba2cSAlex Elder 			obj_request->img_offset < rbd_dev->parent_overlap)
14558b3e1a56SAlex Elder 		rbd_img_parent_read(obj_request);
14568b3e1a56SAlex Elder 	else if (img_request)
14576e2a4505SAlex Elder 		rbd_img_obj_request_read_callback(obj_request);
14586e2a4505SAlex Elder 	else
145907741308SAlex Elder 		obj_request_done_set(obj_request);
1460bf0d5f50SAlex Elder }
1461bf0d5f50SAlex Elder 
1462c47f9371SAlex Elder static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1463bf0d5f50SAlex Elder {
14641b83bef2SSage Weil 	dout("%s: obj %p result %d %llu\n", __func__, obj_request,
14651b83bef2SSage Weil 		obj_request->result, obj_request->length);
14661b83bef2SSage Weil 	/*
14678b3e1a56SAlex Elder 	 * There is no such thing as a successful short write.  Set
14688b3e1a56SAlex Elder 	 * it to our originally-requested length.
14691b83bef2SSage Weil 	 */
14701b83bef2SSage Weil 	obj_request->xferred = obj_request->length;
147107741308SAlex Elder 	obj_request_done_set(obj_request);
1472bf0d5f50SAlex Elder }
1473bf0d5f50SAlex Elder 
1474fbfab539SAlex Elder /*
1475fbfab539SAlex Elder  * For a simple stat call there's nothing to do.  We'll do more if
1476fbfab539SAlex Elder  * this is part of a write sequence for a layered image.
1477fbfab539SAlex Elder  */
1478c47f9371SAlex Elder static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
1479fbfab539SAlex Elder {
148037206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
1481fbfab539SAlex Elder 	obj_request_done_set(obj_request);
1482fbfab539SAlex Elder }
1483fbfab539SAlex Elder 
1484bf0d5f50SAlex Elder static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1485bf0d5f50SAlex Elder 				struct ceph_msg *msg)
1486bf0d5f50SAlex Elder {
1487bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1488bf0d5f50SAlex Elder 	u16 opcode;
1489bf0d5f50SAlex Elder 
149037206ee5SAlex Elder 	dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
1491bf0d5f50SAlex Elder 	rbd_assert(osd_req == obj_request->osd_req);
149257acbaa7SAlex Elder 	if (obj_request_img_data_test(obj_request)) {
149357acbaa7SAlex Elder 		rbd_assert(obj_request->img_request);
149457acbaa7SAlex Elder 		rbd_assert(obj_request->which != BAD_WHICH);
149557acbaa7SAlex Elder 	} else {
149657acbaa7SAlex Elder 		rbd_assert(obj_request->which == BAD_WHICH);
149757acbaa7SAlex Elder 	}
1498bf0d5f50SAlex Elder 
14991b83bef2SSage Weil 	if (osd_req->r_result < 0)
15001b83bef2SSage Weil 		obj_request->result = osd_req->r_result;
1501bf0d5f50SAlex Elder 	obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1502bf0d5f50SAlex Elder 
15030eefd470SAlex Elder 	BUG_ON(osd_req->r_num_ops > 2);
1504bf0d5f50SAlex Elder 
1505c47f9371SAlex Elder 	/*
1506c47f9371SAlex Elder 	 * We support a 64-bit length, but ultimately it has to be
1507c47f9371SAlex Elder 	 * passed to blk_end_request(), which takes an unsigned int.
1508c47f9371SAlex Elder 	 */
15091b83bef2SSage Weil 	obj_request->xferred = osd_req->r_reply_op_len[0];
1510c47f9371SAlex Elder 	rbd_assert(obj_request->xferred < (u64)UINT_MAX);
151179528734SAlex Elder 	opcode = osd_req->r_ops[0].op;
1512bf0d5f50SAlex Elder 	switch (opcode) {
1513bf0d5f50SAlex Elder 	case CEPH_OSD_OP_READ:
1514c47f9371SAlex Elder 		rbd_osd_read_callback(obj_request);
1515bf0d5f50SAlex Elder 		break;
1516bf0d5f50SAlex Elder 	case CEPH_OSD_OP_WRITE:
1517c47f9371SAlex Elder 		rbd_osd_write_callback(obj_request);
1518bf0d5f50SAlex Elder 		break;
1519fbfab539SAlex Elder 	case CEPH_OSD_OP_STAT:
1520c47f9371SAlex Elder 		rbd_osd_stat_callback(obj_request);
1521fbfab539SAlex Elder 		break;
152236be9a76SAlex Elder 	case CEPH_OSD_OP_CALL:
1523b8d70035SAlex Elder 	case CEPH_OSD_OP_NOTIFY_ACK:
15249969ebc5SAlex Elder 	case CEPH_OSD_OP_WATCH:
1525c47f9371SAlex Elder 		rbd_osd_trivial_callback(obj_request);
15269969ebc5SAlex Elder 		break;
1527bf0d5f50SAlex Elder 	default:
1528bf0d5f50SAlex Elder 		rbd_warn(NULL, "%s: unsupported op %hu\n",
1529bf0d5f50SAlex Elder 			obj_request->object_name, (unsigned short) opcode);
1530bf0d5f50SAlex Elder 		break;
1531bf0d5f50SAlex Elder 	}
1532bf0d5f50SAlex Elder 
153307741308SAlex Elder 	if (obj_request_done_test(obj_request))
1534bf0d5f50SAlex Elder 		rbd_obj_request_complete(obj_request);
1535bf0d5f50SAlex Elder }
1536bf0d5f50SAlex Elder 
15379d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
1538430c28c3SAlex Elder {
1539430c28c3SAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15408c042b0dSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15419d4df01fSAlex Elder 	u64 snap_id;
1542430c28c3SAlex Elder 
15438c042b0dSAlex Elder 	rbd_assert(osd_req != NULL);
1544430c28c3SAlex Elder 
15459d4df01fSAlex Elder 	snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
15468c042b0dSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15479d4df01fSAlex Elder 			NULL, snap_id, NULL);
15489d4df01fSAlex Elder }
15499d4df01fSAlex Elder 
15509d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
15519d4df01fSAlex Elder {
15529d4df01fSAlex Elder 	struct rbd_img_request *img_request = obj_request->img_request;
15539d4df01fSAlex Elder 	struct ceph_osd_request *osd_req = obj_request->osd_req;
15549d4df01fSAlex Elder 	struct ceph_snap_context *snapc;
15559d4df01fSAlex Elder 	struct timespec mtime = CURRENT_TIME;
15569d4df01fSAlex Elder 
15579d4df01fSAlex Elder 	rbd_assert(osd_req != NULL);
15589d4df01fSAlex Elder 
15599d4df01fSAlex Elder 	snapc = img_request ? img_request->snapc : NULL;
15609d4df01fSAlex Elder 	ceph_osdc_build_request(osd_req, obj_request->offset,
15619d4df01fSAlex Elder 			snapc, CEPH_NOSNAP, &mtime);
1562430c28c3SAlex Elder }
1563430c28c3SAlex Elder 
1564bf0d5f50SAlex Elder static struct ceph_osd_request *rbd_osd_req_create(
1565bf0d5f50SAlex Elder 					struct rbd_device *rbd_dev,
1566bf0d5f50SAlex Elder 					bool write_request,
1567430c28c3SAlex Elder 					struct rbd_obj_request *obj_request)
1568bf0d5f50SAlex Elder {
1569bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1570bf0d5f50SAlex Elder 	struct ceph_osd_client *osdc;
1571bf0d5f50SAlex Elder 	struct ceph_osd_request *osd_req;
1572bf0d5f50SAlex Elder 
15736365d33aSAlex Elder 	if (obj_request_img_data_test(obj_request)) {
15746365d33aSAlex Elder 		struct rbd_img_request *img_request = obj_request->img_request;
15756365d33aSAlex Elder 
15760c425248SAlex Elder 		rbd_assert(write_request ==
15770c425248SAlex Elder 				img_request_write_test(img_request));
15780c425248SAlex Elder 		if (write_request)
1579bf0d5f50SAlex Elder 			snapc = img_request->snapc;
1580bf0d5f50SAlex Elder 	}
1581bf0d5f50SAlex Elder 
1582bf0d5f50SAlex Elder 	/* Allocate and initialize the request, for the single op */
1583bf0d5f50SAlex Elder 
1584bf0d5f50SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
1585bf0d5f50SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1586bf0d5f50SAlex Elder 	if (!osd_req)
1587bf0d5f50SAlex Elder 		return NULL;	/* ENOMEM */
1588bf0d5f50SAlex Elder 
1589430c28c3SAlex Elder 	if (write_request)
1590bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1591430c28c3SAlex Elder 	else
1592bf0d5f50SAlex Elder 		osd_req->r_flags = CEPH_OSD_FLAG_READ;
1593bf0d5f50SAlex Elder 
1594bf0d5f50SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
1595bf0d5f50SAlex Elder 	osd_req->r_priv = obj_request;
1596bf0d5f50SAlex Elder 
1597bf0d5f50SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
1598bf0d5f50SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1599bf0d5f50SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1600bf0d5f50SAlex Elder 
1601bf0d5f50SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
1602bf0d5f50SAlex Elder 
1603bf0d5f50SAlex Elder 	return osd_req;
1604bf0d5f50SAlex Elder }
1605bf0d5f50SAlex Elder 
16060eefd470SAlex Elder /*
16070eefd470SAlex Elder  * Create a copyup osd request based on the information in the
16080eefd470SAlex Elder  * object request supplied.  A copyup request has two osd ops,
16090eefd470SAlex Elder  * a copyup method call, and a "normal" write request.
16100eefd470SAlex Elder  */
16110eefd470SAlex Elder static struct ceph_osd_request *
16120eefd470SAlex Elder rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
16130eefd470SAlex Elder {
16140eefd470SAlex Elder 	struct rbd_img_request *img_request;
16150eefd470SAlex Elder 	struct ceph_snap_context *snapc;
16160eefd470SAlex Elder 	struct rbd_device *rbd_dev;
16170eefd470SAlex Elder 	struct ceph_osd_client *osdc;
16180eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
16190eefd470SAlex Elder 
16200eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
16210eefd470SAlex Elder 	img_request = obj_request->img_request;
16220eefd470SAlex Elder 	rbd_assert(img_request);
16230eefd470SAlex Elder 	rbd_assert(img_request_write_test(img_request));
16240eefd470SAlex Elder 
16250eefd470SAlex Elder 	/* Allocate and initialize the request, for the two ops */
16260eefd470SAlex Elder 
16270eefd470SAlex Elder 	snapc = img_request->snapc;
16280eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
16290eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
16300eefd470SAlex Elder 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
16310eefd470SAlex Elder 	if (!osd_req)
16320eefd470SAlex Elder 		return NULL;	/* ENOMEM */
16330eefd470SAlex Elder 
16340eefd470SAlex Elder 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
16350eefd470SAlex Elder 	osd_req->r_callback = rbd_osd_req_callback;
16360eefd470SAlex Elder 	osd_req->r_priv = obj_request;
16370eefd470SAlex Elder 
16380eefd470SAlex Elder 	osd_req->r_oid_len = strlen(obj_request->object_name);
16390eefd470SAlex Elder 	rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
16400eefd470SAlex Elder 	memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
16410eefd470SAlex Elder 
16420eefd470SAlex Elder 	osd_req->r_file_layout = rbd_dev->layout;	/* struct */
16430eefd470SAlex Elder 
16440eefd470SAlex Elder 	return osd_req;
16450eefd470SAlex Elder }
16460eefd470SAlex Elder 
16470eefd470SAlex Elder 
1648bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1649bf0d5f50SAlex Elder {
1650bf0d5f50SAlex Elder 	ceph_osdc_put_request(osd_req);
1651bf0d5f50SAlex Elder }
1652bf0d5f50SAlex Elder 
1653bf0d5f50SAlex Elder /* object_name is assumed to be a non-null pointer and NUL-terminated */
1654bf0d5f50SAlex Elder 
1655bf0d5f50SAlex Elder static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1656bf0d5f50SAlex Elder 						u64 offset, u64 length,
1657bf0d5f50SAlex Elder 						enum obj_request_type type)
1658bf0d5f50SAlex Elder {
1659bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1660bf0d5f50SAlex Elder 	size_t size;
1661bf0d5f50SAlex Elder 	char *name;
1662bf0d5f50SAlex Elder 
1663bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(type));
1664bf0d5f50SAlex Elder 
1665bf0d5f50SAlex Elder 	size = strlen(object_name) + 1;
1666bf0d5f50SAlex Elder 	obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1667bf0d5f50SAlex Elder 	if (!obj_request)
1668bf0d5f50SAlex Elder 		return NULL;
1669bf0d5f50SAlex Elder 
1670bf0d5f50SAlex Elder 	name = (char *)(obj_request + 1);
1671bf0d5f50SAlex Elder 	obj_request->object_name = memcpy(name, object_name, size);
1672bf0d5f50SAlex Elder 	obj_request->offset = offset;
1673bf0d5f50SAlex Elder 	obj_request->length = length;
1674926f9b3fSAlex Elder 	obj_request->flags = 0;
1675bf0d5f50SAlex Elder 	obj_request->which = BAD_WHICH;
1676bf0d5f50SAlex Elder 	obj_request->type = type;
1677bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&obj_request->links);
1678788e2df3SAlex Elder 	init_completion(&obj_request->completion);
1679bf0d5f50SAlex Elder 	kref_init(&obj_request->kref);
1680bf0d5f50SAlex Elder 
168137206ee5SAlex Elder 	dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
168237206ee5SAlex Elder 		offset, length, (int)type, obj_request);
168337206ee5SAlex Elder 
1684bf0d5f50SAlex Elder 	return obj_request;
1685bf0d5f50SAlex Elder }
1686bf0d5f50SAlex Elder 
1687bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1688bf0d5f50SAlex Elder {
1689bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1690bf0d5f50SAlex Elder 
1691bf0d5f50SAlex Elder 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1692bf0d5f50SAlex Elder 
169337206ee5SAlex Elder 	dout("%s: obj %p\n", __func__, obj_request);
169437206ee5SAlex Elder 
1695bf0d5f50SAlex Elder 	rbd_assert(obj_request->img_request == NULL);
1696bf0d5f50SAlex Elder 	rbd_assert(obj_request->which == BAD_WHICH);
1697bf0d5f50SAlex Elder 
1698bf0d5f50SAlex Elder 	if (obj_request->osd_req)
1699bf0d5f50SAlex Elder 		rbd_osd_req_destroy(obj_request->osd_req);
1700bf0d5f50SAlex Elder 
1701bf0d5f50SAlex Elder 	rbd_assert(obj_request_type_valid(obj_request->type));
1702bf0d5f50SAlex Elder 	switch (obj_request->type) {
17039969ebc5SAlex Elder 	case OBJ_REQUEST_NODATA:
17049969ebc5SAlex Elder 		break;		/* Nothing to do */
1705bf0d5f50SAlex Elder 	case OBJ_REQUEST_BIO:
1706bf0d5f50SAlex Elder 		if (obj_request->bio_list)
1707bf0d5f50SAlex Elder 			bio_chain_put(obj_request->bio_list);
1708bf0d5f50SAlex Elder 		break;
1709788e2df3SAlex Elder 	case OBJ_REQUEST_PAGES:
1710788e2df3SAlex Elder 		if (obj_request->pages)
1711788e2df3SAlex Elder 			ceph_release_page_vector(obj_request->pages,
1712788e2df3SAlex Elder 						obj_request->page_count);
1713788e2df3SAlex Elder 		break;
1714bf0d5f50SAlex Elder 	}
1715bf0d5f50SAlex Elder 
1716bf0d5f50SAlex Elder 	kfree(obj_request);
1717bf0d5f50SAlex Elder }
1718bf0d5f50SAlex Elder 
1719bf0d5f50SAlex Elder /*
1720bf0d5f50SAlex Elder  * Caller is responsible for filling in the list of object requests
1721bf0d5f50SAlex Elder  * that comprises the image request, and the Linux request pointer
1722bf0d5f50SAlex Elder  * (if there is one).
1723bf0d5f50SAlex Elder  */
1724cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create(
1725cc344fa1SAlex Elder 					struct rbd_device *rbd_dev,
1726bf0d5f50SAlex Elder 					u64 offset, u64 length,
17279849e986SAlex Elder 					bool write_request,
17289849e986SAlex Elder 					bool child_request)
1729bf0d5f50SAlex Elder {
1730bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1731bf0d5f50SAlex Elder 	struct ceph_snap_context *snapc = NULL;
1732bf0d5f50SAlex Elder 
1733bf0d5f50SAlex Elder 	img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1734bf0d5f50SAlex Elder 	if (!img_request)
1735bf0d5f50SAlex Elder 		return NULL;
1736bf0d5f50SAlex Elder 
1737bf0d5f50SAlex Elder 	if (write_request) {
1738bf0d5f50SAlex Elder 		down_read(&rbd_dev->header_rwsem);
1739bf0d5f50SAlex Elder 		snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1740bf0d5f50SAlex Elder 		up_read(&rbd_dev->header_rwsem);
1741bf0d5f50SAlex Elder 		if (WARN_ON(!snapc)) {
1742bf0d5f50SAlex Elder 			kfree(img_request);
1743bf0d5f50SAlex Elder 			return NULL;	/* Shouldn't happen */
1744bf0d5f50SAlex Elder 		}
17450c425248SAlex Elder 
1746bf0d5f50SAlex Elder 	}
1747bf0d5f50SAlex Elder 
1748bf0d5f50SAlex Elder 	img_request->rq = NULL;
1749bf0d5f50SAlex Elder 	img_request->rbd_dev = rbd_dev;
1750bf0d5f50SAlex Elder 	img_request->offset = offset;
1751bf0d5f50SAlex Elder 	img_request->length = length;
17520c425248SAlex Elder 	img_request->flags = 0;
17530c425248SAlex Elder 	if (write_request) {
17540c425248SAlex Elder 		img_request_write_set(img_request);
1755bf0d5f50SAlex Elder 		img_request->snapc = snapc;
17560c425248SAlex Elder 	} else {
1757bf0d5f50SAlex Elder 		img_request->snap_id = rbd_dev->spec->snap_id;
17580c425248SAlex Elder 	}
17599849e986SAlex Elder 	if (child_request)
17609849e986SAlex Elder 		img_request_child_set(img_request);
1761d0b2e944SAlex Elder 	if (rbd_dev->parent_spec)
1762d0b2e944SAlex Elder 		img_request_layered_set(img_request);
1763bf0d5f50SAlex Elder 	spin_lock_init(&img_request->completion_lock);
1764bf0d5f50SAlex Elder 	img_request->next_completion = 0;
1765bf0d5f50SAlex Elder 	img_request->callback = NULL;
1766a5a337d4SAlex Elder 	img_request->result = 0;
1767bf0d5f50SAlex Elder 	img_request->obj_request_count = 0;
1768bf0d5f50SAlex Elder 	INIT_LIST_HEAD(&img_request->obj_requests);
1769bf0d5f50SAlex Elder 	kref_init(&img_request->kref);
1770bf0d5f50SAlex Elder 
1771bf0d5f50SAlex Elder 	rbd_img_request_get(img_request);	/* Avoid a warning */
1772bf0d5f50SAlex Elder 	rbd_img_request_put(img_request);	/* TEMPORARY */
1773bf0d5f50SAlex Elder 
177437206ee5SAlex Elder 	dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
177537206ee5SAlex Elder 		write_request ? "write" : "read", offset, length,
177637206ee5SAlex Elder 		img_request);
177737206ee5SAlex Elder 
1778bf0d5f50SAlex Elder 	return img_request;
1779bf0d5f50SAlex Elder }
1780bf0d5f50SAlex Elder 
1781bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref)
1782bf0d5f50SAlex Elder {
1783bf0d5f50SAlex Elder 	struct rbd_img_request *img_request;
1784bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
1785bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
1786bf0d5f50SAlex Elder 
1787bf0d5f50SAlex Elder 	img_request = container_of(kref, struct rbd_img_request, kref);
1788bf0d5f50SAlex Elder 
178937206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
179037206ee5SAlex Elder 
1791bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1792bf0d5f50SAlex Elder 		rbd_img_obj_request_del(img_request, obj_request);
179325dcf954SAlex Elder 	rbd_assert(img_request->obj_request_count == 0);
1794bf0d5f50SAlex Elder 
17950c425248SAlex Elder 	if (img_request_write_test(img_request))
1796bf0d5f50SAlex Elder 		ceph_put_snap_context(img_request->snapc);
1797bf0d5f50SAlex Elder 
17988b3e1a56SAlex Elder 	if (img_request_child_test(img_request))
17998b3e1a56SAlex Elder 		rbd_obj_request_put(img_request->obj_request);
18008b3e1a56SAlex Elder 
1801bf0d5f50SAlex Elder 	kfree(img_request);
1802bf0d5f50SAlex Elder }
1803bf0d5f50SAlex Elder 
18041217857fSAlex Elder static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
18051217857fSAlex Elder {
18066365d33aSAlex Elder 	struct rbd_img_request *img_request;
18071217857fSAlex Elder 	unsigned int xferred;
18081217857fSAlex Elder 	int result;
18098b3e1a56SAlex Elder 	bool more;
18101217857fSAlex Elder 
18116365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18126365d33aSAlex Elder 	img_request = obj_request->img_request;
18136365d33aSAlex Elder 
18141217857fSAlex Elder 	rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
18151217857fSAlex Elder 	xferred = (unsigned int)obj_request->xferred;
18161217857fSAlex Elder 	result = obj_request->result;
18171217857fSAlex Elder 	if (result) {
18181217857fSAlex Elder 		struct rbd_device *rbd_dev = img_request->rbd_dev;
18191217857fSAlex Elder 
18201217857fSAlex Elder 		rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
18211217857fSAlex Elder 			img_request_write_test(img_request) ? "write" : "read",
18221217857fSAlex Elder 			obj_request->length, obj_request->img_offset,
18231217857fSAlex Elder 			obj_request->offset);
18241217857fSAlex Elder 		rbd_warn(rbd_dev, "  result %d xferred %x\n",
18251217857fSAlex Elder 			result, xferred);
18261217857fSAlex Elder 		if (!img_request->result)
18271217857fSAlex Elder 			img_request->result = result;
18281217857fSAlex Elder 	}
18291217857fSAlex Elder 
1830f1a4739fSAlex Elder 	/* Image object requests don't own their page array */
1831f1a4739fSAlex Elder 
1832f1a4739fSAlex Elder 	if (obj_request->type == OBJ_REQUEST_PAGES) {
1833f1a4739fSAlex Elder 		obj_request->pages = NULL;
1834f1a4739fSAlex Elder 		obj_request->page_count = 0;
1835f1a4739fSAlex Elder 	}
1836f1a4739fSAlex Elder 
18378b3e1a56SAlex Elder 	if (img_request_child_test(img_request)) {
18388b3e1a56SAlex Elder 		rbd_assert(img_request->obj_request != NULL);
18398b3e1a56SAlex Elder 		more = obj_request->which < img_request->obj_request_count - 1;
18408b3e1a56SAlex Elder 	} else {
18418b3e1a56SAlex Elder 		rbd_assert(img_request->rq != NULL);
18428b3e1a56SAlex Elder 		more = blk_end_request(img_request->rq, result, xferred);
18438b3e1a56SAlex Elder 	}
18448b3e1a56SAlex Elder 
18458b3e1a56SAlex Elder 	return more;
18461217857fSAlex Elder }
18471217857fSAlex Elder 
18482169238dSAlex Elder static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
18492169238dSAlex Elder {
18502169238dSAlex Elder 	struct rbd_img_request *img_request;
18512169238dSAlex Elder 	u32 which = obj_request->which;
18522169238dSAlex Elder 	bool more = true;
18532169238dSAlex Elder 
18546365d33aSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
18552169238dSAlex Elder 	img_request = obj_request->img_request;
18562169238dSAlex Elder 
18572169238dSAlex Elder 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
18582169238dSAlex Elder 	rbd_assert(img_request != NULL);
18592169238dSAlex Elder 	rbd_assert(img_request->obj_request_count > 0);
18602169238dSAlex Elder 	rbd_assert(which != BAD_WHICH);
18612169238dSAlex Elder 	rbd_assert(which < img_request->obj_request_count);
18622169238dSAlex Elder 	rbd_assert(which >= img_request->next_completion);
18632169238dSAlex Elder 
18642169238dSAlex Elder 	spin_lock_irq(&img_request->completion_lock);
18652169238dSAlex Elder 	if (which != img_request->next_completion)
18662169238dSAlex Elder 		goto out;
18672169238dSAlex Elder 
18682169238dSAlex Elder 	for_each_obj_request_from(img_request, obj_request) {
18692169238dSAlex Elder 		rbd_assert(more);
18702169238dSAlex Elder 		rbd_assert(which < img_request->obj_request_count);
18712169238dSAlex Elder 
18722169238dSAlex Elder 		if (!obj_request_done_test(obj_request))
18732169238dSAlex Elder 			break;
18741217857fSAlex Elder 		more = rbd_img_obj_end_request(obj_request);
18752169238dSAlex Elder 		which++;
18762169238dSAlex Elder 	}
18772169238dSAlex Elder 
18782169238dSAlex Elder 	rbd_assert(more ^ (which == img_request->obj_request_count));
18792169238dSAlex Elder 	img_request->next_completion = which;
18802169238dSAlex Elder out:
18812169238dSAlex Elder 	spin_unlock_irq(&img_request->completion_lock);
18822169238dSAlex Elder 
18832169238dSAlex Elder 	if (!more)
18842169238dSAlex Elder 		rbd_img_request_complete(img_request);
18852169238dSAlex Elder }
18862169238dSAlex Elder 
1887f1a4739fSAlex Elder /*
1888f1a4739fSAlex Elder  * Split up an image request into one or more object requests, each
1889f1a4739fSAlex Elder  * to a different object.  The "type" parameter indicates whether
1890f1a4739fSAlex Elder  * "data_desc" is the pointer to the head of a list of bio
1891f1a4739fSAlex Elder  * structures, or the base of a page array.  In either case this
1892f1a4739fSAlex Elder  * function assumes data_desc describes memory sufficient to hold
1893f1a4739fSAlex Elder  * all data described by the image request.
1894f1a4739fSAlex Elder  */
1895f1a4739fSAlex Elder static int rbd_img_request_fill(struct rbd_img_request *img_request,
1896f1a4739fSAlex Elder 					enum obj_request_type type,
1897f1a4739fSAlex Elder 					void *data_desc)
1898bf0d5f50SAlex Elder {
1899bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = img_request->rbd_dev;
1900bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request = NULL;
1901bf0d5f50SAlex Elder 	struct rbd_obj_request *next_obj_request;
19020c425248SAlex Elder 	bool write_request = img_request_write_test(img_request);
1903f1a4739fSAlex Elder 	struct bio *bio_list;
1904f1a4739fSAlex Elder 	unsigned int bio_offset = 0;
1905f1a4739fSAlex Elder 	struct page **pages;
19067da22d29SAlex Elder 	u64 img_offset;
1907bf0d5f50SAlex Elder 	u64 resid;
1908bf0d5f50SAlex Elder 	u16 opcode;
1909bf0d5f50SAlex Elder 
1910f1a4739fSAlex Elder 	dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1911f1a4739fSAlex Elder 		(int)type, data_desc);
191237206ee5SAlex Elder 
1913430c28c3SAlex Elder 	opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
19147da22d29SAlex Elder 	img_offset = img_request->offset;
1915bf0d5f50SAlex Elder 	resid = img_request->length;
19164dda41d3SAlex Elder 	rbd_assert(resid > 0);
1917f1a4739fSAlex Elder 
1918f1a4739fSAlex Elder 	if (type == OBJ_REQUEST_BIO) {
1919f1a4739fSAlex Elder 		bio_list = data_desc;
1920f1a4739fSAlex Elder 		rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1921f1a4739fSAlex Elder 	} else {
1922f1a4739fSAlex Elder 		rbd_assert(type == OBJ_REQUEST_PAGES);
1923f1a4739fSAlex Elder 		pages = data_desc;
1924f1a4739fSAlex Elder 	}
1925f1a4739fSAlex Elder 
1926bf0d5f50SAlex Elder 	while (resid) {
19272fa12320SAlex Elder 		struct ceph_osd_request *osd_req;
1928bf0d5f50SAlex Elder 		const char *object_name;
1929bf0d5f50SAlex Elder 		u64 offset;
1930bf0d5f50SAlex Elder 		u64 length;
1931bf0d5f50SAlex Elder 
19327da22d29SAlex Elder 		object_name = rbd_segment_name(rbd_dev, img_offset);
1933bf0d5f50SAlex Elder 		if (!object_name)
1934bf0d5f50SAlex Elder 			goto out_unwind;
19357da22d29SAlex Elder 		offset = rbd_segment_offset(rbd_dev, img_offset);
19367da22d29SAlex Elder 		length = rbd_segment_length(rbd_dev, img_offset, resid);
1937bf0d5f50SAlex Elder 		obj_request = rbd_obj_request_create(object_name,
1938f1a4739fSAlex Elder 						offset, length, type);
1939bf0d5f50SAlex Elder 		kfree(object_name);	/* object request has its own copy */
1940bf0d5f50SAlex Elder 		if (!obj_request)
1941bf0d5f50SAlex Elder 			goto out_unwind;
1942bf0d5f50SAlex Elder 
1943f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO) {
1944f1a4739fSAlex Elder 			unsigned int clone_size;
1945f1a4739fSAlex Elder 
1946bf0d5f50SAlex Elder 			rbd_assert(length <= (u64)UINT_MAX);
1947bf0d5f50SAlex Elder 			clone_size = (unsigned int)length;
1948f1a4739fSAlex Elder 			obj_request->bio_list =
1949f1a4739fSAlex Elder 					bio_chain_clone_range(&bio_list,
1950f1a4739fSAlex Elder 								&bio_offset,
1951f1a4739fSAlex Elder 								clone_size,
1952bf0d5f50SAlex Elder 								GFP_ATOMIC);
1953bf0d5f50SAlex Elder 			if (!obj_request->bio_list)
1954bf0d5f50SAlex Elder 				goto out_partial;
1955f1a4739fSAlex Elder 		} else {
1956f1a4739fSAlex Elder 			unsigned int page_count;
1957f1a4739fSAlex Elder 
1958f1a4739fSAlex Elder 			obj_request->pages = pages;
1959f1a4739fSAlex Elder 			page_count = (u32)calc_pages_for(offset, length);
1960f1a4739fSAlex Elder 			obj_request->page_count = page_count;
1961f1a4739fSAlex Elder 			if ((offset + length) & ~PAGE_MASK)
1962f1a4739fSAlex Elder 				page_count--;	/* more on last page */
1963f1a4739fSAlex Elder 			pages += page_count;
1964f1a4739fSAlex Elder 		}
1965bf0d5f50SAlex Elder 
19662fa12320SAlex Elder 		osd_req = rbd_osd_req_create(rbd_dev, write_request,
19672fa12320SAlex Elder 						obj_request);
19682fa12320SAlex Elder 		if (!osd_req)
1969bf0d5f50SAlex Elder 			goto out_partial;
19702fa12320SAlex Elder 		obj_request->osd_req = osd_req;
19712169238dSAlex Elder 		obj_request->callback = rbd_img_obj_callback;
1972430c28c3SAlex Elder 
19732fa12320SAlex Elder 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
19742fa12320SAlex Elder 						0, 0);
1975f1a4739fSAlex Elder 		if (type == OBJ_REQUEST_BIO)
1976406e2c9fSAlex Elder 			osd_req_op_extent_osd_data_bio(osd_req, 0,
1977f1a4739fSAlex Elder 					obj_request->bio_list, length);
1978f1a4739fSAlex Elder 		else
1979f1a4739fSAlex Elder 			osd_req_op_extent_osd_data_pages(osd_req, 0,
1980f1a4739fSAlex Elder 					obj_request->pages, length,
1981f1a4739fSAlex Elder 					offset & ~PAGE_MASK, false, false);
19829d4df01fSAlex Elder 
19839d4df01fSAlex Elder 		if (write_request)
19849d4df01fSAlex Elder 			rbd_osd_req_format_write(obj_request);
19859d4df01fSAlex Elder 		else
19869d4df01fSAlex Elder 			rbd_osd_req_format_read(obj_request);
1987430c28c3SAlex Elder 
19887da22d29SAlex Elder 		obj_request->img_offset = img_offset;
1989bf0d5f50SAlex Elder 		rbd_img_obj_request_add(img_request, obj_request);
1990bf0d5f50SAlex Elder 
19917da22d29SAlex Elder 		img_offset += length;
1992bf0d5f50SAlex Elder 		resid -= length;
1993bf0d5f50SAlex Elder 	}
1994bf0d5f50SAlex Elder 
1995bf0d5f50SAlex Elder 	return 0;
1996bf0d5f50SAlex Elder 
1997bf0d5f50SAlex Elder out_partial:
1998bf0d5f50SAlex Elder 	rbd_obj_request_put(obj_request);
1999bf0d5f50SAlex Elder out_unwind:
2000bf0d5f50SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2001bf0d5f50SAlex Elder 		rbd_obj_request_put(obj_request);
2002bf0d5f50SAlex Elder 
2003bf0d5f50SAlex Elder 	return -ENOMEM;
2004bf0d5f50SAlex Elder }
2005bf0d5f50SAlex Elder 
20063d7efd18SAlex Elder static void
20070eefd470SAlex Elder rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
20080eefd470SAlex Elder {
20090eefd470SAlex Elder 	struct rbd_img_request *img_request;
20100eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20110eefd470SAlex Elder 	u64 length;
20120eefd470SAlex Elder 	u32 page_count;
20130eefd470SAlex Elder 
20140eefd470SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
20150eefd470SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
20160eefd470SAlex Elder 	img_request = obj_request->img_request;
20170eefd470SAlex Elder 	rbd_assert(img_request);
20180eefd470SAlex Elder 
20190eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20200eefd470SAlex Elder 	rbd_assert(rbd_dev);
20210eefd470SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
20220eefd470SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
20230eefd470SAlex Elder 
20240eefd470SAlex Elder 	rbd_assert(obj_request->copyup_pages);
20250eefd470SAlex Elder 	ceph_release_page_vector(obj_request->copyup_pages, page_count);
20260eefd470SAlex Elder 	obj_request->copyup_pages = NULL;
20270eefd470SAlex Elder 
20280eefd470SAlex Elder 	/*
20290eefd470SAlex Elder 	 * We want the transfer count to reflect the size of the
20300eefd470SAlex Elder 	 * original write request.  There is no such thing as a
20310eefd470SAlex Elder 	 * successful short write, so if the request was successful
20320eefd470SAlex Elder 	 * we can just set it to the originally-requested length.
20330eefd470SAlex Elder 	 */
20340eefd470SAlex Elder 	if (!obj_request->result)
20350eefd470SAlex Elder 		obj_request->xferred = obj_request->length;
20360eefd470SAlex Elder 
20370eefd470SAlex Elder 	/* Finish up with the normal image object callback */
20380eefd470SAlex Elder 
20390eefd470SAlex Elder 	rbd_img_obj_callback(obj_request);
20400eefd470SAlex Elder }
20410eefd470SAlex Elder 
20420eefd470SAlex Elder static void
20433d7efd18SAlex Elder rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
20443d7efd18SAlex Elder {
20453d7efd18SAlex Elder 	struct rbd_obj_request *orig_request;
20460eefd470SAlex Elder 	struct ceph_osd_request *osd_req;
20470eefd470SAlex Elder 	struct ceph_osd_client *osdc;
20480eefd470SAlex Elder 	struct rbd_device *rbd_dev;
20493d7efd18SAlex Elder 	struct page **pages;
20503d7efd18SAlex Elder 	int result;
20513d7efd18SAlex Elder 	u64 obj_size;
20523d7efd18SAlex Elder 	u64 xferred;
20533d7efd18SAlex Elder 
20543d7efd18SAlex Elder 	rbd_assert(img_request_child_test(img_request));
20553d7efd18SAlex Elder 
20563d7efd18SAlex Elder 	/* First get what we need from the image request */
20573d7efd18SAlex Elder 
20583d7efd18SAlex Elder 	pages = img_request->copyup_pages;
20593d7efd18SAlex Elder 	rbd_assert(pages != NULL);
20603d7efd18SAlex Elder 	img_request->copyup_pages = NULL;
20613d7efd18SAlex Elder 
20623d7efd18SAlex Elder 	orig_request = img_request->obj_request;
20633d7efd18SAlex Elder 	rbd_assert(orig_request != NULL);
20640eefd470SAlex Elder 	rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
20653d7efd18SAlex Elder 	result = img_request->result;
20663d7efd18SAlex Elder 	obj_size = img_request->length;
20673d7efd18SAlex Elder 	xferred = img_request->xferred;
20683d7efd18SAlex Elder 
20690eefd470SAlex Elder 	rbd_dev = img_request->rbd_dev;
20700eefd470SAlex Elder 	rbd_assert(rbd_dev);
20710eefd470SAlex Elder 	rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
20720eefd470SAlex Elder 
20733d7efd18SAlex Elder 	rbd_img_request_put(img_request);
20743d7efd18SAlex Elder 
20750eefd470SAlex Elder 	if (result)
20760eefd470SAlex Elder 		goto out_err;
20773d7efd18SAlex Elder 
20780eefd470SAlex Elder 	/* Allocate the new copyup osd request for the original request */
20793d7efd18SAlex Elder 
20800eefd470SAlex Elder 	result = -ENOMEM;
20810eefd470SAlex Elder 	rbd_assert(!orig_request->osd_req);
20820eefd470SAlex Elder 	osd_req = rbd_osd_req_create_copyup(orig_request);
20830eefd470SAlex Elder 	if (!osd_req)
20840eefd470SAlex Elder 		goto out_err;
20850eefd470SAlex Elder 	orig_request->osd_req = osd_req;
20860eefd470SAlex Elder 	orig_request->copyup_pages = pages;
20873d7efd18SAlex Elder 
20880eefd470SAlex Elder 	/* Initialize the copyup op */
20890eefd470SAlex Elder 
20900eefd470SAlex Elder 	osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
20910eefd470SAlex Elder 	osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
20920eefd470SAlex Elder 						false, false);
20930eefd470SAlex Elder 
20940eefd470SAlex Elder 	/* Then the original write request op */
20950eefd470SAlex Elder 
20960eefd470SAlex Elder 	osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
20970eefd470SAlex Elder 					orig_request->offset,
20980eefd470SAlex Elder 					orig_request->length, 0, 0);
20990eefd470SAlex Elder 	osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
21000eefd470SAlex Elder 					orig_request->length);
21010eefd470SAlex Elder 
21020eefd470SAlex Elder 	rbd_osd_req_format_write(orig_request);
21030eefd470SAlex Elder 
21040eefd470SAlex Elder 	/* All set, send it off. */
21050eefd470SAlex Elder 
21060eefd470SAlex Elder 	orig_request->callback = rbd_img_obj_copyup_callback;
21070eefd470SAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
21080eefd470SAlex Elder 	result = rbd_obj_request_submit(osdc, orig_request);
21090eefd470SAlex Elder 	if (!result)
21100eefd470SAlex Elder 		return;
21110eefd470SAlex Elder out_err:
21120eefd470SAlex Elder 	/* Record the error code and complete the request */
21130eefd470SAlex Elder 
21140eefd470SAlex Elder 	orig_request->result = result;
21150eefd470SAlex Elder 	orig_request->xferred = 0;
21163d7efd18SAlex Elder 	obj_request_done_set(orig_request);
21173d7efd18SAlex Elder 	rbd_obj_request_complete(orig_request);
21183d7efd18SAlex Elder }
21193d7efd18SAlex Elder 
21203d7efd18SAlex Elder /*
21213d7efd18SAlex Elder  * Read from the parent image the range of data that covers the
21223d7efd18SAlex Elder  * entire target of the given object request.  This is used for
21233d7efd18SAlex Elder  * satisfying a layered image write request when the target of an
21243d7efd18SAlex Elder  * object request from the image request does not exist.
21253d7efd18SAlex Elder  *
21263d7efd18SAlex Elder  * A page array big enough to hold the returned data is allocated
21273d7efd18SAlex Elder  * and supplied to rbd_img_request_fill() as the "data descriptor."
21283d7efd18SAlex Elder  * When the read completes, this page array will be transferred to
21293d7efd18SAlex Elder  * the original object request for the copyup operation.
21303d7efd18SAlex Elder  *
21313d7efd18SAlex Elder  * If an error occurs, record it as the result of the original
21323d7efd18SAlex Elder  * object request and mark it done so it gets completed.
21333d7efd18SAlex Elder  */
21343d7efd18SAlex Elder static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
21353d7efd18SAlex Elder {
21363d7efd18SAlex Elder 	struct rbd_img_request *img_request = NULL;
21373d7efd18SAlex Elder 	struct rbd_img_request *parent_request = NULL;
21383d7efd18SAlex Elder 	struct rbd_device *rbd_dev;
21393d7efd18SAlex Elder 	u64 img_offset;
21403d7efd18SAlex Elder 	u64 length;
21413d7efd18SAlex Elder 	struct page **pages = NULL;
21423d7efd18SAlex Elder 	u32 page_count;
21433d7efd18SAlex Elder 	int result;
21443d7efd18SAlex Elder 
21453d7efd18SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
21463d7efd18SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
21473d7efd18SAlex Elder 
21483d7efd18SAlex Elder 	img_request = obj_request->img_request;
21493d7efd18SAlex Elder 	rbd_assert(img_request != NULL);
21503d7efd18SAlex Elder 	rbd_dev = img_request->rbd_dev;
21513d7efd18SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
21523d7efd18SAlex Elder 
21533d7efd18SAlex Elder 	/*
21540eefd470SAlex Elder 	 * First things first.  The original osd request is of no
21550eefd470SAlex Elder 	 * use to use any more, we'll need a new one that can hold
21560eefd470SAlex Elder 	 * the two ops in a copyup request.  We'll get that later,
21570eefd470SAlex Elder 	 * but for now we can release the old one.
21580eefd470SAlex Elder 	 */
21590eefd470SAlex Elder 	rbd_osd_req_destroy(obj_request->osd_req);
21600eefd470SAlex Elder 	obj_request->osd_req = NULL;
21610eefd470SAlex Elder 
21620eefd470SAlex Elder 	/*
21633d7efd18SAlex Elder 	 * Determine the byte range covered by the object in the
21643d7efd18SAlex Elder 	 * child image to which the original request was to be sent.
21653d7efd18SAlex Elder 	 */
21663d7efd18SAlex Elder 	img_offset = obj_request->img_offset - obj_request->offset;
21673d7efd18SAlex Elder 	length = (u64)1 << rbd_dev->header.obj_order;
21683d7efd18SAlex Elder 
21693d7efd18SAlex Elder 	/*
2170a9e8ba2cSAlex Elder 	 * There is no defined parent data beyond the parent
2171a9e8ba2cSAlex Elder 	 * overlap, so limit what we read at that boundary if
2172a9e8ba2cSAlex Elder 	 * necessary.
2173a9e8ba2cSAlex Elder 	 */
2174a9e8ba2cSAlex Elder 	if (img_offset + length > rbd_dev->parent_overlap) {
2175a9e8ba2cSAlex Elder 		rbd_assert(img_offset < rbd_dev->parent_overlap);
2176a9e8ba2cSAlex Elder 		length = rbd_dev->parent_overlap - img_offset;
2177a9e8ba2cSAlex Elder 	}
2178a9e8ba2cSAlex Elder 
2179a9e8ba2cSAlex Elder 	/*
21803d7efd18SAlex Elder 	 * Allocate a page array big enough to receive the data read
21813d7efd18SAlex Elder 	 * from the parent.
21823d7efd18SAlex Elder 	 */
21833d7efd18SAlex Elder 	page_count = (u32)calc_pages_for(0, length);
21843d7efd18SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
21853d7efd18SAlex Elder 	if (IS_ERR(pages)) {
21863d7efd18SAlex Elder 		result = PTR_ERR(pages);
21873d7efd18SAlex Elder 		pages = NULL;
21883d7efd18SAlex Elder 		goto out_err;
21893d7efd18SAlex Elder 	}
21903d7efd18SAlex Elder 
21913d7efd18SAlex Elder 	result = -ENOMEM;
21923d7efd18SAlex Elder 	parent_request = rbd_img_request_create(rbd_dev->parent,
21933d7efd18SAlex Elder 						img_offset, length,
21943d7efd18SAlex Elder 						false, true);
21953d7efd18SAlex Elder 	if (!parent_request)
21963d7efd18SAlex Elder 		goto out_err;
21973d7efd18SAlex Elder 	rbd_obj_request_get(obj_request);
21983d7efd18SAlex Elder 	parent_request->obj_request = obj_request;
21993d7efd18SAlex Elder 
22003d7efd18SAlex Elder 	result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
22013d7efd18SAlex Elder 	if (result)
22023d7efd18SAlex Elder 		goto out_err;
22033d7efd18SAlex Elder 	parent_request->copyup_pages = pages;
22043d7efd18SAlex Elder 
22053d7efd18SAlex Elder 	parent_request->callback = rbd_img_obj_parent_read_full_callback;
22063d7efd18SAlex Elder 	result = rbd_img_request_submit(parent_request);
22073d7efd18SAlex Elder 	if (!result)
22083d7efd18SAlex Elder 		return 0;
22093d7efd18SAlex Elder 
22103d7efd18SAlex Elder 	parent_request->copyup_pages = NULL;
22113d7efd18SAlex Elder 	parent_request->obj_request = NULL;
22123d7efd18SAlex Elder 	rbd_obj_request_put(obj_request);
22133d7efd18SAlex Elder out_err:
22143d7efd18SAlex Elder 	if (pages)
22153d7efd18SAlex Elder 		ceph_release_page_vector(pages, page_count);
22163d7efd18SAlex Elder 	if (parent_request)
22173d7efd18SAlex Elder 		rbd_img_request_put(parent_request);
22183d7efd18SAlex Elder 	obj_request->result = result;
22193d7efd18SAlex Elder 	obj_request->xferred = 0;
22203d7efd18SAlex Elder 	obj_request_done_set(obj_request);
22213d7efd18SAlex Elder 
22223d7efd18SAlex Elder 	return result;
22233d7efd18SAlex Elder }
22243d7efd18SAlex Elder 
2225c5b5ef6cSAlex Elder static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2226c5b5ef6cSAlex Elder {
2227c5b5ef6cSAlex Elder 	struct rbd_obj_request *orig_request;
2228c5b5ef6cSAlex Elder 	int result;
2229c5b5ef6cSAlex Elder 
2230c5b5ef6cSAlex Elder 	rbd_assert(!obj_request_img_data_test(obj_request));
2231c5b5ef6cSAlex Elder 
2232c5b5ef6cSAlex Elder 	/*
2233c5b5ef6cSAlex Elder 	 * All we need from the object request is the original
2234c5b5ef6cSAlex Elder 	 * request and the result of the STAT op.  Grab those, then
2235c5b5ef6cSAlex Elder 	 * we're done with the request.
2236c5b5ef6cSAlex Elder 	 */
2237c5b5ef6cSAlex Elder 	orig_request = obj_request->obj_request;
2238c5b5ef6cSAlex Elder 	obj_request->obj_request = NULL;
2239c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2240c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2241c5b5ef6cSAlex Elder 
2242c5b5ef6cSAlex Elder 	result = obj_request->result;
2243c5b5ef6cSAlex Elder 	obj_request->result = 0;
2244c5b5ef6cSAlex Elder 
2245c5b5ef6cSAlex Elder 	dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2246c5b5ef6cSAlex Elder 		obj_request, orig_request, result,
2247c5b5ef6cSAlex Elder 		obj_request->xferred, obj_request->length);
2248c5b5ef6cSAlex Elder 	rbd_obj_request_put(obj_request);
2249c5b5ef6cSAlex Elder 
2250c5b5ef6cSAlex Elder 	rbd_assert(orig_request);
2251c5b5ef6cSAlex Elder 	rbd_assert(orig_request->img_request);
2252c5b5ef6cSAlex Elder 
2253c5b5ef6cSAlex Elder 	/*
2254c5b5ef6cSAlex Elder 	 * Our only purpose here is to determine whether the object
2255c5b5ef6cSAlex Elder 	 * exists, and we don't want to treat the non-existence as
2256c5b5ef6cSAlex Elder 	 * an error.  If something else comes back, transfer the
2257c5b5ef6cSAlex Elder 	 * error to the original request and complete it now.
2258c5b5ef6cSAlex Elder 	 */
2259c5b5ef6cSAlex Elder 	if (!result) {
2260c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, true);
2261c5b5ef6cSAlex Elder 	} else if (result == -ENOENT) {
2262c5b5ef6cSAlex Elder 		obj_request_existence_set(orig_request, false);
2263c5b5ef6cSAlex Elder 	} else if (result) {
2264c5b5ef6cSAlex Elder 		orig_request->result = result;
22653d7efd18SAlex Elder 		goto out;
2266c5b5ef6cSAlex Elder 	}
2267c5b5ef6cSAlex Elder 
2268c5b5ef6cSAlex Elder 	/*
2269c5b5ef6cSAlex Elder 	 * Resubmit the original request now that we have recorded
2270c5b5ef6cSAlex Elder 	 * whether the target object exists.
2271c5b5ef6cSAlex Elder 	 */
2272b454e36dSAlex Elder 	orig_request->result = rbd_img_obj_request_submit(orig_request);
22733d7efd18SAlex Elder out:
2274c5b5ef6cSAlex Elder 	if (orig_request->result)
2275c5b5ef6cSAlex Elder 		rbd_obj_request_complete(orig_request);
2276c5b5ef6cSAlex Elder 	rbd_obj_request_put(orig_request);
2277c5b5ef6cSAlex Elder }
2278c5b5ef6cSAlex Elder 
2279c5b5ef6cSAlex Elder static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2280c5b5ef6cSAlex Elder {
2281c5b5ef6cSAlex Elder 	struct rbd_obj_request *stat_request;
2282c5b5ef6cSAlex Elder 	struct rbd_device *rbd_dev;
2283c5b5ef6cSAlex Elder 	struct ceph_osd_client *osdc;
2284c5b5ef6cSAlex Elder 	struct page **pages = NULL;
2285c5b5ef6cSAlex Elder 	u32 page_count;
2286c5b5ef6cSAlex Elder 	size_t size;
2287c5b5ef6cSAlex Elder 	int ret;
2288c5b5ef6cSAlex Elder 
2289c5b5ef6cSAlex Elder 	/*
2290c5b5ef6cSAlex Elder 	 * The response data for a STAT call consists of:
2291c5b5ef6cSAlex Elder 	 *     le64 length;
2292c5b5ef6cSAlex Elder 	 *     struct {
2293c5b5ef6cSAlex Elder 	 *         le32 tv_sec;
2294c5b5ef6cSAlex Elder 	 *         le32 tv_nsec;
2295c5b5ef6cSAlex Elder 	 *     } mtime;
2296c5b5ef6cSAlex Elder 	 */
2297c5b5ef6cSAlex Elder 	size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2298c5b5ef6cSAlex Elder 	page_count = (u32)calc_pages_for(0, size);
2299c5b5ef6cSAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2300c5b5ef6cSAlex Elder 	if (IS_ERR(pages))
2301c5b5ef6cSAlex Elder 		return PTR_ERR(pages);
2302c5b5ef6cSAlex Elder 
2303c5b5ef6cSAlex Elder 	ret = -ENOMEM;
2304c5b5ef6cSAlex Elder 	stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2305c5b5ef6cSAlex Elder 							OBJ_REQUEST_PAGES);
2306c5b5ef6cSAlex Elder 	if (!stat_request)
2307c5b5ef6cSAlex Elder 		goto out;
2308c5b5ef6cSAlex Elder 
2309c5b5ef6cSAlex Elder 	rbd_obj_request_get(obj_request);
2310c5b5ef6cSAlex Elder 	stat_request->obj_request = obj_request;
2311c5b5ef6cSAlex Elder 	stat_request->pages = pages;
2312c5b5ef6cSAlex Elder 	stat_request->page_count = page_count;
2313c5b5ef6cSAlex Elder 
2314c5b5ef6cSAlex Elder 	rbd_assert(obj_request->img_request);
2315c5b5ef6cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2316c5b5ef6cSAlex Elder 	stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2317c5b5ef6cSAlex Elder 						stat_request);
2318c5b5ef6cSAlex Elder 	if (!stat_request->osd_req)
2319c5b5ef6cSAlex Elder 		goto out;
2320c5b5ef6cSAlex Elder 	stat_request->callback = rbd_img_obj_exists_callback;
2321c5b5ef6cSAlex Elder 
2322c5b5ef6cSAlex Elder 	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2323c5b5ef6cSAlex Elder 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2324c5b5ef6cSAlex Elder 					false, false);
23259d4df01fSAlex Elder 	rbd_osd_req_format_read(stat_request);
2326c5b5ef6cSAlex Elder 
2327c5b5ef6cSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
2328c5b5ef6cSAlex Elder 	ret = rbd_obj_request_submit(osdc, stat_request);
2329c5b5ef6cSAlex Elder out:
2330c5b5ef6cSAlex Elder 	if (ret)
2331c5b5ef6cSAlex Elder 		rbd_obj_request_put(obj_request);
2332c5b5ef6cSAlex Elder 
2333c5b5ef6cSAlex Elder 	return ret;
2334c5b5ef6cSAlex Elder }
2335c5b5ef6cSAlex Elder 
2336b454e36dSAlex Elder static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2337b454e36dSAlex Elder {
2338b454e36dSAlex Elder 	struct rbd_img_request *img_request;
2339a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
23403d7efd18SAlex Elder 	bool known;
2341b454e36dSAlex Elder 
2342b454e36dSAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
2343b454e36dSAlex Elder 
2344b454e36dSAlex Elder 	img_request = obj_request->img_request;
2345b454e36dSAlex Elder 	rbd_assert(img_request);
2346a9e8ba2cSAlex Elder 	rbd_dev = img_request->rbd_dev;
2347b454e36dSAlex Elder 
2348b454e36dSAlex Elder 	/*
2349a9e8ba2cSAlex Elder 	 * Only writes to layered images need special handling.
2350a9e8ba2cSAlex Elder 	 * Reads and non-layered writes are simple object requests.
2351a9e8ba2cSAlex Elder 	 * Layered writes that start beyond the end of the overlap
2352a9e8ba2cSAlex Elder 	 * with the parent have no parent data, so they too are
2353a9e8ba2cSAlex Elder 	 * simple object requests.  Finally, if the target object is
2354a9e8ba2cSAlex Elder 	 * known to already exist, its parent data has already been
2355a9e8ba2cSAlex Elder 	 * copied, so a write to the object can also be handled as a
2356a9e8ba2cSAlex Elder 	 * simple object request.
2357b454e36dSAlex Elder 	 */
2358b454e36dSAlex Elder 	if (!img_request_write_test(img_request) ||
2359b454e36dSAlex Elder 		!img_request_layered_test(img_request) ||
2360a9e8ba2cSAlex Elder 		rbd_dev->parent_overlap <= obj_request->img_offset ||
23613d7efd18SAlex Elder 		((known = obj_request_known_test(obj_request)) &&
23623d7efd18SAlex Elder 			obj_request_exists_test(obj_request))) {
2363b454e36dSAlex Elder 
2364b454e36dSAlex Elder 		struct rbd_device *rbd_dev;
2365b454e36dSAlex Elder 		struct ceph_osd_client *osdc;
2366b454e36dSAlex Elder 
2367b454e36dSAlex Elder 		rbd_dev = obj_request->img_request->rbd_dev;
2368b454e36dSAlex Elder 		osdc = &rbd_dev->rbd_client->client->osdc;
2369b454e36dSAlex Elder 
2370b454e36dSAlex Elder 		return rbd_obj_request_submit(osdc, obj_request);
2371b454e36dSAlex Elder 	}
2372b454e36dSAlex Elder 
2373b454e36dSAlex Elder 	/*
23743d7efd18SAlex Elder 	 * It's a layered write.  The target object might exist but
23753d7efd18SAlex Elder 	 * we may not know that yet.  If we know it doesn't exist,
23763d7efd18SAlex Elder 	 * start by reading the data for the full target object from
23773d7efd18SAlex Elder 	 * the parent so we can use it for a copyup to the target.
2378b454e36dSAlex Elder 	 */
23793d7efd18SAlex Elder 	if (known)
23803d7efd18SAlex Elder 		return rbd_img_obj_parent_read_full(obj_request);
23813d7efd18SAlex Elder 
23823d7efd18SAlex Elder 	/* We don't know whether the target exists.  Go find out. */
2383b454e36dSAlex Elder 
2384b454e36dSAlex Elder 	return rbd_img_obj_exists_submit(obj_request);
2385b454e36dSAlex Elder }
2386b454e36dSAlex Elder 
2387bf0d5f50SAlex Elder static int rbd_img_request_submit(struct rbd_img_request *img_request)
2388bf0d5f50SAlex Elder {
2389bf0d5f50SAlex Elder 	struct rbd_obj_request *obj_request;
239046faeed4SAlex Elder 	struct rbd_obj_request *next_obj_request;
2391bf0d5f50SAlex Elder 
239237206ee5SAlex Elder 	dout("%s: img %p\n", __func__, img_request);
239346faeed4SAlex Elder 	for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
2394bf0d5f50SAlex Elder 		int ret;
2395bf0d5f50SAlex Elder 
2396b454e36dSAlex Elder 		ret = rbd_img_obj_request_submit(obj_request);
2397bf0d5f50SAlex Elder 		if (ret)
2398bf0d5f50SAlex Elder 			return ret;
2399bf0d5f50SAlex Elder 	}
2400bf0d5f50SAlex Elder 
2401bf0d5f50SAlex Elder 	return 0;
2402bf0d5f50SAlex Elder }
2403bf0d5f50SAlex Elder 
24048b3e1a56SAlex Elder static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
24058b3e1a56SAlex Elder {
24068b3e1a56SAlex Elder 	struct rbd_obj_request *obj_request;
2407a9e8ba2cSAlex Elder 	struct rbd_device *rbd_dev;
2408a9e8ba2cSAlex Elder 	u64 obj_end;
24098b3e1a56SAlex Elder 
24108b3e1a56SAlex Elder 	rbd_assert(img_request_child_test(img_request));
24118b3e1a56SAlex Elder 
24128b3e1a56SAlex Elder 	obj_request = img_request->obj_request;
2413a9e8ba2cSAlex Elder 	rbd_assert(obj_request);
2414a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_request);
24158b3e1a56SAlex Elder 
2416a9e8ba2cSAlex Elder 	obj_request->result = img_request->result;
2417a9e8ba2cSAlex Elder 	if (obj_request->result)
2418a9e8ba2cSAlex Elder 		goto out;
2419a9e8ba2cSAlex Elder 
2420a9e8ba2cSAlex Elder 	/*
2421a9e8ba2cSAlex Elder 	 * We need to zero anything beyond the parent overlap
2422a9e8ba2cSAlex Elder 	 * boundary.  Since rbd_img_obj_request_read_callback()
2423a9e8ba2cSAlex Elder 	 * will zero anything beyond the end of a short read, an
2424a9e8ba2cSAlex Elder 	 * easy way to do this is to pretend the data from the
2425a9e8ba2cSAlex Elder 	 * parent came up short--ending at the overlap boundary.
2426a9e8ba2cSAlex Elder 	 */
2427a9e8ba2cSAlex Elder 	rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2428a9e8ba2cSAlex Elder 	obj_end = obj_request->img_offset + obj_request->length;
2429a9e8ba2cSAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
2430a9e8ba2cSAlex Elder 	if (obj_end > rbd_dev->parent_overlap) {
2431a9e8ba2cSAlex Elder 		u64 xferred = 0;
2432a9e8ba2cSAlex Elder 
2433a9e8ba2cSAlex Elder 		if (obj_request->img_offset < rbd_dev->parent_overlap)
2434a9e8ba2cSAlex Elder 			xferred = rbd_dev->parent_overlap -
2435a9e8ba2cSAlex Elder 					obj_request->img_offset;
2436a9e8ba2cSAlex Elder 
2437a9e8ba2cSAlex Elder 		obj_request->xferred = min(img_request->xferred, xferred);
2438a9e8ba2cSAlex Elder 	} else {
2439a9e8ba2cSAlex Elder 		obj_request->xferred = img_request->xferred;
2440a9e8ba2cSAlex Elder 	}
2441a9e8ba2cSAlex Elder out:
24428b3e1a56SAlex Elder 	rbd_img_obj_request_read_callback(obj_request);
24438b3e1a56SAlex Elder 	rbd_obj_request_complete(obj_request);
24448b3e1a56SAlex Elder }
24458b3e1a56SAlex Elder 
24468b3e1a56SAlex Elder static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
24478b3e1a56SAlex Elder {
24488b3e1a56SAlex Elder 	struct rbd_device *rbd_dev;
24498b3e1a56SAlex Elder 	struct rbd_img_request *img_request;
24508b3e1a56SAlex Elder 	int result;
24518b3e1a56SAlex Elder 
24528b3e1a56SAlex Elder 	rbd_assert(obj_request_img_data_test(obj_request));
24538b3e1a56SAlex Elder 	rbd_assert(obj_request->img_request != NULL);
24548b3e1a56SAlex Elder 	rbd_assert(obj_request->result == (s32) -ENOENT);
24558b3e1a56SAlex Elder 	rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
24568b3e1a56SAlex Elder 
24578b3e1a56SAlex Elder 	rbd_dev = obj_request->img_request->rbd_dev;
24588b3e1a56SAlex Elder 	rbd_assert(rbd_dev->parent != NULL);
24598b3e1a56SAlex Elder 	/* rbd_read_finish(obj_request, obj_request->length); */
24608b3e1a56SAlex Elder 	img_request = rbd_img_request_create(rbd_dev->parent,
24618b3e1a56SAlex Elder 						obj_request->img_offset,
24628b3e1a56SAlex Elder 						obj_request->length,
24638b3e1a56SAlex Elder 						false, true);
24648b3e1a56SAlex Elder 	result = -ENOMEM;
24658b3e1a56SAlex Elder 	if (!img_request)
24668b3e1a56SAlex Elder 		goto out_err;
24678b3e1a56SAlex Elder 
24688b3e1a56SAlex Elder 	rbd_obj_request_get(obj_request);
24698b3e1a56SAlex Elder 	img_request->obj_request = obj_request;
24708b3e1a56SAlex Elder 
2471f1a4739fSAlex Elder 	result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2472f1a4739fSAlex Elder 					obj_request->bio_list);
24738b3e1a56SAlex Elder 	if (result)
24748b3e1a56SAlex Elder 		goto out_err;
24758b3e1a56SAlex Elder 
24768b3e1a56SAlex Elder 	img_request->callback = rbd_img_parent_read_callback;
24778b3e1a56SAlex Elder 	result = rbd_img_request_submit(img_request);
24788b3e1a56SAlex Elder 	if (result)
24798b3e1a56SAlex Elder 		goto out_err;
24808b3e1a56SAlex Elder 
24818b3e1a56SAlex Elder 	return;
24828b3e1a56SAlex Elder out_err:
24838b3e1a56SAlex Elder 	if (img_request)
24848b3e1a56SAlex Elder 		rbd_img_request_put(img_request);
24858b3e1a56SAlex Elder 	obj_request->result = result;
24868b3e1a56SAlex Elder 	obj_request->xferred = 0;
24878b3e1a56SAlex Elder 	obj_request_done_set(obj_request);
24888b3e1a56SAlex Elder }
24898b3e1a56SAlex Elder 
2490cf81b60eSAlex Elder static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
2491b8d70035SAlex Elder 				   u64 ver, u64 notify_id)
2492b8d70035SAlex Elder {
2493b8d70035SAlex Elder 	struct rbd_obj_request *obj_request;
24942169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2495b8d70035SAlex Elder 	int ret;
2496b8d70035SAlex Elder 
2497b8d70035SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2498b8d70035SAlex Elder 							OBJ_REQUEST_NODATA);
2499b8d70035SAlex Elder 	if (!obj_request)
2500b8d70035SAlex Elder 		return -ENOMEM;
2501b8d70035SAlex Elder 
2502b8d70035SAlex Elder 	ret = -ENOMEM;
2503430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2504b8d70035SAlex Elder 	if (!obj_request->osd_req)
2505b8d70035SAlex Elder 		goto out;
25062169238dSAlex Elder 	obj_request->callback = rbd_obj_request_put;
2507b8d70035SAlex Elder 
2508c99d2d4aSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2509c99d2d4aSAlex Elder 					notify_id, ver, 0);
25109d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2511430c28c3SAlex Elder 
2512b8d70035SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2513b8d70035SAlex Elder out:
2514cf81b60eSAlex Elder 	if (ret)
2515b8d70035SAlex Elder 		rbd_obj_request_put(obj_request);
2516b8d70035SAlex Elder 
2517b8d70035SAlex Elder 	return ret;
2518b8d70035SAlex Elder }
2519b8d70035SAlex Elder 
2520b8d70035SAlex Elder static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2521b8d70035SAlex Elder {
2522b8d70035SAlex Elder 	struct rbd_device *rbd_dev = (struct rbd_device *)data;
2523b8d70035SAlex Elder 	u64 hver;
2524b8d70035SAlex Elder 
2525b8d70035SAlex Elder 	if (!rbd_dev)
2526b8d70035SAlex Elder 		return;
2527b8d70035SAlex Elder 
252837206ee5SAlex Elder 	dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
2529b8d70035SAlex Elder 		rbd_dev->header_name, (unsigned long long) notify_id,
2530b8d70035SAlex Elder 		(unsigned int) opcode);
2531522a0cc0SAlex Elder 	(void)rbd_dev_refresh(rbd_dev, &hver);
2532b8d70035SAlex Elder 
2533cf81b60eSAlex Elder 	rbd_obj_notify_ack(rbd_dev, hver, notify_id);
2534b8d70035SAlex Elder }
2535b8d70035SAlex Elder 
25369969ebc5SAlex Elder /*
25379969ebc5SAlex Elder  * Request sync osd watch/unwatch.  The value of "start" determines
25389969ebc5SAlex Elder  * whether a watch request is being initiated or torn down.
25399969ebc5SAlex Elder  */
25409969ebc5SAlex Elder static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
25419969ebc5SAlex Elder {
25429969ebc5SAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
25439969ebc5SAlex Elder 	struct rbd_obj_request *obj_request;
25449969ebc5SAlex Elder 	int ret;
25459969ebc5SAlex Elder 
25469969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_event);
25479969ebc5SAlex Elder 	rbd_assert(start ^ !!rbd_dev->watch_request);
25489969ebc5SAlex Elder 
25499969ebc5SAlex Elder 	if (start) {
25503c663bbdSAlex Elder 		ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
25519969ebc5SAlex Elder 						&rbd_dev->watch_event);
25529969ebc5SAlex Elder 		if (ret < 0)
25539969ebc5SAlex Elder 			return ret;
25548eb87565SAlex Elder 		rbd_assert(rbd_dev->watch_event != NULL);
25559969ebc5SAlex Elder 	}
25569969ebc5SAlex Elder 
25579969ebc5SAlex Elder 	ret = -ENOMEM;
25589969ebc5SAlex Elder 	obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
25599969ebc5SAlex Elder 							OBJ_REQUEST_NODATA);
25609969ebc5SAlex Elder 	if (!obj_request)
25619969ebc5SAlex Elder 		goto out_cancel;
25629969ebc5SAlex Elder 
2563430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2564430c28c3SAlex Elder 	if (!obj_request->osd_req)
2565430c28c3SAlex Elder 		goto out_cancel;
2566430c28c3SAlex Elder 
25678eb87565SAlex Elder 	if (start)
2568975241afSAlex Elder 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
25698eb87565SAlex Elder 	else
25706977c3f9SAlex Elder 		ceph_osdc_unregister_linger_request(osdc,
2571975241afSAlex Elder 					rbd_dev->watch_request->osd_req);
25722169238dSAlex Elder 
25732169238dSAlex Elder 	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
25742169238dSAlex Elder 				rbd_dev->watch_event->cookie,
25752169238dSAlex Elder 				rbd_dev->header.obj_version, start);
25769d4df01fSAlex Elder 	rbd_osd_req_format_write(obj_request);
25772169238dSAlex Elder 
25789969ebc5SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
25799969ebc5SAlex Elder 	if (ret)
25809969ebc5SAlex Elder 		goto out_cancel;
25819969ebc5SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
25829969ebc5SAlex Elder 	if (ret)
25839969ebc5SAlex Elder 		goto out_cancel;
25849969ebc5SAlex Elder 	ret = obj_request->result;
25859969ebc5SAlex Elder 	if (ret)
25869969ebc5SAlex Elder 		goto out_cancel;
25879969ebc5SAlex Elder 
25888eb87565SAlex Elder 	/*
25898eb87565SAlex Elder 	 * A watch request is set to linger, so the underlying osd
25908eb87565SAlex Elder 	 * request won't go away until we unregister it.  We retain
25918eb87565SAlex Elder 	 * a pointer to the object request during that time (in
25928eb87565SAlex Elder 	 * rbd_dev->watch_request), so we'll keep a reference to
25938eb87565SAlex Elder 	 * it.  We'll drop that reference (below) after we've
25948eb87565SAlex Elder 	 * unregistered it.
25958eb87565SAlex Elder 	 */
25968eb87565SAlex Elder 	if (start) {
25978eb87565SAlex Elder 		rbd_dev->watch_request = obj_request;
25988eb87565SAlex Elder 
25998eb87565SAlex Elder 		return 0;
26008eb87565SAlex Elder 	}
26018eb87565SAlex Elder 
26028eb87565SAlex Elder 	/* We have successfully torn down the watch request */
26038eb87565SAlex Elder 
26048eb87565SAlex Elder 	rbd_obj_request_put(rbd_dev->watch_request);
26058eb87565SAlex Elder 	rbd_dev->watch_request = NULL;
26069969ebc5SAlex Elder out_cancel:
26079969ebc5SAlex Elder 	/* Cancel the event if we're tearing down, or on error */
26089969ebc5SAlex Elder 	ceph_osdc_cancel_event(rbd_dev->watch_event);
26099969ebc5SAlex Elder 	rbd_dev->watch_event = NULL;
26109969ebc5SAlex Elder 	if (obj_request)
26119969ebc5SAlex Elder 		rbd_obj_request_put(obj_request);
26129969ebc5SAlex Elder 
26139969ebc5SAlex Elder 	return ret;
26149969ebc5SAlex Elder }
26159969ebc5SAlex Elder 
261636be9a76SAlex Elder /*
2617f40eb349SAlex Elder  * Synchronous osd object method call.  Returns the number of bytes
2618f40eb349SAlex Elder  * returned in the outbound buffer, or a negative error code.
261936be9a76SAlex Elder  */
262036be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
262136be9a76SAlex Elder 			     const char *object_name,
262236be9a76SAlex Elder 			     const char *class_name,
262336be9a76SAlex Elder 			     const char *method_name,
26244157976bSAlex Elder 			     const void *outbound,
262536be9a76SAlex Elder 			     size_t outbound_size,
26264157976bSAlex Elder 			     void *inbound,
262736be9a76SAlex Elder 			     size_t inbound_size,
262836be9a76SAlex Elder 			     u64 *version)
262936be9a76SAlex Elder {
26302169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
263136be9a76SAlex Elder 	struct rbd_obj_request *obj_request;
263236be9a76SAlex Elder 	struct page **pages;
263336be9a76SAlex Elder 	u32 page_count;
263436be9a76SAlex Elder 	int ret;
263536be9a76SAlex Elder 
263636be9a76SAlex Elder 	/*
26376010a451SAlex Elder 	 * Method calls are ultimately read operations.  The result
26386010a451SAlex Elder 	 * should placed into the inbound buffer provided.  They
26396010a451SAlex Elder 	 * also supply outbound data--parameters for the object
26406010a451SAlex Elder 	 * method.  Currently if this is present it will be a
26416010a451SAlex Elder 	 * snapshot id.
264236be9a76SAlex Elder 	 */
264336be9a76SAlex Elder 	page_count = (u32)calc_pages_for(0, inbound_size);
264436be9a76SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
264536be9a76SAlex Elder 	if (IS_ERR(pages))
264636be9a76SAlex Elder 		return PTR_ERR(pages);
264736be9a76SAlex Elder 
264836be9a76SAlex Elder 	ret = -ENOMEM;
26496010a451SAlex Elder 	obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
265036be9a76SAlex Elder 							OBJ_REQUEST_PAGES);
265136be9a76SAlex Elder 	if (!obj_request)
265236be9a76SAlex Elder 		goto out;
265336be9a76SAlex Elder 
265436be9a76SAlex Elder 	obj_request->pages = pages;
265536be9a76SAlex Elder 	obj_request->page_count = page_count;
265636be9a76SAlex Elder 
2657430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
265836be9a76SAlex Elder 	if (!obj_request->osd_req)
265936be9a76SAlex Elder 		goto out;
266036be9a76SAlex Elder 
2661c99d2d4aSAlex Elder 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
266204017e29SAlex Elder 					class_name, method_name);
266304017e29SAlex Elder 	if (outbound_size) {
266404017e29SAlex Elder 		struct ceph_pagelist *pagelist;
266504017e29SAlex Elder 
266604017e29SAlex Elder 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
266704017e29SAlex Elder 		if (!pagelist)
266804017e29SAlex Elder 			goto out;
266904017e29SAlex Elder 
267004017e29SAlex Elder 		ceph_pagelist_init(pagelist);
267104017e29SAlex Elder 		ceph_pagelist_append(pagelist, outbound, outbound_size);
267204017e29SAlex Elder 		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
267304017e29SAlex Elder 						pagelist);
267404017e29SAlex Elder 	}
2675a4ce40a9SAlex Elder 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2676a4ce40a9SAlex Elder 					obj_request->pages, inbound_size,
267744cd188dSAlex Elder 					0, false, false);
26789d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2679430c28c3SAlex Elder 
268036be9a76SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
268136be9a76SAlex Elder 	if (ret)
268236be9a76SAlex Elder 		goto out;
268336be9a76SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
268436be9a76SAlex Elder 	if (ret)
268536be9a76SAlex Elder 		goto out;
268636be9a76SAlex Elder 
268736be9a76SAlex Elder 	ret = obj_request->result;
268836be9a76SAlex Elder 	if (ret < 0)
268936be9a76SAlex Elder 		goto out;
269057385b51SAlex Elder 
269157385b51SAlex Elder 	rbd_assert(obj_request->xferred < (u64)INT_MAX);
269257385b51SAlex Elder 	ret = (int)obj_request->xferred;
2693903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
269436be9a76SAlex Elder 	if (version)
269536be9a76SAlex Elder 		*version = obj_request->version;
269636be9a76SAlex Elder out:
269736be9a76SAlex Elder 	if (obj_request)
269836be9a76SAlex Elder 		rbd_obj_request_put(obj_request);
269936be9a76SAlex Elder 	else
270036be9a76SAlex Elder 		ceph_release_page_vector(pages, page_count);
270136be9a76SAlex Elder 
270236be9a76SAlex Elder 	return ret;
270336be9a76SAlex Elder }
270436be9a76SAlex Elder 
2705bf0d5f50SAlex Elder static void rbd_request_fn(struct request_queue *q)
2706cc344fa1SAlex Elder 		__releases(q->queue_lock) __acquires(q->queue_lock)
2707bf0d5f50SAlex Elder {
2708bf0d5f50SAlex Elder 	struct rbd_device *rbd_dev = q->queuedata;
2709bf0d5f50SAlex Elder 	bool read_only = rbd_dev->mapping.read_only;
2710bf0d5f50SAlex Elder 	struct request *rq;
2711bf0d5f50SAlex Elder 	int result;
2712bf0d5f50SAlex Elder 
2713bf0d5f50SAlex Elder 	while ((rq = blk_fetch_request(q))) {
2714bf0d5f50SAlex Elder 		bool write_request = rq_data_dir(rq) == WRITE;
2715bf0d5f50SAlex Elder 		struct rbd_img_request *img_request;
2716bf0d5f50SAlex Elder 		u64 offset;
2717bf0d5f50SAlex Elder 		u64 length;
2718bf0d5f50SAlex Elder 
2719bf0d5f50SAlex Elder 		/* Ignore any non-FS requests that filter through. */
2720bf0d5f50SAlex Elder 
2721bf0d5f50SAlex Elder 		if (rq->cmd_type != REQ_TYPE_FS) {
27224dda41d3SAlex Elder 			dout("%s: non-fs request type %d\n", __func__,
27234dda41d3SAlex Elder 				(int) rq->cmd_type);
27244dda41d3SAlex Elder 			__blk_end_request_all(rq, 0);
27254dda41d3SAlex Elder 			continue;
27264dda41d3SAlex Elder 		}
27274dda41d3SAlex Elder 
27284dda41d3SAlex Elder 		/* Ignore/skip any zero-length requests */
27294dda41d3SAlex Elder 
27304dda41d3SAlex Elder 		offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
27314dda41d3SAlex Elder 		length = (u64) blk_rq_bytes(rq);
27324dda41d3SAlex Elder 
27334dda41d3SAlex Elder 		if (!length) {
27344dda41d3SAlex Elder 			dout("%s: zero-length request\n", __func__);
2735bf0d5f50SAlex Elder 			__blk_end_request_all(rq, 0);
2736bf0d5f50SAlex Elder 			continue;
2737bf0d5f50SAlex Elder 		}
2738bf0d5f50SAlex Elder 
2739bf0d5f50SAlex Elder 		spin_unlock_irq(q->queue_lock);
2740bf0d5f50SAlex Elder 
2741bf0d5f50SAlex Elder 		/* Disallow writes to a read-only device */
2742bf0d5f50SAlex Elder 
2743bf0d5f50SAlex Elder 		if (write_request) {
2744bf0d5f50SAlex Elder 			result = -EROFS;
2745bf0d5f50SAlex Elder 			if (read_only)
2746bf0d5f50SAlex Elder 				goto end_request;
2747bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2748bf0d5f50SAlex Elder 		}
2749bf0d5f50SAlex Elder 
27506d292906SAlex Elder 		/*
27516d292906SAlex Elder 		 * Quit early if the mapped snapshot no longer
27526d292906SAlex Elder 		 * exists.  It's still possible the snapshot will
27536d292906SAlex Elder 		 * have disappeared by the time our request arrives
27546d292906SAlex Elder 		 * at the osd, but there's no sense in sending it if
27556d292906SAlex Elder 		 * we already know.
27566d292906SAlex Elder 		 */
27576d292906SAlex Elder 		if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
2758bf0d5f50SAlex Elder 			dout("request for non-existent snapshot");
2759bf0d5f50SAlex Elder 			rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2760bf0d5f50SAlex Elder 			result = -ENXIO;
2761bf0d5f50SAlex Elder 			goto end_request;
2762bf0d5f50SAlex Elder 		}
2763bf0d5f50SAlex Elder 
2764bf0d5f50SAlex Elder 		result = -EINVAL;
2765bf0d5f50SAlex Elder 		if (WARN_ON(offset && length > U64_MAX - offset + 1))
2766bf0d5f50SAlex Elder 			goto end_request;	/* Shouldn't happen */
2767bf0d5f50SAlex Elder 
2768bf0d5f50SAlex Elder 		result = -ENOMEM;
2769bf0d5f50SAlex Elder 		img_request = rbd_img_request_create(rbd_dev, offset, length,
27709849e986SAlex Elder 							write_request, false);
2771bf0d5f50SAlex Elder 		if (!img_request)
2772bf0d5f50SAlex Elder 			goto end_request;
2773bf0d5f50SAlex Elder 
2774bf0d5f50SAlex Elder 		img_request->rq = rq;
2775bf0d5f50SAlex Elder 
2776f1a4739fSAlex Elder 		result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2777f1a4739fSAlex Elder 						rq->bio);
2778bf0d5f50SAlex Elder 		if (!result)
2779bf0d5f50SAlex Elder 			result = rbd_img_request_submit(img_request);
2780bf0d5f50SAlex Elder 		if (result)
2781bf0d5f50SAlex Elder 			rbd_img_request_put(img_request);
2782bf0d5f50SAlex Elder end_request:
2783bf0d5f50SAlex Elder 		spin_lock_irq(q->queue_lock);
2784bf0d5f50SAlex Elder 		if (result < 0) {
27857da22d29SAlex Elder 			rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
27867da22d29SAlex Elder 				write_request ? "write" : "read",
27877da22d29SAlex Elder 				length, offset, result);
27887da22d29SAlex Elder 
2789bf0d5f50SAlex Elder 			__blk_end_request_all(rq, result);
2790bf0d5f50SAlex Elder 		}
2791bf0d5f50SAlex Elder 	}
2792bf0d5f50SAlex Elder }
2793bf0d5f50SAlex Elder 
2794602adf40SYehuda Sadeh /*
2795602adf40SYehuda Sadeh  * a queue callback. Makes sure that we don't create a bio that spans across
2796602adf40SYehuda Sadeh  * multiple osd objects. One exception would be with a single page bios,
2797f7760dadSAlex Elder  * which we handle later at bio_chain_clone_range()
2798602adf40SYehuda Sadeh  */
2799602adf40SYehuda Sadeh static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2800602adf40SYehuda Sadeh 			  struct bio_vec *bvec)
2801602adf40SYehuda Sadeh {
2802602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = q->queuedata;
2803e5cfeed2SAlex Elder 	sector_t sector_offset;
2804e5cfeed2SAlex Elder 	sector_t sectors_per_obj;
2805e5cfeed2SAlex Elder 	sector_t obj_sector_offset;
2806e5cfeed2SAlex Elder 	int ret;
2807602adf40SYehuda Sadeh 
2808e5cfeed2SAlex Elder 	/*
2809e5cfeed2SAlex Elder 	 * Find how far into its rbd object the partition-relative
2810e5cfeed2SAlex Elder 	 * bio start sector is to offset relative to the enclosing
2811e5cfeed2SAlex Elder 	 * device.
2812e5cfeed2SAlex Elder 	 */
2813e5cfeed2SAlex Elder 	sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2814e5cfeed2SAlex Elder 	sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2815e5cfeed2SAlex Elder 	obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2816593a9e7bSAlex Elder 
2817e5cfeed2SAlex Elder 	/*
2818e5cfeed2SAlex Elder 	 * Compute the number of bytes from that offset to the end
2819e5cfeed2SAlex Elder 	 * of the object.  Account for what's already used by the bio.
2820e5cfeed2SAlex Elder 	 */
2821e5cfeed2SAlex Elder 	ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2822e5cfeed2SAlex Elder 	if (ret > bmd->bi_size)
2823e5cfeed2SAlex Elder 		ret -= bmd->bi_size;
2824e5cfeed2SAlex Elder 	else
2825e5cfeed2SAlex Elder 		ret = 0;
2826e5cfeed2SAlex Elder 
2827e5cfeed2SAlex Elder 	/*
2828e5cfeed2SAlex Elder 	 * Don't send back more than was asked for.  And if the bio
2829e5cfeed2SAlex Elder 	 * was empty, let the whole thing through because:  "Note
2830e5cfeed2SAlex Elder 	 * that a block device *must* allow a single page to be
2831e5cfeed2SAlex Elder 	 * added to an empty bio."
2832e5cfeed2SAlex Elder 	 */
2833e5cfeed2SAlex Elder 	rbd_assert(bvec->bv_len <= PAGE_SIZE);
2834e5cfeed2SAlex Elder 	if (ret > (int) bvec->bv_len || !bmd->bi_size)
2835e5cfeed2SAlex Elder 		ret = (int) bvec->bv_len;
2836e5cfeed2SAlex Elder 
2837e5cfeed2SAlex Elder 	return ret;
2838602adf40SYehuda Sadeh }
2839602adf40SYehuda Sadeh 
2840602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
2841602adf40SYehuda Sadeh {
2842602adf40SYehuda Sadeh 	struct gendisk *disk = rbd_dev->disk;
2843602adf40SYehuda Sadeh 
2844602adf40SYehuda Sadeh 	if (!disk)
2845602adf40SYehuda Sadeh 		return;
2846602adf40SYehuda Sadeh 
2847602adf40SYehuda Sadeh 	if (disk->flags & GENHD_FL_UP)
2848602adf40SYehuda Sadeh 		del_gendisk(disk);
2849602adf40SYehuda Sadeh 	if (disk->queue)
2850602adf40SYehuda Sadeh 		blk_cleanup_queue(disk->queue);
2851602adf40SYehuda Sadeh 	put_disk(disk);
2852602adf40SYehuda Sadeh }
2853602adf40SYehuda Sadeh 
2854788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2855788e2df3SAlex Elder 				const char *object_name,
2856788e2df3SAlex Elder 				u64 offset, u64 length,
285780ef15bfSAlex Elder 				void *buf, u64 *version)
2858788e2df3SAlex Elder 
2859788e2df3SAlex Elder {
28602169238dSAlex Elder 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2861788e2df3SAlex Elder 	struct rbd_obj_request *obj_request;
2862788e2df3SAlex Elder 	struct page **pages = NULL;
2863788e2df3SAlex Elder 	u32 page_count;
28641ceae7efSAlex Elder 	size_t size;
2865788e2df3SAlex Elder 	int ret;
2866788e2df3SAlex Elder 
2867788e2df3SAlex Elder 	page_count = (u32) calc_pages_for(offset, length);
2868788e2df3SAlex Elder 	pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2869788e2df3SAlex Elder 	if (IS_ERR(pages))
2870788e2df3SAlex Elder 		ret = PTR_ERR(pages);
2871788e2df3SAlex Elder 
2872788e2df3SAlex Elder 	ret = -ENOMEM;
2873788e2df3SAlex Elder 	obj_request = rbd_obj_request_create(object_name, offset, length,
2874788e2df3SAlex Elder 							OBJ_REQUEST_PAGES);
2875788e2df3SAlex Elder 	if (!obj_request)
2876788e2df3SAlex Elder 		goto out;
2877788e2df3SAlex Elder 
2878788e2df3SAlex Elder 	obj_request->pages = pages;
2879788e2df3SAlex Elder 	obj_request->page_count = page_count;
2880788e2df3SAlex Elder 
2881430c28c3SAlex Elder 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
2882788e2df3SAlex Elder 	if (!obj_request->osd_req)
2883788e2df3SAlex Elder 		goto out;
2884788e2df3SAlex Elder 
2885c99d2d4aSAlex Elder 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2886c99d2d4aSAlex Elder 					offset, length, 0, 0);
2887406e2c9fSAlex Elder 	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
2888a4ce40a9SAlex Elder 					obj_request->pages,
288944cd188dSAlex Elder 					obj_request->length,
289044cd188dSAlex Elder 					obj_request->offset & ~PAGE_MASK,
289144cd188dSAlex Elder 					false, false);
28929d4df01fSAlex Elder 	rbd_osd_req_format_read(obj_request);
2893430c28c3SAlex Elder 
2894788e2df3SAlex Elder 	ret = rbd_obj_request_submit(osdc, obj_request);
2895788e2df3SAlex Elder 	if (ret)
2896788e2df3SAlex Elder 		goto out;
2897788e2df3SAlex Elder 	ret = rbd_obj_request_wait(obj_request);
2898788e2df3SAlex Elder 	if (ret)
2899788e2df3SAlex Elder 		goto out;
2900788e2df3SAlex Elder 
2901788e2df3SAlex Elder 	ret = obj_request->result;
2902788e2df3SAlex Elder 	if (ret < 0)
2903788e2df3SAlex Elder 		goto out;
29041ceae7efSAlex Elder 
29051ceae7efSAlex Elder 	rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
29061ceae7efSAlex Elder 	size = (size_t) obj_request->xferred;
2907903bb32eSAlex Elder 	ceph_copy_from_page_vector(pages, buf, 0, size);
290823ed6e13SAlex Elder 	rbd_assert(size <= (size_t) INT_MAX);
290923ed6e13SAlex Elder 	ret = (int) size;
2910788e2df3SAlex Elder 	if (version)
2911788e2df3SAlex Elder 		*version = obj_request->version;
2912788e2df3SAlex Elder out:
2913788e2df3SAlex Elder 	if (obj_request)
2914788e2df3SAlex Elder 		rbd_obj_request_put(obj_request);
2915788e2df3SAlex Elder 	else
2916788e2df3SAlex Elder 		ceph_release_page_vector(pages, page_count);
2917788e2df3SAlex Elder 
2918788e2df3SAlex Elder 	return ret;
2919788e2df3SAlex Elder }
2920788e2df3SAlex Elder 
2921602adf40SYehuda Sadeh /*
29224156d998SAlex Elder  * Read the complete header for the given rbd device.
29234156d998SAlex Elder  *
29244156d998SAlex Elder  * Returns a pointer to a dynamically-allocated buffer containing
29254156d998SAlex Elder  * the complete and validated header.  Caller can pass the address
29264156d998SAlex Elder  * of a variable that will be filled in with the version of the
29274156d998SAlex Elder  * header object at the time it was read.
29284156d998SAlex Elder  *
29294156d998SAlex Elder  * Returns a pointer-coded errno if a failure occurs.
29304156d998SAlex Elder  */
29314156d998SAlex Elder static struct rbd_image_header_ondisk *
29324156d998SAlex Elder rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
29334156d998SAlex Elder {
29344156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk = NULL;
29354156d998SAlex Elder 	u32 snap_count = 0;
29364156d998SAlex Elder 	u64 names_size = 0;
29374156d998SAlex Elder 	u32 want_count;
29384156d998SAlex Elder 	int ret;
29394156d998SAlex Elder 
29404156d998SAlex Elder 	/*
29414156d998SAlex Elder 	 * The complete header will include an array of its 64-bit
29424156d998SAlex Elder 	 * snapshot ids, followed by the names of those snapshots as
29434156d998SAlex Elder 	 * a contiguous block of NUL-terminated strings.  Note that
29444156d998SAlex Elder 	 * the number of snapshots could change by the time we read
29454156d998SAlex Elder 	 * it in, in which case we re-read it.
29464156d998SAlex Elder 	 */
29474156d998SAlex Elder 	do {
29484156d998SAlex Elder 		size_t size;
29494156d998SAlex Elder 
29504156d998SAlex Elder 		kfree(ondisk);
29514156d998SAlex Elder 
29524156d998SAlex Elder 		size = sizeof (*ondisk);
29534156d998SAlex Elder 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
29544156d998SAlex Elder 		size += names_size;
29554156d998SAlex Elder 		ondisk = kmalloc(size, GFP_KERNEL);
29564156d998SAlex Elder 		if (!ondisk)
29574156d998SAlex Elder 			return ERR_PTR(-ENOMEM);
29584156d998SAlex Elder 
2959788e2df3SAlex Elder 		ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
296080ef15bfSAlex Elder 				       0, size, ondisk, version);
29614156d998SAlex Elder 		if (ret < 0)
29624156d998SAlex Elder 			goto out_err;
29634156d998SAlex Elder 		if (WARN_ON((size_t) ret < size)) {
29644156d998SAlex Elder 			ret = -ENXIO;
296506ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
296606ecc6cbSAlex Elder 				size, ret);
29674156d998SAlex Elder 			goto out_err;
29684156d998SAlex Elder 		}
29694156d998SAlex Elder 		if (!rbd_dev_ondisk_valid(ondisk)) {
29704156d998SAlex Elder 			ret = -ENXIO;
297106ecc6cbSAlex Elder 			rbd_warn(rbd_dev, "invalid header");
29724156d998SAlex Elder 			goto out_err;
29734156d998SAlex Elder 		}
29744156d998SAlex Elder 
29754156d998SAlex Elder 		names_size = le64_to_cpu(ondisk->snap_names_len);
29764156d998SAlex Elder 		want_count = snap_count;
29774156d998SAlex Elder 		snap_count = le32_to_cpu(ondisk->snap_count);
29784156d998SAlex Elder 	} while (snap_count != want_count);
29794156d998SAlex Elder 
29804156d998SAlex Elder 	return ondisk;
29814156d998SAlex Elder 
29824156d998SAlex Elder out_err:
29834156d998SAlex Elder 	kfree(ondisk);
29844156d998SAlex Elder 
29854156d998SAlex Elder 	return ERR_PTR(ret);
29864156d998SAlex Elder }
29874156d998SAlex Elder 
29884156d998SAlex Elder /*
2989602adf40SYehuda Sadeh  * reload the ondisk the header
2990602adf40SYehuda Sadeh  */
2991602adf40SYehuda Sadeh static int rbd_read_header(struct rbd_device *rbd_dev,
2992602adf40SYehuda Sadeh 			   struct rbd_image_header *header)
2993602adf40SYehuda Sadeh {
29944156d998SAlex Elder 	struct rbd_image_header_ondisk *ondisk;
29954156d998SAlex Elder 	u64 ver = 0;
29964156d998SAlex Elder 	int ret;
2997602adf40SYehuda Sadeh 
29984156d998SAlex Elder 	ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
29994156d998SAlex Elder 	if (IS_ERR(ondisk))
30004156d998SAlex Elder 		return PTR_ERR(ondisk);
30014156d998SAlex Elder 	ret = rbd_header_from_disk(header, ondisk);
30024156d998SAlex Elder 	if (ret >= 0)
300359c2be1eSYehuda Sadeh 		header->obj_version = ver;
30044156d998SAlex Elder 	kfree(ondisk);
3005602adf40SYehuda Sadeh 
30064156d998SAlex Elder 	return ret;
3007602adf40SYehuda Sadeh }
3008602adf40SYehuda Sadeh 
300941f38c2bSAlex Elder static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
3010dfc5606dSYehuda Sadeh {
3011dfc5606dSYehuda Sadeh 	struct rbd_snap *snap;
3012a0593290SAlex Elder 	struct rbd_snap *next;
3013dfc5606dSYehuda Sadeh 
30146087b51bSAlex Elder 	list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
30156087b51bSAlex Elder 		list_del(&snap->node);
30166087b51bSAlex Elder 		rbd_snap_destroy(snap);
30176087b51bSAlex Elder 	}
3018dfc5606dSYehuda Sadeh }
3019dfc5606dSYehuda Sadeh 
30209478554aSAlex Elder static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
30219478554aSAlex Elder {
30229478554aSAlex Elder 	sector_t size;
30239478554aSAlex Elder 
30240d7dbfceSAlex Elder 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
30259478554aSAlex Elder 		return;
30269478554aSAlex Elder 
30279478554aSAlex Elder 	size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
30289478554aSAlex Elder 	dout("setting size to %llu sectors", (unsigned long long) size);
30299478554aSAlex Elder 	rbd_dev->mapping.size = (u64) size;
30309478554aSAlex Elder 	set_capacity(rbd_dev->disk, size);
30319478554aSAlex Elder }
30329478554aSAlex Elder 
3033602adf40SYehuda Sadeh /*
3034602adf40SYehuda Sadeh  * only read the first part of the ondisk header, without the snaps info
3035602adf40SYehuda Sadeh  */
3036117973fbSAlex Elder static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
3037602adf40SYehuda Sadeh {
3038602adf40SYehuda Sadeh 	int ret;
3039602adf40SYehuda Sadeh 	struct rbd_image_header h;
3040602adf40SYehuda Sadeh 
3041602adf40SYehuda Sadeh 	ret = rbd_read_header(rbd_dev, &h);
3042602adf40SYehuda Sadeh 	if (ret < 0)
3043602adf40SYehuda Sadeh 		return ret;
3044602adf40SYehuda Sadeh 
3045a51aa0c0SJosh Durgin 	down_write(&rbd_dev->header_rwsem);
3046a51aa0c0SJosh Durgin 
30479478554aSAlex Elder 	/* Update image size, and check for resize of mapped image */
30489478554aSAlex Elder 	rbd_dev->header.image_size = h.image_size;
30499478554aSAlex Elder 	rbd_update_mapping_size(rbd_dev);
30509db4b3e3SSage Weil 
3051849b4260SAlex Elder 	/* rbd_dev->header.object_prefix shouldn't change */
3052602adf40SYehuda Sadeh 	kfree(rbd_dev->header.snap_sizes);
3053849b4260SAlex Elder 	kfree(rbd_dev->header.snap_names);
3054d1d25646SJosh Durgin 	/* osd requests may still refer to snapc */
3055d1d25646SJosh Durgin 	ceph_put_snap_context(rbd_dev->header.snapc);
3056602adf40SYehuda Sadeh 
3057b813623aSAlex Elder 	if (hver)
3058b813623aSAlex Elder 		*hver = h.obj_version;
3059a71b891bSJosh Durgin 	rbd_dev->header.obj_version = h.obj_version;
306093a24e08SJosh Durgin 	rbd_dev->header.image_size = h.image_size;
3061602adf40SYehuda Sadeh 	rbd_dev->header.snapc = h.snapc;
3062602adf40SYehuda Sadeh 	rbd_dev->header.snap_names = h.snap_names;
3063602adf40SYehuda Sadeh 	rbd_dev->header.snap_sizes = h.snap_sizes;
3064849b4260SAlex Elder 	/* Free the extra copy of the object prefix */
3065849b4260SAlex Elder 	WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3066849b4260SAlex Elder 	kfree(h.object_prefix);
3067849b4260SAlex Elder 
3068304f6808SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
3069dfc5606dSYehuda Sadeh 
3070c666601aSJosh Durgin 	up_write(&rbd_dev->header_rwsem);
3071602adf40SYehuda Sadeh 
3072dfc5606dSYehuda Sadeh 	return ret;
3073602adf40SYehuda Sadeh }
3074602adf40SYehuda Sadeh 
3075117973fbSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
30761fe5e993SAlex Elder {
30771fe5e993SAlex Elder 	int ret;
30781fe5e993SAlex Elder 
3079117973fbSAlex Elder 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
30801fe5e993SAlex Elder 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3081117973fbSAlex Elder 	if (rbd_dev->image_format == 1)
3082117973fbSAlex Elder 		ret = rbd_dev_v1_refresh(rbd_dev, hver);
3083117973fbSAlex Elder 	else
3084117973fbSAlex Elder 		ret = rbd_dev_v2_refresh(rbd_dev, hver);
30851fe5e993SAlex Elder 	mutex_unlock(&ctl_mutex);
3086d98df63eSLaurent Barbe 	revalidate_disk(rbd_dev->disk);
3087522a0cc0SAlex Elder 	if (ret)
3088522a0cc0SAlex Elder 		rbd_warn(rbd_dev, "got notification but failed to "
3089522a0cc0SAlex Elder 			   " update snaps: %d\n", ret);
30901fe5e993SAlex Elder 
30911fe5e993SAlex Elder 	return ret;
30921fe5e993SAlex Elder }
30931fe5e993SAlex Elder 
3094602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
3095602adf40SYehuda Sadeh {
3096602adf40SYehuda Sadeh 	struct gendisk *disk;
3097602adf40SYehuda Sadeh 	struct request_queue *q;
3098593a9e7bSAlex Elder 	u64 segment_size;
3099602adf40SYehuda Sadeh 
3100602adf40SYehuda Sadeh 	/* create gendisk info */
3101602adf40SYehuda Sadeh 	disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3102602adf40SYehuda Sadeh 	if (!disk)
31031fcdb8aaSAlex Elder 		return -ENOMEM;
3104602adf40SYehuda Sadeh 
3105f0f8cef5SAlex Elder 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
3106de71a297SAlex Elder 		 rbd_dev->dev_id);
3107602adf40SYehuda Sadeh 	disk->major = rbd_dev->major;
3108602adf40SYehuda Sadeh 	disk->first_minor = 0;
3109602adf40SYehuda Sadeh 	disk->fops = &rbd_bd_ops;
3110602adf40SYehuda Sadeh 	disk->private_data = rbd_dev;
3111602adf40SYehuda Sadeh 
3112bf0d5f50SAlex Elder 	q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
3113602adf40SYehuda Sadeh 	if (!q)
3114602adf40SYehuda Sadeh 		goto out_disk;
3115029bcbd8SJosh Durgin 
3116593a9e7bSAlex Elder 	/* We use the default size, but let's be explicit about it. */
3117593a9e7bSAlex Elder 	blk_queue_physical_block_size(q, SECTOR_SIZE);
3118593a9e7bSAlex Elder 
3119029bcbd8SJosh Durgin 	/* set io sizes to object size */
3120593a9e7bSAlex Elder 	segment_size = rbd_obj_bytes(&rbd_dev->header);
3121593a9e7bSAlex Elder 	blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3122593a9e7bSAlex Elder 	blk_queue_max_segment_size(q, segment_size);
3123593a9e7bSAlex Elder 	blk_queue_io_min(q, segment_size);
3124593a9e7bSAlex Elder 	blk_queue_io_opt(q, segment_size);
3125029bcbd8SJosh Durgin 
3126602adf40SYehuda Sadeh 	blk_queue_merge_bvec(q, rbd_merge_bvec);
3127602adf40SYehuda Sadeh 	disk->queue = q;
3128602adf40SYehuda Sadeh 
3129602adf40SYehuda Sadeh 	q->queuedata = rbd_dev;
3130602adf40SYehuda Sadeh 
3131602adf40SYehuda Sadeh 	rbd_dev->disk = disk;
3132602adf40SYehuda Sadeh 
313312f02944SAlex Elder 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
313412f02944SAlex Elder 
3135602adf40SYehuda Sadeh 	return 0;
3136602adf40SYehuda Sadeh out_disk:
3137602adf40SYehuda Sadeh 	put_disk(disk);
31381fcdb8aaSAlex Elder 
31391fcdb8aaSAlex Elder 	return -ENOMEM;
3140602adf40SYehuda Sadeh }
3141602adf40SYehuda Sadeh 
3142dfc5606dSYehuda Sadeh /*
3143dfc5606dSYehuda Sadeh   sysfs
3144dfc5606dSYehuda Sadeh */
3145602adf40SYehuda Sadeh 
3146593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3147593a9e7bSAlex Elder {
3148593a9e7bSAlex Elder 	return container_of(dev, struct rbd_device, dev);
3149593a9e7bSAlex Elder }
3150593a9e7bSAlex Elder 
3151dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
3152dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3153602adf40SYehuda Sadeh {
3154593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3155a51aa0c0SJosh Durgin 	sector_t size;
3156dfc5606dSYehuda Sadeh 
3157a51aa0c0SJosh Durgin 	down_read(&rbd_dev->header_rwsem);
3158a51aa0c0SJosh Durgin 	size = get_capacity(rbd_dev->disk);
3159a51aa0c0SJosh Durgin 	up_read(&rbd_dev->header_rwsem);
3160a51aa0c0SJosh Durgin 
3161a51aa0c0SJosh Durgin 	return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
3162602adf40SYehuda Sadeh }
3163602adf40SYehuda Sadeh 
316434b13184SAlex Elder /*
316534b13184SAlex Elder  * Note this shows the features for whatever's mapped, which is not
316634b13184SAlex Elder  * necessarily the base image.
316734b13184SAlex Elder  */
316834b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
316934b13184SAlex Elder 			     struct device_attribute *attr, char *buf)
317034b13184SAlex Elder {
317134b13184SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
317234b13184SAlex Elder 
317334b13184SAlex Elder 	return sprintf(buf, "0x%016llx\n",
317434b13184SAlex Elder 			(unsigned long long) rbd_dev->mapping.features);
317534b13184SAlex Elder }
317634b13184SAlex Elder 
3177dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
3178dfc5606dSYehuda Sadeh 			      struct device_attribute *attr, char *buf)
3179602adf40SYehuda Sadeh {
3180593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3181dfc5606dSYehuda Sadeh 
3182dfc5606dSYehuda Sadeh 	return sprintf(buf, "%d\n", rbd_dev->major);
3183dfc5606dSYehuda Sadeh }
3184dfc5606dSYehuda Sadeh 
3185dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
3186dfc5606dSYehuda Sadeh 				  struct device_attribute *attr, char *buf)
3187dfc5606dSYehuda Sadeh {
3188593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3189dfc5606dSYehuda Sadeh 
31901dbb4399SAlex Elder 	return sprintf(buf, "client%lld\n",
31911dbb4399SAlex Elder 			ceph_client_id(rbd_dev->rbd_client->client));
3192dfc5606dSYehuda Sadeh }
3193dfc5606dSYehuda Sadeh 
3194dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
3195dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3196dfc5606dSYehuda Sadeh {
3197593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3198dfc5606dSYehuda Sadeh 
31990d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
3200dfc5606dSYehuda Sadeh }
3201dfc5606dSYehuda Sadeh 
32029bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
32039bb2f334SAlex Elder 			     struct device_attribute *attr, char *buf)
32049bb2f334SAlex Elder {
32059bb2f334SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
32069bb2f334SAlex Elder 
32070d7dbfceSAlex Elder 	return sprintf(buf, "%llu\n",
32080d7dbfceSAlex Elder 		(unsigned long long) rbd_dev->spec->pool_id);
32099bb2f334SAlex Elder }
32109bb2f334SAlex Elder 
3211dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
3212dfc5606dSYehuda Sadeh 			     struct device_attribute *attr, char *buf)
3213dfc5606dSYehuda Sadeh {
3214593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3215dfc5606dSYehuda Sadeh 
3216a92ffdf8SAlex Elder 	if (rbd_dev->spec->image_name)
32170d7dbfceSAlex Elder 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3218a92ffdf8SAlex Elder 
3219a92ffdf8SAlex Elder 	return sprintf(buf, "(unknown)\n");
3220dfc5606dSYehuda Sadeh }
3221dfc5606dSYehuda Sadeh 
3222589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
3223589d30e0SAlex Elder 			     struct device_attribute *attr, char *buf)
3224589d30e0SAlex Elder {
3225589d30e0SAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3226589d30e0SAlex Elder 
32270d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
3228589d30e0SAlex Elder }
3229589d30e0SAlex Elder 
323034b13184SAlex Elder /*
323134b13184SAlex Elder  * Shows the name of the currently-mapped snapshot (or
323234b13184SAlex Elder  * RBD_SNAP_HEAD_NAME for the base image).
323334b13184SAlex Elder  */
3234dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
3235dfc5606dSYehuda Sadeh 			     struct device_attribute *attr,
3236dfc5606dSYehuda Sadeh 			     char *buf)
3237dfc5606dSYehuda Sadeh {
3238593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3239dfc5606dSYehuda Sadeh 
32400d7dbfceSAlex Elder 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
3241dfc5606dSYehuda Sadeh }
3242dfc5606dSYehuda Sadeh 
324386b00e0dSAlex Elder /*
324486b00e0dSAlex Elder  * For an rbd v2 image, shows the pool id, image id, and snapshot id
324586b00e0dSAlex Elder  * for the parent image.  If there is no parent, simply shows
324686b00e0dSAlex Elder  * "(no parent image)".
324786b00e0dSAlex Elder  */
324886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
324986b00e0dSAlex Elder 			     struct device_attribute *attr,
325086b00e0dSAlex Elder 			     char *buf)
325186b00e0dSAlex Elder {
325286b00e0dSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
325386b00e0dSAlex Elder 	struct rbd_spec *spec = rbd_dev->parent_spec;
325486b00e0dSAlex Elder 	int count;
325586b00e0dSAlex Elder 	char *bufp = buf;
325686b00e0dSAlex Elder 
325786b00e0dSAlex Elder 	if (!spec)
325886b00e0dSAlex Elder 		return sprintf(buf, "(no parent image)\n");
325986b00e0dSAlex Elder 
326086b00e0dSAlex Elder 	count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
326186b00e0dSAlex Elder 			(unsigned long long) spec->pool_id, spec->pool_name);
326286b00e0dSAlex Elder 	if (count < 0)
326386b00e0dSAlex Elder 		return count;
326486b00e0dSAlex Elder 	bufp += count;
326586b00e0dSAlex Elder 
326686b00e0dSAlex Elder 	count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
326786b00e0dSAlex Elder 			spec->image_name ? spec->image_name : "(unknown)");
326886b00e0dSAlex Elder 	if (count < 0)
326986b00e0dSAlex Elder 		return count;
327086b00e0dSAlex Elder 	bufp += count;
327186b00e0dSAlex Elder 
327286b00e0dSAlex Elder 	count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
327386b00e0dSAlex Elder 			(unsigned long long) spec->snap_id, spec->snap_name);
327486b00e0dSAlex Elder 	if (count < 0)
327586b00e0dSAlex Elder 		return count;
327686b00e0dSAlex Elder 	bufp += count;
327786b00e0dSAlex Elder 
327886b00e0dSAlex Elder 	count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
327986b00e0dSAlex Elder 	if (count < 0)
328086b00e0dSAlex Elder 		return count;
328186b00e0dSAlex Elder 	bufp += count;
328286b00e0dSAlex Elder 
328386b00e0dSAlex Elder 	return (ssize_t) (bufp - buf);
328486b00e0dSAlex Elder }
328586b00e0dSAlex Elder 
3286dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
3287dfc5606dSYehuda Sadeh 				 struct device_attribute *attr,
3288dfc5606dSYehuda Sadeh 				 const char *buf,
3289dfc5606dSYehuda Sadeh 				 size_t size)
3290dfc5606dSYehuda Sadeh {
3291593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3292b813623aSAlex Elder 	int ret;
3293602adf40SYehuda Sadeh 
3294117973fbSAlex Elder 	ret = rbd_dev_refresh(rbd_dev, NULL);
3295b813623aSAlex Elder 
3296b813623aSAlex Elder 	return ret < 0 ? ret : size;
3297dfc5606dSYehuda Sadeh }
3298602adf40SYehuda Sadeh 
3299dfc5606dSYehuda Sadeh static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
330034b13184SAlex Elder static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
3301dfc5606dSYehuda Sadeh static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3302dfc5606dSYehuda Sadeh static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3303dfc5606dSYehuda Sadeh static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
33049bb2f334SAlex Elder static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
3305dfc5606dSYehuda Sadeh static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
3306589d30e0SAlex Elder static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
3307dfc5606dSYehuda Sadeh static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3308dfc5606dSYehuda Sadeh static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
330986b00e0dSAlex Elder static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
3310dfc5606dSYehuda Sadeh 
3311dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
3312dfc5606dSYehuda Sadeh 	&dev_attr_size.attr,
331334b13184SAlex Elder 	&dev_attr_features.attr,
3314dfc5606dSYehuda Sadeh 	&dev_attr_major.attr,
3315dfc5606dSYehuda Sadeh 	&dev_attr_client_id.attr,
3316dfc5606dSYehuda Sadeh 	&dev_attr_pool.attr,
33179bb2f334SAlex Elder 	&dev_attr_pool_id.attr,
3318dfc5606dSYehuda Sadeh 	&dev_attr_name.attr,
3319589d30e0SAlex Elder 	&dev_attr_image_id.attr,
3320dfc5606dSYehuda Sadeh 	&dev_attr_current_snap.attr,
332186b00e0dSAlex Elder 	&dev_attr_parent.attr,
3322dfc5606dSYehuda Sadeh 	&dev_attr_refresh.attr,
3323dfc5606dSYehuda Sadeh 	NULL
3324dfc5606dSYehuda Sadeh };
3325dfc5606dSYehuda Sadeh 
3326dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
3327dfc5606dSYehuda Sadeh 	.attrs = rbd_attrs,
3328dfc5606dSYehuda Sadeh };
3329dfc5606dSYehuda Sadeh 
3330dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
3331dfc5606dSYehuda Sadeh 	&rbd_attr_group,
3332dfc5606dSYehuda Sadeh 	NULL
3333dfc5606dSYehuda Sadeh };
3334dfc5606dSYehuda Sadeh 
3335dfc5606dSYehuda Sadeh static void rbd_sysfs_dev_release(struct device *dev)
3336dfc5606dSYehuda Sadeh {
3337dfc5606dSYehuda Sadeh }
3338dfc5606dSYehuda Sadeh 
3339dfc5606dSYehuda Sadeh static struct device_type rbd_device_type = {
3340dfc5606dSYehuda Sadeh 	.name		= "rbd",
3341dfc5606dSYehuda Sadeh 	.groups		= rbd_attr_groups,
3342dfc5606dSYehuda Sadeh 	.release	= rbd_sysfs_dev_release,
3343dfc5606dSYehuda Sadeh };
3344dfc5606dSYehuda Sadeh 
33458b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
33468b8fb99cSAlex Elder {
33478b8fb99cSAlex Elder 	kref_get(&spec->kref);
33488b8fb99cSAlex Elder 
33498b8fb99cSAlex Elder 	return spec;
33508b8fb99cSAlex Elder }
33518b8fb99cSAlex Elder 
33528b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
33538b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
33548b8fb99cSAlex Elder {
33558b8fb99cSAlex Elder 	if (spec)
33568b8fb99cSAlex Elder 		kref_put(&spec->kref, rbd_spec_free);
33578b8fb99cSAlex Elder }
33588b8fb99cSAlex Elder 
33598b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
33608b8fb99cSAlex Elder {
33618b8fb99cSAlex Elder 	struct rbd_spec *spec;
33628b8fb99cSAlex Elder 
33638b8fb99cSAlex Elder 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
33648b8fb99cSAlex Elder 	if (!spec)
33658b8fb99cSAlex Elder 		return NULL;
33668b8fb99cSAlex Elder 	kref_init(&spec->kref);
33678b8fb99cSAlex Elder 
33688b8fb99cSAlex Elder 	return spec;
33698b8fb99cSAlex Elder }
33708b8fb99cSAlex Elder 
33718b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
33728b8fb99cSAlex Elder {
33738b8fb99cSAlex Elder 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
33748b8fb99cSAlex Elder 
33758b8fb99cSAlex Elder 	kfree(spec->pool_name);
33768b8fb99cSAlex Elder 	kfree(spec->image_id);
33778b8fb99cSAlex Elder 	kfree(spec->image_name);
33788b8fb99cSAlex Elder 	kfree(spec->snap_name);
33798b8fb99cSAlex Elder 	kfree(spec);
33808b8fb99cSAlex Elder }
33818b8fb99cSAlex Elder 
3382cc344fa1SAlex Elder static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
3383c53d5893SAlex Elder 				struct rbd_spec *spec)
3384c53d5893SAlex Elder {
3385c53d5893SAlex Elder 	struct rbd_device *rbd_dev;
3386c53d5893SAlex Elder 
3387c53d5893SAlex Elder 	rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3388c53d5893SAlex Elder 	if (!rbd_dev)
3389c53d5893SAlex Elder 		return NULL;
3390c53d5893SAlex Elder 
3391c53d5893SAlex Elder 	spin_lock_init(&rbd_dev->lock);
33926d292906SAlex Elder 	rbd_dev->flags = 0;
3393c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->node);
3394c53d5893SAlex Elder 	INIT_LIST_HEAD(&rbd_dev->snaps);
3395c53d5893SAlex Elder 	init_rwsem(&rbd_dev->header_rwsem);
3396c53d5893SAlex Elder 
3397c53d5893SAlex Elder 	rbd_dev->spec = spec;
3398c53d5893SAlex Elder 	rbd_dev->rbd_client = rbdc;
3399c53d5893SAlex Elder 
34000903e875SAlex Elder 	/* Initialize the layout used for all rbd requests */
34010903e875SAlex Elder 
34020903e875SAlex Elder 	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34030903e875SAlex Elder 	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
34040903e875SAlex Elder 	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
34050903e875SAlex Elder 	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
34060903e875SAlex Elder 
3407c53d5893SAlex Elder 	return rbd_dev;
3408c53d5893SAlex Elder }
3409c53d5893SAlex Elder 
3410c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3411c53d5893SAlex Elder {
341286b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
3413c53d5893SAlex Elder 	kfree(rbd_dev->header_name);
3414c53d5893SAlex Elder 	rbd_put_client(rbd_dev->rbd_client);
3415c53d5893SAlex Elder 	rbd_spec_put(rbd_dev->spec);
3416c53d5893SAlex Elder 	kfree(rbd_dev);
3417c53d5893SAlex Elder }
3418c53d5893SAlex Elder 
34196087b51bSAlex Elder static void rbd_snap_destroy(struct rbd_snap *snap)
3420dfc5606dSYehuda Sadeh {
34213e83b65bSAlex Elder 	kfree(snap->name);
34223e83b65bSAlex Elder 	kfree(snap);
3423dfc5606dSYehuda Sadeh }
3424dfc5606dSYehuda Sadeh 
34256087b51bSAlex Elder static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
3426c8d18425SAlex Elder 						const char *snap_name,
342734b13184SAlex Elder 						u64 snap_id, u64 snap_size,
342834b13184SAlex Elder 						u64 snap_features)
3429dfc5606dSYehuda Sadeh {
34304e891e0aSAlex Elder 	struct rbd_snap *snap;
34314e891e0aSAlex Elder 
34324e891e0aSAlex Elder 	snap = kzalloc(sizeof (*snap), GFP_KERNEL);
3433dfc5606dSYehuda Sadeh 	if (!snap)
34344e891e0aSAlex Elder 		return ERR_PTR(-ENOMEM);
34354e891e0aSAlex Elder 
34366e584f52SAlex Elder 	snap->name = snap_name;
3437c8d18425SAlex Elder 	snap->id = snap_id;
3438c8d18425SAlex Elder 	snap->size = snap_size;
343934b13184SAlex Elder 	snap->features = snap_features;
34404e891e0aSAlex Elder 
34414e891e0aSAlex Elder 	return snap;
3442dfc5606dSYehuda Sadeh }
3443dfc5606dSYehuda Sadeh 
34446e584f52SAlex Elder /*
34456e584f52SAlex Elder  * Returns a dynamically-allocated snapshot name if successful, or a
34466e584f52SAlex Elder  * pointer-coded error otherwise.
34476e584f52SAlex Elder  */
3448cd892126SAlex Elder static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3449cd892126SAlex Elder 		u64 *snap_size, u64 *snap_features)
3450cd892126SAlex Elder {
3451cd892126SAlex Elder 	char *snap_name;
34526e584f52SAlex Elder 	int i;
3453cd892126SAlex Elder 
3454cd892126SAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3455cd892126SAlex Elder 
3456cd892126SAlex Elder 	/* Skip over names until we find the one we are looking for */
3457cd892126SAlex Elder 
3458cd892126SAlex Elder 	snap_name = rbd_dev->header.snap_names;
34596e584f52SAlex Elder 	for (i = 0; i < which; i++)
3460cd892126SAlex Elder 		snap_name += strlen(snap_name) + 1;
3461cd892126SAlex Elder 
34626e584f52SAlex Elder 	snap_name = kstrdup(snap_name, GFP_KERNEL);
34636e584f52SAlex Elder 	if (!snap_name)
34646e584f52SAlex Elder 		return ERR_PTR(-ENOMEM);
34656e584f52SAlex Elder 
34666e584f52SAlex Elder 	*snap_size = rbd_dev->header.snap_sizes[which];
34676e584f52SAlex Elder 	*snap_features = 0;	/* No features for v1 */
34686e584f52SAlex Elder 
3469cd892126SAlex Elder 	return snap_name;
3470cd892126SAlex Elder }
3471cd892126SAlex Elder 
3472dfc5606dSYehuda Sadeh /*
34739d475de5SAlex Elder  * Get the size and object order for an image snapshot, or if
34749d475de5SAlex Elder  * snap_id is CEPH_NOSNAP, gets this information for the base
34759d475de5SAlex Elder  * image.
34769d475de5SAlex Elder  */
34779d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
34789d475de5SAlex Elder 				u8 *order, u64 *snap_size)
34799d475de5SAlex Elder {
34809d475de5SAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
34819d475de5SAlex Elder 	int ret;
34829d475de5SAlex Elder 	struct {
34839d475de5SAlex Elder 		u8 order;
34849d475de5SAlex Elder 		__le64 size;
34859d475de5SAlex Elder 	} __attribute__ ((packed)) size_buf = { 0 };
34869d475de5SAlex Elder 
348736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
34889d475de5SAlex Elder 				"rbd", "get_size",
34894157976bSAlex Elder 				&snapid, sizeof (snapid),
34904157976bSAlex Elder 				&size_buf, sizeof (size_buf), NULL);
349136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
34929d475de5SAlex Elder 	if (ret < 0)
34939d475de5SAlex Elder 		return ret;
349457385b51SAlex Elder 	if (ret < sizeof (size_buf))
349557385b51SAlex Elder 		return -ERANGE;
34969d475de5SAlex Elder 
3497c86f86e9SAlex Elder 	if (order)
34989d475de5SAlex Elder 		*order = size_buf.order;
34999d475de5SAlex Elder 	*snap_size = le64_to_cpu(size_buf.size);
35009d475de5SAlex Elder 
35019d475de5SAlex Elder 	dout("  snap_id 0x%016llx order = %u, snap_size = %llu\n",
35029d475de5SAlex Elder 		(unsigned long long)snap_id, (unsigned int)*order,
35039d475de5SAlex Elder 		(unsigned long long)*snap_size);
35049d475de5SAlex Elder 
35059d475de5SAlex Elder 	return 0;
35069d475de5SAlex Elder }
35079d475de5SAlex Elder 
35089d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
35099d475de5SAlex Elder {
35109d475de5SAlex Elder 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
35119d475de5SAlex Elder 					&rbd_dev->header.obj_order,
35129d475de5SAlex Elder 					&rbd_dev->header.image_size);
35139d475de5SAlex Elder }
35149d475de5SAlex Elder 
35151e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
35161e130199SAlex Elder {
35171e130199SAlex Elder 	void *reply_buf;
35181e130199SAlex Elder 	int ret;
35191e130199SAlex Elder 	void *p;
35201e130199SAlex Elder 
35211e130199SAlex Elder 	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
35221e130199SAlex Elder 	if (!reply_buf)
35231e130199SAlex Elder 		return -ENOMEM;
35241e130199SAlex Elder 
352536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35264157976bSAlex Elder 				"rbd", "get_object_prefix", NULL, 0,
352707b2391fSAlex Elder 				reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
352836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35291e130199SAlex Elder 	if (ret < 0)
35301e130199SAlex Elder 		goto out;
35311e130199SAlex Elder 
35321e130199SAlex Elder 	p = reply_buf;
35331e130199SAlex Elder 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
353457385b51SAlex Elder 						p + ret, NULL, GFP_NOIO);
353557385b51SAlex Elder 	ret = 0;
35361e130199SAlex Elder 
35371e130199SAlex Elder 	if (IS_ERR(rbd_dev->header.object_prefix)) {
35381e130199SAlex Elder 		ret = PTR_ERR(rbd_dev->header.object_prefix);
35391e130199SAlex Elder 		rbd_dev->header.object_prefix = NULL;
35401e130199SAlex Elder 	} else {
35411e130199SAlex Elder 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
35421e130199SAlex Elder 	}
35431e130199SAlex Elder out:
35441e130199SAlex Elder 	kfree(reply_buf);
35451e130199SAlex Elder 
35461e130199SAlex Elder 	return ret;
35471e130199SAlex Elder }
35481e130199SAlex Elder 
3549b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3550b1b5402aSAlex Elder 		u64 *snap_features)
3551b1b5402aSAlex Elder {
3552b1b5402aSAlex Elder 	__le64 snapid = cpu_to_le64(snap_id);
3553b1b5402aSAlex Elder 	struct {
3554b1b5402aSAlex Elder 		__le64 features;
3555b1b5402aSAlex Elder 		__le64 incompat;
35564157976bSAlex Elder 	} __attribute__ ((packed)) features_buf = { 0 };
3557d889140cSAlex Elder 	u64 incompat;
3558b1b5402aSAlex Elder 	int ret;
3559b1b5402aSAlex Elder 
356036be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3561b1b5402aSAlex Elder 				"rbd", "get_features",
35624157976bSAlex Elder 				&snapid, sizeof (snapid),
35634157976bSAlex Elder 				&features_buf, sizeof (features_buf), NULL);
356436be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3565b1b5402aSAlex Elder 	if (ret < 0)
3566b1b5402aSAlex Elder 		return ret;
356757385b51SAlex Elder 	if (ret < sizeof (features_buf))
356857385b51SAlex Elder 		return -ERANGE;
3569d889140cSAlex Elder 
3570d889140cSAlex Elder 	incompat = le64_to_cpu(features_buf.incompat);
35715cbf6f12SAlex Elder 	if (incompat & ~RBD_FEATURES_SUPPORTED)
3572b8f5c6edSAlex Elder 		return -ENXIO;
3573d889140cSAlex Elder 
3574b1b5402aSAlex Elder 	*snap_features = le64_to_cpu(features_buf.features);
3575b1b5402aSAlex Elder 
3576b1b5402aSAlex Elder 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3577b1b5402aSAlex Elder 		(unsigned long long)snap_id,
3578b1b5402aSAlex Elder 		(unsigned long long)*snap_features,
3579b1b5402aSAlex Elder 		(unsigned long long)le64_to_cpu(features_buf.incompat));
3580b1b5402aSAlex Elder 
3581b1b5402aSAlex Elder 	return 0;
3582b1b5402aSAlex Elder }
3583b1b5402aSAlex Elder 
3584b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3585b1b5402aSAlex Elder {
3586b1b5402aSAlex Elder 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3587b1b5402aSAlex Elder 						&rbd_dev->header.features);
3588b1b5402aSAlex Elder }
3589b1b5402aSAlex Elder 
359086b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
359186b00e0dSAlex Elder {
359286b00e0dSAlex Elder 	struct rbd_spec *parent_spec;
359386b00e0dSAlex Elder 	size_t size;
359486b00e0dSAlex Elder 	void *reply_buf = NULL;
359586b00e0dSAlex Elder 	__le64 snapid;
359686b00e0dSAlex Elder 	void *p;
359786b00e0dSAlex Elder 	void *end;
359886b00e0dSAlex Elder 	char *image_id;
359986b00e0dSAlex Elder 	u64 overlap;
360086b00e0dSAlex Elder 	int ret;
360186b00e0dSAlex Elder 
360286b00e0dSAlex Elder 	parent_spec = rbd_spec_alloc();
360386b00e0dSAlex Elder 	if (!parent_spec)
360486b00e0dSAlex Elder 		return -ENOMEM;
360586b00e0dSAlex Elder 
360686b00e0dSAlex Elder 	size = sizeof (__le64) +				/* pool_id */
360786b00e0dSAlex Elder 		sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX +	/* image_id */
360886b00e0dSAlex Elder 		sizeof (__le64) +				/* snap_id */
360986b00e0dSAlex Elder 		sizeof (__le64);				/* overlap */
361086b00e0dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
361186b00e0dSAlex Elder 	if (!reply_buf) {
361286b00e0dSAlex Elder 		ret = -ENOMEM;
361386b00e0dSAlex Elder 		goto out_err;
361486b00e0dSAlex Elder 	}
361586b00e0dSAlex Elder 
361686b00e0dSAlex Elder 	snapid = cpu_to_le64(CEPH_NOSNAP);
361736be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
361886b00e0dSAlex Elder 				"rbd", "get_parent",
36194157976bSAlex Elder 				&snapid, sizeof (snapid),
36204157976bSAlex Elder 				reply_buf, size, NULL);
362136be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
362286b00e0dSAlex Elder 	if (ret < 0)
362386b00e0dSAlex Elder 		goto out_err;
362486b00e0dSAlex Elder 
362586b00e0dSAlex Elder 	p = reply_buf;
362657385b51SAlex Elder 	end = reply_buf + ret;
362757385b51SAlex Elder 	ret = -ERANGE;
362886b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
362986b00e0dSAlex Elder 	if (parent_spec->pool_id == CEPH_NOPOOL)
363086b00e0dSAlex Elder 		goto out;	/* No parent?  No problem. */
363186b00e0dSAlex Elder 
36320903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
36330903e875SAlex Elder 
36340903e875SAlex Elder 	ret = -EIO;
36350903e875SAlex Elder 	if (WARN_ON(parent_spec->pool_id > (u64)U32_MAX))
363657385b51SAlex Elder 		goto out_err;
36370903e875SAlex Elder 
3638979ed480SAlex Elder 	image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
363986b00e0dSAlex Elder 	if (IS_ERR(image_id)) {
364086b00e0dSAlex Elder 		ret = PTR_ERR(image_id);
364186b00e0dSAlex Elder 		goto out_err;
364286b00e0dSAlex Elder 	}
364386b00e0dSAlex Elder 	parent_spec->image_id = image_id;
364486b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
364586b00e0dSAlex Elder 	ceph_decode_64_safe(&p, end, overlap, out_err);
364686b00e0dSAlex Elder 
364786b00e0dSAlex Elder 	rbd_dev->parent_overlap = overlap;
364886b00e0dSAlex Elder 	rbd_dev->parent_spec = parent_spec;
364986b00e0dSAlex Elder 	parent_spec = NULL;	/* rbd_dev now owns this */
365086b00e0dSAlex Elder out:
365186b00e0dSAlex Elder 	ret = 0;
365286b00e0dSAlex Elder out_err:
365386b00e0dSAlex Elder 	kfree(reply_buf);
365486b00e0dSAlex Elder 	rbd_spec_put(parent_spec);
365586b00e0dSAlex Elder 
365686b00e0dSAlex Elder 	return ret;
365786b00e0dSAlex Elder }
365886b00e0dSAlex Elder 
3659cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3660cc070d59SAlex Elder {
3661cc070d59SAlex Elder 	struct {
3662cc070d59SAlex Elder 		__le64 stripe_unit;
3663cc070d59SAlex Elder 		__le64 stripe_count;
3664cc070d59SAlex Elder 	} __attribute__ ((packed)) striping_info_buf = { 0 };
3665cc070d59SAlex Elder 	size_t size = sizeof (striping_info_buf);
3666cc070d59SAlex Elder 	void *p;
3667cc070d59SAlex Elder 	u64 obj_size;
3668cc070d59SAlex Elder 	u64 stripe_unit;
3669cc070d59SAlex Elder 	u64 stripe_count;
3670cc070d59SAlex Elder 	int ret;
3671cc070d59SAlex Elder 
3672cc070d59SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3673cc070d59SAlex Elder 				"rbd", "get_stripe_unit_count", NULL, 0,
3674cc070d59SAlex Elder 				(char *)&striping_info_buf, size, NULL);
3675cc070d59SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3676cc070d59SAlex Elder 	if (ret < 0)
3677cc070d59SAlex Elder 		return ret;
3678cc070d59SAlex Elder 	if (ret < size)
3679cc070d59SAlex Elder 		return -ERANGE;
3680cc070d59SAlex Elder 
3681cc070d59SAlex Elder 	/*
3682cc070d59SAlex Elder 	 * We don't actually support the "fancy striping" feature
3683cc070d59SAlex Elder 	 * (STRIPINGV2) yet, but if the striping sizes are the
3684cc070d59SAlex Elder 	 * defaults the behavior is the same as before.  So find
3685cc070d59SAlex Elder 	 * out, and only fail if the image has non-default values.
3686cc070d59SAlex Elder 	 */
3687cc070d59SAlex Elder 	ret = -EINVAL;
3688cc070d59SAlex Elder 	obj_size = (u64)1 << rbd_dev->header.obj_order;
3689cc070d59SAlex Elder 	p = &striping_info_buf;
3690cc070d59SAlex Elder 	stripe_unit = ceph_decode_64(&p);
3691cc070d59SAlex Elder 	if (stripe_unit != obj_size) {
3692cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe unit "
3693cc070d59SAlex Elder 				"(got %llu want %llu)",
3694cc070d59SAlex Elder 				stripe_unit, obj_size);
3695cc070d59SAlex Elder 		return -EINVAL;
3696cc070d59SAlex Elder 	}
3697cc070d59SAlex Elder 	stripe_count = ceph_decode_64(&p);
3698cc070d59SAlex Elder 	if (stripe_count != 1) {
3699cc070d59SAlex Elder 		rbd_warn(rbd_dev, "unsupported stripe count "
3700cc070d59SAlex Elder 				"(got %llu want 1)", stripe_count);
3701cc070d59SAlex Elder 		return -EINVAL;
3702cc070d59SAlex Elder 	}
3703cc070d59SAlex Elder 	rbd_dev->stripe_unit = stripe_unit;
3704cc070d59SAlex Elder 	rbd_dev->stripe_count = stripe_count;
3705cc070d59SAlex Elder 
3706cc070d59SAlex Elder 	return 0;
3707cc070d59SAlex Elder }
3708cc070d59SAlex Elder 
37099e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
37109e15b77dSAlex Elder {
37119e15b77dSAlex Elder 	size_t image_id_size;
37129e15b77dSAlex Elder 	char *image_id;
37139e15b77dSAlex Elder 	void *p;
37149e15b77dSAlex Elder 	void *end;
37159e15b77dSAlex Elder 	size_t size;
37169e15b77dSAlex Elder 	void *reply_buf = NULL;
37179e15b77dSAlex Elder 	size_t len = 0;
37189e15b77dSAlex Elder 	char *image_name = NULL;
37199e15b77dSAlex Elder 	int ret;
37209e15b77dSAlex Elder 
37219e15b77dSAlex Elder 	rbd_assert(!rbd_dev->spec->image_name);
37229e15b77dSAlex Elder 
372369e7a02fSAlex Elder 	len = strlen(rbd_dev->spec->image_id);
372469e7a02fSAlex Elder 	image_id_size = sizeof (__le32) + len;
37259e15b77dSAlex Elder 	image_id = kmalloc(image_id_size, GFP_KERNEL);
37269e15b77dSAlex Elder 	if (!image_id)
37279e15b77dSAlex Elder 		return NULL;
37289e15b77dSAlex Elder 
37299e15b77dSAlex Elder 	p = image_id;
37304157976bSAlex Elder 	end = image_id + image_id_size;
373169e7a02fSAlex Elder 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
37329e15b77dSAlex Elder 
37339e15b77dSAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
37349e15b77dSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
37359e15b77dSAlex Elder 	if (!reply_buf)
37369e15b77dSAlex Elder 		goto out;
37379e15b77dSAlex Elder 
373836be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
37399e15b77dSAlex Elder 				"rbd", "dir_get_name",
37409e15b77dSAlex Elder 				image_id, image_id_size,
37414157976bSAlex Elder 				reply_buf, size, NULL);
37429e15b77dSAlex Elder 	if (ret < 0)
37439e15b77dSAlex Elder 		goto out;
37449e15b77dSAlex Elder 	p = reply_buf;
3745f40eb349SAlex Elder 	end = reply_buf + ret;
3746f40eb349SAlex Elder 
37479e15b77dSAlex Elder 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
37489e15b77dSAlex Elder 	if (IS_ERR(image_name))
37499e15b77dSAlex Elder 		image_name = NULL;
37509e15b77dSAlex Elder 	else
37519e15b77dSAlex Elder 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
37529e15b77dSAlex Elder out:
37539e15b77dSAlex Elder 	kfree(reply_buf);
37549e15b77dSAlex Elder 	kfree(image_id);
37559e15b77dSAlex Elder 
37569e15b77dSAlex Elder 	return image_name;
37579e15b77dSAlex Elder }
37589e15b77dSAlex Elder 
37599e15b77dSAlex Elder /*
37609e15b77dSAlex Elder  * When a parent image gets probed, we only have the pool, image,
37619e15b77dSAlex Elder  * and snapshot ids but not the names of any of them.  This call
37629e15b77dSAlex Elder  * is made later to fill in those names.  It has to be done after
37639e15b77dSAlex Elder  * rbd_dev_snaps_update() has completed because some of the
37649e15b77dSAlex Elder  * information (in particular, snapshot name) is not available
37659e15b77dSAlex Elder  * until then.
37669e15b77dSAlex Elder  */
37679e15b77dSAlex Elder static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
37689e15b77dSAlex Elder {
37699e15b77dSAlex Elder 	struct ceph_osd_client *osdc;
37709e15b77dSAlex Elder 	const char *name;
37719e15b77dSAlex Elder 	void *reply_buf = NULL;
37729e15b77dSAlex Elder 	int ret;
37739e15b77dSAlex Elder 
37749e15b77dSAlex Elder 	if (rbd_dev->spec->pool_name)
37759e15b77dSAlex Elder 		return 0;	/* Already have the names */
37769e15b77dSAlex Elder 
37779e15b77dSAlex Elder 	/* Look up the pool name */
37789e15b77dSAlex Elder 
37799e15b77dSAlex Elder 	osdc = &rbd_dev->rbd_client->client->osdc;
37809e15b77dSAlex Elder 	name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
3781935dc89fSAlex Elder 	if (!name) {
3782935dc89fSAlex Elder 		rbd_warn(rbd_dev, "there is no pool with id %llu",
3783935dc89fSAlex Elder 			rbd_dev->spec->pool_id);	/* Really a BUG() */
3784935dc89fSAlex Elder 		return -EIO;
3785935dc89fSAlex Elder 	}
37869e15b77dSAlex Elder 
37879e15b77dSAlex Elder 	rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
37889e15b77dSAlex Elder 	if (!rbd_dev->spec->pool_name)
37899e15b77dSAlex Elder 		return -ENOMEM;
37909e15b77dSAlex Elder 
37919e15b77dSAlex Elder 	/* Fetch the image name; tolerate failure here */
37929e15b77dSAlex Elder 
37939e15b77dSAlex Elder 	name = rbd_dev_image_name(rbd_dev);
379469e7a02fSAlex Elder 	if (name)
37959e15b77dSAlex Elder 		rbd_dev->spec->image_name = (char *)name;
379669e7a02fSAlex Elder 	else
379706ecc6cbSAlex Elder 		rbd_warn(rbd_dev, "unable to get image name");
37989e15b77dSAlex Elder 
37999e15b77dSAlex Elder 	/* Look up the snapshot name. */
38009e15b77dSAlex Elder 
38019e15b77dSAlex Elder 	name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
38029e15b77dSAlex Elder 	if (!name) {
3803935dc89fSAlex Elder 		rbd_warn(rbd_dev, "no snapshot with id %llu",
3804935dc89fSAlex Elder 			rbd_dev->spec->snap_id);	/* Really a BUG() */
38059e15b77dSAlex Elder 		ret = -EIO;
38069e15b77dSAlex Elder 		goto out_err;
38079e15b77dSAlex Elder 	}
38089e15b77dSAlex Elder 	rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
38099e15b77dSAlex Elder 	if(!rbd_dev->spec->snap_name)
38109e15b77dSAlex Elder 		goto out_err;
38119e15b77dSAlex Elder 
38129e15b77dSAlex Elder 	return 0;
38139e15b77dSAlex Elder out_err:
38149e15b77dSAlex Elder 	kfree(reply_buf);
38159e15b77dSAlex Elder 	kfree(rbd_dev->spec->pool_name);
38169e15b77dSAlex Elder 	rbd_dev->spec->pool_name = NULL;
38179e15b77dSAlex Elder 
38189e15b77dSAlex Elder 	return ret;
38199e15b77dSAlex Elder }
38209e15b77dSAlex Elder 
38216e14b1a6SAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
382235d489f9SAlex Elder {
382335d489f9SAlex Elder 	size_t size;
382435d489f9SAlex Elder 	int ret;
382535d489f9SAlex Elder 	void *reply_buf;
382635d489f9SAlex Elder 	void *p;
382735d489f9SAlex Elder 	void *end;
382835d489f9SAlex Elder 	u64 seq;
382935d489f9SAlex Elder 	u32 snap_count;
383035d489f9SAlex Elder 	struct ceph_snap_context *snapc;
383135d489f9SAlex Elder 	u32 i;
383235d489f9SAlex Elder 
383335d489f9SAlex Elder 	/*
383435d489f9SAlex Elder 	 * We'll need room for the seq value (maximum snapshot id),
383535d489f9SAlex Elder 	 * snapshot count, and array of that many snapshot ids.
383635d489f9SAlex Elder 	 * For now we have a fixed upper limit on the number we're
383735d489f9SAlex Elder 	 * prepared to receive.
383835d489f9SAlex Elder 	 */
383935d489f9SAlex Elder 	size = sizeof (__le64) + sizeof (__le32) +
384035d489f9SAlex Elder 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
384135d489f9SAlex Elder 	reply_buf = kzalloc(size, GFP_KERNEL);
384235d489f9SAlex Elder 	if (!reply_buf)
384335d489f9SAlex Elder 		return -ENOMEM;
384435d489f9SAlex Elder 
384536be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
38464157976bSAlex Elder 				"rbd", "get_snapcontext", NULL, 0,
384707b2391fSAlex Elder 				reply_buf, size, ver);
384836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
384935d489f9SAlex Elder 	if (ret < 0)
385035d489f9SAlex Elder 		goto out;
385135d489f9SAlex Elder 
385235d489f9SAlex Elder 	p = reply_buf;
385357385b51SAlex Elder 	end = reply_buf + ret;
385457385b51SAlex Elder 	ret = -ERANGE;
385535d489f9SAlex Elder 	ceph_decode_64_safe(&p, end, seq, out);
385635d489f9SAlex Elder 	ceph_decode_32_safe(&p, end, snap_count, out);
385735d489f9SAlex Elder 
385835d489f9SAlex Elder 	/*
385935d489f9SAlex Elder 	 * Make sure the reported number of snapshot ids wouldn't go
386035d489f9SAlex Elder 	 * beyond the end of our buffer.  But before checking that,
386135d489f9SAlex Elder 	 * make sure the computed size of the snapshot context we
386235d489f9SAlex Elder 	 * allocate is representable in a size_t.
386335d489f9SAlex Elder 	 */
386435d489f9SAlex Elder 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
386535d489f9SAlex Elder 				 / sizeof (u64)) {
386635d489f9SAlex Elder 		ret = -EINVAL;
386735d489f9SAlex Elder 		goto out;
386835d489f9SAlex Elder 	}
386935d489f9SAlex Elder 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
387035d489f9SAlex Elder 		goto out;
387135d489f9SAlex Elder 
387235d489f9SAlex Elder 	size = sizeof (struct ceph_snap_context) +
387335d489f9SAlex Elder 				snap_count * sizeof (snapc->snaps[0]);
387435d489f9SAlex Elder 	snapc = kmalloc(size, GFP_KERNEL);
387535d489f9SAlex Elder 	if (!snapc) {
387635d489f9SAlex Elder 		ret = -ENOMEM;
387735d489f9SAlex Elder 		goto out;
387835d489f9SAlex Elder 	}
387957385b51SAlex Elder 	ret = 0;
388035d489f9SAlex Elder 
388135d489f9SAlex Elder 	atomic_set(&snapc->nref, 1);
388235d489f9SAlex Elder 	snapc->seq = seq;
388335d489f9SAlex Elder 	snapc->num_snaps = snap_count;
388435d489f9SAlex Elder 	for (i = 0; i < snap_count; i++)
388535d489f9SAlex Elder 		snapc->snaps[i] = ceph_decode_64(&p);
388635d489f9SAlex Elder 
388735d489f9SAlex Elder 	rbd_dev->header.snapc = snapc;
388835d489f9SAlex Elder 
388935d489f9SAlex Elder 	dout("  snap context seq = %llu, snap_count = %u\n",
389035d489f9SAlex Elder 		(unsigned long long)seq, (unsigned int)snap_count);
389135d489f9SAlex Elder out:
389235d489f9SAlex Elder 	kfree(reply_buf);
389335d489f9SAlex Elder 
389457385b51SAlex Elder 	return ret;
389535d489f9SAlex Elder }
389635d489f9SAlex Elder 
3897b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3898b8b1e2dbSAlex Elder {
3899b8b1e2dbSAlex Elder 	size_t size;
3900b8b1e2dbSAlex Elder 	void *reply_buf;
3901b8b1e2dbSAlex Elder 	__le64 snap_id;
3902b8b1e2dbSAlex Elder 	int ret;
3903b8b1e2dbSAlex Elder 	void *p;
3904b8b1e2dbSAlex Elder 	void *end;
3905b8b1e2dbSAlex Elder 	char *snap_name;
3906b8b1e2dbSAlex Elder 
3907b8b1e2dbSAlex Elder 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3908b8b1e2dbSAlex Elder 	reply_buf = kmalloc(size, GFP_KERNEL);
3909b8b1e2dbSAlex Elder 	if (!reply_buf)
3910b8b1e2dbSAlex Elder 		return ERR_PTR(-ENOMEM);
3911b8b1e2dbSAlex Elder 
3912acb1b6caSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3913b8b1e2dbSAlex Elder 	snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
391436be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3915b8b1e2dbSAlex Elder 				"rbd", "get_snapshot_name",
39164157976bSAlex Elder 				&snap_id, sizeof (snap_id),
391707b2391fSAlex Elder 				reply_buf, size, NULL);
391836be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3919f40eb349SAlex Elder 	if (ret < 0) {
3920f40eb349SAlex Elder 		snap_name = ERR_PTR(ret);
3921b8b1e2dbSAlex Elder 		goto out;
3922f40eb349SAlex Elder 	}
3923b8b1e2dbSAlex Elder 
3924b8b1e2dbSAlex Elder 	p = reply_buf;
3925f40eb349SAlex Elder 	end = reply_buf + ret;
3926e5c35534SAlex Elder 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
3927f40eb349SAlex Elder 	if (IS_ERR(snap_name))
3928b8b1e2dbSAlex Elder 		goto out;
3929f40eb349SAlex Elder 
3930b8b1e2dbSAlex Elder 	dout("  snap_id 0x%016llx snap_name = %s\n",
3931b8b1e2dbSAlex Elder 		(unsigned long long)le64_to_cpu(snap_id), snap_name);
3932b8b1e2dbSAlex Elder out:
3933b8b1e2dbSAlex Elder 	kfree(reply_buf);
3934b8b1e2dbSAlex Elder 
3935f40eb349SAlex Elder 	return snap_name;
3936b8b1e2dbSAlex Elder }
3937b8b1e2dbSAlex Elder 
3938b8b1e2dbSAlex Elder static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3939b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3940b8b1e2dbSAlex Elder {
3941e0b49868SAlex Elder 	u64 snap_id;
3942acb1b6caSAlex Elder 	u64 size;
3943acb1b6caSAlex Elder 	u64 features;
3944acb1b6caSAlex Elder 	char *snap_name;
3945b8b1e2dbSAlex Elder 	int ret;
3946b8b1e2dbSAlex Elder 
3947acb1b6caSAlex Elder 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3948b8b1e2dbSAlex Elder 	snap_id = rbd_dev->header.snapc->snaps[which];
3949acb1b6caSAlex Elder 	ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
3950b8b1e2dbSAlex Elder 	if (ret)
3951acb1b6caSAlex Elder 		goto out_err;
3952b8b1e2dbSAlex Elder 
3953acb1b6caSAlex Elder 	ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
3954acb1b6caSAlex Elder 	if (ret)
3955acb1b6caSAlex Elder 		goto out_err;
3956acb1b6caSAlex Elder 
3957acb1b6caSAlex Elder 	snap_name = rbd_dev_v2_snap_name(rbd_dev, which);
3958acb1b6caSAlex Elder 	if (!IS_ERR(snap_name)) {
3959acb1b6caSAlex Elder 		*snap_size = size;
3960acb1b6caSAlex Elder 		*snap_features = features;
3961acb1b6caSAlex Elder 	}
3962acb1b6caSAlex Elder 
3963acb1b6caSAlex Elder 	return snap_name;
3964acb1b6caSAlex Elder out_err:
3965acb1b6caSAlex Elder 	return ERR_PTR(ret);
3966b8b1e2dbSAlex Elder }
3967b8b1e2dbSAlex Elder 
3968b8b1e2dbSAlex Elder static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3969b8b1e2dbSAlex Elder 		u64 *snap_size, u64 *snap_features)
3970b8b1e2dbSAlex Elder {
3971b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 1)
3972b8b1e2dbSAlex Elder 		return rbd_dev_v1_snap_info(rbd_dev, which,
3973b8b1e2dbSAlex Elder 					snap_size, snap_features);
3974b8b1e2dbSAlex Elder 	if (rbd_dev->image_format == 2)
3975b8b1e2dbSAlex Elder 		return rbd_dev_v2_snap_info(rbd_dev, which,
3976b8b1e2dbSAlex Elder 					snap_size, snap_features);
3977b8b1e2dbSAlex Elder 	return ERR_PTR(-EINVAL);
3978b8b1e2dbSAlex Elder }
3979b8b1e2dbSAlex Elder 
3980117973fbSAlex Elder static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3981117973fbSAlex Elder {
3982117973fbSAlex Elder 	int ret;
3983117973fbSAlex Elder 	__u8 obj_order;
3984117973fbSAlex Elder 
3985117973fbSAlex Elder 	down_write(&rbd_dev->header_rwsem);
3986117973fbSAlex Elder 
3987117973fbSAlex Elder 	/* Grab old order first, to see if it changes */
3988117973fbSAlex Elder 
3989117973fbSAlex Elder 	obj_order = rbd_dev->header.obj_order,
3990117973fbSAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
3991117973fbSAlex Elder 	if (ret)
3992117973fbSAlex Elder 		goto out;
3993117973fbSAlex Elder 	if (rbd_dev->header.obj_order != obj_order) {
3994117973fbSAlex Elder 		ret = -EIO;
3995117973fbSAlex Elder 		goto out;
3996117973fbSAlex Elder 	}
3997117973fbSAlex Elder 	rbd_update_mapping_size(rbd_dev);
3998117973fbSAlex Elder 
3999117973fbSAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4000117973fbSAlex Elder 	dout("rbd_dev_v2_snap_context returned %d\n", ret);
4001117973fbSAlex Elder 	if (ret)
4002117973fbSAlex Elder 		goto out;
4003117973fbSAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
4004117973fbSAlex Elder 	dout("rbd_dev_snaps_update returned %d\n", ret);
4005117973fbSAlex Elder 	if (ret)
4006117973fbSAlex Elder 		goto out;
4007117973fbSAlex Elder out:
4008117973fbSAlex Elder 	up_write(&rbd_dev->header_rwsem);
4009117973fbSAlex Elder 
4010117973fbSAlex Elder 	return ret;
4011117973fbSAlex Elder }
4012117973fbSAlex Elder 
40139d475de5SAlex Elder /*
401435938150SAlex Elder  * Scan the rbd device's current snapshot list and compare it to the
401535938150SAlex Elder  * newly-received snapshot context.  Remove any existing snapshots
401635938150SAlex Elder  * not present in the new snapshot context.  Add a new snapshot for
401735938150SAlex Elder  * any snaphots in the snapshot context not in the current list.
401835938150SAlex Elder  * And verify there are no changes to snapshots we already know
401935938150SAlex Elder  * about.
402035938150SAlex Elder  *
402135938150SAlex Elder  * Assumes the snapshots in the snapshot context are sorted by
402235938150SAlex Elder  * snapshot id, highest id first.  (Snapshots in the rbd_dev's list
402335938150SAlex Elder  * are also maintained in that order.)
4024522a0cc0SAlex Elder  *
4025522a0cc0SAlex Elder  * Note that any error occurs while updating the snapshot list
4026522a0cc0SAlex Elder  * aborts the update, and the entire list is cleared.  The snapshot
4027522a0cc0SAlex Elder  * list becomes inconsistent at that point anyway, so it might as
4028522a0cc0SAlex Elder  * well be empty.
4029dfc5606dSYehuda Sadeh  */
4030304f6808SAlex Elder static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
4031dfc5606dSYehuda Sadeh {
403235938150SAlex Elder 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
403335938150SAlex Elder 	const u32 snap_count = snapc->num_snaps;
403435938150SAlex Elder 	struct list_head *head = &rbd_dev->snaps;
403535938150SAlex Elder 	struct list_head *links = head->next;
403635938150SAlex Elder 	u32 index = 0;
4037522a0cc0SAlex Elder 	int ret = 0;
4038dfc5606dSYehuda Sadeh 
40399fcbb800SAlex Elder 	dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
404035938150SAlex Elder 	while (index < snap_count || links != head) {
404135938150SAlex Elder 		u64 snap_id;
404235938150SAlex Elder 		struct rbd_snap *snap;
4043cd892126SAlex Elder 		char *snap_name;
4044cd892126SAlex Elder 		u64 snap_size = 0;
4045cd892126SAlex Elder 		u64 snap_features = 0;
4046dfc5606dSYehuda Sadeh 
404735938150SAlex Elder 		snap_id = index < snap_count ? snapc->snaps[index]
404835938150SAlex Elder 					     : CEPH_NOSNAP;
404935938150SAlex Elder 		snap = links != head ? list_entry(links, struct rbd_snap, node)
405035938150SAlex Elder 				     : NULL;
4051aafb230eSAlex Elder 		rbd_assert(!snap || snap->id != CEPH_NOSNAP);
4052dfc5606dSYehuda Sadeh 
405335938150SAlex Elder 		if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
405435938150SAlex Elder 			struct list_head *next = links->next;
4055dfc5606dSYehuda Sadeh 
40566d292906SAlex Elder 			/*
40576d292906SAlex Elder 			 * A previously-existing snapshot is not in
40586d292906SAlex Elder 			 * the new snap context.
40596d292906SAlex Elder 			 *
4060522a0cc0SAlex Elder 			 * If the now-missing snapshot is the one
4061522a0cc0SAlex Elder 			 * the image represents, clear its existence
4062522a0cc0SAlex Elder 			 * flag so we can avoid sending any more
4063522a0cc0SAlex Elder 			 * requests to it.
40646d292906SAlex Elder 			 */
40650d7dbfceSAlex Elder 			if (rbd_dev->spec->snap_id == snap->id)
40666d292906SAlex Elder 				clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
40673e83b65bSAlex Elder 			dout("removing %ssnap id %llu\n",
40680d7dbfceSAlex Elder 				rbd_dev->spec->snap_id == snap->id ?
40690d7dbfceSAlex Elder 							"mapped " : "",
40709fcbb800SAlex Elder 				(unsigned long long)snap->id);
40716087b51bSAlex Elder 
40726087b51bSAlex Elder 			list_del(&snap->node);
40736087b51bSAlex Elder 			rbd_snap_destroy(snap);
4074dfc5606dSYehuda Sadeh 
407535938150SAlex Elder 			/* Done with this list entry; advance */
407635938150SAlex Elder 
407735938150SAlex Elder 			links = next;
407835938150SAlex Elder 			continue;
4079dfc5606dSYehuda Sadeh 		}
408035938150SAlex Elder 
4081b8b1e2dbSAlex Elder 		snap_name = rbd_dev_snap_info(rbd_dev, index,
4082cd892126SAlex Elder 					&snap_size, &snap_features);
4083522a0cc0SAlex Elder 		if (IS_ERR(snap_name)) {
4084522a0cc0SAlex Elder 			ret = PTR_ERR(snap_name);
4085522a0cc0SAlex Elder 			dout("failed to get snap info, error %d\n", ret);
4086522a0cc0SAlex Elder 			goto out_err;
4087522a0cc0SAlex Elder 		}
4088cd892126SAlex Elder 
40899fcbb800SAlex Elder 		dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
40909fcbb800SAlex Elder 			(unsigned long long)snap_id);
409135938150SAlex Elder 		if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
409235938150SAlex Elder 			struct rbd_snap *new_snap;
409335938150SAlex Elder 
409435938150SAlex Elder 			/* We haven't seen this snapshot before */
409535938150SAlex Elder 
40966087b51bSAlex Elder 			new_snap = rbd_snap_create(rbd_dev, snap_name,
4097cd892126SAlex Elder 					snap_id, snap_size, snap_features);
40989fcbb800SAlex Elder 			if (IS_ERR(new_snap)) {
4099522a0cc0SAlex Elder 				ret = PTR_ERR(new_snap);
4100522a0cc0SAlex Elder 				dout("  failed to add dev, error %d\n", ret);
4101522a0cc0SAlex Elder 				goto out_err;
41029fcbb800SAlex Elder 			}
410335938150SAlex Elder 
410435938150SAlex Elder 			/* New goes before existing, or at end of list */
410535938150SAlex Elder 
41069fcbb800SAlex Elder 			dout("  added dev%s\n", snap ? "" : " at end\n");
410735938150SAlex Elder 			if (snap)
410835938150SAlex Elder 				list_add_tail(&new_snap->node, &snap->node);
410935938150SAlex Elder 			else
4110523f3258SAlex Elder 				list_add_tail(&new_snap->node, head);
411135938150SAlex Elder 		} else {
411235938150SAlex Elder 			/* Already have this one */
411335938150SAlex Elder 
41149fcbb800SAlex Elder 			dout("  already present\n");
41159fcbb800SAlex Elder 
4116cd892126SAlex Elder 			rbd_assert(snap->size == snap_size);
4117aafb230eSAlex Elder 			rbd_assert(!strcmp(snap->name, snap_name));
4118cd892126SAlex Elder 			rbd_assert(snap->features == snap_features);
411935938150SAlex Elder 
412035938150SAlex Elder 			/* Done with this list entry; advance */
412135938150SAlex Elder 
412235938150SAlex Elder 			links = links->next;
4123dfc5606dSYehuda Sadeh 		}
412435938150SAlex Elder 
412535938150SAlex Elder 		/* Advance to the next entry in the snapshot context */
412635938150SAlex Elder 
412735938150SAlex Elder 		index++;
4128dfc5606dSYehuda Sadeh 	}
41299fcbb800SAlex Elder 	dout("%s: done\n", __func__);
4130dfc5606dSYehuda Sadeh 
4131dfc5606dSYehuda Sadeh 	return 0;
4132522a0cc0SAlex Elder out_err:
4133522a0cc0SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
4134522a0cc0SAlex Elder 
4135522a0cc0SAlex Elder 	return ret;
4136dfc5606dSYehuda Sadeh }
4137dfc5606dSYehuda Sadeh 
4138dfc5606dSYehuda Sadeh static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4139dfc5606dSYehuda Sadeh {
4140dfc5606dSYehuda Sadeh 	struct device *dev;
4141cd789ab9SAlex Elder 	int ret;
4142dfc5606dSYehuda Sadeh 
4143dfc5606dSYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4144dfc5606dSYehuda Sadeh 
4145cd789ab9SAlex Elder 	dev = &rbd_dev->dev;
4146dfc5606dSYehuda Sadeh 	dev->bus = &rbd_bus_type;
4147dfc5606dSYehuda Sadeh 	dev->type = &rbd_device_type;
4148dfc5606dSYehuda Sadeh 	dev->parent = &rbd_root_dev;
4149dfc5606dSYehuda Sadeh 	dev->release = rbd_dev_release;
4150de71a297SAlex Elder 	dev_set_name(dev, "%d", rbd_dev->dev_id);
4151dfc5606dSYehuda Sadeh 	ret = device_register(dev);
4152dfc5606dSYehuda Sadeh 
4153dfc5606dSYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4154cd789ab9SAlex Elder 
4155dfc5606dSYehuda Sadeh 	return ret;
4156602adf40SYehuda Sadeh }
4157602adf40SYehuda Sadeh 
4158dfc5606dSYehuda Sadeh static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4159dfc5606dSYehuda Sadeh {
4160dfc5606dSYehuda Sadeh 	device_unregister(&rbd_dev->dev);
4161dfc5606dSYehuda Sadeh }
4162dfc5606dSYehuda Sadeh 
4163e2839308SAlex Elder static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
41641ddbe94eSAlex Elder 
41651ddbe94eSAlex Elder /*
4166499afd5bSAlex Elder  * Get a unique rbd identifier for the given new rbd_dev, and add
4167499afd5bSAlex Elder  * the rbd_dev to the global list.  The minimum rbd id is 1.
41681ddbe94eSAlex Elder  */
4169e2839308SAlex Elder static void rbd_dev_id_get(struct rbd_device *rbd_dev)
4170b7f23c36SAlex Elder {
4171e2839308SAlex Elder 	rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
4172499afd5bSAlex Elder 
4173499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4174499afd5bSAlex Elder 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
4175499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4176e2839308SAlex Elder 	dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4177e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4178b7f23c36SAlex Elder }
4179b7f23c36SAlex Elder 
41801ddbe94eSAlex Elder /*
4181499afd5bSAlex Elder  * Remove an rbd_dev from the global list, and record that its
4182499afd5bSAlex Elder  * identifier is no longer in use.
41831ddbe94eSAlex Elder  */
4184e2839308SAlex Elder static void rbd_dev_id_put(struct rbd_device *rbd_dev)
41851ddbe94eSAlex Elder {
4186d184f6bfSAlex Elder 	struct list_head *tmp;
4187de71a297SAlex Elder 	int rbd_id = rbd_dev->dev_id;
4188d184f6bfSAlex Elder 	int max_id;
4189d184f6bfSAlex Elder 
4190aafb230eSAlex Elder 	rbd_assert(rbd_id > 0);
4191499afd5bSAlex Elder 
4192e2839308SAlex Elder 	dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4193e2839308SAlex Elder 		(unsigned long long) rbd_dev->dev_id);
4194499afd5bSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4195499afd5bSAlex Elder 	list_del_init(&rbd_dev->node);
4196d184f6bfSAlex Elder 
4197d184f6bfSAlex Elder 	/*
4198d184f6bfSAlex Elder 	 * If the id being "put" is not the current maximum, there
4199d184f6bfSAlex Elder 	 * is nothing special we need to do.
4200d184f6bfSAlex Elder 	 */
4201e2839308SAlex Elder 	if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
4202d184f6bfSAlex Elder 		spin_unlock(&rbd_dev_list_lock);
4203d184f6bfSAlex Elder 		return;
4204d184f6bfSAlex Elder 	}
4205d184f6bfSAlex Elder 
4206d184f6bfSAlex Elder 	/*
4207d184f6bfSAlex Elder 	 * We need to update the current maximum id.  Search the
4208d184f6bfSAlex Elder 	 * list to find out what it is.  We're more likely to find
4209d184f6bfSAlex Elder 	 * the maximum at the end, so search the list backward.
4210d184f6bfSAlex Elder 	 */
4211d184f6bfSAlex Elder 	max_id = 0;
4212d184f6bfSAlex Elder 	list_for_each_prev(tmp, &rbd_dev_list) {
4213d184f6bfSAlex Elder 		struct rbd_device *rbd_dev;
4214d184f6bfSAlex Elder 
4215d184f6bfSAlex Elder 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4216b213e0b1SAlex Elder 		if (rbd_dev->dev_id > max_id)
4217b213e0b1SAlex Elder 			max_id = rbd_dev->dev_id;
4218d184f6bfSAlex Elder 	}
4219499afd5bSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
42201ddbe94eSAlex Elder 
42211ddbe94eSAlex Elder 	/*
4222e2839308SAlex Elder 	 * The max id could have been updated by rbd_dev_id_get(), in
4223d184f6bfSAlex Elder 	 * which case it now accurately reflects the new maximum.
4224d184f6bfSAlex Elder 	 * Be careful not to overwrite the maximum value in that
4225d184f6bfSAlex Elder 	 * case.
42261ddbe94eSAlex Elder 	 */
4227e2839308SAlex Elder 	atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4228e2839308SAlex Elder 	dout("  max dev id has been reset\n");
4229b7f23c36SAlex Elder }
4230b7f23c36SAlex Elder 
4231a725f65eSAlex Elder /*
4232e28fff26SAlex Elder  * Skips over white space at *buf, and updates *buf to point to the
4233e28fff26SAlex Elder  * first found non-space character (if any). Returns the length of
4234593a9e7bSAlex Elder  * the token (string of non-white space characters) found.  Note
4235593a9e7bSAlex Elder  * that *buf must be terminated with '\0'.
4236e28fff26SAlex Elder  */
4237e28fff26SAlex Elder static inline size_t next_token(const char **buf)
4238e28fff26SAlex Elder {
4239e28fff26SAlex Elder         /*
4240e28fff26SAlex Elder         * These are the characters that produce nonzero for
4241e28fff26SAlex Elder         * isspace() in the "C" and "POSIX" locales.
4242e28fff26SAlex Elder         */
4243e28fff26SAlex Elder         const char *spaces = " \f\n\r\t\v";
4244e28fff26SAlex Elder 
4245e28fff26SAlex Elder         *buf += strspn(*buf, spaces);	/* Find start of token */
4246e28fff26SAlex Elder 
4247e28fff26SAlex Elder 	return strcspn(*buf, spaces);   /* Return token length */
4248e28fff26SAlex Elder }
4249e28fff26SAlex Elder 
4250e28fff26SAlex Elder /*
4251e28fff26SAlex Elder  * Finds the next token in *buf, and if the provided token buffer is
4252e28fff26SAlex Elder  * big enough, copies the found token into it.  The result, if
4253593a9e7bSAlex Elder  * copied, is guaranteed to be terminated with '\0'.  Note that *buf
4254593a9e7bSAlex Elder  * must be terminated with '\0' on entry.
4255e28fff26SAlex Elder  *
4256e28fff26SAlex Elder  * Returns the length of the token found (not including the '\0').
4257e28fff26SAlex Elder  * Return value will be 0 if no token is found, and it will be >=
4258e28fff26SAlex Elder  * token_size if the token would not fit.
4259e28fff26SAlex Elder  *
4260593a9e7bSAlex Elder  * The *buf pointer will be updated to point beyond the end of the
4261e28fff26SAlex Elder  * found token.  Note that this occurs even if the token buffer is
4262e28fff26SAlex Elder  * too small to hold it.
4263e28fff26SAlex Elder  */
4264e28fff26SAlex Elder static inline size_t copy_token(const char **buf,
4265e28fff26SAlex Elder 				char *token,
4266e28fff26SAlex Elder 				size_t token_size)
4267e28fff26SAlex Elder {
4268e28fff26SAlex Elder         size_t len;
4269e28fff26SAlex Elder 
4270e28fff26SAlex Elder 	len = next_token(buf);
4271e28fff26SAlex Elder 	if (len < token_size) {
4272e28fff26SAlex Elder 		memcpy(token, *buf, len);
4273e28fff26SAlex Elder 		*(token + len) = '\0';
4274e28fff26SAlex Elder 	}
4275e28fff26SAlex Elder 	*buf += len;
4276e28fff26SAlex Elder 
4277e28fff26SAlex Elder         return len;
4278e28fff26SAlex Elder }
4279e28fff26SAlex Elder 
4280e28fff26SAlex Elder /*
4281ea3352f4SAlex Elder  * Finds the next token in *buf, dynamically allocates a buffer big
4282ea3352f4SAlex Elder  * enough to hold a copy of it, and copies the token into the new
4283ea3352f4SAlex Elder  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
4284ea3352f4SAlex Elder  * that a duplicate buffer is created even for a zero-length token.
4285ea3352f4SAlex Elder  *
4286ea3352f4SAlex Elder  * Returns a pointer to the newly-allocated duplicate, or a null
4287ea3352f4SAlex Elder  * pointer if memory for the duplicate was not available.  If
4288ea3352f4SAlex Elder  * the lenp argument is a non-null pointer, the length of the token
4289ea3352f4SAlex Elder  * (not including the '\0') is returned in *lenp.
4290ea3352f4SAlex Elder  *
4291ea3352f4SAlex Elder  * If successful, the *buf pointer will be updated to point beyond
4292ea3352f4SAlex Elder  * the end of the found token.
4293ea3352f4SAlex Elder  *
4294ea3352f4SAlex Elder  * Note: uses GFP_KERNEL for allocation.
4295ea3352f4SAlex Elder  */
4296ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
4297ea3352f4SAlex Elder {
4298ea3352f4SAlex Elder 	char *dup;
4299ea3352f4SAlex Elder 	size_t len;
4300ea3352f4SAlex Elder 
4301ea3352f4SAlex Elder 	len = next_token(buf);
43024caf35f9SAlex Elder 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
4303ea3352f4SAlex Elder 	if (!dup)
4304ea3352f4SAlex Elder 		return NULL;
4305ea3352f4SAlex Elder 	*(dup + len) = '\0';
4306ea3352f4SAlex Elder 	*buf += len;
4307ea3352f4SAlex Elder 
4308ea3352f4SAlex Elder 	if (lenp)
4309ea3352f4SAlex Elder 		*lenp = len;
4310ea3352f4SAlex Elder 
4311ea3352f4SAlex Elder 	return dup;
4312ea3352f4SAlex Elder }
4313ea3352f4SAlex Elder 
4314ea3352f4SAlex Elder /*
4315859c31dfSAlex Elder  * Parse the options provided for an "rbd add" (i.e., rbd image
4316859c31dfSAlex Elder  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
4317859c31dfSAlex Elder  * and the data written is passed here via a NUL-terminated buffer.
4318859c31dfSAlex Elder  * Returns 0 if successful or an error code otherwise.
4319d22f76e7SAlex Elder  *
4320859c31dfSAlex Elder  * The information extracted from these options is recorded in
4321859c31dfSAlex Elder  * the other parameters which return dynamically-allocated
4322859c31dfSAlex Elder  * structures:
4323859c31dfSAlex Elder  *  ceph_opts
4324859c31dfSAlex Elder  *      The address of a pointer that will refer to a ceph options
4325859c31dfSAlex Elder  *      structure.  Caller must release the returned pointer using
4326859c31dfSAlex Elder  *      ceph_destroy_options() when it is no longer needed.
4327859c31dfSAlex Elder  *  rbd_opts
4328859c31dfSAlex Elder  *	Address of an rbd options pointer.  Fully initialized by
4329859c31dfSAlex Elder  *	this function; caller must release with kfree().
4330859c31dfSAlex Elder  *  spec
4331859c31dfSAlex Elder  *	Address of an rbd image specification pointer.  Fully
4332859c31dfSAlex Elder  *	initialized by this function based on parsed options.
4333859c31dfSAlex Elder  *	Caller must release with rbd_spec_put().
4334859c31dfSAlex Elder  *
4335859c31dfSAlex Elder  * The options passed take this form:
4336859c31dfSAlex Elder  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4337859c31dfSAlex Elder  * where:
4338859c31dfSAlex Elder  *  <mon_addrs>
4339859c31dfSAlex Elder  *      A comma-separated list of one or more monitor addresses.
4340859c31dfSAlex Elder  *      A monitor address is an ip address, optionally followed
4341859c31dfSAlex Elder  *      by a port number (separated by a colon).
4342859c31dfSAlex Elder  *        I.e.:  ip1[:port1][,ip2[:port2]...]
4343859c31dfSAlex Elder  *  <options>
4344859c31dfSAlex Elder  *      A comma-separated list of ceph and/or rbd options.
4345859c31dfSAlex Elder  *  <pool_name>
4346859c31dfSAlex Elder  *      The name of the rados pool containing the rbd image.
4347859c31dfSAlex Elder  *  <image_name>
4348859c31dfSAlex Elder  *      The name of the image in that pool to map.
4349859c31dfSAlex Elder  *  <snap_id>
4350859c31dfSAlex Elder  *      An optional snapshot id.  If provided, the mapping will
4351859c31dfSAlex Elder  *      present data from the image at the time that snapshot was
4352859c31dfSAlex Elder  *      created.  The image head is used if no snapshot id is
4353859c31dfSAlex Elder  *      provided.  Snapshot mappings are always read-only.
4354a725f65eSAlex Elder  */
4355859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
4356dc79b113SAlex Elder 				struct ceph_options **ceph_opts,
4357859c31dfSAlex Elder 				struct rbd_options **opts,
4358859c31dfSAlex Elder 				struct rbd_spec **rbd_spec)
4359a725f65eSAlex Elder {
4360e28fff26SAlex Elder 	size_t len;
4361859c31dfSAlex Elder 	char *options;
43620ddebc0cSAlex Elder 	const char *mon_addrs;
43630ddebc0cSAlex Elder 	size_t mon_addrs_size;
4364859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
43654e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4366859c31dfSAlex Elder 	struct ceph_options *copts;
4367dc79b113SAlex Elder 	int ret;
4368e28fff26SAlex Elder 
4369e28fff26SAlex Elder 	/* The first four tokens are required */
4370e28fff26SAlex Elder 
43717ef3214aSAlex Elder 	len = next_token(&buf);
43724fb5d671SAlex Elder 	if (!len) {
43734fb5d671SAlex Elder 		rbd_warn(NULL, "no monitor address(es) provided");
43744fb5d671SAlex Elder 		return -EINVAL;
43754fb5d671SAlex Elder 	}
43760ddebc0cSAlex Elder 	mon_addrs = buf;
4377f28e565aSAlex Elder 	mon_addrs_size = len + 1;
43787ef3214aSAlex Elder 	buf += len;
4379a725f65eSAlex Elder 
4380dc79b113SAlex Elder 	ret = -EINVAL;
4381f28e565aSAlex Elder 	options = dup_token(&buf, NULL);
4382f28e565aSAlex Elder 	if (!options)
4383dc79b113SAlex Elder 		return -ENOMEM;
43844fb5d671SAlex Elder 	if (!*options) {
43854fb5d671SAlex Elder 		rbd_warn(NULL, "no options provided");
43864fb5d671SAlex Elder 		goto out_err;
43874fb5d671SAlex Elder 	}
4388a725f65eSAlex Elder 
4389859c31dfSAlex Elder 	spec = rbd_spec_alloc();
4390859c31dfSAlex Elder 	if (!spec)
4391f28e565aSAlex Elder 		goto out_mem;
4392859c31dfSAlex Elder 
4393859c31dfSAlex Elder 	spec->pool_name = dup_token(&buf, NULL);
4394859c31dfSAlex Elder 	if (!spec->pool_name)
4395859c31dfSAlex Elder 		goto out_mem;
43964fb5d671SAlex Elder 	if (!*spec->pool_name) {
43974fb5d671SAlex Elder 		rbd_warn(NULL, "no pool name provided");
43984fb5d671SAlex Elder 		goto out_err;
43994fb5d671SAlex Elder 	}
4400e28fff26SAlex Elder 
440169e7a02fSAlex Elder 	spec->image_name = dup_token(&buf, NULL);
4402859c31dfSAlex Elder 	if (!spec->image_name)
4403f28e565aSAlex Elder 		goto out_mem;
44044fb5d671SAlex Elder 	if (!*spec->image_name) {
44054fb5d671SAlex Elder 		rbd_warn(NULL, "no image name provided");
44064fb5d671SAlex Elder 		goto out_err;
44074fb5d671SAlex Elder 	}
4408e28fff26SAlex Elder 
4409f28e565aSAlex Elder 	/*
4410f28e565aSAlex Elder 	 * Snapshot name is optional; default is to use "-"
4411f28e565aSAlex Elder 	 * (indicating the head/no snapshot).
4412f28e565aSAlex Elder 	 */
44133feeb894SAlex Elder 	len = next_token(&buf);
4414820a5f3eSAlex Elder 	if (!len) {
44153feeb894SAlex Elder 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
44163feeb894SAlex Elder 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
4417f28e565aSAlex Elder 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
4418dc79b113SAlex Elder 		ret = -ENAMETOOLONG;
4419f28e565aSAlex Elder 		goto out_err;
4420849b4260SAlex Elder 	}
44214caf35f9SAlex Elder 	spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4422859c31dfSAlex Elder 	if (!spec->snap_name)
4423f28e565aSAlex Elder 		goto out_mem;
4424859c31dfSAlex Elder 	*(spec->snap_name + len) = '\0';
4425e5c35534SAlex Elder 
44260ddebc0cSAlex Elder 	/* Initialize all rbd options to the defaults */
4427e28fff26SAlex Elder 
44284e9afebaSAlex Elder 	rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
44294e9afebaSAlex Elder 	if (!rbd_opts)
44304e9afebaSAlex Elder 		goto out_mem;
44314e9afebaSAlex Elder 
44324e9afebaSAlex Elder 	rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
4433d22f76e7SAlex Elder 
4434859c31dfSAlex Elder 	copts = ceph_parse_options(options, mon_addrs,
44350ddebc0cSAlex Elder 					mon_addrs + mon_addrs_size - 1,
44364e9afebaSAlex Elder 					parse_rbd_opts_token, rbd_opts);
4437859c31dfSAlex Elder 	if (IS_ERR(copts)) {
4438859c31dfSAlex Elder 		ret = PTR_ERR(copts);
4439dc79b113SAlex Elder 		goto out_err;
4440dc79b113SAlex Elder 	}
4441859c31dfSAlex Elder 	kfree(options);
4442859c31dfSAlex Elder 
4443859c31dfSAlex Elder 	*ceph_opts = copts;
44444e9afebaSAlex Elder 	*opts = rbd_opts;
4445859c31dfSAlex Elder 	*rbd_spec = spec;
44460ddebc0cSAlex Elder 
4447dc79b113SAlex Elder 	return 0;
4448f28e565aSAlex Elder out_mem:
4449dc79b113SAlex Elder 	ret = -ENOMEM;
4450d22f76e7SAlex Elder out_err:
4451859c31dfSAlex Elder 	kfree(rbd_opts);
4452859c31dfSAlex Elder 	rbd_spec_put(spec);
4453f28e565aSAlex Elder 	kfree(options);
4454d22f76e7SAlex Elder 
4455dc79b113SAlex Elder 	return ret;
4456a725f65eSAlex Elder }
4457a725f65eSAlex Elder 
4458589d30e0SAlex Elder /*
4459589d30e0SAlex Elder  * An rbd format 2 image has a unique identifier, distinct from the
4460589d30e0SAlex Elder  * name given to it by the user.  Internally, that identifier is
4461589d30e0SAlex Elder  * what's used to specify the names of objects related to the image.
4462589d30e0SAlex Elder  *
4463589d30e0SAlex Elder  * A special "rbd id" object is used to map an rbd image name to its
4464589d30e0SAlex Elder  * id.  If that object doesn't exist, then there is no v2 rbd image
4465589d30e0SAlex Elder  * with the supplied name.
4466589d30e0SAlex Elder  *
4467589d30e0SAlex Elder  * This function will record the given rbd_dev's image_id field if
4468589d30e0SAlex Elder  * it can be determined, and in that case will return 0.  If any
4469589d30e0SAlex Elder  * errors occur a negative errno will be returned and the rbd_dev's
4470589d30e0SAlex Elder  * image_id field will be unchanged (and should be NULL).
4471589d30e0SAlex Elder  */
4472589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4473589d30e0SAlex Elder {
4474589d30e0SAlex Elder 	int ret;
4475589d30e0SAlex Elder 	size_t size;
4476589d30e0SAlex Elder 	char *object_name;
4477589d30e0SAlex Elder 	void *response;
4478589d30e0SAlex Elder 	void *p;
4479589d30e0SAlex Elder 
44802f82ee54SAlex Elder 	/* If we already have it we don't need to look it up */
44812f82ee54SAlex Elder 
44822f82ee54SAlex Elder 	if (rbd_dev->spec->image_id)
44832f82ee54SAlex Elder 		return 0;
44842f82ee54SAlex Elder 
4485589d30e0SAlex Elder 	/*
44862c0d0a10SAlex Elder 	 * When probing a parent image, the image id is already
44872c0d0a10SAlex Elder 	 * known (and the image name likely is not).  There's no
44882c0d0a10SAlex Elder 	 * need to fetch the image id again in this case.
44892c0d0a10SAlex Elder 	 */
44902c0d0a10SAlex Elder 	if (rbd_dev->spec->image_id)
44912c0d0a10SAlex Elder 		return 0;
44922c0d0a10SAlex Elder 
44932c0d0a10SAlex Elder 	/*
4494589d30e0SAlex Elder 	 * First, see if the format 2 image id file exists, and if
4495589d30e0SAlex Elder 	 * so, get the image's persistent id from it.
4496589d30e0SAlex Elder 	 */
449769e7a02fSAlex Elder 	size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
4498589d30e0SAlex Elder 	object_name = kmalloc(size, GFP_NOIO);
4499589d30e0SAlex Elder 	if (!object_name)
4500589d30e0SAlex Elder 		return -ENOMEM;
45010d7dbfceSAlex Elder 	sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
4502589d30e0SAlex Elder 	dout("rbd id object name is %s\n", object_name);
4503589d30e0SAlex Elder 
4504589d30e0SAlex Elder 	/* Response will be an encoded string, which includes a length */
4505589d30e0SAlex Elder 
4506589d30e0SAlex Elder 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4507589d30e0SAlex Elder 	response = kzalloc(size, GFP_NOIO);
4508589d30e0SAlex Elder 	if (!response) {
4509589d30e0SAlex Elder 		ret = -ENOMEM;
4510589d30e0SAlex Elder 		goto out;
4511589d30e0SAlex Elder 	}
4512589d30e0SAlex Elder 
451336be9a76SAlex Elder 	ret = rbd_obj_method_sync(rbd_dev, object_name,
45144157976bSAlex Elder 				"rbd", "get_id", NULL, 0,
451507b2391fSAlex Elder 				response, RBD_IMAGE_ID_LEN_MAX, NULL);
451636be9a76SAlex Elder 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4517589d30e0SAlex Elder 	if (ret < 0)
4518589d30e0SAlex Elder 		goto out;
4519589d30e0SAlex Elder 
4520589d30e0SAlex Elder 	p = response;
45210d7dbfceSAlex Elder 	rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
452257385b51SAlex Elder 						p + ret,
4523979ed480SAlex Elder 						NULL, GFP_NOIO);
452457385b51SAlex Elder 	ret = 0;
452557385b51SAlex Elder 
45260d7dbfceSAlex Elder 	if (IS_ERR(rbd_dev->spec->image_id)) {
45270d7dbfceSAlex Elder 		ret = PTR_ERR(rbd_dev->spec->image_id);
45280d7dbfceSAlex Elder 		rbd_dev->spec->image_id = NULL;
4529589d30e0SAlex Elder 	} else {
45300d7dbfceSAlex Elder 		dout("image_id is %s\n", rbd_dev->spec->image_id);
4531589d30e0SAlex Elder 	}
4532589d30e0SAlex Elder out:
4533589d30e0SAlex Elder 	kfree(response);
4534589d30e0SAlex Elder 	kfree(object_name);
4535589d30e0SAlex Elder 
4536589d30e0SAlex Elder 	return ret;
4537589d30e0SAlex Elder }
4538589d30e0SAlex Elder 
4539a30b71b9SAlex Elder static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4540a30b71b9SAlex Elder {
4541a30b71b9SAlex Elder 	int ret;
4542a30b71b9SAlex Elder 	size_t size;
4543a30b71b9SAlex Elder 
4544a30b71b9SAlex Elder 	/* Version 1 images have no id; empty string is used */
4545a30b71b9SAlex Elder 
45460d7dbfceSAlex Elder 	rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
45470d7dbfceSAlex Elder 	if (!rbd_dev->spec->image_id)
4548a30b71b9SAlex Elder 		return -ENOMEM;
4549a30b71b9SAlex Elder 
4550a30b71b9SAlex Elder 	/* Record the header object name for this rbd image. */
4551a30b71b9SAlex Elder 
455269e7a02fSAlex Elder 	size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
4553a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4554a30b71b9SAlex Elder 	if (!rbd_dev->header_name) {
4555a30b71b9SAlex Elder 		ret = -ENOMEM;
4556a30b71b9SAlex Elder 		goto out_err;
4557a30b71b9SAlex Elder 	}
45580d7dbfceSAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
45590d7dbfceSAlex Elder 		rbd_dev->spec->image_name, RBD_SUFFIX);
4560a30b71b9SAlex Elder 
4561a30b71b9SAlex Elder 	/* Populate rbd image metadata */
4562a30b71b9SAlex Elder 
4563a30b71b9SAlex Elder 	ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4564a30b71b9SAlex Elder 	if (ret < 0)
4565a30b71b9SAlex Elder 		goto out_err;
456686b00e0dSAlex Elder 
456786b00e0dSAlex Elder 	/* Version 1 images have no parent (no layering) */
456886b00e0dSAlex Elder 
456986b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
457086b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
457186b00e0dSAlex Elder 
4572a30b71b9SAlex Elder 	rbd_dev->image_format = 1;
4573a30b71b9SAlex Elder 
4574a30b71b9SAlex Elder 	dout("discovered version 1 image, header name is %s\n",
4575a30b71b9SAlex Elder 		rbd_dev->header_name);
4576a30b71b9SAlex Elder 
4577a30b71b9SAlex Elder 	return 0;
4578a30b71b9SAlex Elder 
4579a30b71b9SAlex Elder out_err:
4580a30b71b9SAlex Elder 	kfree(rbd_dev->header_name);
4581a30b71b9SAlex Elder 	rbd_dev->header_name = NULL;
45820d7dbfceSAlex Elder 	kfree(rbd_dev->spec->image_id);
45830d7dbfceSAlex Elder 	rbd_dev->spec->image_id = NULL;
4584a30b71b9SAlex Elder 
4585a30b71b9SAlex Elder 	return ret;
4586a30b71b9SAlex Elder }
4587a30b71b9SAlex Elder 
4588a30b71b9SAlex Elder static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4589a30b71b9SAlex Elder {
4590a30b71b9SAlex Elder 	size_t size;
45919d475de5SAlex Elder 	int ret;
45926e14b1a6SAlex Elder 	u64 ver = 0;
4593a30b71b9SAlex Elder 
4594a30b71b9SAlex Elder 	/*
4595a30b71b9SAlex Elder 	 * Image id was filled in by the caller.  Record the header
4596a30b71b9SAlex Elder 	 * object name for this rbd image.
4597a30b71b9SAlex Elder 	 */
4598979ed480SAlex Elder 	size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
4599a30b71b9SAlex Elder 	rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4600a30b71b9SAlex Elder 	if (!rbd_dev->header_name)
4601a30b71b9SAlex Elder 		return -ENOMEM;
4602a30b71b9SAlex Elder 	sprintf(rbd_dev->header_name, "%s%s",
46030d7dbfceSAlex Elder 			RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
46049d475de5SAlex Elder 
46059d475de5SAlex Elder 	/* Get the size and object order for the image */
46069d475de5SAlex Elder 	ret = rbd_dev_v2_image_size(rbd_dev);
460757385b51SAlex Elder 	if (ret)
46089d475de5SAlex Elder 		goto out_err;
46091e130199SAlex Elder 
46101e130199SAlex Elder 	/* Get the object prefix (a.k.a. block_name) for the image */
46111e130199SAlex Elder 
46121e130199SAlex Elder 	ret = rbd_dev_v2_object_prefix(rbd_dev);
461357385b51SAlex Elder 	if (ret)
46141e130199SAlex Elder 		goto out_err;
4615b1b5402aSAlex Elder 
4616d889140cSAlex Elder 	/* Get the and check features for the image */
4617b1b5402aSAlex Elder 
4618b1b5402aSAlex Elder 	ret = rbd_dev_v2_features(rbd_dev);
461957385b51SAlex Elder 	if (ret)
4620b1b5402aSAlex Elder 		goto out_err;
462135d489f9SAlex Elder 
462286b00e0dSAlex Elder 	/* If the image supports layering, get the parent info */
462386b00e0dSAlex Elder 
462486b00e0dSAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
462586b00e0dSAlex Elder 		ret = rbd_dev_v2_parent_info(rbd_dev);
462657385b51SAlex Elder 		if (ret)
462786b00e0dSAlex Elder 			goto out_err;
4628770eba6eSAlex Elder 		rbd_warn(rbd_dev, "WARNING: kernel support for "
4629770eba6eSAlex Elder 					"layered rbd images is EXPERIMENTAL!");
463086b00e0dSAlex Elder 	}
463186b00e0dSAlex Elder 
4632cc070d59SAlex Elder 	/* If the image supports fancy striping, get its parameters */
4633cc070d59SAlex Elder 
4634cc070d59SAlex Elder 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4635cc070d59SAlex Elder 		ret = rbd_dev_v2_striping_info(rbd_dev);
4636cc070d59SAlex Elder 		if (ret < 0)
4637cc070d59SAlex Elder 			goto out_err;
4638cc070d59SAlex Elder 	}
4639cc070d59SAlex Elder 
46406e14b1a6SAlex Elder 	/* crypto and compression type aren't (yet) supported for v2 images */
464135d489f9SAlex Elder 
46426e14b1a6SAlex Elder 	rbd_dev->header.crypt_type = 0;
46436e14b1a6SAlex Elder 	rbd_dev->header.comp_type = 0;
46446e14b1a6SAlex Elder 
46456e14b1a6SAlex Elder 	/* Get the snapshot context, plus the header version */
46466e14b1a6SAlex Elder 
46476e14b1a6SAlex Elder 	ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
464835d489f9SAlex Elder 	if (ret)
464935d489f9SAlex Elder 		goto out_err;
46506e14b1a6SAlex Elder 	rbd_dev->header.obj_version = ver;
46516e14b1a6SAlex Elder 
4652a30b71b9SAlex Elder 	rbd_dev->image_format = 2;
4653a30b71b9SAlex Elder 
4654a30b71b9SAlex Elder 	dout("discovered version 2 image, header name is %s\n",
4655a30b71b9SAlex Elder 		rbd_dev->header_name);
4656a30b71b9SAlex Elder 
465735152979SAlex Elder 	return 0;
46589d475de5SAlex Elder out_err:
465986b00e0dSAlex Elder 	rbd_dev->parent_overlap = 0;
466086b00e0dSAlex Elder 	rbd_spec_put(rbd_dev->parent_spec);
466186b00e0dSAlex Elder 	rbd_dev->parent_spec = NULL;
46629d475de5SAlex Elder 	kfree(rbd_dev->header_name);
46639d475de5SAlex Elder 	rbd_dev->header_name = NULL;
46641e130199SAlex Elder 	kfree(rbd_dev->header.object_prefix);
46651e130199SAlex Elder 	rbd_dev->header.object_prefix = NULL;
46669d475de5SAlex Elder 
46679d475de5SAlex Elder 	return ret;
4668a30b71b9SAlex Elder }
4669a30b71b9SAlex Elder 
467083a06263SAlex Elder static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
467183a06263SAlex Elder {
46722f82ee54SAlex Elder 	struct rbd_device *parent = NULL;
46732f82ee54SAlex Elder 	struct rbd_spec *parent_spec = NULL;
46742f82ee54SAlex Elder 	struct rbd_client *rbdc = NULL;
467583a06263SAlex Elder 	int ret;
467683a06263SAlex Elder 
467783a06263SAlex Elder 	/* no need to lock here, as rbd_dev is not registered yet */
467883a06263SAlex Elder 	ret = rbd_dev_snaps_update(rbd_dev);
467983a06263SAlex Elder 	if (ret)
468083a06263SAlex Elder 		return ret;
468183a06263SAlex Elder 
46829e15b77dSAlex Elder 	ret = rbd_dev_probe_update_spec(rbd_dev);
46839e15b77dSAlex Elder 	if (ret)
46849e15b77dSAlex Elder 		goto err_out_snaps;
46859e15b77dSAlex Elder 
468683a06263SAlex Elder 	ret = rbd_dev_set_mapping(rbd_dev);
468783a06263SAlex Elder 	if (ret)
468883a06263SAlex Elder 		goto err_out_snaps;
468983a06263SAlex Elder 
469083a06263SAlex Elder 	/* generate unique id: find highest unique id, add one */
469183a06263SAlex Elder 	rbd_dev_id_get(rbd_dev);
469283a06263SAlex Elder 
469383a06263SAlex Elder 	/* Fill in the device name, now that we have its id. */
469483a06263SAlex Elder 	BUILD_BUG_ON(DEV_NAME_LEN
469583a06263SAlex Elder 			< sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
469683a06263SAlex Elder 	sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
469783a06263SAlex Elder 
469883a06263SAlex Elder 	/* Get our block major device number. */
469983a06263SAlex Elder 
470083a06263SAlex Elder 	ret = register_blkdev(0, rbd_dev->name);
470183a06263SAlex Elder 	if (ret < 0)
470283a06263SAlex Elder 		goto err_out_id;
470383a06263SAlex Elder 	rbd_dev->major = ret;
470483a06263SAlex Elder 
470583a06263SAlex Elder 	/* Set up the blkdev mapping. */
470683a06263SAlex Elder 
470783a06263SAlex Elder 	ret = rbd_init_disk(rbd_dev);
470883a06263SAlex Elder 	if (ret)
470983a06263SAlex Elder 		goto err_out_blkdev;
471083a06263SAlex Elder 
471183a06263SAlex Elder 	ret = rbd_bus_add_dev(rbd_dev);
471283a06263SAlex Elder 	if (ret)
471383a06263SAlex Elder 		goto err_out_disk;
471483a06263SAlex Elder 
471583a06263SAlex Elder 	/*
471683a06263SAlex Elder 	 * At this point cleanup in the event of an error is the job
471783a06263SAlex Elder 	 * of the sysfs code (initiated by rbd_bus_del_dev()).
471883a06263SAlex Elder 	 */
47192f82ee54SAlex Elder 	/* Probe the parent if there is one */
47202f82ee54SAlex Elder 
47212f82ee54SAlex Elder 	if (rbd_dev->parent_spec) {
47222f82ee54SAlex Elder 		/*
47232f82ee54SAlex Elder 		 * We need to pass a reference to the client and the
47242f82ee54SAlex Elder 		 * parent spec when creating the parent rbd_dev.
47252f82ee54SAlex Elder 		 * Images related by parent/child relationships
47262f82ee54SAlex Elder 		 * always share both.
47272f82ee54SAlex Elder 		 */
47282f82ee54SAlex Elder 		parent_spec = rbd_spec_get(rbd_dev->parent_spec);
47292f82ee54SAlex Elder 		rbdc = __rbd_get_client(rbd_dev->rbd_client);
47302f82ee54SAlex Elder 
47312f82ee54SAlex Elder 		parent = rbd_dev_create(rbdc, parent_spec);
47322f82ee54SAlex Elder 		if (!parent) {
47332f82ee54SAlex Elder 			ret = -ENOMEM;
47342f82ee54SAlex Elder 			goto err_out_spec;
47352f82ee54SAlex Elder 		}
47362f82ee54SAlex Elder 		rbdc = NULL;		/* parent now owns reference */
47372f82ee54SAlex Elder 		parent_spec = NULL;	/* parent now owns reference */
47382f82ee54SAlex Elder 		ret = rbd_dev_probe(parent);
47392f82ee54SAlex Elder 		if (ret < 0)
47402f82ee54SAlex Elder 			goto err_out_parent;
47412f82ee54SAlex Elder 		rbd_dev->parent = parent;
47422f82ee54SAlex Elder 	}
47432f82ee54SAlex Elder 
47449969ebc5SAlex Elder 	ret = rbd_dev_header_watch_sync(rbd_dev, 1);
474583a06263SAlex Elder 	if (ret)
474683a06263SAlex Elder 		goto err_out_bus;
474783a06263SAlex Elder 
474883a06263SAlex Elder 	/* Everything's ready.  Announce the disk to the world. */
474983a06263SAlex Elder 
475083a06263SAlex Elder 	add_disk(rbd_dev->disk);
475183a06263SAlex Elder 
475283a06263SAlex Elder 	pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
475383a06263SAlex Elder 		(unsigned long long) rbd_dev->mapping.size);
475483a06263SAlex Elder 
475583a06263SAlex Elder 	return ret;
47562f82ee54SAlex Elder 
47572f82ee54SAlex Elder err_out_parent:
47582f82ee54SAlex Elder 	rbd_dev_destroy(parent);
47592f82ee54SAlex Elder err_out_spec:
47602f82ee54SAlex Elder 	rbd_spec_put(parent_spec);
47612f82ee54SAlex Elder 	rbd_put_client(rbdc);
476283a06263SAlex Elder err_out_bus:
476383a06263SAlex Elder 	/* this will also clean up rest of rbd_dev stuff */
476483a06263SAlex Elder 
476583a06263SAlex Elder 	rbd_bus_del_dev(rbd_dev);
476683a06263SAlex Elder 
476783a06263SAlex Elder 	return ret;
476883a06263SAlex Elder err_out_disk:
476983a06263SAlex Elder 	rbd_free_disk(rbd_dev);
477083a06263SAlex Elder err_out_blkdev:
477183a06263SAlex Elder 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
477283a06263SAlex Elder err_out_id:
477383a06263SAlex Elder 	rbd_dev_id_put(rbd_dev);
477483a06263SAlex Elder err_out_snaps:
477583a06263SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
477683a06263SAlex Elder 
477783a06263SAlex Elder 	return ret;
477883a06263SAlex Elder }
477983a06263SAlex Elder 
4780a30b71b9SAlex Elder /*
4781a30b71b9SAlex Elder  * Probe for the existence of the header object for the given rbd
4782a30b71b9SAlex Elder  * device.  For format 2 images this includes determining the image
4783a30b71b9SAlex Elder  * id.
4784a30b71b9SAlex Elder  */
4785a30b71b9SAlex Elder static int rbd_dev_probe(struct rbd_device *rbd_dev)
4786a30b71b9SAlex Elder {
4787a30b71b9SAlex Elder 	int ret;
4788a30b71b9SAlex Elder 
4789a30b71b9SAlex Elder 	/*
4790a30b71b9SAlex Elder 	 * Get the id from the image id object.  If it's not a
4791a30b71b9SAlex Elder 	 * format 2 image, we'll get ENOENT back, and we'll assume
4792a30b71b9SAlex Elder 	 * it's a format 1 image.
4793a30b71b9SAlex Elder 	 */
4794a30b71b9SAlex Elder 	ret = rbd_dev_image_id(rbd_dev);
4795a30b71b9SAlex Elder 	if (ret)
4796a30b71b9SAlex Elder 		ret = rbd_dev_v1_probe(rbd_dev);
4797a30b71b9SAlex Elder 	else
4798a30b71b9SAlex Elder 		ret = rbd_dev_v2_probe(rbd_dev);
479983a06263SAlex Elder 	if (ret) {
4800a30b71b9SAlex Elder 		dout("probe failed, returning %d\n", ret);
4801a30b71b9SAlex Elder 
4802a30b71b9SAlex Elder 		return ret;
4803a30b71b9SAlex Elder 	}
4804a30b71b9SAlex Elder 
480583a06263SAlex Elder 	ret = rbd_dev_probe_finish(rbd_dev);
480683a06263SAlex Elder 	if (ret)
480783a06263SAlex Elder 		rbd_header_free(&rbd_dev->header);
480883a06263SAlex Elder 
480983a06263SAlex Elder 	return ret;
481083a06263SAlex Elder }
481183a06263SAlex Elder 
481259c2be1eSYehuda Sadeh static ssize_t rbd_add(struct bus_type *bus,
481359c2be1eSYehuda Sadeh 		       const char *buf,
481459c2be1eSYehuda Sadeh 		       size_t count)
4815602adf40SYehuda Sadeh {
4816cb8627c7SAlex Elder 	struct rbd_device *rbd_dev = NULL;
4817dc79b113SAlex Elder 	struct ceph_options *ceph_opts = NULL;
48184e9afebaSAlex Elder 	struct rbd_options *rbd_opts = NULL;
4819859c31dfSAlex Elder 	struct rbd_spec *spec = NULL;
48209d3997fdSAlex Elder 	struct rbd_client *rbdc;
482127cc2594SAlex Elder 	struct ceph_osd_client *osdc;
482227cc2594SAlex Elder 	int rc = -ENOMEM;
4823602adf40SYehuda Sadeh 
4824602adf40SYehuda Sadeh 	if (!try_module_get(THIS_MODULE))
4825602adf40SYehuda Sadeh 		return -ENODEV;
4826602adf40SYehuda Sadeh 
4827a725f65eSAlex Elder 	/* parse add command */
4828859c31dfSAlex Elder 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
4829dc79b113SAlex Elder 	if (rc < 0)
4830bd4ba655SAlex Elder 		goto err_out_module;
4831a725f65eSAlex Elder 
48329d3997fdSAlex Elder 	rbdc = rbd_get_client(ceph_opts);
48339d3997fdSAlex Elder 	if (IS_ERR(rbdc)) {
48349d3997fdSAlex Elder 		rc = PTR_ERR(rbdc);
48350ddebc0cSAlex Elder 		goto err_out_args;
48369d3997fdSAlex Elder 	}
4837c53d5893SAlex Elder 	ceph_opts = NULL;	/* rbd_dev client now owns this */
4838602adf40SYehuda Sadeh 
4839602adf40SYehuda Sadeh 	/* pick the pool */
48409d3997fdSAlex Elder 	osdc = &rbdc->client->osdc;
4841859c31dfSAlex Elder 	rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
4842602adf40SYehuda Sadeh 	if (rc < 0)
4843602adf40SYehuda Sadeh 		goto err_out_client;
4844859c31dfSAlex Elder 	spec->pool_id = (u64) rc;
4845859c31dfSAlex Elder 
48460903e875SAlex Elder 	/* The ceph file layout needs to fit pool id in 32 bits */
48470903e875SAlex Elder 
48480903e875SAlex Elder 	if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
48490903e875SAlex Elder 		rc = -EIO;
48500903e875SAlex Elder 		goto err_out_client;
48510903e875SAlex Elder 	}
48520903e875SAlex Elder 
4853c53d5893SAlex Elder 	rbd_dev = rbd_dev_create(rbdc, spec);
4854bd4ba655SAlex Elder 	if (!rbd_dev)
4855bd4ba655SAlex Elder 		goto err_out_client;
4856c53d5893SAlex Elder 	rbdc = NULL;		/* rbd_dev now owns this */
4857c53d5893SAlex Elder 	spec = NULL;		/* rbd_dev now owns this */
4858602adf40SYehuda Sadeh 
4859bd4ba655SAlex Elder 	rbd_dev->mapping.read_only = rbd_opts->read_only;
4860c53d5893SAlex Elder 	kfree(rbd_opts);
4861c53d5893SAlex Elder 	rbd_opts = NULL;	/* done with this */
4862bd4ba655SAlex Elder 
4863a30b71b9SAlex Elder 	rc = rbd_dev_probe(rbd_dev);
4864a30b71b9SAlex Elder 	if (rc < 0)
4865c53d5893SAlex Elder 		goto err_out_rbd_dev;
486605fd6f6fSAlex Elder 
4867602adf40SYehuda Sadeh 	return count;
4868c53d5893SAlex Elder err_out_rbd_dev:
4869c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4870bd4ba655SAlex Elder err_out_client:
48719d3997fdSAlex Elder 	rbd_put_client(rbdc);
48720ddebc0cSAlex Elder err_out_args:
487378cea76eSAlex Elder 	if (ceph_opts)
487478cea76eSAlex Elder 		ceph_destroy_options(ceph_opts);
48754e9afebaSAlex Elder 	kfree(rbd_opts);
4876859c31dfSAlex Elder 	rbd_spec_put(spec);
4877bd4ba655SAlex Elder err_out_module:
4878bd4ba655SAlex Elder 	module_put(THIS_MODULE);
487927cc2594SAlex Elder 
4880602adf40SYehuda Sadeh 	dout("Error adding device %s\n", buf);
488127cc2594SAlex Elder 
488227cc2594SAlex Elder 	return (ssize_t) rc;
4883602adf40SYehuda Sadeh }
4884602adf40SYehuda Sadeh 
4885de71a297SAlex Elder static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
4886602adf40SYehuda Sadeh {
4887602adf40SYehuda Sadeh 	struct list_head *tmp;
4888602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev;
4889602adf40SYehuda Sadeh 
4890e124a82fSAlex Elder 	spin_lock(&rbd_dev_list_lock);
4891602adf40SYehuda Sadeh 	list_for_each(tmp, &rbd_dev_list) {
4892602adf40SYehuda Sadeh 		rbd_dev = list_entry(tmp, struct rbd_device, node);
4893de71a297SAlex Elder 		if (rbd_dev->dev_id == dev_id) {
4894e124a82fSAlex Elder 			spin_unlock(&rbd_dev_list_lock);
4895602adf40SYehuda Sadeh 			return rbd_dev;
4896602adf40SYehuda Sadeh 		}
4897e124a82fSAlex Elder 	}
4898e124a82fSAlex Elder 	spin_unlock(&rbd_dev_list_lock);
4899602adf40SYehuda Sadeh 	return NULL;
4900602adf40SYehuda Sadeh }
4901602adf40SYehuda Sadeh 
4902dfc5606dSYehuda Sadeh static void rbd_dev_release(struct device *dev)
4903602adf40SYehuda Sadeh {
4904593a9e7bSAlex Elder 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4905602adf40SYehuda Sadeh 
490659c2be1eSYehuda Sadeh 	if (rbd_dev->watch_event)
49079969ebc5SAlex Elder 		rbd_dev_header_watch_sync(rbd_dev, 0);
4908602adf40SYehuda Sadeh 
4909602adf40SYehuda Sadeh 	/* clean up and free blkdev */
4910602adf40SYehuda Sadeh 	rbd_free_disk(rbd_dev);
4911602adf40SYehuda Sadeh 	unregister_blkdev(rbd_dev->major, rbd_dev->name);
491232eec68dSAlex Elder 
49132ac4e75dSAlex Elder 	/* release allocated disk header fields */
49142ac4e75dSAlex Elder 	rbd_header_free(&rbd_dev->header);
49152ac4e75dSAlex Elder 
491632eec68dSAlex Elder 	/* done with the id, and with the rbd_dev */
4917e2839308SAlex Elder 	rbd_dev_id_put(rbd_dev);
4918c53d5893SAlex Elder 	rbd_assert(rbd_dev->rbd_client != NULL);
4919c53d5893SAlex Elder 	rbd_dev_destroy(rbd_dev);
4920602adf40SYehuda Sadeh 
4921602adf40SYehuda Sadeh 	/* release module ref */
4922602adf40SYehuda Sadeh 	module_put(THIS_MODULE);
4923602adf40SYehuda Sadeh }
4924602adf40SYehuda Sadeh 
49252f82ee54SAlex Elder static void __rbd_remove(struct rbd_device *rbd_dev)
49262f82ee54SAlex Elder {
49272f82ee54SAlex Elder 	rbd_remove_all_snaps(rbd_dev);
49282f82ee54SAlex Elder 	rbd_bus_del_dev(rbd_dev);
49292f82ee54SAlex Elder }
49302f82ee54SAlex Elder 
4931dfc5606dSYehuda Sadeh static ssize_t rbd_remove(struct bus_type *bus,
4932602adf40SYehuda Sadeh 			  const char *buf,
4933602adf40SYehuda Sadeh 			  size_t count)
4934602adf40SYehuda Sadeh {
4935602adf40SYehuda Sadeh 	struct rbd_device *rbd_dev = NULL;
4936602adf40SYehuda Sadeh 	int target_id, rc;
4937602adf40SYehuda Sadeh 	unsigned long ul;
4938602adf40SYehuda Sadeh 	int ret = count;
4939602adf40SYehuda Sadeh 
4940602adf40SYehuda Sadeh 	rc = strict_strtoul(buf, 10, &ul);
4941602adf40SYehuda Sadeh 	if (rc)
4942602adf40SYehuda Sadeh 		return rc;
4943602adf40SYehuda Sadeh 
4944602adf40SYehuda Sadeh 	/* convert to int; abort if we lost anything in the conversion */
4945602adf40SYehuda Sadeh 	target_id = (int) ul;
4946602adf40SYehuda Sadeh 	if (target_id != ul)
4947602adf40SYehuda Sadeh 		return -EINVAL;
4948602adf40SYehuda Sadeh 
4949602adf40SYehuda Sadeh 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4950602adf40SYehuda Sadeh 
4951602adf40SYehuda Sadeh 	rbd_dev = __rbd_get_dev(target_id);
4952602adf40SYehuda Sadeh 	if (!rbd_dev) {
4953602adf40SYehuda Sadeh 		ret = -ENOENT;
4954602adf40SYehuda Sadeh 		goto done;
4955602adf40SYehuda Sadeh 	}
4956602adf40SYehuda Sadeh 
4957a14ea269SAlex Elder 	spin_lock_irq(&rbd_dev->lock);
4958b82d167bSAlex Elder 	if (rbd_dev->open_count)
495942382b70SAlex Elder 		ret = -EBUSY;
4960b82d167bSAlex Elder 	else
4961b82d167bSAlex Elder 		set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
4962a14ea269SAlex Elder 	spin_unlock_irq(&rbd_dev->lock);
4963b82d167bSAlex Elder 	if (ret < 0)
496442382b70SAlex Elder 		goto done;
496542382b70SAlex Elder 
49662f82ee54SAlex Elder 	while (rbd_dev->parent_spec) {
49672f82ee54SAlex Elder 		struct rbd_device *first = rbd_dev;
49682f82ee54SAlex Elder 		struct rbd_device *second = first->parent;
49692f82ee54SAlex Elder 		struct rbd_device *third;
49702f82ee54SAlex Elder 
49712f82ee54SAlex Elder 		/*
49722f82ee54SAlex Elder 		 * Follow to the parent with no grandparent and
49732f82ee54SAlex Elder 		 * remove it.
49742f82ee54SAlex Elder 		 */
49752f82ee54SAlex Elder 		while (second && (third = second->parent)) {
49762f82ee54SAlex Elder 			first = second;
49772f82ee54SAlex Elder 			second = third;
49782f82ee54SAlex Elder 		}
49792f82ee54SAlex Elder 		__rbd_remove(second);
49802f82ee54SAlex Elder 		rbd_spec_put(first->parent_spec);
49812f82ee54SAlex Elder 		first->parent_spec = NULL;
49822f82ee54SAlex Elder 		first->parent_overlap = 0;
49832f82ee54SAlex Elder 		first->parent = NULL;
49842f82ee54SAlex Elder 	}
49852f82ee54SAlex Elder 	__rbd_remove(rbd_dev);
4986602adf40SYehuda Sadeh 
4987602adf40SYehuda Sadeh done:
4988602adf40SYehuda Sadeh 	mutex_unlock(&ctl_mutex);
4989aafb230eSAlex Elder 
4990602adf40SYehuda Sadeh 	return ret;
4991602adf40SYehuda Sadeh }
4992602adf40SYehuda Sadeh 
4993602adf40SYehuda Sadeh /*
4994602adf40SYehuda Sadeh  * create control files in sysfs
4995dfc5606dSYehuda Sadeh  * /sys/bus/rbd/...
4996602adf40SYehuda Sadeh  */
4997602adf40SYehuda Sadeh static int rbd_sysfs_init(void)
4998602adf40SYehuda Sadeh {
4999dfc5606dSYehuda Sadeh 	int ret;
5000602adf40SYehuda Sadeh 
5001fed4c143SAlex Elder 	ret = device_register(&rbd_root_dev);
5002dfc5606dSYehuda Sadeh 	if (ret < 0)
5003dfc5606dSYehuda Sadeh 		return ret;
5004602adf40SYehuda Sadeh 
5005fed4c143SAlex Elder 	ret = bus_register(&rbd_bus_type);
5006fed4c143SAlex Elder 	if (ret < 0)
5007fed4c143SAlex Elder 		device_unregister(&rbd_root_dev);
5008602adf40SYehuda Sadeh 
5009602adf40SYehuda Sadeh 	return ret;
5010602adf40SYehuda Sadeh }
5011602adf40SYehuda Sadeh 
5012602adf40SYehuda Sadeh static void rbd_sysfs_cleanup(void)
5013602adf40SYehuda Sadeh {
5014dfc5606dSYehuda Sadeh 	bus_unregister(&rbd_bus_type);
5015fed4c143SAlex Elder 	device_unregister(&rbd_root_dev);
5016602adf40SYehuda Sadeh }
5017602adf40SYehuda Sadeh 
5018cc344fa1SAlex Elder static int __init rbd_init(void)
5019602adf40SYehuda Sadeh {
5020602adf40SYehuda Sadeh 	int rc;
5021602adf40SYehuda Sadeh 
50221e32d34cSAlex Elder 	if (!libceph_compatible(NULL)) {
50231e32d34cSAlex Elder 		rbd_warn(NULL, "libceph incompatibility (quitting)");
50241e32d34cSAlex Elder 
50251e32d34cSAlex Elder 		return -EINVAL;
50261e32d34cSAlex Elder 	}
5027602adf40SYehuda Sadeh 	rc = rbd_sysfs_init();
5028602adf40SYehuda Sadeh 	if (rc)
5029602adf40SYehuda Sadeh 		return rc;
5030f0f8cef5SAlex Elder 	pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5031602adf40SYehuda Sadeh 	return 0;
5032602adf40SYehuda Sadeh }
5033602adf40SYehuda Sadeh 
5034cc344fa1SAlex Elder static void __exit rbd_exit(void)
5035602adf40SYehuda Sadeh {
5036602adf40SYehuda Sadeh 	rbd_sysfs_cleanup();
5037602adf40SYehuda Sadeh }
5038602adf40SYehuda Sadeh 
5039602adf40SYehuda Sadeh module_init(rbd_init);
5040602adf40SYehuda Sadeh module_exit(rbd_exit);
5041602adf40SYehuda Sadeh 
5042602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5043602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5044602adf40SYehuda Sadeh MODULE_DESCRIPTION("rados block device");
5045602adf40SYehuda Sadeh 
5046602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
5047602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5048602adf40SYehuda Sadeh 
5049602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
5050