xref: /openbmc/linux/drivers/block/rbd.c (revision 8795a739)
1 
2 /*
3    rbd.c -- Export ceph rados objects as a Linux block device
4 
5 
6    based on drivers/block/osdblk.c:
7 
8    Copyright 2009 Red Hat, Inc.
9 
10    This program is free software; you can redistribute it and/or modify
11    it under the terms of the GNU General Public License as published by
12    the Free Software Foundation.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; see the file COPYING.  If not, write to
21    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 
24 
25    For usage instructions, please refer to:
26 
27                  Documentation/ABI/testing/sysfs-bus-rbd
28 
29  */
30 
31 #include <linux/ceph/libceph.h>
32 #include <linux/ceph/osd_client.h>
33 #include <linux/ceph/mon_client.h>
34 #include <linux/ceph/cls_lock_client.h>
35 #include <linux/ceph/striper.h>
36 #include <linux/ceph/decode.h>
37 #include <linux/parser.h>
38 #include <linux/bsearch.h>
39 
40 #include <linux/kernel.h>
41 #include <linux/device.h>
42 #include <linux/module.h>
43 #include <linux/blk-mq.h>
44 #include <linux/fs.h>
45 #include <linux/blkdev.h>
46 #include <linux/slab.h>
47 #include <linux/idr.h>
48 #include <linux/workqueue.h>
49 
50 #include "rbd_types.h"
51 
52 #define RBD_DEBUG	/* Activate rbd_assert() calls */
53 
54 /*
55  * Increment the given counter and return its updated value.
56  * If the counter is already 0 it will not be incremented.
57  * If the counter is already at its maximum value returns
58  * -EINVAL without updating it.
59  */
60 static int atomic_inc_return_safe(atomic_t *v)
61 {
62 	unsigned int counter;
63 
64 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65 	if (counter <= (unsigned int)INT_MAX)
66 		return (int)counter;
67 
68 	atomic_dec(v);
69 
70 	return -EINVAL;
71 }
72 
73 /* Decrement the counter.  Return the resulting value, or -EINVAL */
74 static int atomic_dec_return_safe(atomic_t *v)
75 {
76 	int counter;
77 
78 	counter = atomic_dec_return(v);
79 	if (counter >= 0)
80 		return counter;
81 
82 	atomic_inc(v);
83 
84 	return -EINVAL;
85 }
86 
87 #define RBD_DRV_NAME "rbd"
88 
89 #define RBD_MINORS_PER_MAJOR		256
90 #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91 
92 #define RBD_MAX_PARENT_CHAIN_LEN	16
93 
94 #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95 #define RBD_MAX_SNAP_NAME_LEN	\
96 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97 
98 #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99 
100 #define RBD_SNAP_HEAD_NAME	"-"
101 
102 #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
103 
104 /* This allows a single page to hold an image name sent by OSD */
105 #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106 #define RBD_IMAGE_ID_LEN_MAX	64
107 
108 #define RBD_OBJ_PREFIX_LEN_MAX	64
109 
110 #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
111 #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
112 
113 /* Feature bits */
114 
115 #define RBD_FEATURE_LAYERING		(1ULL<<0)
116 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
117 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
118 #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
119 #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120 #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
121 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
123 
124 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125 				 RBD_FEATURE_STRIPINGV2 |	\
126 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
127 				 RBD_FEATURE_OBJECT_MAP |	\
128 				 RBD_FEATURE_FAST_DIFF |	\
129 				 RBD_FEATURE_DEEP_FLATTEN |	\
130 				 RBD_FEATURE_DATA_POOL |	\
131 				 RBD_FEATURE_OPERATIONS)
132 
133 /* Features supported by this (client software) implementation. */
134 
135 #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136 
137 /*
138  * An RBD device name will be "rbd#", where the "rbd" comes from
139  * RBD_DRV_NAME above, and # is a unique integer identifier.
140  */
141 #define DEV_NAME_LEN		32
142 
143 /*
144  * block device image metadata (in-memory version)
145  */
146 struct rbd_image_header {
147 	/* These six fields never change for a given rbd image */
148 	char *object_prefix;
149 	__u8 obj_order;
150 	u64 stripe_unit;
151 	u64 stripe_count;
152 	s64 data_pool_id;
153 	u64 features;		/* Might be changeable someday? */
154 
155 	/* The remaining fields need to be updated occasionally */
156 	u64 image_size;
157 	struct ceph_snap_context *snapc;
158 	char *snap_names;	/* format 1 only */
159 	u64 *snap_sizes;	/* format 1 only */
160 };
161 
162 /*
163  * An rbd image specification.
164  *
165  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166  * identify an image.  Each rbd_dev structure includes a pointer to
167  * an rbd_spec structure that encapsulates this identity.
168  *
169  * Each of the id's in an rbd_spec has an associated name.  For a
170  * user-mapped image, the names are supplied and the id's associated
171  * with them are looked up.  For a layered image, a parent image is
172  * defined by the tuple, and the names are looked up.
173  *
174  * An rbd_dev structure contains a parent_spec pointer which is
175  * non-null if the image it represents is a child in a layered
176  * image.  This pointer will refer to the rbd_spec structure used
177  * by the parent rbd_dev for its own identity (i.e., the structure
178  * is shared between the parent and child).
179  *
180  * Since these structures are populated once, during the discovery
181  * phase of image construction, they are effectively immutable so
182  * we make no effort to synchronize access to them.
183  *
184  * Note that code herein does not assume the image name is known (it
185  * could be a null pointer).
186  */
187 struct rbd_spec {
188 	u64		pool_id;
189 	const char	*pool_name;
190 	const char	*pool_ns;	/* NULL if default, never "" */
191 
192 	const char	*image_id;
193 	const char	*image_name;
194 
195 	u64		snap_id;
196 	const char	*snap_name;
197 
198 	struct kref	kref;
199 };
200 
201 /*
202  * an instance of the client.  multiple devices may share an rbd client.
203  */
204 struct rbd_client {
205 	struct ceph_client	*client;
206 	struct kref		kref;
207 	struct list_head	node;
208 };
209 
210 struct pending_result {
211 	int			result;		/* first nonzero result */
212 	int			num_pending;
213 };
214 
215 struct rbd_img_request;
216 
217 enum obj_request_type {
218 	OBJ_REQUEST_NODATA = 1,
219 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
220 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
222 };
223 
224 enum obj_operation_type {
225 	OBJ_OP_READ = 1,
226 	OBJ_OP_WRITE,
227 	OBJ_OP_DISCARD,
228 	OBJ_OP_ZEROOUT,
229 };
230 
231 #define RBD_OBJ_FLAG_DELETION			(1U << 0)
232 #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233 #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
234 #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
235 #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
236 
237 enum rbd_obj_read_state {
238 	RBD_OBJ_READ_START = 1,
239 	RBD_OBJ_READ_OBJECT,
240 	RBD_OBJ_READ_PARENT,
241 };
242 
243 /*
244  * Writes go through the following state machine to deal with
245  * layering:
246  *
247  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248  *            .                 |                                    .
249  *            .                 v                                    .
250  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
251  *            .                 |                    .               .
252  *            .                 v                    v (deep-copyup  .
253  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
254  * flattened) v                 |                    .               .
255  *            .                 v                    .               .
256  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
257  *                              |                        not needed) v
258  *                              v                                    .
259  *                            done . . . . . . . . . . . . . . . . . .
260  *                              ^
261  *                              |
262  *                     RBD_OBJ_WRITE_FLAT
263  *
264  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
265  * assert_exists guard is needed or not (in some cases it's not needed
266  * even if there is a parent).
267  */
268 enum rbd_obj_write_state {
269 	RBD_OBJ_WRITE_START = 1,
270 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
271 	RBD_OBJ_WRITE_OBJECT,
272 	__RBD_OBJ_WRITE_COPYUP,
273 	RBD_OBJ_WRITE_COPYUP,
274 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275 };
276 
277 enum rbd_obj_copyup_state {
278 	RBD_OBJ_COPYUP_START = 1,
279 	RBD_OBJ_COPYUP_READ_PARENT,
280 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
281 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284 };
285 
286 struct rbd_obj_request {
287 	struct ceph_object_extent ex;
288 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289 	union {
290 		enum rbd_obj_read_state	 read_state;	/* for reads */
291 		enum rbd_obj_write_state write_state;	/* for writes */
292 	};
293 
294 	struct rbd_img_request	*img_request;
295 	struct ceph_file_extent	*img_extents;
296 	u32			num_img_extents;
297 
298 	union {
299 		struct ceph_bio_iter	bio_pos;
300 		struct {
301 			struct ceph_bvec_iter	bvec_pos;
302 			u32			bvec_count;
303 			u32			bvec_idx;
304 		};
305 	};
306 
307 	enum rbd_obj_copyup_state copyup_state;
308 	struct bio_vec		*copyup_bvecs;
309 	u32			copyup_bvec_count;
310 
311 	struct list_head	osd_reqs;	/* w/ r_private_item */
312 
313 	struct mutex		state_mutex;
314 	struct pending_result	pending;
315 	struct kref		kref;
316 };
317 
318 enum img_req_flags {
319 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
321 };
322 
323 enum rbd_img_state {
324 	RBD_IMG_START = 1,
325 	RBD_IMG_EXCLUSIVE_LOCK,
326 	__RBD_IMG_OBJECT_REQUESTS,
327 	RBD_IMG_OBJECT_REQUESTS,
328 };
329 
330 struct rbd_img_request {
331 	struct rbd_device	*rbd_dev;
332 	enum obj_operation_type	op_type;
333 	enum obj_request_type	data_type;
334 	unsigned long		flags;
335 	enum rbd_img_state	state;
336 	union {
337 		u64			snap_id;	/* for reads */
338 		struct ceph_snap_context *snapc;	/* for writes */
339 	};
340 	union {
341 		struct request		*rq;		/* block request */
342 		struct rbd_obj_request	*obj_request;	/* obj req initiator */
343 	};
344 
345 	struct list_head	lock_item;
346 	struct list_head	object_extents;	/* obj_req.ex structs */
347 
348 	struct mutex		state_mutex;
349 	struct pending_result	pending;
350 	struct work_struct	work;
351 	int			work_result;
352 	struct kref		kref;
353 };
354 
355 #define for_each_obj_request(ireq, oreq) \
356 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
357 #define for_each_obj_request_safe(ireq, oreq, n) \
358 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
359 
360 enum rbd_watch_state {
361 	RBD_WATCH_STATE_UNREGISTERED,
362 	RBD_WATCH_STATE_REGISTERED,
363 	RBD_WATCH_STATE_ERROR,
364 };
365 
366 enum rbd_lock_state {
367 	RBD_LOCK_STATE_UNLOCKED,
368 	RBD_LOCK_STATE_LOCKED,
369 	RBD_LOCK_STATE_RELEASING,
370 };
371 
372 /* WatchNotify::ClientId */
373 struct rbd_client_id {
374 	u64 gid;
375 	u64 handle;
376 };
377 
378 struct rbd_mapping {
379 	u64                     size;
380 	u64                     features;
381 };
382 
383 /*
384  * a single device
385  */
386 struct rbd_device {
387 	int			dev_id;		/* blkdev unique id */
388 
389 	int			major;		/* blkdev assigned major */
390 	int			minor;
391 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
392 
393 	u32			image_format;	/* Either 1 or 2 */
394 	struct rbd_client	*rbd_client;
395 
396 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
397 
398 	spinlock_t		lock;		/* queue, flags, open_count */
399 
400 	struct rbd_image_header	header;
401 	unsigned long		flags;		/* possibly lock protected */
402 	struct rbd_spec		*spec;
403 	struct rbd_options	*opts;
404 	char			*config_info;	/* add{,_single_major} string */
405 
406 	struct ceph_object_id	header_oid;
407 	struct ceph_object_locator header_oloc;
408 
409 	struct ceph_file_layout	layout;		/* used for all rbd requests */
410 
411 	struct mutex		watch_mutex;
412 	enum rbd_watch_state	watch_state;
413 	struct ceph_osd_linger_request *watch_handle;
414 	u64			watch_cookie;
415 	struct delayed_work	watch_dwork;
416 
417 	struct rw_semaphore	lock_rwsem;
418 	enum rbd_lock_state	lock_state;
419 	char			lock_cookie[32];
420 	struct rbd_client_id	owner_cid;
421 	struct work_struct	acquired_lock_work;
422 	struct work_struct	released_lock_work;
423 	struct delayed_work	lock_dwork;
424 	struct work_struct	unlock_work;
425 	spinlock_t		lock_lists_lock;
426 	struct list_head	acquiring_list;
427 	struct list_head	running_list;
428 	struct completion	acquire_wait;
429 	int			acquire_err;
430 	struct completion	releasing_wait;
431 
432 	spinlock_t		object_map_lock;
433 	u8			*object_map;
434 	u64			object_map_size;	/* in objects */
435 	u64			object_map_flags;
436 
437 	struct workqueue_struct	*task_wq;
438 
439 	struct rbd_spec		*parent_spec;
440 	u64			parent_overlap;
441 	atomic_t		parent_ref;
442 	struct rbd_device	*parent;
443 
444 	/* Block layer tags. */
445 	struct blk_mq_tag_set	tag_set;
446 
447 	/* protects updating the header */
448 	struct rw_semaphore     header_rwsem;
449 
450 	struct rbd_mapping	mapping;
451 
452 	struct list_head	node;
453 
454 	/* sysfs related */
455 	struct device		dev;
456 	unsigned long		open_count;	/* protected by lock */
457 };
458 
459 /*
460  * Flag bits for rbd_dev->flags:
461  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
462  *   by rbd_dev->lock
463  */
464 enum rbd_dev_flags {
465 	RBD_DEV_FLAG_EXISTS,	/* mapped snapshot has not been deleted */
466 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
467 };
468 
469 static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
470 
471 static LIST_HEAD(rbd_dev_list);    /* devices */
472 static DEFINE_SPINLOCK(rbd_dev_list_lock);
473 
474 static LIST_HEAD(rbd_client_list);		/* clients */
475 static DEFINE_SPINLOCK(rbd_client_list_lock);
476 
477 /* Slab caches for frequently-allocated structures */
478 
479 static struct kmem_cache	*rbd_img_request_cache;
480 static struct kmem_cache	*rbd_obj_request_cache;
481 
482 static int rbd_major;
483 static DEFINE_IDA(rbd_dev_id_ida);
484 
485 static struct workqueue_struct *rbd_wq;
486 
487 static struct ceph_snap_context rbd_empty_snapc = {
488 	.nref = REFCOUNT_INIT(1),
489 };
490 
491 /*
492  * single-major requires >= 0.75 version of userspace rbd utility.
493  */
494 static bool single_major = true;
495 module_param(single_major, bool, 0444);
496 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
497 
498 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
499 static ssize_t remove_store(struct bus_type *bus, const char *buf,
500 			    size_t count);
501 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
502 				      size_t count);
503 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
504 					 size_t count);
505 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
506 
507 static int rbd_dev_id_to_minor(int dev_id)
508 {
509 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
510 }
511 
512 static int minor_to_rbd_dev_id(int minor)
513 {
514 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
515 }
516 
517 static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
518 {
519 	lockdep_assert_held(&rbd_dev->lock_rwsem);
520 
521 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
522 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
523 }
524 
525 static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
526 {
527 	bool is_lock_owner;
528 
529 	down_read(&rbd_dev->lock_rwsem);
530 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
531 	up_read(&rbd_dev->lock_rwsem);
532 	return is_lock_owner;
533 }
534 
535 static ssize_t supported_features_show(struct bus_type *bus, char *buf)
536 {
537 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
538 }
539 
540 static BUS_ATTR_WO(add);
541 static BUS_ATTR_WO(remove);
542 static BUS_ATTR_WO(add_single_major);
543 static BUS_ATTR_WO(remove_single_major);
544 static BUS_ATTR_RO(supported_features);
545 
546 static struct attribute *rbd_bus_attrs[] = {
547 	&bus_attr_add.attr,
548 	&bus_attr_remove.attr,
549 	&bus_attr_add_single_major.attr,
550 	&bus_attr_remove_single_major.attr,
551 	&bus_attr_supported_features.attr,
552 	NULL,
553 };
554 
555 static umode_t rbd_bus_is_visible(struct kobject *kobj,
556 				  struct attribute *attr, int index)
557 {
558 	if (!single_major &&
559 	    (attr == &bus_attr_add_single_major.attr ||
560 	     attr == &bus_attr_remove_single_major.attr))
561 		return 0;
562 
563 	return attr->mode;
564 }
565 
566 static const struct attribute_group rbd_bus_group = {
567 	.attrs = rbd_bus_attrs,
568 	.is_visible = rbd_bus_is_visible,
569 };
570 __ATTRIBUTE_GROUPS(rbd_bus);
571 
572 static struct bus_type rbd_bus_type = {
573 	.name		= "rbd",
574 	.bus_groups	= rbd_bus_groups,
575 };
576 
577 static void rbd_root_dev_release(struct device *dev)
578 {
579 }
580 
581 static struct device rbd_root_dev = {
582 	.init_name =    "rbd",
583 	.release =      rbd_root_dev_release,
584 };
585 
586 static __printf(2, 3)
587 void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
588 {
589 	struct va_format vaf;
590 	va_list args;
591 
592 	va_start(args, fmt);
593 	vaf.fmt = fmt;
594 	vaf.va = &args;
595 
596 	if (!rbd_dev)
597 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
598 	else if (rbd_dev->disk)
599 		printk(KERN_WARNING "%s: %s: %pV\n",
600 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
601 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
602 		printk(KERN_WARNING "%s: image %s: %pV\n",
603 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
604 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
605 		printk(KERN_WARNING "%s: id %s: %pV\n",
606 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
607 	else	/* punt */
608 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
609 			RBD_DRV_NAME, rbd_dev, &vaf);
610 	va_end(args);
611 }
612 
613 #ifdef RBD_DEBUG
614 #define rbd_assert(expr)						\
615 		if (unlikely(!(expr))) {				\
616 			printk(KERN_ERR "\nAssertion failure in %s() "	\
617 						"at line %d:\n\n"	\
618 					"\trbd_assert(%s);\n\n",	\
619 					__func__, __LINE__, #expr);	\
620 			BUG();						\
621 		}
622 #else /* !RBD_DEBUG */
623 #  define rbd_assert(expr)	((void) 0)
624 #endif /* !RBD_DEBUG */
625 
626 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
627 
628 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
629 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
630 static int rbd_dev_header_info(struct rbd_device *rbd_dev);
631 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
632 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
633 					u64 snap_id);
634 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
635 				u8 *order, u64 *snap_size);
636 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
637 		u64 *snap_features);
638 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
639 
640 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
641 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
642 
643 /*
644  * Return true if nothing else is pending.
645  */
646 static bool pending_result_dec(struct pending_result *pending, int *result)
647 {
648 	rbd_assert(pending->num_pending > 0);
649 
650 	if (*result && !pending->result)
651 		pending->result = *result;
652 	if (--pending->num_pending)
653 		return false;
654 
655 	*result = pending->result;
656 	return true;
657 }
658 
659 static int rbd_open(struct block_device *bdev, fmode_t mode)
660 {
661 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
662 	bool removing = false;
663 
664 	spin_lock_irq(&rbd_dev->lock);
665 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
666 		removing = true;
667 	else
668 		rbd_dev->open_count++;
669 	spin_unlock_irq(&rbd_dev->lock);
670 	if (removing)
671 		return -ENOENT;
672 
673 	(void) get_device(&rbd_dev->dev);
674 
675 	return 0;
676 }
677 
678 static void rbd_release(struct gendisk *disk, fmode_t mode)
679 {
680 	struct rbd_device *rbd_dev = disk->private_data;
681 	unsigned long open_count_before;
682 
683 	spin_lock_irq(&rbd_dev->lock);
684 	open_count_before = rbd_dev->open_count--;
685 	spin_unlock_irq(&rbd_dev->lock);
686 	rbd_assert(open_count_before > 0);
687 
688 	put_device(&rbd_dev->dev);
689 }
690 
691 static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
692 {
693 	int ro;
694 
695 	if (get_user(ro, (int __user *)arg))
696 		return -EFAULT;
697 
698 	/* Snapshots can't be marked read-write */
699 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
700 		return -EROFS;
701 
702 	/* Let blkdev_roset() handle it */
703 	return -ENOTTY;
704 }
705 
706 static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
707 			unsigned int cmd, unsigned long arg)
708 {
709 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
710 	int ret;
711 
712 	switch (cmd) {
713 	case BLKROSET:
714 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
715 		break;
716 	default:
717 		ret = -ENOTTY;
718 	}
719 
720 	return ret;
721 }
722 
723 #ifdef CONFIG_COMPAT
724 static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
725 				unsigned int cmd, unsigned long arg)
726 {
727 	return rbd_ioctl(bdev, mode, cmd, arg);
728 }
729 #endif /* CONFIG_COMPAT */
730 
731 static const struct block_device_operations rbd_bd_ops = {
732 	.owner			= THIS_MODULE,
733 	.open			= rbd_open,
734 	.release		= rbd_release,
735 	.ioctl			= rbd_ioctl,
736 #ifdef CONFIG_COMPAT
737 	.compat_ioctl		= rbd_compat_ioctl,
738 #endif
739 };
740 
741 /*
742  * Initialize an rbd client instance.  Success or not, this function
743  * consumes ceph_opts.  Caller holds client_mutex.
744  */
745 static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
746 {
747 	struct rbd_client *rbdc;
748 	int ret = -ENOMEM;
749 
750 	dout("%s:\n", __func__);
751 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
752 	if (!rbdc)
753 		goto out_opt;
754 
755 	kref_init(&rbdc->kref);
756 	INIT_LIST_HEAD(&rbdc->node);
757 
758 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
759 	if (IS_ERR(rbdc->client))
760 		goto out_rbdc;
761 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
762 
763 	ret = ceph_open_session(rbdc->client);
764 	if (ret < 0)
765 		goto out_client;
766 
767 	spin_lock(&rbd_client_list_lock);
768 	list_add_tail(&rbdc->node, &rbd_client_list);
769 	spin_unlock(&rbd_client_list_lock);
770 
771 	dout("%s: rbdc %p\n", __func__, rbdc);
772 
773 	return rbdc;
774 out_client:
775 	ceph_destroy_client(rbdc->client);
776 out_rbdc:
777 	kfree(rbdc);
778 out_opt:
779 	if (ceph_opts)
780 		ceph_destroy_options(ceph_opts);
781 	dout("%s: error %d\n", __func__, ret);
782 
783 	return ERR_PTR(ret);
784 }
785 
786 static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
787 {
788 	kref_get(&rbdc->kref);
789 
790 	return rbdc;
791 }
792 
793 /*
794  * Find a ceph client with specific addr and configuration.  If
795  * found, bump its reference count.
796  */
797 static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
798 {
799 	struct rbd_client *client_node;
800 	bool found = false;
801 
802 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
803 		return NULL;
804 
805 	spin_lock(&rbd_client_list_lock);
806 	list_for_each_entry(client_node, &rbd_client_list, node) {
807 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
808 			__rbd_get_client(client_node);
809 
810 			found = true;
811 			break;
812 		}
813 	}
814 	spin_unlock(&rbd_client_list_lock);
815 
816 	return found ? client_node : NULL;
817 }
818 
819 /*
820  * (Per device) rbd map options
821  */
822 enum {
823 	Opt_queue_depth,
824 	Opt_alloc_size,
825 	Opt_lock_timeout,
826 	Opt_last_int,
827 	/* int args above */
828 	Opt_pool_ns,
829 	Opt_last_string,
830 	/* string args above */
831 	Opt_read_only,
832 	Opt_read_write,
833 	Opt_lock_on_read,
834 	Opt_exclusive,
835 	Opt_notrim,
836 	Opt_err
837 };
838 
839 static match_table_t rbd_opts_tokens = {
840 	{Opt_queue_depth, "queue_depth=%d"},
841 	{Opt_alloc_size, "alloc_size=%d"},
842 	{Opt_lock_timeout, "lock_timeout=%d"},
843 	/* int args above */
844 	{Opt_pool_ns, "_pool_ns=%s"},
845 	/* string args above */
846 	{Opt_read_only, "read_only"},
847 	{Opt_read_only, "ro"},		/* Alternate spelling */
848 	{Opt_read_write, "read_write"},
849 	{Opt_read_write, "rw"},		/* Alternate spelling */
850 	{Opt_lock_on_read, "lock_on_read"},
851 	{Opt_exclusive, "exclusive"},
852 	{Opt_notrim, "notrim"},
853 	{Opt_err, NULL}
854 };
855 
856 struct rbd_options {
857 	int	queue_depth;
858 	int	alloc_size;
859 	unsigned long	lock_timeout;
860 	bool	read_only;
861 	bool	lock_on_read;
862 	bool	exclusive;
863 	bool	trim;
864 };
865 
866 #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
867 #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
868 #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
869 #define RBD_READ_ONLY_DEFAULT	false
870 #define RBD_LOCK_ON_READ_DEFAULT false
871 #define RBD_EXCLUSIVE_DEFAULT	false
872 #define RBD_TRIM_DEFAULT	true
873 
874 struct parse_rbd_opts_ctx {
875 	struct rbd_spec		*spec;
876 	struct rbd_options	*opts;
877 };
878 
879 static int parse_rbd_opts_token(char *c, void *private)
880 {
881 	struct parse_rbd_opts_ctx *pctx = private;
882 	substring_t argstr[MAX_OPT_ARGS];
883 	int token, intval, ret;
884 
885 	token = match_token(c, rbd_opts_tokens, argstr);
886 	if (token < Opt_last_int) {
887 		ret = match_int(&argstr[0], &intval);
888 		if (ret < 0) {
889 			pr_err("bad option arg (not int) at '%s'\n", c);
890 			return ret;
891 		}
892 		dout("got int token %d val %d\n", token, intval);
893 	} else if (token > Opt_last_int && token < Opt_last_string) {
894 		dout("got string token %d val %s\n", token, argstr[0].from);
895 	} else {
896 		dout("got token %d\n", token);
897 	}
898 
899 	switch (token) {
900 	case Opt_queue_depth:
901 		if (intval < 1) {
902 			pr_err("queue_depth out of range\n");
903 			return -EINVAL;
904 		}
905 		pctx->opts->queue_depth = intval;
906 		break;
907 	case Opt_alloc_size:
908 		if (intval < SECTOR_SIZE) {
909 			pr_err("alloc_size out of range\n");
910 			return -EINVAL;
911 		}
912 		if (!is_power_of_2(intval)) {
913 			pr_err("alloc_size must be a power of 2\n");
914 			return -EINVAL;
915 		}
916 		pctx->opts->alloc_size = intval;
917 		break;
918 	case Opt_lock_timeout:
919 		/* 0 is "wait forever" (i.e. infinite timeout) */
920 		if (intval < 0 || intval > INT_MAX / 1000) {
921 			pr_err("lock_timeout out of range\n");
922 			return -EINVAL;
923 		}
924 		pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
925 		break;
926 	case Opt_pool_ns:
927 		kfree(pctx->spec->pool_ns);
928 		pctx->spec->pool_ns = match_strdup(argstr);
929 		if (!pctx->spec->pool_ns)
930 			return -ENOMEM;
931 		break;
932 	case Opt_read_only:
933 		pctx->opts->read_only = true;
934 		break;
935 	case Opt_read_write:
936 		pctx->opts->read_only = false;
937 		break;
938 	case Opt_lock_on_read:
939 		pctx->opts->lock_on_read = true;
940 		break;
941 	case Opt_exclusive:
942 		pctx->opts->exclusive = true;
943 		break;
944 	case Opt_notrim:
945 		pctx->opts->trim = false;
946 		break;
947 	default:
948 		/* libceph prints "bad option" msg */
949 		return -EINVAL;
950 	}
951 
952 	return 0;
953 }
954 
955 static char* obj_op_name(enum obj_operation_type op_type)
956 {
957 	switch (op_type) {
958 	case OBJ_OP_READ:
959 		return "read";
960 	case OBJ_OP_WRITE:
961 		return "write";
962 	case OBJ_OP_DISCARD:
963 		return "discard";
964 	case OBJ_OP_ZEROOUT:
965 		return "zeroout";
966 	default:
967 		return "???";
968 	}
969 }
970 
971 /*
972  * Destroy ceph client
973  *
974  * Caller must hold rbd_client_list_lock.
975  */
976 static void rbd_client_release(struct kref *kref)
977 {
978 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
979 
980 	dout("%s: rbdc %p\n", __func__, rbdc);
981 	spin_lock(&rbd_client_list_lock);
982 	list_del(&rbdc->node);
983 	spin_unlock(&rbd_client_list_lock);
984 
985 	ceph_destroy_client(rbdc->client);
986 	kfree(rbdc);
987 }
988 
989 /*
990  * Drop reference to ceph client node. If it's not referenced anymore, release
991  * it.
992  */
993 static void rbd_put_client(struct rbd_client *rbdc)
994 {
995 	if (rbdc)
996 		kref_put(&rbdc->kref, rbd_client_release);
997 }
998 
999 /*
1000  * Get a ceph client with specific addr and configuration, if one does
1001  * not exist create it.  Either way, ceph_opts is consumed by this
1002  * function.
1003  */
1004 static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
1005 {
1006 	struct rbd_client *rbdc;
1007 	int ret;
1008 
1009 	mutex_lock(&client_mutex);
1010 	rbdc = rbd_client_find(ceph_opts);
1011 	if (rbdc) {
1012 		ceph_destroy_options(ceph_opts);
1013 
1014 		/*
1015 		 * Using an existing client.  Make sure ->pg_pools is up to
1016 		 * date before we look up the pool id in do_rbd_add().
1017 		 */
1018 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
1019 					rbdc->client->options->mount_timeout);
1020 		if (ret) {
1021 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1022 			rbd_put_client(rbdc);
1023 			rbdc = ERR_PTR(ret);
1024 		}
1025 	} else {
1026 		rbdc = rbd_client_create(ceph_opts);
1027 	}
1028 	mutex_unlock(&client_mutex);
1029 
1030 	return rbdc;
1031 }
1032 
1033 static bool rbd_image_format_valid(u32 image_format)
1034 {
1035 	return image_format == 1 || image_format == 2;
1036 }
1037 
1038 static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1039 {
1040 	size_t size;
1041 	u32 snap_count;
1042 
1043 	/* The header has to start with the magic rbd header text */
1044 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1045 		return false;
1046 
1047 	/* The bio layer requires at least sector-sized I/O */
1048 
1049 	if (ondisk->options.order < SECTOR_SHIFT)
1050 		return false;
1051 
1052 	/* If we use u64 in a few spots we may be able to loosen this */
1053 
1054 	if (ondisk->options.order > 8 * sizeof (int) - 1)
1055 		return false;
1056 
1057 	/*
1058 	 * The size of a snapshot header has to fit in a size_t, and
1059 	 * that limits the number of snapshots.
1060 	 */
1061 	snap_count = le32_to_cpu(ondisk->snap_count);
1062 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1063 	if (snap_count > size / sizeof (__le64))
1064 		return false;
1065 
1066 	/*
1067 	 * Not only that, but the size of the entire the snapshot
1068 	 * header must also be representable in a size_t.
1069 	 */
1070 	size -= snap_count * sizeof (__le64);
1071 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1072 		return false;
1073 
1074 	return true;
1075 }
1076 
1077 /*
1078  * returns the size of an object in the image
1079  */
1080 static u32 rbd_obj_bytes(struct rbd_image_header *header)
1081 {
1082 	return 1U << header->obj_order;
1083 }
1084 
1085 static void rbd_init_layout(struct rbd_device *rbd_dev)
1086 {
1087 	if (rbd_dev->header.stripe_unit == 0 ||
1088 	    rbd_dev->header.stripe_count == 0) {
1089 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1090 		rbd_dev->header.stripe_count = 1;
1091 	}
1092 
1093 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1094 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1095 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
1096 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1097 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1098 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1099 }
1100 
1101 /*
1102  * Fill an rbd image header with information from the given format 1
1103  * on-disk header.
1104  */
1105 static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1106 				 struct rbd_image_header_ondisk *ondisk)
1107 {
1108 	struct rbd_image_header *header = &rbd_dev->header;
1109 	bool first_time = header->object_prefix == NULL;
1110 	struct ceph_snap_context *snapc;
1111 	char *object_prefix = NULL;
1112 	char *snap_names = NULL;
1113 	u64 *snap_sizes = NULL;
1114 	u32 snap_count;
1115 	int ret = -ENOMEM;
1116 	u32 i;
1117 
1118 	/* Allocate this now to avoid having to handle failure below */
1119 
1120 	if (first_time) {
1121 		object_prefix = kstrndup(ondisk->object_prefix,
1122 					 sizeof(ondisk->object_prefix),
1123 					 GFP_KERNEL);
1124 		if (!object_prefix)
1125 			return -ENOMEM;
1126 	}
1127 
1128 	/* Allocate the snapshot context and fill it in */
1129 
1130 	snap_count = le32_to_cpu(ondisk->snap_count);
1131 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1132 	if (!snapc)
1133 		goto out_err;
1134 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1135 	if (snap_count) {
1136 		struct rbd_image_snap_ondisk *snaps;
1137 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1138 
1139 		/* We'll keep a copy of the snapshot names... */
1140 
1141 		if (snap_names_len > (u64)SIZE_MAX)
1142 			goto out_2big;
1143 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1144 		if (!snap_names)
1145 			goto out_err;
1146 
1147 		/* ...as well as the array of their sizes. */
1148 		snap_sizes = kmalloc_array(snap_count,
1149 					   sizeof(*header->snap_sizes),
1150 					   GFP_KERNEL);
1151 		if (!snap_sizes)
1152 			goto out_err;
1153 
1154 		/*
1155 		 * Copy the names, and fill in each snapshot's id
1156 		 * and size.
1157 		 *
1158 		 * Note that rbd_dev_v1_header_info() guarantees the
1159 		 * ondisk buffer we're working with has
1160 		 * snap_names_len bytes beyond the end of the
1161 		 * snapshot id array, this memcpy() is safe.
1162 		 */
1163 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1164 		snaps = ondisk->snaps;
1165 		for (i = 0; i < snap_count; i++) {
1166 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1167 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1168 		}
1169 	}
1170 
1171 	/* We won't fail any more, fill in the header */
1172 
1173 	if (first_time) {
1174 		header->object_prefix = object_prefix;
1175 		header->obj_order = ondisk->options.order;
1176 		rbd_init_layout(rbd_dev);
1177 	} else {
1178 		ceph_put_snap_context(header->snapc);
1179 		kfree(header->snap_names);
1180 		kfree(header->snap_sizes);
1181 	}
1182 
1183 	/* The remaining fields always get updated (when we refresh) */
1184 
1185 	header->image_size = le64_to_cpu(ondisk->image_size);
1186 	header->snapc = snapc;
1187 	header->snap_names = snap_names;
1188 	header->snap_sizes = snap_sizes;
1189 
1190 	return 0;
1191 out_2big:
1192 	ret = -EIO;
1193 out_err:
1194 	kfree(snap_sizes);
1195 	kfree(snap_names);
1196 	ceph_put_snap_context(snapc);
1197 	kfree(object_prefix);
1198 
1199 	return ret;
1200 }
1201 
1202 static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1203 {
1204 	const char *snap_name;
1205 
1206 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1207 
1208 	/* Skip over names until we find the one we are looking for */
1209 
1210 	snap_name = rbd_dev->header.snap_names;
1211 	while (which--)
1212 		snap_name += strlen(snap_name) + 1;
1213 
1214 	return kstrdup(snap_name, GFP_KERNEL);
1215 }
1216 
1217 /*
1218  * Snapshot id comparison function for use with qsort()/bsearch().
1219  * Note that result is for snapshots in *descending* order.
1220  */
1221 static int snapid_compare_reverse(const void *s1, const void *s2)
1222 {
1223 	u64 snap_id1 = *(u64 *)s1;
1224 	u64 snap_id2 = *(u64 *)s2;
1225 
1226 	if (snap_id1 < snap_id2)
1227 		return 1;
1228 	return snap_id1 == snap_id2 ? 0 : -1;
1229 }
1230 
1231 /*
1232  * Search a snapshot context to see if the given snapshot id is
1233  * present.
1234  *
1235  * Returns the position of the snapshot id in the array if it's found,
1236  * or BAD_SNAP_INDEX otherwise.
1237  *
1238  * Note: The snapshot array is in kept sorted (by the osd) in
1239  * reverse order, highest snapshot id first.
1240  */
1241 static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1242 {
1243 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1244 	u64 *found;
1245 
1246 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1247 				sizeof (snap_id), snapid_compare_reverse);
1248 
1249 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1250 }
1251 
1252 static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1253 					u64 snap_id)
1254 {
1255 	u32 which;
1256 	const char *snap_name;
1257 
1258 	which = rbd_dev_snap_index(rbd_dev, snap_id);
1259 	if (which == BAD_SNAP_INDEX)
1260 		return ERR_PTR(-ENOENT);
1261 
1262 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1263 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1264 }
1265 
1266 static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1267 {
1268 	if (snap_id == CEPH_NOSNAP)
1269 		return RBD_SNAP_HEAD_NAME;
1270 
1271 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1272 	if (rbd_dev->image_format == 1)
1273 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1274 
1275 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1276 }
1277 
1278 static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1279 				u64 *snap_size)
1280 {
1281 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1282 	if (snap_id == CEPH_NOSNAP) {
1283 		*snap_size = rbd_dev->header.image_size;
1284 	} else if (rbd_dev->image_format == 1) {
1285 		u32 which;
1286 
1287 		which = rbd_dev_snap_index(rbd_dev, snap_id);
1288 		if (which == BAD_SNAP_INDEX)
1289 			return -ENOENT;
1290 
1291 		*snap_size = rbd_dev->header.snap_sizes[which];
1292 	} else {
1293 		u64 size = 0;
1294 		int ret;
1295 
1296 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1297 		if (ret)
1298 			return ret;
1299 
1300 		*snap_size = size;
1301 	}
1302 	return 0;
1303 }
1304 
1305 static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1306 			u64 *snap_features)
1307 {
1308 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1309 	if (snap_id == CEPH_NOSNAP) {
1310 		*snap_features = rbd_dev->header.features;
1311 	} else if (rbd_dev->image_format == 1) {
1312 		*snap_features = 0;	/* No features for format 1 */
1313 	} else {
1314 		u64 features = 0;
1315 		int ret;
1316 
1317 		ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1318 		if (ret)
1319 			return ret;
1320 
1321 		*snap_features = features;
1322 	}
1323 	return 0;
1324 }
1325 
1326 static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1327 {
1328 	u64 snap_id = rbd_dev->spec->snap_id;
1329 	u64 size = 0;
1330 	u64 features = 0;
1331 	int ret;
1332 
1333 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1334 	if (ret)
1335 		return ret;
1336 	ret = rbd_snap_features(rbd_dev, snap_id, &features);
1337 	if (ret)
1338 		return ret;
1339 
1340 	rbd_dev->mapping.size = size;
1341 	rbd_dev->mapping.features = features;
1342 
1343 	return 0;
1344 }
1345 
1346 static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1347 {
1348 	rbd_dev->mapping.size = 0;
1349 	rbd_dev->mapping.features = 0;
1350 }
1351 
1352 static void zero_bvec(struct bio_vec *bv)
1353 {
1354 	void *buf;
1355 	unsigned long flags;
1356 
1357 	buf = bvec_kmap_irq(bv, &flags);
1358 	memset(buf, 0, bv->bv_len);
1359 	flush_dcache_page(bv->bv_page);
1360 	bvec_kunmap_irq(buf, &flags);
1361 }
1362 
1363 static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1364 {
1365 	struct ceph_bio_iter it = *bio_pos;
1366 
1367 	ceph_bio_iter_advance(&it, off);
1368 	ceph_bio_iter_advance_step(&it, bytes, ({
1369 		zero_bvec(&bv);
1370 	}));
1371 }
1372 
1373 static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1374 {
1375 	struct ceph_bvec_iter it = *bvec_pos;
1376 
1377 	ceph_bvec_iter_advance(&it, off);
1378 	ceph_bvec_iter_advance_step(&it, bytes, ({
1379 		zero_bvec(&bv);
1380 	}));
1381 }
1382 
1383 /*
1384  * Zero a range in @obj_req data buffer defined by a bio (list) or
1385  * (private) bio_vec array.
1386  *
1387  * @off is relative to the start of the data buffer.
1388  */
1389 static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1390 			       u32 bytes)
1391 {
1392 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1393 
1394 	switch (obj_req->img_request->data_type) {
1395 	case OBJ_REQUEST_BIO:
1396 		zero_bios(&obj_req->bio_pos, off, bytes);
1397 		break;
1398 	case OBJ_REQUEST_BVECS:
1399 	case OBJ_REQUEST_OWN_BVECS:
1400 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
1401 		break;
1402 	default:
1403 		BUG();
1404 	}
1405 }
1406 
1407 static void rbd_obj_request_destroy(struct kref *kref);
1408 static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1409 {
1410 	rbd_assert(obj_request != NULL);
1411 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1412 		kref_read(&obj_request->kref));
1413 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1414 }
1415 
1416 static void rbd_img_request_destroy(struct kref *kref);
1417 static void rbd_img_request_put(struct rbd_img_request *img_request)
1418 {
1419 	rbd_assert(img_request != NULL);
1420 	dout("%s: img %p (was %d)\n", __func__, img_request,
1421 		kref_read(&img_request->kref));
1422 	kref_put(&img_request->kref, rbd_img_request_destroy);
1423 }
1424 
1425 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1426 					struct rbd_obj_request *obj_request)
1427 {
1428 	rbd_assert(obj_request->img_request == NULL);
1429 
1430 	/* Image request now owns object's original reference */
1431 	obj_request->img_request = img_request;
1432 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1433 }
1434 
1435 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1436 					struct rbd_obj_request *obj_request)
1437 {
1438 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1439 	list_del(&obj_request->ex.oe_item);
1440 	rbd_assert(obj_request->img_request == img_request);
1441 	rbd_obj_request_put(obj_request);
1442 }
1443 
1444 static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1445 {
1446 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1447 
1448 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1449 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1450 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1451 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1452 }
1453 
1454 /*
1455  * The default/initial value for all image request flags is 0.  Each
1456  * is conditionally set to 1 at image request initialization time
1457  * and currently never change thereafter.
1458  */
1459 static void img_request_layered_set(struct rbd_img_request *img_request)
1460 {
1461 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1462 	smp_mb();
1463 }
1464 
1465 static void img_request_layered_clear(struct rbd_img_request *img_request)
1466 {
1467 	clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1468 	smp_mb();
1469 }
1470 
1471 static bool img_request_layered_test(struct rbd_img_request *img_request)
1472 {
1473 	smp_mb();
1474 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1475 }
1476 
1477 static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1478 {
1479 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1480 
1481 	return !obj_req->ex.oe_off &&
1482 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
1483 }
1484 
1485 static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1486 {
1487 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1488 
1489 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1490 					rbd_dev->layout.object_size;
1491 }
1492 
1493 /*
1494  * Must be called after rbd_obj_calc_img_extents().
1495  */
1496 static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1497 {
1498 	if (!obj_req->num_img_extents ||
1499 	    (rbd_obj_is_entire(obj_req) &&
1500 	     !obj_req->img_request->snapc->num_snaps))
1501 		return false;
1502 
1503 	return true;
1504 }
1505 
1506 static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1507 {
1508 	return ceph_file_extents_bytes(obj_req->img_extents,
1509 				       obj_req->num_img_extents);
1510 }
1511 
1512 static bool rbd_img_is_write(struct rbd_img_request *img_req)
1513 {
1514 	switch (img_req->op_type) {
1515 	case OBJ_OP_READ:
1516 		return false;
1517 	case OBJ_OP_WRITE:
1518 	case OBJ_OP_DISCARD:
1519 	case OBJ_OP_ZEROOUT:
1520 		return true;
1521 	default:
1522 		BUG();
1523 	}
1524 }
1525 
1526 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1527 {
1528 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1529 	int result;
1530 
1531 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1532 	     osd_req->r_result, obj_req);
1533 
1534 	/*
1535 	 * Writes aren't allowed to return a data payload.  In some
1536 	 * guarded write cases (e.g. stat + zero on an empty object)
1537 	 * a stat response makes it through, but we don't care.
1538 	 */
1539 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1540 		result = 0;
1541 	else
1542 		result = osd_req->r_result;
1543 
1544 	rbd_obj_handle_request(obj_req, result);
1545 }
1546 
1547 static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1548 {
1549 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1550 
1551 	osd_req->r_flags = CEPH_OSD_FLAG_READ;
1552 	osd_req->r_snapid = obj_request->img_request->snap_id;
1553 }
1554 
1555 static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1556 {
1557 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1558 
1559 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1560 	ktime_get_real_ts64(&osd_req->r_mtime);
1561 	osd_req->r_data_offset = obj_request->ex.oe_off;
1562 }
1563 
1564 static struct ceph_osd_request *
1565 __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1566 			  struct ceph_snap_context *snapc, int num_ops)
1567 {
1568 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1569 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1570 	struct ceph_osd_request *req;
1571 	const char *name_format = rbd_dev->image_format == 1 ?
1572 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1573 	int ret;
1574 
1575 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1576 	if (!req)
1577 		return ERR_PTR(-ENOMEM);
1578 
1579 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1580 	req->r_callback = rbd_osd_req_callback;
1581 	req->r_priv = obj_req;
1582 
1583 	/*
1584 	 * Data objects may be stored in a separate pool, but always in
1585 	 * the same namespace in that pool as the header in its pool.
1586 	 */
1587 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1588 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1589 
1590 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1591 			       rbd_dev->header.object_prefix,
1592 			       obj_req->ex.oe_objno);
1593 	if (ret)
1594 		return ERR_PTR(ret);
1595 
1596 	return req;
1597 }
1598 
1599 static struct ceph_osd_request *
1600 rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1601 {
1602 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1603 					 num_ops);
1604 }
1605 
1606 static struct rbd_obj_request *rbd_obj_request_create(void)
1607 {
1608 	struct rbd_obj_request *obj_request;
1609 
1610 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1611 	if (!obj_request)
1612 		return NULL;
1613 
1614 	ceph_object_extent_init(&obj_request->ex);
1615 	INIT_LIST_HEAD(&obj_request->osd_reqs);
1616 	mutex_init(&obj_request->state_mutex);
1617 	kref_init(&obj_request->kref);
1618 
1619 	dout("%s %p\n", __func__, obj_request);
1620 	return obj_request;
1621 }
1622 
1623 static void rbd_obj_request_destroy(struct kref *kref)
1624 {
1625 	struct rbd_obj_request *obj_request;
1626 	struct ceph_osd_request *osd_req;
1627 	u32 i;
1628 
1629 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1630 
1631 	dout("%s: obj %p\n", __func__, obj_request);
1632 
1633 	while (!list_empty(&obj_request->osd_reqs)) {
1634 		osd_req = list_first_entry(&obj_request->osd_reqs,
1635 				    struct ceph_osd_request, r_private_item);
1636 		list_del_init(&osd_req->r_private_item);
1637 		ceph_osdc_put_request(osd_req);
1638 	}
1639 
1640 	switch (obj_request->img_request->data_type) {
1641 	case OBJ_REQUEST_NODATA:
1642 	case OBJ_REQUEST_BIO:
1643 	case OBJ_REQUEST_BVECS:
1644 		break;		/* Nothing to do */
1645 	case OBJ_REQUEST_OWN_BVECS:
1646 		kfree(obj_request->bvec_pos.bvecs);
1647 		break;
1648 	default:
1649 		BUG();
1650 	}
1651 
1652 	kfree(obj_request->img_extents);
1653 	if (obj_request->copyup_bvecs) {
1654 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1655 			if (obj_request->copyup_bvecs[i].bv_page)
1656 				__free_page(obj_request->copyup_bvecs[i].bv_page);
1657 		}
1658 		kfree(obj_request->copyup_bvecs);
1659 	}
1660 
1661 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1662 }
1663 
1664 /* It's OK to call this for a device with no parent */
1665 
1666 static void rbd_spec_put(struct rbd_spec *spec);
1667 static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1668 {
1669 	rbd_dev_remove_parent(rbd_dev);
1670 	rbd_spec_put(rbd_dev->parent_spec);
1671 	rbd_dev->parent_spec = NULL;
1672 	rbd_dev->parent_overlap = 0;
1673 }
1674 
1675 /*
1676  * Parent image reference counting is used to determine when an
1677  * image's parent fields can be safely torn down--after there are no
1678  * more in-flight requests to the parent image.  When the last
1679  * reference is dropped, cleaning them up is safe.
1680  */
1681 static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1682 {
1683 	int counter;
1684 
1685 	if (!rbd_dev->parent_spec)
1686 		return;
1687 
1688 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1689 	if (counter > 0)
1690 		return;
1691 
1692 	/* Last reference; clean up parent data structures */
1693 
1694 	if (!counter)
1695 		rbd_dev_unparent(rbd_dev);
1696 	else
1697 		rbd_warn(rbd_dev, "parent reference underflow");
1698 }
1699 
1700 /*
1701  * If an image has a non-zero parent overlap, get a reference to its
1702  * parent.
1703  *
1704  * Returns true if the rbd device has a parent with a non-zero
1705  * overlap and a reference for it was successfully taken, or
1706  * false otherwise.
1707  */
1708 static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1709 {
1710 	int counter = 0;
1711 
1712 	if (!rbd_dev->parent_spec)
1713 		return false;
1714 
1715 	down_read(&rbd_dev->header_rwsem);
1716 	if (rbd_dev->parent_overlap)
1717 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1718 	up_read(&rbd_dev->header_rwsem);
1719 
1720 	if (counter < 0)
1721 		rbd_warn(rbd_dev, "parent reference overflow");
1722 
1723 	return counter > 0;
1724 }
1725 
1726 /*
1727  * Caller is responsible for filling in the list of object requests
1728  * that comprises the image request, and the Linux request pointer
1729  * (if there is one).
1730  */
1731 static struct rbd_img_request *rbd_img_request_create(
1732 					struct rbd_device *rbd_dev,
1733 					enum obj_operation_type op_type,
1734 					struct ceph_snap_context *snapc)
1735 {
1736 	struct rbd_img_request *img_request;
1737 
1738 	img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
1739 	if (!img_request)
1740 		return NULL;
1741 
1742 	img_request->rbd_dev = rbd_dev;
1743 	img_request->op_type = op_type;
1744 	if (!rbd_img_is_write(img_request))
1745 		img_request->snap_id = rbd_dev->spec->snap_id;
1746 	else
1747 		img_request->snapc = snapc;
1748 
1749 	if (rbd_dev_parent_get(rbd_dev))
1750 		img_request_layered_set(img_request);
1751 
1752 	INIT_LIST_HEAD(&img_request->lock_item);
1753 	INIT_LIST_HEAD(&img_request->object_extents);
1754 	mutex_init(&img_request->state_mutex);
1755 	kref_init(&img_request->kref);
1756 
1757 	return img_request;
1758 }
1759 
1760 static void rbd_img_request_destroy(struct kref *kref)
1761 {
1762 	struct rbd_img_request *img_request;
1763 	struct rbd_obj_request *obj_request;
1764 	struct rbd_obj_request *next_obj_request;
1765 
1766 	img_request = container_of(kref, struct rbd_img_request, kref);
1767 
1768 	dout("%s: img %p\n", __func__, img_request);
1769 
1770 	WARN_ON(!list_empty(&img_request->lock_item));
1771 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1772 		rbd_img_obj_request_del(img_request, obj_request);
1773 
1774 	if (img_request_layered_test(img_request)) {
1775 		img_request_layered_clear(img_request);
1776 		rbd_dev_parent_put(img_request->rbd_dev);
1777 	}
1778 
1779 	if (rbd_img_is_write(img_request))
1780 		ceph_put_snap_context(img_request->snapc);
1781 
1782 	kmem_cache_free(rbd_img_request_cache, img_request);
1783 }
1784 
1785 #define BITS_PER_OBJ	2
1786 #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
1787 #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
1788 
1789 static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1790 				   u64 *index, u8 *shift)
1791 {
1792 	u32 off;
1793 
1794 	rbd_assert(objno < rbd_dev->object_map_size);
1795 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1796 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1797 }
1798 
1799 static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1800 {
1801 	u64 index;
1802 	u8 shift;
1803 
1804 	lockdep_assert_held(&rbd_dev->object_map_lock);
1805 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1806 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1807 }
1808 
1809 static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1810 {
1811 	u64 index;
1812 	u8 shift;
1813 	u8 *p;
1814 
1815 	lockdep_assert_held(&rbd_dev->object_map_lock);
1816 	rbd_assert(!(val & ~OBJ_MASK));
1817 
1818 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1819 	p = &rbd_dev->object_map[index];
1820 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1821 }
1822 
1823 static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1824 {
1825 	u8 state;
1826 
1827 	spin_lock(&rbd_dev->object_map_lock);
1828 	state = __rbd_object_map_get(rbd_dev, objno);
1829 	spin_unlock(&rbd_dev->object_map_lock);
1830 	return state;
1831 }
1832 
1833 static bool use_object_map(struct rbd_device *rbd_dev)
1834 {
1835 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1836 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1837 }
1838 
1839 static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1840 {
1841 	u8 state;
1842 
1843 	/* fall back to default logic if object map is disabled or invalid */
1844 	if (!use_object_map(rbd_dev))
1845 		return true;
1846 
1847 	state = rbd_object_map_get(rbd_dev, objno);
1848 	return state != OBJECT_NONEXISTENT;
1849 }
1850 
1851 static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1852 				struct ceph_object_id *oid)
1853 {
1854 	if (snap_id == CEPH_NOSNAP)
1855 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1856 				rbd_dev->spec->image_id);
1857 	else
1858 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1859 				rbd_dev->spec->image_id, snap_id);
1860 }
1861 
1862 static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1863 {
1864 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1865 	CEPH_DEFINE_OID_ONSTACK(oid);
1866 	u8 lock_type;
1867 	char *lock_tag;
1868 	struct ceph_locker *lockers;
1869 	u32 num_lockers;
1870 	bool broke_lock = false;
1871 	int ret;
1872 
1873 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1874 
1875 again:
1876 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1877 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1878 	if (ret != -EBUSY || broke_lock) {
1879 		if (ret == -EEXIST)
1880 			ret = 0; /* already locked by myself */
1881 		if (ret)
1882 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1883 		return ret;
1884 	}
1885 
1886 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1887 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
1888 				 &lockers, &num_lockers);
1889 	if (ret) {
1890 		if (ret == -ENOENT)
1891 			goto again;
1892 
1893 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1894 		return ret;
1895 	}
1896 
1897 	kfree(lock_tag);
1898 	if (num_lockers == 0)
1899 		goto again;
1900 
1901 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1902 		 ENTITY_NAME(lockers[0].id.name));
1903 
1904 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1905 				  RBD_LOCK_NAME, lockers[0].id.cookie,
1906 				  &lockers[0].id.name);
1907 	ceph_free_lockers(lockers, num_lockers);
1908 	if (ret) {
1909 		if (ret == -ENOENT)
1910 			goto again;
1911 
1912 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1913 		return ret;
1914 	}
1915 
1916 	broke_lock = true;
1917 	goto again;
1918 }
1919 
1920 static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1921 {
1922 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1923 	CEPH_DEFINE_OID_ONSTACK(oid);
1924 	int ret;
1925 
1926 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1927 
1928 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1929 			      "");
1930 	if (ret && ret != -ENOENT)
1931 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1932 }
1933 
1934 static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1935 {
1936 	u8 struct_v;
1937 	u32 struct_len;
1938 	u32 header_len;
1939 	void *header_end;
1940 	int ret;
1941 
1942 	ceph_decode_32_safe(p, end, header_len, e_inval);
1943 	header_end = *p + header_len;
1944 
1945 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1946 				  &struct_len);
1947 	if (ret)
1948 		return ret;
1949 
1950 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1951 
1952 	*p = header_end;
1953 	return 0;
1954 
1955 e_inval:
1956 	return -EINVAL;
1957 }
1958 
1959 static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1960 {
1961 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1962 	CEPH_DEFINE_OID_ONSTACK(oid);
1963 	struct page **pages;
1964 	void *p, *end;
1965 	size_t reply_len;
1966 	u64 num_objects;
1967 	u64 object_map_bytes;
1968 	u64 object_map_size;
1969 	int num_pages;
1970 	int ret;
1971 
1972 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1973 
1974 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
1975 					   rbd_dev->mapping.size);
1976 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1977 					    BITS_PER_BYTE);
1978 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
1979 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1980 	if (IS_ERR(pages))
1981 		return PTR_ERR(pages);
1982 
1983 	reply_len = num_pages * PAGE_SIZE;
1984 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1985 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1986 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1987 			     NULL, 0, pages, &reply_len);
1988 	if (ret)
1989 		goto out;
1990 
1991 	p = page_address(pages[0]);
1992 	end = p + min(reply_len, (size_t)PAGE_SIZE);
1993 	ret = decode_object_map_header(&p, end, &object_map_size);
1994 	if (ret)
1995 		goto out;
1996 
1997 	if (object_map_size != num_objects) {
1998 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1999 			 object_map_size, num_objects);
2000 		ret = -EINVAL;
2001 		goto out;
2002 	}
2003 
2004 	if (offset_in_page(p) + object_map_bytes > reply_len) {
2005 		ret = -EINVAL;
2006 		goto out;
2007 	}
2008 
2009 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
2010 	if (!rbd_dev->object_map) {
2011 		ret = -ENOMEM;
2012 		goto out;
2013 	}
2014 
2015 	rbd_dev->object_map_size = object_map_size;
2016 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
2017 				   offset_in_page(p), object_map_bytes);
2018 
2019 out:
2020 	ceph_release_page_vector(pages, num_pages);
2021 	return ret;
2022 }
2023 
2024 static void rbd_object_map_free(struct rbd_device *rbd_dev)
2025 {
2026 	kvfree(rbd_dev->object_map);
2027 	rbd_dev->object_map = NULL;
2028 	rbd_dev->object_map_size = 0;
2029 }
2030 
2031 static int rbd_object_map_load(struct rbd_device *rbd_dev)
2032 {
2033 	int ret;
2034 
2035 	ret = __rbd_object_map_load(rbd_dev);
2036 	if (ret)
2037 		return ret;
2038 
2039 	ret = rbd_dev_v2_get_flags(rbd_dev);
2040 	if (ret) {
2041 		rbd_object_map_free(rbd_dev);
2042 		return ret;
2043 	}
2044 
2045 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
2046 		rbd_warn(rbd_dev, "object map is invalid");
2047 
2048 	return 0;
2049 }
2050 
2051 static int rbd_object_map_open(struct rbd_device *rbd_dev)
2052 {
2053 	int ret;
2054 
2055 	ret = rbd_object_map_lock(rbd_dev);
2056 	if (ret)
2057 		return ret;
2058 
2059 	ret = rbd_object_map_load(rbd_dev);
2060 	if (ret) {
2061 		rbd_object_map_unlock(rbd_dev);
2062 		return ret;
2063 	}
2064 
2065 	return 0;
2066 }
2067 
2068 static void rbd_object_map_close(struct rbd_device *rbd_dev)
2069 {
2070 	rbd_object_map_free(rbd_dev);
2071 	rbd_object_map_unlock(rbd_dev);
2072 }
2073 
2074 /*
2075  * This function needs snap_id (or more precisely just something to
2076  * distinguish between HEAD and snapshot object maps), new_state and
2077  * current_state that were passed to rbd_object_map_update().
2078  *
2079  * To avoid allocating and stashing a context we piggyback on the OSD
2080  * request.  A HEAD update has two ops (assert_locked).  For new_state
2081  * and current_state we decode our own object_map_update op, encoded in
2082  * rbd_cls_object_map_update().
2083  */
2084 static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
2085 					struct ceph_osd_request *osd_req)
2086 {
2087 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2088 	struct ceph_osd_data *osd_data;
2089 	u64 objno;
2090 	u8 state, new_state, current_state;
2091 	bool has_current_state;
2092 	void *p;
2093 
2094 	if (osd_req->r_result)
2095 		return osd_req->r_result;
2096 
2097 	/*
2098 	 * Nothing to do for a snapshot object map.
2099 	 */
2100 	if (osd_req->r_num_ops == 1)
2101 		return 0;
2102 
2103 	/*
2104 	 * Update in-memory HEAD object map.
2105 	 */
2106 	rbd_assert(osd_req->r_num_ops == 2);
2107 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2108 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2109 
2110 	p = page_address(osd_data->pages[0]);
2111 	objno = ceph_decode_64(&p);
2112 	rbd_assert(objno == obj_req->ex.oe_objno);
2113 	rbd_assert(ceph_decode_64(&p) == objno + 1);
2114 	new_state = ceph_decode_8(&p);
2115 	has_current_state = ceph_decode_8(&p);
2116 	if (has_current_state)
2117 		current_state = ceph_decode_8(&p);
2118 
2119 	spin_lock(&rbd_dev->object_map_lock);
2120 	state = __rbd_object_map_get(rbd_dev, objno);
2121 	if (!has_current_state || current_state == state ||
2122 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2123 		__rbd_object_map_set(rbd_dev, objno, new_state);
2124 	spin_unlock(&rbd_dev->object_map_lock);
2125 
2126 	return 0;
2127 }
2128 
2129 static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2130 {
2131 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2132 	int result;
2133 
2134 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2135 	     osd_req->r_result, obj_req);
2136 
2137 	result = rbd_object_map_update_finish(obj_req, osd_req);
2138 	rbd_obj_handle_request(obj_req, result);
2139 }
2140 
2141 static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2142 {
2143 	u8 state = rbd_object_map_get(rbd_dev, objno);
2144 
2145 	if (state == new_state ||
2146 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2147 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2148 		return false;
2149 
2150 	return true;
2151 }
2152 
2153 static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2154 				     int which, u64 objno, u8 new_state,
2155 				     const u8 *current_state)
2156 {
2157 	struct page **pages;
2158 	void *p, *start;
2159 	int ret;
2160 
2161 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2162 	if (ret)
2163 		return ret;
2164 
2165 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2166 	if (IS_ERR(pages))
2167 		return PTR_ERR(pages);
2168 
2169 	p = start = page_address(pages[0]);
2170 	ceph_encode_64(&p, objno);
2171 	ceph_encode_64(&p, objno + 1);
2172 	ceph_encode_8(&p, new_state);
2173 	if (current_state) {
2174 		ceph_encode_8(&p, 1);
2175 		ceph_encode_8(&p, *current_state);
2176 	} else {
2177 		ceph_encode_8(&p, 0);
2178 	}
2179 
2180 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2181 					  false, true);
2182 	return 0;
2183 }
2184 
2185 /*
2186  * Return:
2187  *   0 - object map update sent
2188  *   1 - object map update isn't needed
2189  *  <0 - error
2190  */
2191 static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2192 				 u8 new_state, const u8 *current_state)
2193 {
2194 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2195 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2196 	struct ceph_osd_request *req;
2197 	int num_ops = 1;
2198 	int which = 0;
2199 	int ret;
2200 
2201 	if (snap_id == CEPH_NOSNAP) {
2202 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2203 			return 1;
2204 
2205 		num_ops++; /* assert_locked */
2206 	}
2207 
2208 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2209 	if (!req)
2210 		return -ENOMEM;
2211 
2212 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2213 	req->r_callback = rbd_object_map_callback;
2214 	req->r_priv = obj_req;
2215 
2216 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2217 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2218 	req->r_flags = CEPH_OSD_FLAG_WRITE;
2219 	ktime_get_real_ts64(&req->r_mtime);
2220 
2221 	if (snap_id == CEPH_NOSNAP) {
2222 		/*
2223 		 * Protect against possible race conditions during lock
2224 		 * ownership transitions.
2225 		 */
2226 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2227 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2228 		if (ret)
2229 			return ret;
2230 	}
2231 
2232 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2233 					new_state, current_state);
2234 	if (ret)
2235 		return ret;
2236 
2237 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2238 	if (ret)
2239 		return ret;
2240 
2241 	ceph_osdc_start_request(osdc, req, false);
2242 	return 0;
2243 }
2244 
2245 static void prune_extents(struct ceph_file_extent *img_extents,
2246 			  u32 *num_img_extents, u64 overlap)
2247 {
2248 	u32 cnt = *num_img_extents;
2249 
2250 	/* drop extents completely beyond the overlap */
2251 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2252 		cnt--;
2253 
2254 	if (cnt) {
2255 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2256 
2257 		/* trim final overlapping extent */
2258 		if (ex->fe_off + ex->fe_len > overlap)
2259 			ex->fe_len = overlap - ex->fe_off;
2260 	}
2261 
2262 	*num_img_extents = cnt;
2263 }
2264 
2265 /*
2266  * Determine the byte range(s) covered by either just the object extent
2267  * or the entire object in the parent image.
2268  */
2269 static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2270 				    bool entire)
2271 {
2272 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2273 	int ret;
2274 
2275 	if (!rbd_dev->parent_overlap)
2276 		return 0;
2277 
2278 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2279 				  entire ? 0 : obj_req->ex.oe_off,
2280 				  entire ? rbd_dev->layout.object_size :
2281 							obj_req->ex.oe_len,
2282 				  &obj_req->img_extents,
2283 				  &obj_req->num_img_extents);
2284 	if (ret)
2285 		return ret;
2286 
2287 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2288 		      rbd_dev->parent_overlap);
2289 	return 0;
2290 }
2291 
2292 static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
2293 {
2294 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2295 
2296 	switch (obj_req->img_request->data_type) {
2297 	case OBJ_REQUEST_BIO:
2298 		osd_req_op_extent_osd_data_bio(osd_req, which,
2299 					       &obj_req->bio_pos,
2300 					       obj_req->ex.oe_len);
2301 		break;
2302 	case OBJ_REQUEST_BVECS:
2303 	case OBJ_REQUEST_OWN_BVECS:
2304 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2305 							obj_req->ex.oe_len);
2306 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2307 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
2308 						    &obj_req->bvec_pos);
2309 		break;
2310 	default:
2311 		BUG();
2312 	}
2313 }
2314 
2315 static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
2316 {
2317 	struct page **pages;
2318 
2319 	/*
2320 	 * The response data for a STAT call consists of:
2321 	 *     le64 length;
2322 	 *     struct {
2323 	 *         le32 tv_sec;
2324 	 *         le32 tv_nsec;
2325 	 *     } mtime;
2326 	 */
2327 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2328 	if (IS_ERR(pages))
2329 		return PTR_ERR(pages);
2330 
2331 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2332 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
2333 				     8 + sizeof(struct ceph_timespec),
2334 				     0, false, true);
2335 	return 0;
2336 }
2337 
2338 static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2339 				u32 bytes)
2340 {
2341 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2342 	int ret;
2343 
2344 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2345 	if (ret)
2346 		return ret;
2347 
2348 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2349 					  obj_req->copyup_bvec_count, bytes);
2350 	return 0;
2351 }
2352 
2353 static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2354 {
2355 	obj_req->read_state = RBD_OBJ_READ_START;
2356 	return 0;
2357 }
2358 
2359 static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2360 				      int which)
2361 {
2362 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2363 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2364 	u16 opcode;
2365 
2366 	if (!use_object_map(rbd_dev) ||
2367 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2368 		osd_req_op_alloc_hint_init(osd_req, which++,
2369 					   rbd_dev->layout.object_size,
2370 					   rbd_dev->layout.object_size);
2371 	}
2372 
2373 	if (rbd_obj_is_entire(obj_req))
2374 		opcode = CEPH_OSD_OP_WRITEFULL;
2375 	else
2376 		opcode = CEPH_OSD_OP_WRITE;
2377 
2378 	osd_req_op_extent_init(osd_req, which, opcode,
2379 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2380 	rbd_osd_setup_data(osd_req, which);
2381 }
2382 
2383 static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
2384 {
2385 	int ret;
2386 
2387 	/* reverse map the entire object onto the parent */
2388 	ret = rbd_obj_calc_img_extents(obj_req, true);
2389 	if (ret)
2390 		return ret;
2391 
2392 	if (rbd_obj_copyup_enabled(obj_req))
2393 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2394 
2395 	obj_req->write_state = RBD_OBJ_WRITE_START;
2396 	return 0;
2397 }
2398 
2399 static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2400 {
2401 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2402 					  CEPH_OSD_OP_ZERO;
2403 }
2404 
2405 static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2406 					int which)
2407 {
2408 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2409 
2410 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2411 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2412 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2413 	} else {
2414 		osd_req_op_extent_init(osd_req, which,
2415 				       truncate_or_zero_opcode(obj_req),
2416 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2417 				       0, 0);
2418 	}
2419 }
2420 
2421 static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
2422 {
2423 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2424 	u64 off, next_off;
2425 	int ret;
2426 
2427 	/*
2428 	 * Align the range to alloc_size boundary and punt on discards
2429 	 * that are too small to free up any space.
2430 	 *
2431 	 * alloc_size == object_size && is_tail() is a special case for
2432 	 * filestore with filestore_punch_hole = false, needed to allow
2433 	 * truncate (in addition to delete).
2434 	 */
2435 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2436 	    !rbd_obj_is_tail(obj_req)) {
2437 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2438 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2439 				      rbd_dev->opts->alloc_size);
2440 		if (off >= next_off)
2441 			return 1;
2442 
2443 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2444 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2445 		     off, next_off - off);
2446 		obj_req->ex.oe_off = off;
2447 		obj_req->ex.oe_len = next_off - off;
2448 	}
2449 
2450 	/* reverse map the entire object onto the parent */
2451 	ret = rbd_obj_calc_img_extents(obj_req, true);
2452 	if (ret)
2453 		return ret;
2454 
2455 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2456 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2457 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2458 
2459 	obj_req->write_state = RBD_OBJ_WRITE_START;
2460 	return 0;
2461 }
2462 
2463 static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2464 					int which)
2465 {
2466 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2467 	u16 opcode;
2468 
2469 	if (rbd_obj_is_entire(obj_req)) {
2470 		if (obj_req->num_img_extents) {
2471 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2472 				osd_req_op_init(osd_req, which++,
2473 						CEPH_OSD_OP_CREATE, 0);
2474 			opcode = CEPH_OSD_OP_TRUNCATE;
2475 		} else {
2476 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2477 			osd_req_op_init(osd_req, which++,
2478 					CEPH_OSD_OP_DELETE, 0);
2479 			opcode = 0;
2480 		}
2481 	} else {
2482 		opcode = truncate_or_zero_opcode(obj_req);
2483 	}
2484 
2485 	if (opcode)
2486 		osd_req_op_extent_init(osd_req, which, opcode,
2487 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2488 				       0, 0);
2489 }
2490 
2491 static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
2492 {
2493 	int ret;
2494 
2495 	/* reverse map the entire object onto the parent */
2496 	ret = rbd_obj_calc_img_extents(obj_req, true);
2497 	if (ret)
2498 		return ret;
2499 
2500 	if (rbd_obj_copyup_enabled(obj_req))
2501 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2502 	if (!obj_req->num_img_extents) {
2503 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2504 		if (rbd_obj_is_entire(obj_req))
2505 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2506 	}
2507 
2508 	obj_req->write_state = RBD_OBJ_WRITE_START;
2509 	return 0;
2510 }
2511 
2512 static int count_write_ops(struct rbd_obj_request *obj_req)
2513 {
2514 	struct rbd_img_request *img_req = obj_req->img_request;
2515 
2516 	switch (img_req->op_type) {
2517 	case OBJ_OP_WRITE:
2518 		if (!use_object_map(img_req->rbd_dev) ||
2519 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2520 			return 2; /* setallochint + write/writefull */
2521 
2522 		return 1; /* write/writefull */
2523 	case OBJ_OP_DISCARD:
2524 		return 1; /* delete/truncate/zero */
2525 	case OBJ_OP_ZEROOUT:
2526 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2527 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2528 			return 2; /* create + truncate */
2529 
2530 		return 1; /* delete/truncate/zero */
2531 	default:
2532 		BUG();
2533 	}
2534 }
2535 
2536 static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2537 				    int which)
2538 {
2539 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2540 
2541 	switch (obj_req->img_request->op_type) {
2542 	case OBJ_OP_WRITE:
2543 		__rbd_osd_setup_write_ops(osd_req, which);
2544 		break;
2545 	case OBJ_OP_DISCARD:
2546 		__rbd_osd_setup_discard_ops(osd_req, which);
2547 		break;
2548 	case OBJ_OP_ZEROOUT:
2549 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2550 		break;
2551 	default:
2552 		BUG();
2553 	}
2554 }
2555 
2556 /*
2557  * Prune the list of object requests (adjust offset and/or length, drop
2558  * redundant requests).  Prepare object request state machines and image
2559  * request state machine for execution.
2560  */
2561 static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2562 {
2563 	struct rbd_obj_request *obj_req, *next_obj_req;
2564 	int ret;
2565 
2566 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2567 		switch (img_req->op_type) {
2568 		case OBJ_OP_READ:
2569 			ret = rbd_obj_init_read(obj_req);
2570 			break;
2571 		case OBJ_OP_WRITE:
2572 			ret = rbd_obj_init_write(obj_req);
2573 			break;
2574 		case OBJ_OP_DISCARD:
2575 			ret = rbd_obj_init_discard(obj_req);
2576 			break;
2577 		case OBJ_OP_ZEROOUT:
2578 			ret = rbd_obj_init_zeroout(obj_req);
2579 			break;
2580 		default:
2581 			BUG();
2582 		}
2583 		if (ret < 0)
2584 			return ret;
2585 		if (ret > 0) {
2586 			rbd_img_obj_request_del(img_req, obj_req);
2587 			continue;
2588 		}
2589 	}
2590 
2591 	img_req->state = RBD_IMG_START;
2592 	return 0;
2593 }
2594 
2595 union rbd_img_fill_iter {
2596 	struct ceph_bio_iter	bio_iter;
2597 	struct ceph_bvec_iter	bvec_iter;
2598 };
2599 
2600 struct rbd_img_fill_ctx {
2601 	enum obj_request_type	pos_type;
2602 	union rbd_img_fill_iter	*pos;
2603 	union rbd_img_fill_iter	iter;
2604 	ceph_object_extent_fn_t	set_pos_fn;
2605 	ceph_object_extent_fn_t	count_fn;
2606 	ceph_object_extent_fn_t	copy_fn;
2607 };
2608 
2609 static struct ceph_object_extent *alloc_object_extent(void *arg)
2610 {
2611 	struct rbd_img_request *img_req = arg;
2612 	struct rbd_obj_request *obj_req;
2613 
2614 	obj_req = rbd_obj_request_create();
2615 	if (!obj_req)
2616 		return NULL;
2617 
2618 	rbd_img_obj_request_add(img_req, obj_req);
2619 	return &obj_req->ex;
2620 }
2621 
2622 /*
2623  * While su != os && sc == 1 is technically not fancy (it's the same
2624  * layout as su == os && sc == 1), we can't use the nocopy path for it
2625  * because ->set_pos_fn() should be called only once per object.
2626  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2627  * treat su != os && sc == 1 as fancy.
2628  */
2629 static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2630 {
2631 	return l->stripe_unit != l->object_size;
2632 }
2633 
2634 static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2635 				       struct ceph_file_extent *img_extents,
2636 				       u32 num_img_extents,
2637 				       struct rbd_img_fill_ctx *fctx)
2638 {
2639 	u32 i;
2640 	int ret;
2641 
2642 	img_req->data_type = fctx->pos_type;
2643 
2644 	/*
2645 	 * Create object requests and set each object request's starting
2646 	 * position in the provided bio (list) or bio_vec array.
2647 	 */
2648 	fctx->iter = *fctx->pos;
2649 	for (i = 0; i < num_img_extents; i++) {
2650 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2651 					   img_extents[i].fe_off,
2652 					   img_extents[i].fe_len,
2653 					   &img_req->object_extents,
2654 					   alloc_object_extent, img_req,
2655 					   fctx->set_pos_fn, &fctx->iter);
2656 		if (ret)
2657 			return ret;
2658 	}
2659 
2660 	return __rbd_img_fill_request(img_req);
2661 }
2662 
2663 /*
2664  * Map a list of image extents to a list of object extents, create the
2665  * corresponding object requests (normally each to a different object,
2666  * but not always) and add them to @img_req.  For each object request,
2667  * set up its data descriptor to point to the corresponding chunk(s) of
2668  * @fctx->pos data buffer.
2669  *
2670  * Because ceph_file_to_extents() will merge adjacent object extents
2671  * together, each object request's data descriptor may point to multiple
2672  * different chunks of @fctx->pos data buffer.
2673  *
2674  * @fctx->pos data buffer is assumed to be large enough.
2675  */
2676 static int rbd_img_fill_request(struct rbd_img_request *img_req,
2677 				struct ceph_file_extent *img_extents,
2678 				u32 num_img_extents,
2679 				struct rbd_img_fill_ctx *fctx)
2680 {
2681 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2682 	struct rbd_obj_request *obj_req;
2683 	u32 i;
2684 	int ret;
2685 
2686 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2687 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2688 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2689 						   num_img_extents, fctx);
2690 
2691 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2692 
2693 	/*
2694 	 * Create object requests and determine ->bvec_count for each object
2695 	 * request.  Note that ->bvec_count sum over all object requests may
2696 	 * be greater than the number of bio_vecs in the provided bio (list)
2697 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2698 	 * stripe unit boundaries.
2699 	 */
2700 	fctx->iter = *fctx->pos;
2701 	for (i = 0; i < num_img_extents; i++) {
2702 		ret = ceph_file_to_extents(&rbd_dev->layout,
2703 					   img_extents[i].fe_off,
2704 					   img_extents[i].fe_len,
2705 					   &img_req->object_extents,
2706 					   alloc_object_extent, img_req,
2707 					   fctx->count_fn, &fctx->iter);
2708 		if (ret)
2709 			return ret;
2710 	}
2711 
2712 	for_each_obj_request(img_req, obj_req) {
2713 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2714 					      sizeof(*obj_req->bvec_pos.bvecs),
2715 					      GFP_NOIO);
2716 		if (!obj_req->bvec_pos.bvecs)
2717 			return -ENOMEM;
2718 	}
2719 
2720 	/*
2721 	 * Fill in each object request's private bio_vec array, splitting and
2722 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2723 	 */
2724 	fctx->iter = *fctx->pos;
2725 	for (i = 0; i < num_img_extents; i++) {
2726 		ret = ceph_iterate_extents(&rbd_dev->layout,
2727 					   img_extents[i].fe_off,
2728 					   img_extents[i].fe_len,
2729 					   &img_req->object_extents,
2730 					   fctx->copy_fn, &fctx->iter);
2731 		if (ret)
2732 			return ret;
2733 	}
2734 
2735 	return __rbd_img_fill_request(img_req);
2736 }
2737 
2738 static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2739 			       u64 off, u64 len)
2740 {
2741 	struct ceph_file_extent ex = { off, len };
2742 	union rbd_img_fill_iter dummy;
2743 	struct rbd_img_fill_ctx fctx = {
2744 		.pos_type = OBJ_REQUEST_NODATA,
2745 		.pos = &dummy,
2746 	};
2747 
2748 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2749 }
2750 
2751 static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2752 {
2753 	struct rbd_obj_request *obj_req =
2754 	    container_of(ex, struct rbd_obj_request, ex);
2755 	struct ceph_bio_iter *it = arg;
2756 
2757 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2758 	obj_req->bio_pos = *it;
2759 	ceph_bio_iter_advance(it, bytes);
2760 }
2761 
2762 static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2763 {
2764 	struct rbd_obj_request *obj_req =
2765 	    container_of(ex, struct rbd_obj_request, ex);
2766 	struct ceph_bio_iter *it = arg;
2767 
2768 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2769 	ceph_bio_iter_advance_step(it, bytes, ({
2770 		obj_req->bvec_count++;
2771 	}));
2772 
2773 }
2774 
2775 static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2776 {
2777 	struct rbd_obj_request *obj_req =
2778 	    container_of(ex, struct rbd_obj_request, ex);
2779 	struct ceph_bio_iter *it = arg;
2780 
2781 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2782 	ceph_bio_iter_advance_step(it, bytes, ({
2783 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2784 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2785 	}));
2786 }
2787 
2788 static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2789 				   struct ceph_file_extent *img_extents,
2790 				   u32 num_img_extents,
2791 				   struct ceph_bio_iter *bio_pos)
2792 {
2793 	struct rbd_img_fill_ctx fctx = {
2794 		.pos_type = OBJ_REQUEST_BIO,
2795 		.pos = (union rbd_img_fill_iter *)bio_pos,
2796 		.set_pos_fn = set_bio_pos,
2797 		.count_fn = count_bio_bvecs,
2798 		.copy_fn = copy_bio_bvecs,
2799 	};
2800 
2801 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2802 				    &fctx);
2803 }
2804 
2805 static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2806 				 u64 off, u64 len, struct bio *bio)
2807 {
2808 	struct ceph_file_extent ex = { off, len };
2809 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2810 
2811 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2812 }
2813 
2814 static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2815 {
2816 	struct rbd_obj_request *obj_req =
2817 	    container_of(ex, struct rbd_obj_request, ex);
2818 	struct ceph_bvec_iter *it = arg;
2819 
2820 	obj_req->bvec_pos = *it;
2821 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2822 	ceph_bvec_iter_advance(it, bytes);
2823 }
2824 
2825 static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2826 {
2827 	struct rbd_obj_request *obj_req =
2828 	    container_of(ex, struct rbd_obj_request, ex);
2829 	struct ceph_bvec_iter *it = arg;
2830 
2831 	ceph_bvec_iter_advance_step(it, bytes, ({
2832 		obj_req->bvec_count++;
2833 	}));
2834 }
2835 
2836 static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2837 {
2838 	struct rbd_obj_request *obj_req =
2839 	    container_of(ex, struct rbd_obj_request, ex);
2840 	struct ceph_bvec_iter *it = arg;
2841 
2842 	ceph_bvec_iter_advance_step(it, bytes, ({
2843 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2844 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2845 	}));
2846 }
2847 
2848 static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2849 				     struct ceph_file_extent *img_extents,
2850 				     u32 num_img_extents,
2851 				     struct ceph_bvec_iter *bvec_pos)
2852 {
2853 	struct rbd_img_fill_ctx fctx = {
2854 		.pos_type = OBJ_REQUEST_BVECS,
2855 		.pos = (union rbd_img_fill_iter *)bvec_pos,
2856 		.set_pos_fn = set_bvec_pos,
2857 		.count_fn = count_bvecs,
2858 		.copy_fn = copy_bvecs,
2859 	};
2860 
2861 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2862 				    &fctx);
2863 }
2864 
2865 static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2866 				   struct ceph_file_extent *img_extents,
2867 				   u32 num_img_extents,
2868 				   struct bio_vec *bvecs)
2869 {
2870 	struct ceph_bvec_iter it = {
2871 		.bvecs = bvecs,
2872 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2873 							     num_img_extents) },
2874 	};
2875 
2876 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2877 					 &it);
2878 }
2879 
2880 static void rbd_img_handle_request_work(struct work_struct *work)
2881 {
2882 	struct rbd_img_request *img_req =
2883 	    container_of(work, struct rbd_img_request, work);
2884 
2885 	rbd_img_handle_request(img_req, img_req->work_result);
2886 }
2887 
2888 static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2889 {
2890 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2891 	img_req->work_result = result;
2892 	queue_work(rbd_wq, &img_req->work);
2893 }
2894 
2895 static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2896 {
2897 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2898 
2899 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2900 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2901 		return true;
2902 	}
2903 
2904 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2905 	     obj_req->ex.oe_objno);
2906 	return false;
2907 }
2908 
2909 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2910 {
2911 	struct ceph_osd_request *osd_req;
2912 	int ret;
2913 
2914 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2915 	if (IS_ERR(osd_req))
2916 		return PTR_ERR(osd_req);
2917 
2918 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2919 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2920 	rbd_osd_setup_data(osd_req, 0);
2921 	rbd_osd_format_read(osd_req);
2922 
2923 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2924 	if (ret)
2925 		return ret;
2926 
2927 	rbd_osd_submit(osd_req);
2928 	return 0;
2929 }
2930 
2931 static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2932 {
2933 	struct rbd_img_request *img_req = obj_req->img_request;
2934 	struct rbd_img_request *child_img_req;
2935 	int ret;
2936 
2937 	child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2938 					       OBJ_OP_READ, NULL);
2939 	if (!child_img_req)
2940 		return -ENOMEM;
2941 
2942 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2943 	child_img_req->obj_request = obj_req;
2944 
2945 	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2946 	     obj_req);
2947 
2948 	if (!rbd_img_is_write(img_req)) {
2949 		switch (img_req->data_type) {
2950 		case OBJ_REQUEST_BIO:
2951 			ret = __rbd_img_fill_from_bio(child_img_req,
2952 						      obj_req->img_extents,
2953 						      obj_req->num_img_extents,
2954 						      &obj_req->bio_pos);
2955 			break;
2956 		case OBJ_REQUEST_BVECS:
2957 		case OBJ_REQUEST_OWN_BVECS:
2958 			ret = __rbd_img_fill_from_bvecs(child_img_req,
2959 						      obj_req->img_extents,
2960 						      obj_req->num_img_extents,
2961 						      &obj_req->bvec_pos);
2962 			break;
2963 		default:
2964 			BUG();
2965 		}
2966 	} else {
2967 		ret = rbd_img_fill_from_bvecs(child_img_req,
2968 					      obj_req->img_extents,
2969 					      obj_req->num_img_extents,
2970 					      obj_req->copyup_bvecs);
2971 	}
2972 	if (ret) {
2973 		rbd_img_request_put(child_img_req);
2974 		return ret;
2975 	}
2976 
2977 	/* avoid parent chain recursion */
2978 	rbd_img_schedule(child_img_req, 0);
2979 	return 0;
2980 }
2981 
2982 static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
2983 {
2984 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2985 	int ret;
2986 
2987 again:
2988 	switch (obj_req->read_state) {
2989 	case RBD_OBJ_READ_START:
2990 		rbd_assert(!*result);
2991 
2992 		if (!rbd_obj_may_exist(obj_req)) {
2993 			*result = -ENOENT;
2994 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
2995 			goto again;
2996 		}
2997 
2998 		ret = rbd_obj_read_object(obj_req);
2999 		if (ret) {
3000 			*result = ret;
3001 			return true;
3002 		}
3003 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
3004 		return false;
3005 	case RBD_OBJ_READ_OBJECT:
3006 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
3007 			/* reverse map this object extent onto the parent */
3008 			ret = rbd_obj_calc_img_extents(obj_req, false);
3009 			if (ret) {
3010 				*result = ret;
3011 				return true;
3012 			}
3013 			if (obj_req->num_img_extents) {
3014 				ret = rbd_obj_read_from_parent(obj_req);
3015 				if (ret) {
3016 					*result = ret;
3017 					return true;
3018 				}
3019 				obj_req->read_state = RBD_OBJ_READ_PARENT;
3020 				return false;
3021 			}
3022 		}
3023 
3024 		/*
3025 		 * -ENOENT means a hole in the image -- zero-fill the entire
3026 		 * length of the request.  A short read also implies zero-fill
3027 		 * to the end of the request.
3028 		 */
3029 		if (*result == -ENOENT) {
3030 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
3031 			*result = 0;
3032 		} else if (*result >= 0) {
3033 			if (*result < obj_req->ex.oe_len)
3034 				rbd_obj_zero_range(obj_req, *result,
3035 						obj_req->ex.oe_len - *result);
3036 			else
3037 				rbd_assert(*result == obj_req->ex.oe_len);
3038 			*result = 0;
3039 		}
3040 		return true;
3041 	case RBD_OBJ_READ_PARENT:
3042 		/*
3043 		 * The parent image is read only up to the overlap -- zero-fill
3044 		 * from the overlap to the end of the request.
3045 		 */
3046 		if (!*result) {
3047 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
3048 
3049 			if (obj_overlap < obj_req->ex.oe_len)
3050 				rbd_obj_zero_range(obj_req, obj_overlap,
3051 					    obj_req->ex.oe_len - obj_overlap);
3052 		}
3053 		return true;
3054 	default:
3055 		BUG();
3056 	}
3057 }
3058 
3059 static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
3060 {
3061 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3062 
3063 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
3064 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
3065 
3066 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
3067 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
3068 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
3069 		return true;
3070 	}
3071 
3072 	return false;
3073 }
3074 
3075 /*
3076  * Return:
3077  *   0 - object map update sent
3078  *   1 - object map update isn't needed
3079  *  <0 - error
3080  */
3081 static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
3082 {
3083 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3084 	u8 new_state;
3085 
3086 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3087 		return 1;
3088 
3089 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3090 		new_state = OBJECT_PENDING;
3091 	else
3092 		new_state = OBJECT_EXISTS;
3093 
3094 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3095 }
3096 
3097 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3098 {
3099 	struct ceph_osd_request *osd_req;
3100 	int num_ops = count_write_ops(obj_req);
3101 	int which = 0;
3102 	int ret;
3103 
3104 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3105 		num_ops++; /* stat */
3106 
3107 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3108 	if (IS_ERR(osd_req))
3109 		return PTR_ERR(osd_req);
3110 
3111 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3112 		ret = rbd_osd_setup_stat(osd_req, which++);
3113 		if (ret)
3114 			return ret;
3115 	}
3116 
3117 	rbd_osd_setup_write_ops(osd_req, which);
3118 	rbd_osd_format_write(osd_req);
3119 
3120 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3121 	if (ret)
3122 		return ret;
3123 
3124 	rbd_osd_submit(osd_req);
3125 	return 0;
3126 }
3127 
3128 /*
3129  * copyup_bvecs pages are never highmem pages
3130  */
3131 static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3132 {
3133 	struct ceph_bvec_iter it = {
3134 		.bvecs = bvecs,
3135 		.iter = { .bi_size = bytes },
3136 	};
3137 
3138 	ceph_bvec_iter_advance_step(&it, bytes, ({
3139 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3140 			       bv.bv_len))
3141 			return false;
3142 	}));
3143 	return true;
3144 }
3145 
3146 #define MODS_ONLY	U32_MAX
3147 
3148 static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3149 				      u32 bytes)
3150 {
3151 	struct ceph_osd_request *osd_req;
3152 	int ret;
3153 
3154 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3155 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
3156 
3157 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3158 	if (IS_ERR(osd_req))
3159 		return PTR_ERR(osd_req);
3160 
3161 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3162 	if (ret)
3163 		return ret;
3164 
3165 	rbd_osd_format_write(osd_req);
3166 
3167 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3168 	if (ret)
3169 		return ret;
3170 
3171 	rbd_osd_submit(osd_req);
3172 	return 0;
3173 }
3174 
3175 static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3176 					u32 bytes)
3177 {
3178 	struct ceph_osd_request *osd_req;
3179 	int num_ops = count_write_ops(obj_req);
3180 	int which = 0;
3181 	int ret;
3182 
3183 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3184 
3185 	if (bytes != MODS_ONLY)
3186 		num_ops++; /* copyup */
3187 
3188 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3189 	if (IS_ERR(osd_req))
3190 		return PTR_ERR(osd_req);
3191 
3192 	if (bytes != MODS_ONLY) {
3193 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3194 		if (ret)
3195 			return ret;
3196 	}
3197 
3198 	rbd_osd_setup_write_ops(osd_req, which);
3199 	rbd_osd_format_write(osd_req);
3200 
3201 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3202 	if (ret)
3203 		return ret;
3204 
3205 	rbd_osd_submit(osd_req);
3206 	return 0;
3207 }
3208 
3209 static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3210 {
3211 	u32 i;
3212 
3213 	rbd_assert(!obj_req->copyup_bvecs);
3214 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3215 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3216 					sizeof(*obj_req->copyup_bvecs),
3217 					GFP_NOIO);
3218 	if (!obj_req->copyup_bvecs)
3219 		return -ENOMEM;
3220 
3221 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3222 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3223 
3224 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3225 		if (!obj_req->copyup_bvecs[i].bv_page)
3226 			return -ENOMEM;
3227 
3228 		obj_req->copyup_bvecs[i].bv_offset = 0;
3229 		obj_req->copyup_bvecs[i].bv_len = len;
3230 		obj_overlap -= len;
3231 	}
3232 
3233 	rbd_assert(!obj_overlap);
3234 	return 0;
3235 }
3236 
3237 /*
3238  * The target object doesn't exist.  Read the data for the entire
3239  * target object up to the overlap point (if any) from the parent,
3240  * so we can use it for a copyup.
3241  */
3242 static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
3243 {
3244 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3245 	int ret;
3246 
3247 	rbd_assert(obj_req->num_img_extents);
3248 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3249 		      rbd_dev->parent_overlap);
3250 	if (!obj_req->num_img_extents) {
3251 		/*
3252 		 * The overlap has become 0 (most likely because the
3253 		 * image has been flattened).  Re-submit the original write
3254 		 * request -- pass MODS_ONLY since the copyup isn't needed
3255 		 * anymore.
3256 		 */
3257 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
3258 	}
3259 
3260 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3261 	if (ret)
3262 		return ret;
3263 
3264 	return rbd_obj_read_from_parent(obj_req);
3265 }
3266 
3267 static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3268 {
3269 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3270 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3271 	u8 new_state;
3272 	u32 i;
3273 	int ret;
3274 
3275 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3276 
3277 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3278 		return;
3279 
3280 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3281 		return;
3282 
3283 	for (i = 0; i < snapc->num_snaps; i++) {
3284 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3285 		    i + 1 < snapc->num_snaps)
3286 			new_state = OBJECT_EXISTS_CLEAN;
3287 		else
3288 			new_state = OBJECT_EXISTS;
3289 
3290 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3291 					    new_state, NULL);
3292 		if (ret < 0) {
3293 			obj_req->pending.result = ret;
3294 			return;
3295 		}
3296 
3297 		rbd_assert(!ret);
3298 		obj_req->pending.num_pending++;
3299 	}
3300 }
3301 
3302 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3303 {
3304 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3305 	int ret;
3306 
3307 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3308 
3309 	/*
3310 	 * Only send non-zero copyup data to save some I/O and network
3311 	 * bandwidth -- zero copyup data is equivalent to the object not
3312 	 * existing.
3313 	 */
3314 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3315 		bytes = 0;
3316 
3317 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3318 		/*
3319 		 * Send a copyup request with an empty snapshot context to
3320 		 * deep-copyup the object through all existing snapshots.
3321 		 * A second request with the current snapshot context will be
3322 		 * sent for the actual modification.
3323 		 */
3324 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3325 		if (ret) {
3326 			obj_req->pending.result = ret;
3327 			return;
3328 		}
3329 
3330 		obj_req->pending.num_pending++;
3331 		bytes = MODS_ONLY;
3332 	}
3333 
3334 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3335 	if (ret) {
3336 		obj_req->pending.result = ret;
3337 		return;
3338 	}
3339 
3340 	obj_req->pending.num_pending++;
3341 }
3342 
3343 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3344 {
3345 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3346 	int ret;
3347 
3348 again:
3349 	switch (obj_req->copyup_state) {
3350 	case RBD_OBJ_COPYUP_START:
3351 		rbd_assert(!*result);
3352 
3353 		ret = rbd_obj_copyup_read_parent(obj_req);
3354 		if (ret) {
3355 			*result = ret;
3356 			return true;
3357 		}
3358 		if (obj_req->num_img_extents)
3359 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3360 		else
3361 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3362 		return false;
3363 	case RBD_OBJ_COPYUP_READ_PARENT:
3364 		if (*result)
3365 			return true;
3366 
3367 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3368 				  rbd_obj_img_extents_bytes(obj_req))) {
3369 			dout("%s %p detected zeros\n", __func__, obj_req);
3370 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3371 		}
3372 
3373 		rbd_obj_copyup_object_maps(obj_req);
3374 		if (!obj_req->pending.num_pending) {
3375 			*result = obj_req->pending.result;
3376 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3377 			goto again;
3378 		}
3379 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3380 		return false;
3381 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3382 		if (!pending_result_dec(&obj_req->pending, result))
3383 			return false;
3384 		/* fall through */
3385 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
3386 		if (*result) {
3387 			rbd_warn(rbd_dev, "snap object map update failed: %d",
3388 				 *result);
3389 			return true;
3390 		}
3391 
3392 		rbd_obj_copyup_write_object(obj_req);
3393 		if (!obj_req->pending.num_pending) {
3394 			*result = obj_req->pending.result;
3395 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3396 			goto again;
3397 		}
3398 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3399 		return false;
3400 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3401 		if (!pending_result_dec(&obj_req->pending, result))
3402 			return false;
3403 		/* fall through */
3404 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3405 		return true;
3406 	default:
3407 		BUG();
3408 	}
3409 }
3410 
3411 /*
3412  * Return:
3413  *   0 - object map update sent
3414  *   1 - object map update isn't needed
3415  *  <0 - error
3416  */
3417 static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3418 {
3419 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3420 	u8 current_state = OBJECT_PENDING;
3421 
3422 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3423 		return 1;
3424 
3425 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3426 		return 1;
3427 
3428 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3429 				     &current_state);
3430 }
3431 
3432 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3433 {
3434 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3435 	int ret;
3436 
3437 again:
3438 	switch (obj_req->write_state) {
3439 	case RBD_OBJ_WRITE_START:
3440 		rbd_assert(!*result);
3441 
3442 		if (rbd_obj_write_is_noop(obj_req))
3443 			return true;
3444 
3445 		ret = rbd_obj_write_pre_object_map(obj_req);
3446 		if (ret < 0) {
3447 			*result = ret;
3448 			return true;
3449 		}
3450 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3451 		if (ret > 0)
3452 			goto again;
3453 		return false;
3454 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3455 		if (*result) {
3456 			rbd_warn(rbd_dev, "pre object map update failed: %d",
3457 				 *result);
3458 			return true;
3459 		}
3460 		ret = rbd_obj_write_object(obj_req);
3461 		if (ret) {
3462 			*result = ret;
3463 			return true;
3464 		}
3465 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3466 		return false;
3467 	case RBD_OBJ_WRITE_OBJECT:
3468 		if (*result == -ENOENT) {
3469 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3470 				*result = 0;
3471 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3472 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3473 				goto again;
3474 			}
3475 			/*
3476 			 * On a non-existent object:
3477 			 *   delete - -ENOENT, truncate/zero - 0
3478 			 */
3479 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3480 				*result = 0;
3481 		}
3482 		if (*result)
3483 			return true;
3484 
3485 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3486 		goto again;
3487 	case __RBD_OBJ_WRITE_COPYUP:
3488 		if (!rbd_obj_advance_copyup(obj_req, result))
3489 			return false;
3490 		/* fall through */
3491 	case RBD_OBJ_WRITE_COPYUP:
3492 		if (*result) {
3493 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3494 			return true;
3495 		}
3496 		ret = rbd_obj_write_post_object_map(obj_req);
3497 		if (ret < 0) {
3498 			*result = ret;
3499 			return true;
3500 		}
3501 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3502 		if (ret > 0)
3503 			goto again;
3504 		return false;
3505 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3506 		if (*result)
3507 			rbd_warn(rbd_dev, "post object map update failed: %d",
3508 				 *result);
3509 		return true;
3510 	default:
3511 		BUG();
3512 	}
3513 }
3514 
3515 /*
3516  * Return true if @obj_req is completed.
3517  */
3518 static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3519 				     int *result)
3520 {
3521 	struct rbd_img_request *img_req = obj_req->img_request;
3522 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3523 	bool done;
3524 
3525 	mutex_lock(&obj_req->state_mutex);
3526 	if (!rbd_img_is_write(img_req))
3527 		done = rbd_obj_advance_read(obj_req, result);
3528 	else
3529 		done = rbd_obj_advance_write(obj_req, result);
3530 	mutex_unlock(&obj_req->state_mutex);
3531 
3532 	if (done && *result) {
3533 		rbd_assert(*result < 0);
3534 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3535 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3536 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3537 	}
3538 	return done;
3539 }
3540 
3541 /*
3542  * This is open-coded in rbd_img_handle_request() to avoid parent chain
3543  * recursion.
3544  */
3545 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3546 {
3547 	if (__rbd_obj_handle_request(obj_req, &result))
3548 		rbd_img_handle_request(obj_req->img_request, result);
3549 }
3550 
3551 static bool need_exclusive_lock(struct rbd_img_request *img_req)
3552 {
3553 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3554 
3555 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3556 		return false;
3557 
3558 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
3559 		return false;
3560 
3561 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3562 	if (rbd_dev->opts->lock_on_read ||
3563 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3564 		return true;
3565 
3566 	return rbd_img_is_write(img_req);
3567 }
3568 
3569 static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3570 {
3571 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3572 	bool locked;
3573 
3574 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3575 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3576 	spin_lock(&rbd_dev->lock_lists_lock);
3577 	rbd_assert(list_empty(&img_req->lock_item));
3578 	if (!locked)
3579 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3580 	else
3581 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3582 	spin_unlock(&rbd_dev->lock_lists_lock);
3583 	return locked;
3584 }
3585 
3586 static void rbd_lock_del_request(struct rbd_img_request *img_req)
3587 {
3588 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3589 	bool need_wakeup;
3590 
3591 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3592 	spin_lock(&rbd_dev->lock_lists_lock);
3593 	rbd_assert(!list_empty(&img_req->lock_item));
3594 	list_del_init(&img_req->lock_item);
3595 	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3596 		       list_empty(&rbd_dev->running_list));
3597 	spin_unlock(&rbd_dev->lock_lists_lock);
3598 	if (need_wakeup)
3599 		complete(&rbd_dev->releasing_wait);
3600 }
3601 
3602 static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3603 {
3604 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3605 
3606 	if (!need_exclusive_lock(img_req))
3607 		return 1;
3608 
3609 	if (rbd_lock_add_request(img_req))
3610 		return 1;
3611 
3612 	if (rbd_dev->opts->exclusive) {
3613 		WARN_ON(1); /* lock got released? */
3614 		return -EROFS;
3615 	}
3616 
3617 	/*
3618 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3619 	 * and cancel_delayed_work() in wake_lock_waiters().
3620 	 */
3621 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3622 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3623 	return 0;
3624 }
3625 
3626 static void rbd_img_object_requests(struct rbd_img_request *img_req)
3627 {
3628 	struct rbd_obj_request *obj_req;
3629 
3630 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3631 
3632 	for_each_obj_request(img_req, obj_req) {
3633 		int result = 0;
3634 
3635 		if (__rbd_obj_handle_request(obj_req, &result)) {
3636 			if (result) {
3637 				img_req->pending.result = result;
3638 				return;
3639 			}
3640 		} else {
3641 			img_req->pending.num_pending++;
3642 		}
3643 	}
3644 }
3645 
3646 static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3647 {
3648 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3649 	int ret;
3650 
3651 again:
3652 	switch (img_req->state) {
3653 	case RBD_IMG_START:
3654 		rbd_assert(!*result);
3655 
3656 		ret = rbd_img_exclusive_lock(img_req);
3657 		if (ret < 0) {
3658 			*result = ret;
3659 			return true;
3660 		}
3661 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3662 		if (ret > 0)
3663 			goto again;
3664 		return false;
3665 	case RBD_IMG_EXCLUSIVE_LOCK:
3666 		if (*result)
3667 			return true;
3668 
3669 		rbd_assert(!need_exclusive_lock(img_req) ||
3670 			   __rbd_is_lock_owner(rbd_dev));
3671 
3672 		rbd_img_object_requests(img_req);
3673 		if (!img_req->pending.num_pending) {
3674 			*result = img_req->pending.result;
3675 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
3676 			goto again;
3677 		}
3678 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3679 		return false;
3680 	case __RBD_IMG_OBJECT_REQUESTS:
3681 		if (!pending_result_dec(&img_req->pending, result))
3682 			return false;
3683 		/* fall through */
3684 	case RBD_IMG_OBJECT_REQUESTS:
3685 		return true;
3686 	default:
3687 		BUG();
3688 	}
3689 }
3690 
3691 /*
3692  * Return true if @img_req is completed.
3693  */
3694 static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3695 				     int *result)
3696 {
3697 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3698 	bool done;
3699 
3700 	if (need_exclusive_lock(img_req)) {
3701 		down_read(&rbd_dev->lock_rwsem);
3702 		mutex_lock(&img_req->state_mutex);
3703 		done = rbd_img_advance(img_req, result);
3704 		if (done)
3705 			rbd_lock_del_request(img_req);
3706 		mutex_unlock(&img_req->state_mutex);
3707 		up_read(&rbd_dev->lock_rwsem);
3708 	} else {
3709 		mutex_lock(&img_req->state_mutex);
3710 		done = rbd_img_advance(img_req, result);
3711 		mutex_unlock(&img_req->state_mutex);
3712 	}
3713 
3714 	if (done && *result) {
3715 		rbd_assert(*result < 0);
3716 		rbd_warn(rbd_dev, "%s%s result %d",
3717 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3718 		      obj_op_name(img_req->op_type), *result);
3719 	}
3720 	return done;
3721 }
3722 
3723 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3724 {
3725 again:
3726 	if (!__rbd_img_handle_request(img_req, &result))
3727 		return;
3728 
3729 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
3730 		struct rbd_obj_request *obj_req = img_req->obj_request;
3731 
3732 		rbd_img_request_put(img_req);
3733 		if (__rbd_obj_handle_request(obj_req, &result)) {
3734 			img_req = obj_req->img_request;
3735 			goto again;
3736 		}
3737 	} else {
3738 		struct request *rq = img_req->rq;
3739 
3740 		rbd_img_request_put(img_req);
3741 		blk_mq_end_request(rq, errno_to_blk_status(result));
3742 	}
3743 }
3744 
3745 static const struct rbd_client_id rbd_empty_cid;
3746 
3747 static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3748 			  const struct rbd_client_id *rhs)
3749 {
3750 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3751 }
3752 
3753 static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3754 {
3755 	struct rbd_client_id cid;
3756 
3757 	mutex_lock(&rbd_dev->watch_mutex);
3758 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3759 	cid.handle = rbd_dev->watch_cookie;
3760 	mutex_unlock(&rbd_dev->watch_mutex);
3761 	return cid;
3762 }
3763 
3764 /*
3765  * lock_rwsem must be held for write
3766  */
3767 static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3768 			      const struct rbd_client_id *cid)
3769 {
3770 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3771 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3772 	     cid->gid, cid->handle);
3773 	rbd_dev->owner_cid = *cid; /* struct */
3774 }
3775 
3776 static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3777 {
3778 	mutex_lock(&rbd_dev->watch_mutex);
3779 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3780 	mutex_unlock(&rbd_dev->watch_mutex);
3781 }
3782 
3783 static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3784 {
3785 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3786 
3787 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3788 	strcpy(rbd_dev->lock_cookie, cookie);
3789 	rbd_set_owner_cid(rbd_dev, &cid);
3790 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3791 }
3792 
3793 /*
3794  * lock_rwsem must be held for write
3795  */
3796 static int rbd_lock(struct rbd_device *rbd_dev)
3797 {
3798 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3799 	char cookie[32];
3800 	int ret;
3801 
3802 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3803 		rbd_dev->lock_cookie[0] != '\0');
3804 
3805 	format_lock_cookie(rbd_dev, cookie);
3806 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3807 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3808 			    RBD_LOCK_TAG, "", 0);
3809 	if (ret)
3810 		return ret;
3811 
3812 	__rbd_lock(rbd_dev, cookie);
3813 	return 0;
3814 }
3815 
3816 /*
3817  * lock_rwsem must be held for write
3818  */
3819 static void rbd_unlock(struct rbd_device *rbd_dev)
3820 {
3821 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3822 	int ret;
3823 
3824 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3825 		rbd_dev->lock_cookie[0] == '\0');
3826 
3827 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3828 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3829 	if (ret && ret != -ENOENT)
3830 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3831 
3832 	/* treat errors as the image is unlocked */
3833 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3834 	rbd_dev->lock_cookie[0] = '\0';
3835 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3836 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3837 }
3838 
3839 static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3840 				enum rbd_notify_op notify_op,
3841 				struct page ***preply_pages,
3842 				size_t *preply_len)
3843 {
3844 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3845 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3846 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3847 	int buf_size = sizeof(buf);
3848 	void *p = buf;
3849 
3850 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3851 
3852 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3853 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3854 	ceph_encode_32(&p, notify_op);
3855 	ceph_encode_64(&p, cid.gid);
3856 	ceph_encode_64(&p, cid.handle);
3857 
3858 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3859 				&rbd_dev->header_oloc, buf, buf_size,
3860 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3861 }
3862 
3863 static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3864 			       enum rbd_notify_op notify_op)
3865 {
3866 	struct page **reply_pages;
3867 	size_t reply_len;
3868 
3869 	__rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3870 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3871 }
3872 
3873 static void rbd_notify_acquired_lock(struct work_struct *work)
3874 {
3875 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3876 						  acquired_lock_work);
3877 
3878 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3879 }
3880 
3881 static void rbd_notify_released_lock(struct work_struct *work)
3882 {
3883 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3884 						  released_lock_work);
3885 
3886 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3887 }
3888 
3889 static int rbd_request_lock(struct rbd_device *rbd_dev)
3890 {
3891 	struct page **reply_pages;
3892 	size_t reply_len;
3893 	bool lock_owner_responded = false;
3894 	int ret;
3895 
3896 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3897 
3898 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3899 				   &reply_pages, &reply_len);
3900 	if (ret && ret != -ETIMEDOUT) {
3901 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3902 		goto out;
3903 	}
3904 
3905 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3906 		void *p = page_address(reply_pages[0]);
3907 		void *const end = p + reply_len;
3908 		u32 n;
3909 
3910 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3911 		while (n--) {
3912 			u8 struct_v;
3913 			u32 len;
3914 
3915 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3916 			p += 8 + 8; /* skip gid and cookie */
3917 
3918 			ceph_decode_32_safe(&p, end, len, e_inval);
3919 			if (!len)
3920 				continue;
3921 
3922 			if (lock_owner_responded) {
3923 				rbd_warn(rbd_dev,
3924 					 "duplicate lock owners detected");
3925 				ret = -EIO;
3926 				goto out;
3927 			}
3928 
3929 			lock_owner_responded = true;
3930 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3931 						  &struct_v, &len);
3932 			if (ret) {
3933 				rbd_warn(rbd_dev,
3934 					 "failed to decode ResponseMessage: %d",
3935 					 ret);
3936 				goto e_inval;
3937 			}
3938 
3939 			ret = ceph_decode_32(&p);
3940 		}
3941 	}
3942 
3943 	if (!lock_owner_responded) {
3944 		rbd_warn(rbd_dev, "no lock owners detected");
3945 		ret = -ETIMEDOUT;
3946 	}
3947 
3948 out:
3949 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3950 	return ret;
3951 
3952 e_inval:
3953 	ret = -EINVAL;
3954 	goto out;
3955 }
3956 
3957 /*
3958  * Either image request state machine(s) or rbd_add_acquire_lock()
3959  * (i.e. "rbd map").
3960  */
3961 static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3962 {
3963 	struct rbd_img_request *img_req;
3964 
3965 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3966 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3967 
3968 	cancel_delayed_work(&rbd_dev->lock_dwork);
3969 	if (!completion_done(&rbd_dev->acquire_wait)) {
3970 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3971 			   list_empty(&rbd_dev->running_list));
3972 		rbd_dev->acquire_err = result;
3973 		complete_all(&rbd_dev->acquire_wait);
3974 		return;
3975 	}
3976 
3977 	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3978 		mutex_lock(&img_req->state_mutex);
3979 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3980 		rbd_img_schedule(img_req, result);
3981 		mutex_unlock(&img_req->state_mutex);
3982 	}
3983 
3984 	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3985 }
3986 
3987 static int get_lock_owner_info(struct rbd_device *rbd_dev,
3988 			       struct ceph_locker **lockers, u32 *num_lockers)
3989 {
3990 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3991 	u8 lock_type;
3992 	char *lock_tag;
3993 	int ret;
3994 
3995 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3996 
3997 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3998 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3999 				 &lock_type, &lock_tag, lockers, num_lockers);
4000 	if (ret)
4001 		return ret;
4002 
4003 	if (*num_lockers == 0) {
4004 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
4005 		goto out;
4006 	}
4007 
4008 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
4009 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
4010 			 lock_tag);
4011 		ret = -EBUSY;
4012 		goto out;
4013 	}
4014 
4015 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
4016 		rbd_warn(rbd_dev, "shared lock type detected");
4017 		ret = -EBUSY;
4018 		goto out;
4019 	}
4020 
4021 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
4022 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
4023 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
4024 			 (*lockers)[0].id.cookie);
4025 		ret = -EBUSY;
4026 		goto out;
4027 	}
4028 
4029 out:
4030 	kfree(lock_tag);
4031 	return ret;
4032 }
4033 
4034 static int find_watcher(struct rbd_device *rbd_dev,
4035 			const struct ceph_locker *locker)
4036 {
4037 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4038 	struct ceph_watch_item *watchers;
4039 	u32 num_watchers;
4040 	u64 cookie;
4041 	int i;
4042 	int ret;
4043 
4044 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
4045 				      &rbd_dev->header_oloc, &watchers,
4046 				      &num_watchers);
4047 	if (ret)
4048 		return ret;
4049 
4050 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
4051 	for (i = 0; i < num_watchers; i++) {
4052 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
4053 			    sizeof(locker->info.addr)) &&
4054 		    watchers[i].cookie == cookie) {
4055 			struct rbd_client_id cid = {
4056 				.gid = le64_to_cpu(watchers[i].name.num),
4057 				.handle = cookie,
4058 			};
4059 
4060 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
4061 			     rbd_dev, cid.gid, cid.handle);
4062 			rbd_set_owner_cid(rbd_dev, &cid);
4063 			ret = 1;
4064 			goto out;
4065 		}
4066 	}
4067 
4068 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
4069 	ret = 0;
4070 out:
4071 	kfree(watchers);
4072 	return ret;
4073 }
4074 
4075 /*
4076  * lock_rwsem must be held for write
4077  */
4078 static int rbd_try_lock(struct rbd_device *rbd_dev)
4079 {
4080 	struct ceph_client *client = rbd_dev->rbd_client->client;
4081 	struct ceph_locker *lockers;
4082 	u32 num_lockers;
4083 	int ret;
4084 
4085 	for (;;) {
4086 		ret = rbd_lock(rbd_dev);
4087 		if (ret != -EBUSY)
4088 			return ret;
4089 
4090 		/* determine if the current lock holder is still alive */
4091 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4092 		if (ret)
4093 			return ret;
4094 
4095 		if (num_lockers == 0)
4096 			goto again;
4097 
4098 		ret = find_watcher(rbd_dev, lockers);
4099 		if (ret)
4100 			goto out; /* request lock or error */
4101 
4102 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4103 			 ENTITY_NAME(lockers[0].id.name));
4104 
4105 		ret = ceph_monc_blacklist_add(&client->monc,
4106 					      &lockers[0].info.addr);
4107 		if (ret) {
4108 			rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
4109 				 ENTITY_NAME(lockers[0].id.name), ret);
4110 			goto out;
4111 		}
4112 
4113 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4114 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4115 					  lockers[0].id.cookie,
4116 					  &lockers[0].id.name);
4117 		if (ret && ret != -ENOENT)
4118 			goto out;
4119 
4120 again:
4121 		ceph_free_lockers(lockers, num_lockers);
4122 	}
4123 
4124 out:
4125 	ceph_free_lockers(lockers, num_lockers);
4126 	return ret;
4127 }
4128 
4129 static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4130 {
4131 	int ret;
4132 
4133 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4134 		ret = rbd_object_map_open(rbd_dev);
4135 		if (ret)
4136 			return ret;
4137 	}
4138 
4139 	return 0;
4140 }
4141 
4142 /*
4143  * Return:
4144  *   0 - lock acquired
4145  *   1 - caller should call rbd_request_lock()
4146  *  <0 - error
4147  */
4148 static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4149 {
4150 	int ret;
4151 
4152 	down_read(&rbd_dev->lock_rwsem);
4153 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4154 	     rbd_dev->lock_state);
4155 	if (__rbd_is_lock_owner(rbd_dev)) {
4156 		up_read(&rbd_dev->lock_rwsem);
4157 		return 0;
4158 	}
4159 
4160 	up_read(&rbd_dev->lock_rwsem);
4161 	down_write(&rbd_dev->lock_rwsem);
4162 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4163 	     rbd_dev->lock_state);
4164 	if (__rbd_is_lock_owner(rbd_dev)) {
4165 		up_write(&rbd_dev->lock_rwsem);
4166 		return 0;
4167 	}
4168 
4169 	ret = rbd_try_lock(rbd_dev);
4170 	if (ret < 0) {
4171 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4172 		if (ret == -EBLACKLISTED)
4173 			goto out;
4174 
4175 		ret = 1; /* request lock anyway */
4176 	}
4177 	if (ret > 0) {
4178 		up_write(&rbd_dev->lock_rwsem);
4179 		return ret;
4180 	}
4181 
4182 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4183 	rbd_assert(list_empty(&rbd_dev->running_list));
4184 
4185 	ret = rbd_post_acquire_action(rbd_dev);
4186 	if (ret) {
4187 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4188 		/*
4189 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
4190 		 * rbd_lock_add_request() would let the request through,
4191 		 * assuming that e.g. object map is locked and loaded.
4192 		 */
4193 		rbd_unlock(rbd_dev);
4194 	}
4195 
4196 out:
4197 	wake_lock_waiters(rbd_dev, ret);
4198 	up_write(&rbd_dev->lock_rwsem);
4199 	return ret;
4200 }
4201 
4202 static void rbd_acquire_lock(struct work_struct *work)
4203 {
4204 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4205 					    struct rbd_device, lock_dwork);
4206 	int ret;
4207 
4208 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4209 again:
4210 	ret = rbd_try_acquire_lock(rbd_dev);
4211 	if (ret <= 0) {
4212 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4213 		return;
4214 	}
4215 
4216 	ret = rbd_request_lock(rbd_dev);
4217 	if (ret == -ETIMEDOUT) {
4218 		goto again; /* treat this as a dead client */
4219 	} else if (ret == -EROFS) {
4220 		rbd_warn(rbd_dev, "peer will not release lock");
4221 		down_write(&rbd_dev->lock_rwsem);
4222 		wake_lock_waiters(rbd_dev, ret);
4223 		up_write(&rbd_dev->lock_rwsem);
4224 	} else if (ret < 0) {
4225 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4226 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4227 				 RBD_RETRY_DELAY);
4228 	} else {
4229 		/*
4230 		 * lock owner acked, but resend if we don't see them
4231 		 * release the lock
4232 		 */
4233 		dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
4234 		     rbd_dev);
4235 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4236 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4237 	}
4238 }
4239 
4240 static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4241 {
4242 	bool need_wait;
4243 
4244 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4245 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4246 
4247 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4248 		return false;
4249 
4250 	/*
4251 	 * Ensure that all in-flight IO is flushed.
4252 	 */
4253 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4254 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4255 	need_wait = !list_empty(&rbd_dev->running_list);
4256 	downgrade_write(&rbd_dev->lock_rwsem);
4257 	if (need_wait)
4258 		wait_for_completion(&rbd_dev->releasing_wait);
4259 	up_read(&rbd_dev->lock_rwsem);
4260 
4261 	down_write(&rbd_dev->lock_rwsem);
4262 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4263 		return false;
4264 
4265 	rbd_assert(list_empty(&rbd_dev->running_list));
4266 	return true;
4267 }
4268 
4269 static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4270 {
4271 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4272 		rbd_object_map_close(rbd_dev);
4273 }
4274 
4275 static void __rbd_release_lock(struct rbd_device *rbd_dev)
4276 {
4277 	rbd_assert(list_empty(&rbd_dev->running_list));
4278 
4279 	rbd_pre_release_action(rbd_dev);
4280 	rbd_unlock(rbd_dev);
4281 }
4282 
4283 /*
4284  * lock_rwsem must be held for write
4285  */
4286 static void rbd_release_lock(struct rbd_device *rbd_dev)
4287 {
4288 	if (!rbd_quiesce_lock(rbd_dev))
4289 		return;
4290 
4291 	__rbd_release_lock(rbd_dev);
4292 
4293 	/*
4294 	 * Give others a chance to grab the lock - we would re-acquire
4295 	 * almost immediately if we got new IO while draining the running
4296 	 * list otherwise.  We need to ack our own notifications, so this
4297 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4298 	 * way of maybe_kick_acquire().
4299 	 */
4300 	cancel_delayed_work(&rbd_dev->lock_dwork);
4301 }
4302 
4303 static void rbd_release_lock_work(struct work_struct *work)
4304 {
4305 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4306 						  unlock_work);
4307 
4308 	down_write(&rbd_dev->lock_rwsem);
4309 	rbd_release_lock(rbd_dev);
4310 	up_write(&rbd_dev->lock_rwsem);
4311 }
4312 
4313 static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4314 {
4315 	bool have_requests;
4316 
4317 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4318 	if (__rbd_is_lock_owner(rbd_dev))
4319 		return;
4320 
4321 	spin_lock(&rbd_dev->lock_lists_lock);
4322 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4323 	spin_unlock(&rbd_dev->lock_lists_lock);
4324 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4325 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4326 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4327 	}
4328 }
4329 
4330 static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4331 				     void **p)
4332 {
4333 	struct rbd_client_id cid = { 0 };
4334 
4335 	if (struct_v >= 2) {
4336 		cid.gid = ceph_decode_64(p);
4337 		cid.handle = ceph_decode_64(p);
4338 	}
4339 
4340 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4341 	     cid.handle);
4342 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4343 		down_write(&rbd_dev->lock_rwsem);
4344 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4345 			/*
4346 			 * we already know that the remote client is
4347 			 * the owner
4348 			 */
4349 			up_write(&rbd_dev->lock_rwsem);
4350 			return;
4351 		}
4352 
4353 		rbd_set_owner_cid(rbd_dev, &cid);
4354 		downgrade_write(&rbd_dev->lock_rwsem);
4355 	} else {
4356 		down_read(&rbd_dev->lock_rwsem);
4357 	}
4358 
4359 	maybe_kick_acquire(rbd_dev);
4360 	up_read(&rbd_dev->lock_rwsem);
4361 }
4362 
4363 static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4364 				     void **p)
4365 {
4366 	struct rbd_client_id cid = { 0 };
4367 
4368 	if (struct_v >= 2) {
4369 		cid.gid = ceph_decode_64(p);
4370 		cid.handle = ceph_decode_64(p);
4371 	}
4372 
4373 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4374 	     cid.handle);
4375 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4376 		down_write(&rbd_dev->lock_rwsem);
4377 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4378 			dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4379 			     __func__, rbd_dev, cid.gid, cid.handle,
4380 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4381 			up_write(&rbd_dev->lock_rwsem);
4382 			return;
4383 		}
4384 
4385 		rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4386 		downgrade_write(&rbd_dev->lock_rwsem);
4387 	} else {
4388 		down_read(&rbd_dev->lock_rwsem);
4389 	}
4390 
4391 	maybe_kick_acquire(rbd_dev);
4392 	up_read(&rbd_dev->lock_rwsem);
4393 }
4394 
4395 /*
4396  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4397  * ResponseMessage is needed.
4398  */
4399 static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4400 				   void **p)
4401 {
4402 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4403 	struct rbd_client_id cid = { 0 };
4404 	int result = 1;
4405 
4406 	if (struct_v >= 2) {
4407 		cid.gid = ceph_decode_64(p);
4408 		cid.handle = ceph_decode_64(p);
4409 	}
4410 
4411 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4412 	     cid.handle);
4413 	if (rbd_cid_equal(&cid, &my_cid))
4414 		return result;
4415 
4416 	down_read(&rbd_dev->lock_rwsem);
4417 	if (__rbd_is_lock_owner(rbd_dev)) {
4418 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4419 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4420 			goto out_unlock;
4421 
4422 		/*
4423 		 * encode ResponseMessage(0) so the peer can detect
4424 		 * a missing owner
4425 		 */
4426 		result = 0;
4427 
4428 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4429 			if (!rbd_dev->opts->exclusive) {
4430 				dout("%s rbd_dev %p queueing unlock_work\n",
4431 				     __func__, rbd_dev);
4432 				queue_work(rbd_dev->task_wq,
4433 					   &rbd_dev->unlock_work);
4434 			} else {
4435 				/* refuse to release the lock */
4436 				result = -EROFS;
4437 			}
4438 		}
4439 	}
4440 
4441 out_unlock:
4442 	up_read(&rbd_dev->lock_rwsem);
4443 	return result;
4444 }
4445 
4446 static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4447 				     u64 notify_id, u64 cookie, s32 *result)
4448 {
4449 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4450 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4451 	int buf_size = sizeof(buf);
4452 	int ret;
4453 
4454 	if (result) {
4455 		void *p = buf;
4456 
4457 		/* encode ResponseMessage */
4458 		ceph_start_encoding(&p, 1, 1,
4459 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4460 		ceph_encode_32(&p, *result);
4461 	} else {
4462 		buf_size = 0;
4463 	}
4464 
4465 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4466 				   &rbd_dev->header_oloc, notify_id, cookie,
4467 				   buf, buf_size);
4468 	if (ret)
4469 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4470 }
4471 
4472 static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4473 				   u64 cookie)
4474 {
4475 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4476 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4477 }
4478 
4479 static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4480 					  u64 notify_id, u64 cookie, s32 result)
4481 {
4482 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4483 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4484 }
4485 
4486 static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4487 			 u64 notifier_id, void *data, size_t data_len)
4488 {
4489 	struct rbd_device *rbd_dev = arg;
4490 	void *p = data;
4491 	void *const end = p + data_len;
4492 	u8 struct_v = 0;
4493 	u32 len;
4494 	u32 notify_op;
4495 	int ret;
4496 
4497 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4498 	     __func__, rbd_dev, cookie, notify_id, data_len);
4499 	if (data_len) {
4500 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4501 					  &struct_v, &len);
4502 		if (ret) {
4503 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4504 				 ret);
4505 			return;
4506 		}
4507 
4508 		notify_op = ceph_decode_32(&p);
4509 	} else {
4510 		/* legacy notification for header updates */
4511 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4512 		len = 0;
4513 	}
4514 
4515 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4516 	switch (notify_op) {
4517 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4518 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4519 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4520 		break;
4521 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4522 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4523 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4524 		break;
4525 	case RBD_NOTIFY_OP_REQUEST_LOCK:
4526 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4527 		if (ret <= 0)
4528 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4529 						      cookie, ret);
4530 		else
4531 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4532 		break;
4533 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4534 		ret = rbd_dev_refresh(rbd_dev);
4535 		if (ret)
4536 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4537 
4538 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4539 		break;
4540 	default:
4541 		if (rbd_is_lock_owner(rbd_dev))
4542 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4543 						      cookie, -EOPNOTSUPP);
4544 		else
4545 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4546 		break;
4547 	}
4548 }
4549 
4550 static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4551 
4552 static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4553 {
4554 	struct rbd_device *rbd_dev = arg;
4555 
4556 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4557 
4558 	down_write(&rbd_dev->lock_rwsem);
4559 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4560 	up_write(&rbd_dev->lock_rwsem);
4561 
4562 	mutex_lock(&rbd_dev->watch_mutex);
4563 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4564 		__rbd_unregister_watch(rbd_dev);
4565 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4566 
4567 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4568 	}
4569 	mutex_unlock(&rbd_dev->watch_mutex);
4570 }
4571 
4572 /*
4573  * watch_mutex must be locked
4574  */
4575 static int __rbd_register_watch(struct rbd_device *rbd_dev)
4576 {
4577 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4578 	struct ceph_osd_linger_request *handle;
4579 
4580 	rbd_assert(!rbd_dev->watch_handle);
4581 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4582 
4583 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4584 				 &rbd_dev->header_oloc, rbd_watch_cb,
4585 				 rbd_watch_errcb, rbd_dev);
4586 	if (IS_ERR(handle))
4587 		return PTR_ERR(handle);
4588 
4589 	rbd_dev->watch_handle = handle;
4590 	return 0;
4591 }
4592 
4593 /*
4594  * watch_mutex must be locked
4595  */
4596 static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4597 {
4598 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4599 	int ret;
4600 
4601 	rbd_assert(rbd_dev->watch_handle);
4602 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4603 
4604 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4605 	if (ret)
4606 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4607 
4608 	rbd_dev->watch_handle = NULL;
4609 }
4610 
4611 static int rbd_register_watch(struct rbd_device *rbd_dev)
4612 {
4613 	int ret;
4614 
4615 	mutex_lock(&rbd_dev->watch_mutex);
4616 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4617 	ret = __rbd_register_watch(rbd_dev);
4618 	if (ret)
4619 		goto out;
4620 
4621 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4622 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4623 
4624 out:
4625 	mutex_unlock(&rbd_dev->watch_mutex);
4626 	return ret;
4627 }
4628 
4629 static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4630 {
4631 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4632 
4633 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4634 	cancel_work_sync(&rbd_dev->released_lock_work);
4635 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4636 	cancel_work_sync(&rbd_dev->unlock_work);
4637 }
4638 
4639 static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4640 {
4641 	cancel_tasks_sync(rbd_dev);
4642 
4643 	mutex_lock(&rbd_dev->watch_mutex);
4644 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4645 		__rbd_unregister_watch(rbd_dev);
4646 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4647 	mutex_unlock(&rbd_dev->watch_mutex);
4648 
4649 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4650 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4651 }
4652 
4653 /*
4654  * lock_rwsem must be held for write
4655  */
4656 static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4657 {
4658 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4659 	char cookie[32];
4660 	int ret;
4661 
4662 	if (!rbd_quiesce_lock(rbd_dev))
4663 		return;
4664 
4665 	format_lock_cookie(rbd_dev, cookie);
4666 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4667 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4668 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4669 				  RBD_LOCK_TAG, cookie);
4670 	if (ret) {
4671 		if (ret != -EOPNOTSUPP)
4672 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4673 				 ret);
4674 
4675 		/*
4676 		 * Lock cookie cannot be updated on older OSDs, so do
4677 		 * a manual release and queue an acquire.
4678 		 */
4679 		__rbd_release_lock(rbd_dev);
4680 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4681 	} else {
4682 		__rbd_lock(rbd_dev, cookie);
4683 		wake_lock_waiters(rbd_dev, 0);
4684 	}
4685 }
4686 
4687 static void rbd_reregister_watch(struct work_struct *work)
4688 {
4689 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4690 					    struct rbd_device, watch_dwork);
4691 	int ret;
4692 
4693 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4694 
4695 	mutex_lock(&rbd_dev->watch_mutex);
4696 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4697 		mutex_unlock(&rbd_dev->watch_mutex);
4698 		return;
4699 	}
4700 
4701 	ret = __rbd_register_watch(rbd_dev);
4702 	if (ret) {
4703 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4704 		if (ret != -EBLACKLISTED && ret != -ENOENT) {
4705 			queue_delayed_work(rbd_dev->task_wq,
4706 					   &rbd_dev->watch_dwork,
4707 					   RBD_RETRY_DELAY);
4708 			mutex_unlock(&rbd_dev->watch_mutex);
4709 			return;
4710 		}
4711 
4712 		mutex_unlock(&rbd_dev->watch_mutex);
4713 		down_write(&rbd_dev->lock_rwsem);
4714 		wake_lock_waiters(rbd_dev, ret);
4715 		up_write(&rbd_dev->lock_rwsem);
4716 		return;
4717 	}
4718 
4719 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4720 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4721 	mutex_unlock(&rbd_dev->watch_mutex);
4722 
4723 	down_write(&rbd_dev->lock_rwsem);
4724 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4725 		rbd_reacquire_lock(rbd_dev);
4726 	up_write(&rbd_dev->lock_rwsem);
4727 
4728 	ret = rbd_dev_refresh(rbd_dev);
4729 	if (ret)
4730 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
4731 }
4732 
4733 /*
4734  * Synchronous osd object method call.  Returns the number of bytes
4735  * returned in the outbound buffer, or a negative error code.
4736  */
4737 static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4738 			     struct ceph_object_id *oid,
4739 			     struct ceph_object_locator *oloc,
4740 			     const char *method_name,
4741 			     const void *outbound,
4742 			     size_t outbound_size,
4743 			     void *inbound,
4744 			     size_t inbound_size)
4745 {
4746 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4747 	struct page *req_page = NULL;
4748 	struct page *reply_page;
4749 	int ret;
4750 
4751 	/*
4752 	 * Method calls are ultimately read operations.  The result
4753 	 * should placed into the inbound buffer provided.  They
4754 	 * also supply outbound data--parameters for the object
4755 	 * method.  Currently if this is present it will be a
4756 	 * snapshot id.
4757 	 */
4758 	if (outbound) {
4759 		if (outbound_size > PAGE_SIZE)
4760 			return -E2BIG;
4761 
4762 		req_page = alloc_page(GFP_KERNEL);
4763 		if (!req_page)
4764 			return -ENOMEM;
4765 
4766 		memcpy(page_address(req_page), outbound, outbound_size);
4767 	}
4768 
4769 	reply_page = alloc_page(GFP_KERNEL);
4770 	if (!reply_page) {
4771 		if (req_page)
4772 			__free_page(req_page);
4773 		return -ENOMEM;
4774 	}
4775 
4776 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4777 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
4778 			     &reply_page, &inbound_size);
4779 	if (!ret) {
4780 		memcpy(inbound, page_address(reply_page), inbound_size);
4781 		ret = inbound_size;
4782 	}
4783 
4784 	if (req_page)
4785 		__free_page(req_page);
4786 	__free_page(reply_page);
4787 	return ret;
4788 }
4789 
4790 static void rbd_queue_workfn(struct work_struct *work)
4791 {
4792 	struct request *rq = blk_mq_rq_from_pdu(work);
4793 	struct rbd_device *rbd_dev = rq->q->queuedata;
4794 	struct rbd_img_request *img_request;
4795 	struct ceph_snap_context *snapc = NULL;
4796 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4797 	u64 length = blk_rq_bytes(rq);
4798 	enum obj_operation_type op_type;
4799 	u64 mapping_size;
4800 	int result;
4801 
4802 	switch (req_op(rq)) {
4803 	case REQ_OP_DISCARD:
4804 		op_type = OBJ_OP_DISCARD;
4805 		break;
4806 	case REQ_OP_WRITE_ZEROES:
4807 		op_type = OBJ_OP_ZEROOUT;
4808 		break;
4809 	case REQ_OP_WRITE:
4810 		op_type = OBJ_OP_WRITE;
4811 		break;
4812 	case REQ_OP_READ:
4813 		op_type = OBJ_OP_READ;
4814 		break;
4815 	default:
4816 		dout("%s: non-fs request type %d\n", __func__, req_op(rq));
4817 		result = -EIO;
4818 		goto err;
4819 	}
4820 
4821 	/* Ignore/skip any zero-length requests */
4822 
4823 	if (!length) {
4824 		dout("%s: zero-length request\n", __func__);
4825 		result = 0;
4826 		goto err_rq;
4827 	}
4828 
4829 	if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
4830 		rbd_warn(rbd_dev, "%s on read-only snapshot",
4831 			 obj_op_name(op_type));
4832 		result = -EIO;
4833 		goto err;
4834 	}
4835 
4836 	/*
4837 	 * Quit early if the mapped snapshot no longer exists.  It's
4838 	 * still possible the snapshot will have disappeared by the
4839 	 * time our request arrives at the osd, but there's no sense in
4840 	 * sending it if we already know.
4841 	 */
4842 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4843 		dout("request for non-existent snapshot");
4844 		rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4845 		result = -ENXIO;
4846 		goto err_rq;
4847 	}
4848 
4849 	if (offset && length > U64_MAX - offset + 1) {
4850 		rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4851 			 length);
4852 		result = -EINVAL;
4853 		goto err_rq;	/* Shouldn't happen */
4854 	}
4855 
4856 	blk_mq_start_request(rq);
4857 
4858 	down_read(&rbd_dev->header_rwsem);
4859 	mapping_size = rbd_dev->mapping.size;
4860 	if (op_type != OBJ_OP_READ) {
4861 		snapc = rbd_dev->header.snapc;
4862 		ceph_get_snap_context(snapc);
4863 	}
4864 	up_read(&rbd_dev->header_rwsem);
4865 
4866 	if (offset + length > mapping_size) {
4867 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4868 			 length, mapping_size);
4869 		result = -EIO;
4870 		goto err_rq;
4871 	}
4872 
4873 	img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
4874 	if (!img_request) {
4875 		result = -ENOMEM;
4876 		goto err_rq;
4877 	}
4878 	img_request->rq = rq;
4879 	snapc = NULL; /* img_request consumes a ref */
4880 
4881 	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4882 	     img_request, obj_op_name(op_type), offset, length);
4883 
4884 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4885 		result = rbd_img_fill_nodata(img_request, offset, length);
4886 	else
4887 		result = rbd_img_fill_from_bio(img_request, offset, length,
4888 					       rq->bio);
4889 	if (result)
4890 		goto err_img_request;
4891 
4892 	rbd_img_handle_request(img_request, 0);
4893 	return;
4894 
4895 err_img_request:
4896 	rbd_img_request_put(img_request);
4897 err_rq:
4898 	if (result)
4899 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
4900 			 obj_op_name(op_type), length, offset, result);
4901 	ceph_put_snap_context(snapc);
4902 err:
4903 	blk_mq_end_request(rq, errno_to_blk_status(result));
4904 }
4905 
4906 static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4907 		const struct blk_mq_queue_data *bd)
4908 {
4909 	struct request *rq = bd->rq;
4910 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
4911 
4912 	queue_work(rbd_wq, work);
4913 	return BLK_STS_OK;
4914 }
4915 
4916 static void rbd_free_disk(struct rbd_device *rbd_dev)
4917 {
4918 	blk_cleanup_queue(rbd_dev->disk->queue);
4919 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4920 	put_disk(rbd_dev->disk);
4921 	rbd_dev->disk = NULL;
4922 }
4923 
4924 static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4925 			     struct ceph_object_id *oid,
4926 			     struct ceph_object_locator *oloc,
4927 			     void *buf, int buf_len)
4928 
4929 {
4930 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4931 	struct ceph_osd_request *req;
4932 	struct page **pages;
4933 	int num_pages = calc_pages_for(0, buf_len);
4934 	int ret;
4935 
4936 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4937 	if (!req)
4938 		return -ENOMEM;
4939 
4940 	ceph_oid_copy(&req->r_base_oid, oid);
4941 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4942 	req->r_flags = CEPH_OSD_FLAG_READ;
4943 
4944 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4945 	if (IS_ERR(pages)) {
4946 		ret = PTR_ERR(pages);
4947 		goto out_req;
4948 	}
4949 
4950 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4951 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4952 					 true);
4953 
4954 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4955 	if (ret)
4956 		goto out_req;
4957 
4958 	ceph_osdc_start_request(osdc, req, false);
4959 	ret = ceph_osdc_wait_request(osdc, req);
4960 	if (ret >= 0)
4961 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4962 
4963 out_req:
4964 	ceph_osdc_put_request(req);
4965 	return ret;
4966 }
4967 
4968 /*
4969  * Read the complete header for the given rbd device.  On successful
4970  * return, the rbd_dev->header field will contain up-to-date
4971  * information about the image.
4972  */
4973 static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
4974 {
4975 	struct rbd_image_header_ondisk *ondisk = NULL;
4976 	u32 snap_count = 0;
4977 	u64 names_size = 0;
4978 	u32 want_count;
4979 	int ret;
4980 
4981 	/*
4982 	 * The complete header will include an array of its 64-bit
4983 	 * snapshot ids, followed by the names of those snapshots as
4984 	 * a contiguous block of NUL-terminated strings.  Note that
4985 	 * the number of snapshots could change by the time we read
4986 	 * it in, in which case we re-read it.
4987 	 */
4988 	do {
4989 		size_t size;
4990 
4991 		kfree(ondisk);
4992 
4993 		size = sizeof (*ondisk);
4994 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4995 		size += names_size;
4996 		ondisk = kmalloc(size, GFP_KERNEL);
4997 		if (!ondisk)
4998 			return -ENOMEM;
4999 
5000 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
5001 					&rbd_dev->header_oloc, ondisk, size);
5002 		if (ret < 0)
5003 			goto out;
5004 		if ((size_t)ret < size) {
5005 			ret = -ENXIO;
5006 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
5007 				size, ret);
5008 			goto out;
5009 		}
5010 		if (!rbd_dev_ondisk_valid(ondisk)) {
5011 			ret = -ENXIO;
5012 			rbd_warn(rbd_dev, "invalid header");
5013 			goto out;
5014 		}
5015 
5016 		names_size = le64_to_cpu(ondisk->snap_names_len);
5017 		want_count = snap_count;
5018 		snap_count = le32_to_cpu(ondisk->snap_count);
5019 	} while (snap_count != want_count);
5020 
5021 	ret = rbd_header_from_disk(rbd_dev, ondisk);
5022 out:
5023 	kfree(ondisk);
5024 
5025 	return ret;
5026 }
5027 
5028 /*
5029  * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
5030  * has disappeared from the (just updated) snapshot context.
5031  */
5032 static void rbd_exists_validate(struct rbd_device *rbd_dev)
5033 {
5034 	u64 snap_id;
5035 
5036 	if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
5037 		return;
5038 
5039 	snap_id = rbd_dev->spec->snap_id;
5040 	if (snap_id == CEPH_NOSNAP)
5041 		return;
5042 
5043 	if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
5044 		clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5045 }
5046 
5047 static void rbd_dev_update_size(struct rbd_device *rbd_dev)
5048 {
5049 	sector_t size;
5050 
5051 	/*
5052 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
5053 	 * try to update its size.  If REMOVING is set, updating size
5054 	 * is just useless work since the device can't be opened.
5055 	 */
5056 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
5057 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
5058 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
5059 		dout("setting size to %llu sectors", (unsigned long long)size);
5060 		set_capacity(rbd_dev->disk, size);
5061 		revalidate_disk(rbd_dev->disk);
5062 	}
5063 }
5064 
5065 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
5066 {
5067 	u64 mapping_size;
5068 	int ret;
5069 
5070 	down_write(&rbd_dev->header_rwsem);
5071 	mapping_size = rbd_dev->mapping.size;
5072 
5073 	ret = rbd_dev_header_info(rbd_dev);
5074 	if (ret)
5075 		goto out;
5076 
5077 	/*
5078 	 * If there is a parent, see if it has disappeared due to the
5079 	 * mapped image getting flattened.
5080 	 */
5081 	if (rbd_dev->parent) {
5082 		ret = rbd_dev_v2_parent_info(rbd_dev);
5083 		if (ret)
5084 			goto out;
5085 	}
5086 
5087 	if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
5088 		rbd_dev->mapping.size = rbd_dev->header.image_size;
5089 	} else {
5090 		/* validate mapped snapshot's EXISTS flag */
5091 		rbd_exists_validate(rbd_dev);
5092 	}
5093 
5094 out:
5095 	up_write(&rbd_dev->header_rwsem);
5096 	if (!ret && mapping_size != rbd_dev->mapping.size)
5097 		rbd_dev_update_size(rbd_dev);
5098 
5099 	return ret;
5100 }
5101 
5102 static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
5103 		unsigned int hctx_idx, unsigned int numa_node)
5104 {
5105 	struct work_struct *work = blk_mq_rq_to_pdu(rq);
5106 
5107 	INIT_WORK(work, rbd_queue_workfn);
5108 	return 0;
5109 }
5110 
5111 static const struct blk_mq_ops rbd_mq_ops = {
5112 	.queue_rq	= rbd_queue_rq,
5113 	.init_request	= rbd_init_request,
5114 };
5115 
5116 static int rbd_init_disk(struct rbd_device *rbd_dev)
5117 {
5118 	struct gendisk *disk;
5119 	struct request_queue *q;
5120 	unsigned int objset_bytes =
5121 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
5122 	int err;
5123 
5124 	/* create gendisk info */
5125 	disk = alloc_disk(single_major ?
5126 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
5127 			  RBD_MINORS_PER_MAJOR);
5128 	if (!disk)
5129 		return -ENOMEM;
5130 
5131 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
5132 		 rbd_dev->dev_id);
5133 	disk->major = rbd_dev->major;
5134 	disk->first_minor = rbd_dev->minor;
5135 	if (single_major)
5136 		disk->flags |= GENHD_FL_EXT_DEVT;
5137 	disk->fops = &rbd_bd_ops;
5138 	disk->private_data = rbd_dev;
5139 
5140 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
5141 	rbd_dev->tag_set.ops = &rbd_mq_ops;
5142 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
5143 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
5144 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
5145 	rbd_dev->tag_set.nr_hw_queues = 1;
5146 	rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
5147 
5148 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
5149 	if (err)
5150 		goto out_disk;
5151 
5152 	q = blk_mq_init_queue(&rbd_dev->tag_set);
5153 	if (IS_ERR(q)) {
5154 		err = PTR_ERR(q);
5155 		goto out_tag_set;
5156 	}
5157 
5158 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5159 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
5160 
5161 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
5162 	q->limits.max_sectors = queue_max_hw_sectors(q);
5163 	blk_queue_max_segments(q, USHRT_MAX);
5164 	blk_queue_max_segment_size(q, UINT_MAX);
5165 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5166 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5167 
5168 	if (rbd_dev->opts->trim) {
5169 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
5170 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5171 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5172 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5173 	}
5174 
5175 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5176 		q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
5177 
5178 	/*
5179 	 * disk_release() expects a queue ref from add_disk() and will
5180 	 * put it.  Hold an extra ref until add_disk() is called.
5181 	 */
5182 	WARN_ON(!blk_get_queue(q));
5183 	disk->queue = q;
5184 	q->queuedata = rbd_dev;
5185 
5186 	rbd_dev->disk = disk;
5187 
5188 	return 0;
5189 out_tag_set:
5190 	blk_mq_free_tag_set(&rbd_dev->tag_set);
5191 out_disk:
5192 	put_disk(disk);
5193 	return err;
5194 }
5195 
5196 /*
5197   sysfs
5198 */
5199 
5200 static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5201 {
5202 	return container_of(dev, struct rbd_device, dev);
5203 }
5204 
5205 static ssize_t rbd_size_show(struct device *dev,
5206 			     struct device_attribute *attr, char *buf)
5207 {
5208 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5209 
5210 	return sprintf(buf, "%llu\n",
5211 		(unsigned long long)rbd_dev->mapping.size);
5212 }
5213 
5214 /*
5215  * Note this shows the features for whatever's mapped, which is not
5216  * necessarily the base image.
5217  */
5218 static ssize_t rbd_features_show(struct device *dev,
5219 			     struct device_attribute *attr, char *buf)
5220 {
5221 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5222 
5223 	return sprintf(buf, "0x%016llx\n",
5224 			(unsigned long long)rbd_dev->mapping.features);
5225 }
5226 
5227 static ssize_t rbd_major_show(struct device *dev,
5228 			      struct device_attribute *attr, char *buf)
5229 {
5230 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5231 
5232 	if (rbd_dev->major)
5233 		return sprintf(buf, "%d\n", rbd_dev->major);
5234 
5235 	return sprintf(buf, "(none)\n");
5236 }
5237 
5238 static ssize_t rbd_minor_show(struct device *dev,
5239 			      struct device_attribute *attr, char *buf)
5240 {
5241 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5242 
5243 	return sprintf(buf, "%d\n", rbd_dev->minor);
5244 }
5245 
5246 static ssize_t rbd_client_addr_show(struct device *dev,
5247 				    struct device_attribute *attr, char *buf)
5248 {
5249 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5250 	struct ceph_entity_addr *client_addr =
5251 	    ceph_client_addr(rbd_dev->rbd_client->client);
5252 
5253 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5254 		       le32_to_cpu(client_addr->nonce));
5255 }
5256 
5257 static ssize_t rbd_client_id_show(struct device *dev,
5258 				  struct device_attribute *attr, char *buf)
5259 {
5260 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5261 
5262 	return sprintf(buf, "client%lld\n",
5263 		       ceph_client_gid(rbd_dev->rbd_client->client));
5264 }
5265 
5266 static ssize_t rbd_cluster_fsid_show(struct device *dev,
5267 				     struct device_attribute *attr, char *buf)
5268 {
5269 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5270 
5271 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5272 }
5273 
5274 static ssize_t rbd_config_info_show(struct device *dev,
5275 				    struct device_attribute *attr, char *buf)
5276 {
5277 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5278 
5279 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5280 }
5281 
5282 static ssize_t rbd_pool_show(struct device *dev,
5283 			     struct device_attribute *attr, char *buf)
5284 {
5285 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5286 
5287 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5288 }
5289 
5290 static ssize_t rbd_pool_id_show(struct device *dev,
5291 			     struct device_attribute *attr, char *buf)
5292 {
5293 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5294 
5295 	return sprintf(buf, "%llu\n",
5296 			(unsigned long long) rbd_dev->spec->pool_id);
5297 }
5298 
5299 static ssize_t rbd_pool_ns_show(struct device *dev,
5300 				struct device_attribute *attr, char *buf)
5301 {
5302 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5303 
5304 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5305 }
5306 
5307 static ssize_t rbd_name_show(struct device *dev,
5308 			     struct device_attribute *attr, char *buf)
5309 {
5310 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5311 
5312 	if (rbd_dev->spec->image_name)
5313 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5314 
5315 	return sprintf(buf, "(unknown)\n");
5316 }
5317 
5318 static ssize_t rbd_image_id_show(struct device *dev,
5319 			     struct device_attribute *attr, char *buf)
5320 {
5321 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5322 
5323 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5324 }
5325 
5326 /*
5327  * Shows the name of the currently-mapped snapshot (or
5328  * RBD_SNAP_HEAD_NAME for the base image).
5329  */
5330 static ssize_t rbd_snap_show(struct device *dev,
5331 			     struct device_attribute *attr,
5332 			     char *buf)
5333 {
5334 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5335 
5336 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5337 }
5338 
5339 static ssize_t rbd_snap_id_show(struct device *dev,
5340 				struct device_attribute *attr, char *buf)
5341 {
5342 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5343 
5344 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5345 }
5346 
5347 /*
5348  * For a v2 image, shows the chain of parent images, separated by empty
5349  * lines.  For v1 images or if there is no parent, shows "(no parent
5350  * image)".
5351  */
5352 static ssize_t rbd_parent_show(struct device *dev,
5353 			       struct device_attribute *attr,
5354 			       char *buf)
5355 {
5356 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5357 	ssize_t count = 0;
5358 
5359 	if (!rbd_dev->parent)
5360 		return sprintf(buf, "(no parent image)\n");
5361 
5362 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5363 		struct rbd_spec *spec = rbd_dev->parent_spec;
5364 
5365 		count += sprintf(&buf[count], "%s"
5366 			    "pool_id %llu\npool_name %s\n"
5367 			    "pool_ns %s\n"
5368 			    "image_id %s\nimage_name %s\n"
5369 			    "snap_id %llu\nsnap_name %s\n"
5370 			    "overlap %llu\n",
5371 			    !count ? "" : "\n", /* first? */
5372 			    spec->pool_id, spec->pool_name,
5373 			    spec->pool_ns ?: "",
5374 			    spec->image_id, spec->image_name ?: "(unknown)",
5375 			    spec->snap_id, spec->snap_name,
5376 			    rbd_dev->parent_overlap);
5377 	}
5378 
5379 	return count;
5380 }
5381 
5382 static ssize_t rbd_image_refresh(struct device *dev,
5383 				 struct device_attribute *attr,
5384 				 const char *buf,
5385 				 size_t size)
5386 {
5387 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5388 	int ret;
5389 
5390 	ret = rbd_dev_refresh(rbd_dev);
5391 	if (ret)
5392 		return ret;
5393 
5394 	return size;
5395 }
5396 
5397 static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5398 static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5399 static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5400 static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5401 static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5402 static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5403 static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5404 static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5405 static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5406 static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5407 static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5408 static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5409 static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5410 static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5411 static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5412 static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5413 static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5414 
5415 static struct attribute *rbd_attrs[] = {
5416 	&dev_attr_size.attr,
5417 	&dev_attr_features.attr,
5418 	&dev_attr_major.attr,
5419 	&dev_attr_minor.attr,
5420 	&dev_attr_client_addr.attr,
5421 	&dev_attr_client_id.attr,
5422 	&dev_attr_cluster_fsid.attr,
5423 	&dev_attr_config_info.attr,
5424 	&dev_attr_pool.attr,
5425 	&dev_attr_pool_id.attr,
5426 	&dev_attr_pool_ns.attr,
5427 	&dev_attr_name.attr,
5428 	&dev_attr_image_id.attr,
5429 	&dev_attr_current_snap.attr,
5430 	&dev_attr_snap_id.attr,
5431 	&dev_attr_parent.attr,
5432 	&dev_attr_refresh.attr,
5433 	NULL
5434 };
5435 
5436 static struct attribute_group rbd_attr_group = {
5437 	.attrs = rbd_attrs,
5438 };
5439 
5440 static const struct attribute_group *rbd_attr_groups[] = {
5441 	&rbd_attr_group,
5442 	NULL
5443 };
5444 
5445 static void rbd_dev_release(struct device *dev);
5446 
5447 static const struct device_type rbd_device_type = {
5448 	.name		= "rbd",
5449 	.groups		= rbd_attr_groups,
5450 	.release	= rbd_dev_release,
5451 };
5452 
5453 static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5454 {
5455 	kref_get(&spec->kref);
5456 
5457 	return spec;
5458 }
5459 
5460 static void rbd_spec_free(struct kref *kref);
5461 static void rbd_spec_put(struct rbd_spec *spec)
5462 {
5463 	if (spec)
5464 		kref_put(&spec->kref, rbd_spec_free);
5465 }
5466 
5467 static struct rbd_spec *rbd_spec_alloc(void)
5468 {
5469 	struct rbd_spec *spec;
5470 
5471 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5472 	if (!spec)
5473 		return NULL;
5474 
5475 	spec->pool_id = CEPH_NOPOOL;
5476 	spec->snap_id = CEPH_NOSNAP;
5477 	kref_init(&spec->kref);
5478 
5479 	return spec;
5480 }
5481 
5482 static void rbd_spec_free(struct kref *kref)
5483 {
5484 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5485 
5486 	kfree(spec->pool_name);
5487 	kfree(spec->pool_ns);
5488 	kfree(spec->image_id);
5489 	kfree(spec->image_name);
5490 	kfree(spec->snap_name);
5491 	kfree(spec);
5492 }
5493 
5494 static void rbd_dev_free(struct rbd_device *rbd_dev)
5495 {
5496 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5497 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5498 
5499 	ceph_oid_destroy(&rbd_dev->header_oid);
5500 	ceph_oloc_destroy(&rbd_dev->header_oloc);
5501 	kfree(rbd_dev->config_info);
5502 
5503 	rbd_put_client(rbd_dev->rbd_client);
5504 	rbd_spec_put(rbd_dev->spec);
5505 	kfree(rbd_dev->opts);
5506 	kfree(rbd_dev);
5507 }
5508 
5509 static void rbd_dev_release(struct device *dev)
5510 {
5511 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5512 	bool need_put = !!rbd_dev->opts;
5513 
5514 	if (need_put) {
5515 		destroy_workqueue(rbd_dev->task_wq);
5516 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5517 	}
5518 
5519 	rbd_dev_free(rbd_dev);
5520 
5521 	/*
5522 	 * This is racy, but way better than putting module outside of
5523 	 * the release callback.  The race window is pretty small, so
5524 	 * doing something similar to dm (dm-builtin.c) is overkill.
5525 	 */
5526 	if (need_put)
5527 		module_put(THIS_MODULE);
5528 }
5529 
5530 static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5531 					   struct rbd_spec *spec)
5532 {
5533 	struct rbd_device *rbd_dev;
5534 
5535 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5536 	if (!rbd_dev)
5537 		return NULL;
5538 
5539 	spin_lock_init(&rbd_dev->lock);
5540 	INIT_LIST_HEAD(&rbd_dev->node);
5541 	init_rwsem(&rbd_dev->header_rwsem);
5542 
5543 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5544 	ceph_oid_init(&rbd_dev->header_oid);
5545 	rbd_dev->header_oloc.pool = spec->pool_id;
5546 	if (spec->pool_ns) {
5547 		WARN_ON(!*spec->pool_ns);
5548 		rbd_dev->header_oloc.pool_ns =
5549 		    ceph_find_or_create_string(spec->pool_ns,
5550 					       strlen(spec->pool_ns));
5551 	}
5552 
5553 	mutex_init(&rbd_dev->watch_mutex);
5554 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5555 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5556 
5557 	init_rwsem(&rbd_dev->lock_rwsem);
5558 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5559 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5560 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5561 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5562 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5563 	spin_lock_init(&rbd_dev->lock_lists_lock);
5564 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5565 	INIT_LIST_HEAD(&rbd_dev->running_list);
5566 	init_completion(&rbd_dev->acquire_wait);
5567 	init_completion(&rbd_dev->releasing_wait);
5568 
5569 	spin_lock_init(&rbd_dev->object_map_lock);
5570 
5571 	rbd_dev->dev.bus = &rbd_bus_type;
5572 	rbd_dev->dev.type = &rbd_device_type;
5573 	rbd_dev->dev.parent = &rbd_root_dev;
5574 	device_initialize(&rbd_dev->dev);
5575 
5576 	rbd_dev->rbd_client = rbdc;
5577 	rbd_dev->spec = spec;
5578 
5579 	return rbd_dev;
5580 }
5581 
5582 /*
5583  * Create a mapping rbd_dev.
5584  */
5585 static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5586 					 struct rbd_spec *spec,
5587 					 struct rbd_options *opts)
5588 {
5589 	struct rbd_device *rbd_dev;
5590 
5591 	rbd_dev = __rbd_dev_create(rbdc, spec);
5592 	if (!rbd_dev)
5593 		return NULL;
5594 
5595 	rbd_dev->opts = opts;
5596 
5597 	/* get an id and fill in device name */
5598 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5599 					 minor_to_rbd_dev_id(1 << MINORBITS),
5600 					 GFP_KERNEL);
5601 	if (rbd_dev->dev_id < 0)
5602 		goto fail_rbd_dev;
5603 
5604 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5605 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5606 						   rbd_dev->name);
5607 	if (!rbd_dev->task_wq)
5608 		goto fail_dev_id;
5609 
5610 	/* we have a ref from do_rbd_add() */
5611 	__module_get(THIS_MODULE);
5612 
5613 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5614 	return rbd_dev;
5615 
5616 fail_dev_id:
5617 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5618 fail_rbd_dev:
5619 	rbd_dev_free(rbd_dev);
5620 	return NULL;
5621 }
5622 
5623 static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5624 {
5625 	if (rbd_dev)
5626 		put_device(&rbd_dev->dev);
5627 }
5628 
5629 /*
5630  * Get the size and object order for an image snapshot, or if
5631  * snap_id is CEPH_NOSNAP, gets this information for the base
5632  * image.
5633  */
5634 static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5635 				u8 *order, u64 *snap_size)
5636 {
5637 	__le64 snapid = cpu_to_le64(snap_id);
5638 	int ret;
5639 	struct {
5640 		u8 order;
5641 		__le64 size;
5642 	} __attribute__ ((packed)) size_buf = { 0 };
5643 
5644 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5645 				  &rbd_dev->header_oloc, "get_size",
5646 				  &snapid, sizeof(snapid),
5647 				  &size_buf, sizeof(size_buf));
5648 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5649 	if (ret < 0)
5650 		return ret;
5651 	if (ret < sizeof (size_buf))
5652 		return -ERANGE;
5653 
5654 	if (order) {
5655 		*order = size_buf.order;
5656 		dout("  order %u", (unsigned int)*order);
5657 	}
5658 	*snap_size = le64_to_cpu(size_buf.size);
5659 
5660 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5661 		(unsigned long long)snap_id,
5662 		(unsigned long long)*snap_size);
5663 
5664 	return 0;
5665 }
5666 
5667 static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5668 {
5669 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5670 					&rbd_dev->header.obj_order,
5671 					&rbd_dev->header.image_size);
5672 }
5673 
5674 static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5675 {
5676 	size_t size;
5677 	void *reply_buf;
5678 	int ret;
5679 	void *p;
5680 
5681 	/* Response will be an encoded string, which includes a length */
5682 	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5683 	reply_buf = kzalloc(size, GFP_KERNEL);
5684 	if (!reply_buf)
5685 		return -ENOMEM;
5686 
5687 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5688 				  &rbd_dev->header_oloc, "get_object_prefix",
5689 				  NULL, 0, reply_buf, size);
5690 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5691 	if (ret < 0)
5692 		goto out;
5693 
5694 	p = reply_buf;
5695 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
5696 						p + ret, NULL, GFP_NOIO);
5697 	ret = 0;
5698 
5699 	if (IS_ERR(rbd_dev->header.object_prefix)) {
5700 		ret = PTR_ERR(rbd_dev->header.object_prefix);
5701 		rbd_dev->header.object_prefix = NULL;
5702 	} else {
5703 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
5704 	}
5705 out:
5706 	kfree(reply_buf);
5707 
5708 	return ret;
5709 }
5710 
5711 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5712 		u64 *snap_features)
5713 {
5714 	__le64 snapid = cpu_to_le64(snap_id);
5715 	struct {
5716 		__le64 features;
5717 		__le64 incompat;
5718 	} __attribute__ ((packed)) features_buf = { 0 };
5719 	u64 unsup;
5720 	int ret;
5721 
5722 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5723 				  &rbd_dev->header_oloc, "get_features",
5724 				  &snapid, sizeof(snapid),
5725 				  &features_buf, sizeof(features_buf));
5726 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5727 	if (ret < 0)
5728 		return ret;
5729 	if (ret < sizeof (features_buf))
5730 		return -ERANGE;
5731 
5732 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5733 	if (unsup) {
5734 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5735 			 unsup);
5736 		return -ENXIO;
5737 	}
5738 
5739 	*snap_features = le64_to_cpu(features_buf.features);
5740 
5741 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5742 		(unsigned long long)snap_id,
5743 		(unsigned long long)*snap_features,
5744 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5745 
5746 	return 0;
5747 }
5748 
5749 static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5750 {
5751 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5752 						&rbd_dev->header.features);
5753 }
5754 
5755 /*
5756  * These are generic image flags, but since they are used only for
5757  * object map, store them in rbd_dev->object_map_flags.
5758  *
5759  * For the same reason, this function is called only on object map
5760  * (re)load and not on header refresh.
5761  */
5762 static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5763 {
5764 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5765 	__le64 flags;
5766 	int ret;
5767 
5768 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5769 				  &rbd_dev->header_oloc, "get_flags",
5770 				  &snapid, sizeof(snapid),
5771 				  &flags, sizeof(flags));
5772 	if (ret < 0)
5773 		return ret;
5774 	if (ret < sizeof(flags))
5775 		return -EBADMSG;
5776 
5777 	rbd_dev->object_map_flags = le64_to_cpu(flags);
5778 	return 0;
5779 }
5780 
5781 struct parent_image_info {
5782 	u64		pool_id;
5783 	const char	*pool_ns;
5784 	const char	*image_id;
5785 	u64		snap_id;
5786 
5787 	bool		has_overlap;
5788 	u64		overlap;
5789 };
5790 
5791 /*
5792  * The caller is responsible for @pii.
5793  */
5794 static int decode_parent_image_spec(void **p, void *end,
5795 				    struct parent_image_info *pii)
5796 {
5797 	u8 struct_v;
5798 	u32 struct_len;
5799 	int ret;
5800 
5801 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5802 				  &struct_v, &struct_len);
5803 	if (ret)
5804 		return ret;
5805 
5806 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5807 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5808 	if (IS_ERR(pii->pool_ns)) {
5809 		ret = PTR_ERR(pii->pool_ns);
5810 		pii->pool_ns = NULL;
5811 		return ret;
5812 	}
5813 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5814 	if (IS_ERR(pii->image_id)) {
5815 		ret = PTR_ERR(pii->image_id);
5816 		pii->image_id = NULL;
5817 		return ret;
5818 	}
5819 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5820 	return 0;
5821 
5822 e_inval:
5823 	return -EINVAL;
5824 }
5825 
5826 static int __get_parent_info(struct rbd_device *rbd_dev,
5827 			     struct page *req_page,
5828 			     struct page *reply_page,
5829 			     struct parent_image_info *pii)
5830 {
5831 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5832 	size_t reply_len = PAGE_SIZE;
5833 	void *p, *end;
5834 	int ret;
5835 
5836 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5837 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5838 			     req_page, sizeof(u64), &reply_page, &reply_len);
5839 	if (ret)
5840 		return ret == -EOPNOTSUPP ? 1 : ret;
5841 
5842 	p = page_address(reply_page);
5843 	end = p + reply_len;
5844 	ret = decode_parent_image_spec(&p, end, pii);
5845 	if (ret)
5846 		return ret;
5847 
5848 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5849 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5850 			     req_page, sizeof(u64), &reply_page, &reply_len);
5851 	if (ret)
5852 		return ret;
5853 
5854 	p = page_address(reply_page);
5855 	end = p + reply_len;
5856 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5857 	if (pii->has_overlap)
5858 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5859 
5860 	return 0;
5861 
5862 e_inval:
5863 	return -EINVAL;
5864 }
5865 
5866 /*
5867  * The caller is responsible for @pii.
5868  */
5869 static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5870 				    struct page *req_page,
5871 				    struct page *reply_page,
5872 				    struct parent_image_info *pii)
5873 {
5874 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5875 	size_t reply_len = PAGE_SIZE;
5876 	void *p, *end;
5877 	int ret;
5878 
5879 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5880 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5881 			     req_page, sizeof(u64), &reply_page, &reply_len);
5882 	if (ret)
5883 		return ret;
5884 
5885 	p = page_address(reply_page);
5886 	end = p + reply_len;
5887 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5888 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5889 	if (IS_ERR(pii->image_id)) {
5890 		ret = PTR_ERR(pii->image_id);
5891 		pii->image_id = NULL;
5892 		return ret;
5893 	}
5894 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5895 	pii->has_overlap = true;
5896 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5897 
5898 	return 0;
5899 
5900 e_inval:
5901 	return -EINVAL;
5902 }
5903 
5904 static int get_parent_info(struct rbd_device *rbd_dev,
5905 			   struct parent_image_info *pii)
5906 {
5907 	struct page *req_page, *reply_page;
5908 	void *p;
5909 	int ret;
5910 
5911 	req_page = alloc_page(GFP_KERNEL);
5912 	if (!req_page)
5913 		return -ENOMEM;
5914 
5915 	reply_page = alloc_page(GFP_KERNEL);
5916 	if (!reply_page) {
5917 		__free_page(req_page);
5918 		return -ENOMEM;
5919 	}
5920 
5921 	p = page_address(req_page);
5922 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5923 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5924 	if (ret > 0)
5925 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5926 					       pii);
5927 
5928 	__free_page(req_page);
5929 	__free_page(reply_page);
5930 	return ret;
5931 }
5932 
5933 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5934 {
5935 	struct rbd_spec *parent_spec;
5936 	struct parent_image_info pii = { 0 };
5937 	int ret;
5938 
5939 	parent_spec = rbd_spec_alloc();
5940 	if (!parent_spec)
5941 		return -ENOMEM;
5942 
5943 	ret = get_parent_info(rbd_dev, &pii);
5944 	if (ret)
5945 		goto out_err;
5946 
5947 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5948 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5949 	     pii.has_overlap, pii.overlap);
5950 
5951 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5952 		/*
5953 		 * Either the parent never existed, or we have
5954 		 * record of it but the image got flattened so it no
5955 		 * longer has a parent.  When the parent of a
5956 		 * layered image disappears we immediately set the
5957 		 * overlap to 0.  The effect of this is that all new
5958 		 * requests will be treated as if the image had no
5959 		 * parent.
5960 		 *
5961 		 * If !pii.has_overlap, the parent image spec is not
5962 		 * applicable.  It's there to avoid duplication in each
5963 		 * snapshot record.
5964 		 */
5965 		if (rbd_dev->parent_overlap) {
5966 			rbd_dev->parent_overlap = 0;
5967 			rbd_dev_parent_put(rbd_dev);
5968 			pr_info("%s: clone image has been flattened\n",
5969 				rbd_dev->disk->disk_name);
5970 		}
5971 
5972 		goto out;	/* No parent?  No problem. */
5973 	}
5974 
5975 	/* The ceph file layout needs to fit pool id in 32 bits */
5976 
5977 	ret = -EIO;
5978 	if (pii.pool_id > (u64)U32_MAX) {
5979 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5980 			(unsigned long long)pii.pool_id, U32_MAX);
5981 		goto out_err;
5982 	}
5983 
5984 	/*
5985 	 * The parent won't change (except when the clone is
5986 	 * flattened, already handled that).  So we only need to
5987 	 * record the parent spec we have not already done so.
5988 	 */
5989 	if (!rbd_dev->parent_spec) {
5990 		parent_spec->pool_id = pii.pool_id;
5991 		if (pii.pool_ns && *pii.pool_ns) {
5992 			parent_spec->pool_ns = pii.pool_ns;
5993 			pii.pool_ns = NULL;
5994 		}
5995 		parent_spec->image_id = pii.image_id;
5996 		pii.image_id = NULL;
5997 		parent_spec->snap_id = pii.snap_id;
5998 
5999 		rbd_dev->parent_spec = parent_spec;
6000 		parent_spec = NULL;	/* rbd_dev now owns this */
6001 	}
6002 
6003 	/*
6004 	 * We always update the parent overlap.  If it's zero we issue
6005 	 * a warning, as we will proceed as if there was no parent.
6006 	 */
6007 	if (!pii.overlap) {
6008 		if (parent_spec) {
6009 			/* refresh, careful to warn just once */
6010 			if (rbd_dev->parent_overlap)
6011 				rbd_warn(rbd_dev,
6012 				    "clone now standalone (overlap became 0)");
6013 		} else {
6014 			/* initial probe */
6015 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
6016 		}
6017 	}
6018 	rbd_dev->parent_overlap = pii.overlap;
6019 
6020 out:
6021 	ret = 0;
6022 out_err:
6023 	kfree(pii.pool_ns);
6024 	kfree(pii.image_id);
6025 	rbd_spec_put(parent_spec);
6026 	return ret;
6027 }
6028 
6029 static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
6030 {
6031 	struct {
6032 		__le64 stripe_unit;
6033 		__le64 stripe_count;
6034 	} __attribute__ ((packed)) striping_info_buf = { 0 };
6035 	size_t size = sizeof (striping_info_buf);
6036 	void *p;
6037 	int ret;
6038 
6039 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6040 				&rbd_dev->header_oloc, "get_stripe_unit_count",
6041 				NULL, 0, &striping_info_buf, size);
6042 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6043 	if (ret < 0)
6044 		return ret;
6045 	if (ret < size)
6046 		return -ERANGE;
6047 
6048 	p = &striping_info_buf;
6049 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
6050 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
6051 	return 0;
6052 }
6053 
6054 static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
6055 {
6056 	__le64 data_pool_id;
6057 	int ret;
6058 
6059 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6060 				  &rbd_dev->header_oloc, "get_data_pool",
6061 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
6062 	if (ret < 0)
6063 		return ret;
6064 	if (ret < sizeof(data_pool_id))
6065 		return -EBADMSG;
6066 
6067 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
6068 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
6069 	return 0;
6070 }
6071 
6072 static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
6073 {
6074 	CEPH_DEFINE_OID_ONSTACK(oid);
6075 	size_t image_id_size;
6076 	char *image_id;
6077 	void *p;
6078 	void *end;
6079 	size_t size;
6080 	void *reply_buf = NULL;
6081 	size_t len = 0;
6082 	char *image_name = NULL;
6083 	int ret;
6084 
6085 	rbd_assert(!rbd_dev->spec->image_name);
6086 
6087 	len = strlen(rbd_dev->spec->image_id);
6088 	image_id_size = sizeof (__le32) + len;
6089 	image_id = kmalloc(image_id_size, GFP_KERNEL);
6090 	if (!image_id)
6091 		return NULL;
6092 
6093 	p = image_id;
6094 	end = image_id + image_id_size;
6095 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
6096 
6097 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
6098 	reply_buf = kmalloc(size, GFP_KERNEL);
6099 	if (!reply_buf)
6100 		goto out;
6101 
6102 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
6103 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6104 				  "dir_get_name", image_id, image_id_size,
6105 				  reply_buf, size);
6106 	if (ret < 0)
6107 		goto out;
6108 	p = reply_buf;
6109 	end = reply_buf + ret;
6110 
6111 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
6112 	if (IS_ERR(image_name))
6113 		image_name = NULL;
6114 	else
6115 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
6116 out:
6117 	kfree(reply_buf);
6118 	kfree(image_id);
6119 
6120 	return image_name;
6121 }
6122 
6123 static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6124 {
6125 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6126 	const char *snap_name;
6127 	u32 which = 0;
6128 
6129 	/* Skip over names until we find the one we are looking for */
6130 
6131 	snap_name = rbd_dev->header.snap_names;
6132 	while (which < snapc->num_snaps) {
6133 		if (!strcmp(name, snap_name))
6134 			return snapc->snaps[which];
6135 		snap_name += strlen(snap_name) + 1;
6136 		which++;
6137 	}
6138 	return CEPH_NOSNAP;
6139 }
6140 
6141 static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6142 {
6143 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
6144 	u32 which;
6145 	bool found = false;
6146 	u64 snap_id;
6147 
6148 	for (which = 0; !found && which < snapc->num_snaps; which++) {
6149 		const char *snap_name;
6150 
6151 		snap_id = snapc->snaps[which];
6152 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6153 		if (IS_ERR(snap_name)) {
6154 			/* ignore no-longer existing snapshots */
6155 			if (PTR_ERR(snap_name) == -ENOENT)
6156 				continue;
6157 			else
6158 				break;
6159 		}
6160 		found = !strcmp(name, snap_name);
6161 		kfree(snap_name);
6162 	}
6163 	return found ? snap_id : CEPH_NOSNAP;
6164 }
6165 
6166 /*
6167  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6168  * no snapshot by that name is found, or if an error occurs.
6169  */
6170 static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6171 {
6172 	if (rbd_dev->image_format == 1)
6173 		return rbd_v1_snap_id_by_name(rbd_dev, name);
6174 
6175 	return rbd_v2_snap_id_by_name(rbd_dev, name);
6176 }
6177 
6178 /*
6179  * An image being mapped will have everything but the snap id.
6180  */
6181 static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6182 {
6183 	struct rbd_spec *spec = rbd_dev->spec;
6184 
6185 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6186 	rbd_assert(spec->image_id && spec->image_name);
6187 	rbd_assert(spec->snap_name);
6188 
6189 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6190 		u64 snap_id;
6191 
6192 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6193 		if (snap_id == CEPH_NOSNAP)
6194 			return -ENOENT;
6195 
6196 		spec->snap_id = snap_id;
6197 	} else {
6198 		spec->snap_id = CEPH_NOSNAP;
6199 	}
6200 
6201 	return 0;
6202 }
6203 
6204 /*
6205  * A parent image will have all ids but none of the names.
6206  *
6207  * All names in an rbd spec are dynamically allocated.  It's OK if we
6208  * can't figure out the name for an image id.
6209  */
6210 static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
6211 {
6212 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6213 	struct rbd_spec *spec = rbd_dev->spec;
6214 	const char *pool_name;
6215 	const char *image_name;
6216 	const char *snap_name;
6217 	int ret;
6218 
6219 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
6220 	rbd_assert(spec->image_id);
6221 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
6222 
6223 	/* Get the pool name; we have to make our own copy of this */
6224 
6225 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6226 	if (!pool_name) {
6227 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6228 		return -EIO;
6229 	}
6230 	pool_name = kstrdup(pool_name, GFP_KERNEL);
6231 	if (!pool_name)
6232 		return -ENOMEM;
6233 
6234 	/* Fetch the image name; tolerate failure here */
6235 
6236 	image_name = rbd_dev_image_name(rbd_dev);
6237 	if (!image_name)
6238 		rbd_warn(rbd_dev, "unable to get image name");
6239 
6240 	/* Fetch the snapshot name */
6241 
6242 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6243 	if (IS_ERR(snap_name)) {
6244 		ret = PTR_ERR(snap_name);
6245 		goto out_err;
6246 	}
6247 
6248 	spec->pool_name = pool_name;
6249 	spec->image_name = image_name;
6250 	spec->snap_name = snap_name;
6251 
6252 	return 0;
6253 
6254 out_err:
6255 	kfree(image_name);
6256 	kfree(pool_name);
6257 	return ret;
6258 }
6259 
6260 static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
6261 {
6262 	size_t size;
6263 	int ret;
6264 	void *reply_buf;
6265 	void *p;
6266 	void *end;
6267 	u64 seq;
6268 	u32 snap_count;
6269 	struct ceph_snap_context *snapc;
6270 	u32 i;
6271 
6272 	/*
6273 	 * We'll need room for the seq value (maximum snapshot id),
6274 	 * snapshot count, and array of that many snapshot ids.
6275 	 * For now we have a fixed upper limit on the number we're
6276 	 * prepared to receive.
6277 	 */
6278 	size = sizeof (__le64) + sizeof (__le32) +
6279 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
6280 	reply_buf = kzalloc(size, GFP_KERNEL);
6281 	if (!reply_buf)
6282 		return -ENOMEM;
6283 
6284 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6285 				  &rbd_dev->header_oloc, "get_snapcontext",
6286 				  NULL, 0, reply_buf, size);
6287 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6288 	if (ret < 0)
6289 		goto out;
6290 
6291 	p = reply_buf;
6292 	end = reply_buf + ret;
6293 	ret = -ERANGE;
6294 	ceph_decode_64_safe(&p, end, seq, out);
6295 	ceph_decode_32_safe(&p, end, snap_count, out);
6296 
6297 	/*
6298 	 * Make sure the reported number of snapshot ids wouldn't go
6299 	 * beyond the end of our buffer.  But before checking that,
6300 	 * make sure the computed size of the snapshot context we
6301 	 * allocate is representable in a size_t.
6302 	 */
6303 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6304 				 / sizeof (u64)) {
6305 		ret = -EINVAL;
6306 		goto out;
6307 	}
6308 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6309 		goto out;
6310 	ret = 0;
6311 
6312 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
6313 	if (!snapc) {
6314 		ret = -ENOMEM;
6315 		goto out;
6316 	}
6317 	snapc->seq = seq;
6318 	for (i = 0; i < snap_count; i++)
6319 		snapc->snaps[i] = ceph_decode_64(&p);
6320 
6321 	ceph_put_snap_context(rbd_dev->header.snapc);
6322 	rbd_dev->header.snapc = snapc;
6323 
6324 	dout("  snap context seq = %llu, snap_count = %u\n",
6325 		(unsigned long long)seq, (unsigned int)snap_count);
6326 out:
6327 	kfree(reply_buf);
6328 
6329 	return ret;
6330 }
6331 
6332 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6333 					u64 snap_id)
6334 {
6335 	size_t size;
6336 	void *reply_buf;
6337 	__le64 snapid;
6338 	int ret;
6339 	void *p;
6340 	void *end;
6341 	char *snap_name;
6342 
6343 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6344 	reply_buf = kmalloc(size, GFP_KERNEL);
6345 	if (!reply_buf)
6346 		return ERR_PTR(-ENOMEM);
6347 
6348 	snapid = cpu_to_le64(snap_id);
6349 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6350 				  &rbd_dev->header_oloc, "get_snapshot_name",
6351 				  &snapid, sizeof(snapid), reply_buf, size);
6352 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6353 	if (ret < 0) {
6354 		snap_name = ERR_PTR(ret);
6355 		goto out;
6356 	}
6357 
6358 	p = reply_buf;
6359 	end = reply_buf + ret;
6360 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6361 	if (IS_ERR(snap_name))
6362 		goto out;
6363 
6364 	dout("  snap_id 0x%016llx snap_name = %s\n",
6365 		(unsigned long long)snap_id, snap_name);
6366 out:
6367 	kfree(reply_buf);
6368 
6369 	return snap_name;
6370 }
6371 
6372 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6373 {
6374 	bool first_time = rbd_dev->header.object_prefix == NULL;
6375 	int ret;
6376 
6377 	ret = rbd_dev_v2_image_size(rbd_dev);
6378 	if (ret)
6379 		return ret;
6380 
6381 	if (first_time) {
6382 		ret = rbd_dev_v2_header_onetime(rbd_dev);
6383 		if (ret)
6384 			return ret;
6385 	}
6386 
6387 	ret = rbd_dev_v2_snap_context(rbd_dev);
6388 	if (ret && first_time) {
6389 		kfree(rbd_dev->header.object_prefix);
6390 		rbd_dev->header.object_prefix = NULL;
6391 	}
6392 
6393 	return ret;
6394 }
6395 
6396 static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6397 {
6398 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6399 
6400 	if (rbd_dev->image_format == 1)
6401 		return rbd_dev_v1_header_info(rbd_dev);
6402 
6403 	return rbd_dev_v2_header_info(rbd_dev);
6404 }
6405 
6406 /*
6407  * Skips over white space at *buf, and updates *buf to point to the
6408  * first found non-space character (if any). Returns the length of
6409  * the token (string of non-white space characters) found.  Note
6410  * that *buf must be terminated with '\0'.
6411  */
6412 static inline size_t next_token(const char **buf)
6413 {
6414         /*
6415         * These are the characters that produce nonzero for
6416         * isspace() in the "C" and "POSIX" locales.
6417         */
6418         const char *spaces = " \f\n\r\t\v";
6419 
6420         *buf += strspn(*buf, spaces);	/* Find start of token */
6421 
6422 	return strcspn(*buf, spaces);   /* Return token length */
6423 }
6424 
6425 /*
6426  * Finds the next token in *buf, dynamically allocates a buffer big
6427  * enough to hold a copy of it, and copies the token into the new
6428  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6429  * that a duplicate buffer is created even for a zero-length token.
6430  *
6431  * Returns a pointer to the newly-allocated duplicate, or a null
6432  * pointer if memory for the duplicate was not available.  If
6433  * the lenp argument is a non-null pointer, the length of the token
6434  * (not including the '\0') is returned in *lenp.
6435  *
6436  * If successful, the *buf pointer will be updated to point beyond
6437  * the end of the found token.
6438  *
6439  * Note: uses GFP_KERNEL for allocation.
6440  */
6441 static inline char *dup_token(const char **buf, size_t *lenp)
6442 {
6443 	char *dup;
6444 	size_t len;
6445 
6446 	len = next_token(buf);
6447 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6448 	if (!dup)
6449 		return NULL;
6450 	*(dup + len) = '\0';
6451 	*buf += len;
6452 
6453 	if (lenp)
6454 		*lenp = len;
6455 
6456 	return dup;
6457 }
6458 
6459 /*
6460  * Parse the options provided for an "rbd add" (i.e., rbd image
6461  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6462  * and the data written is passed here via a NUL-terminated buffer.
6463  * Returns 0 if successful or an error code otherwise.
6464  *
6465  * The information extracted from these options is recorded in
6466  * the other parameters which return dynamically-allocated
6467  * structures:
6468  *  ceph_opts
6469  *      The address of a pointer that will refer to a ceph options
6470  *      structure.  Caller must release the returned pointer using
6471  *      ceph_destroy_options() when it is no longer needed.
6472  *  rbd_opts
6473  *	Address of an rbd options pointer.  Fully initialized by
6474  *	this function; caller must release with kfree().
6475  *  spec
6476  *	Address of an rbd image specification pointer.  Fully
6477  *	initialized by this function based on parsed options.
6478  *	Caller must release with rbd_spec_put().
6479  *
6480  * The options passed take this form:
6481  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6482  * where:
6483  *  <mon_addrs>
6484  *      A comma-separated list of one or more monitor addresses.
6485  *      A monitor address is an ip address, optionally followed
6486  *      by a port number (separated by a colon).
6487  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6488  *  <options>
6489  *      A comma-separated list of ceph and/or rbd options.
6490  *  <pool_name>
6491  *      The name of the rados pool containing the rbd image.
6492  *  <image_name>
6493  *      The name of the image in that pool to map.
6494  *  <snap_id>
6495  *      An optional snapshot id.  If provided, the mapping will
6496  *      present data from the image at the time that snapshot was
6497  *      created.  The image head is used if no snapshot id is
6498  *      provided.  Snapshot mappings are always read-only.
6499  */
6500 static int rbd_add_parse_args(const char *buf,
6501 				struct ceph_options **ceph_opts,
6502 				struct rbd_options **opts,
6503 				struct rbd_spec **rbd_spec)
6504 {
6505 	size_t len;
6506 	char *options;
6507 	const char *mon_addrs;
6508 	char *snap_name;
6509 	size_t mon_addrs_size;
6510 	struct parse_rbd_opts_ctx pctx = { 0 };
6511 	struct ceph_options *copts;
6512 	int ret;
6513 
6514 	/* The first four tokens are required */
6515 
6516 	len = next_token(&buf);
6517 	if (!len) {
6518 		rbd_warn(NULL, "no monitor address(es) provided");
6519 		return -EINVAL;
6520 	}
6521 	mon_addrs = buf;
6522 	mon_addrs_size = len + 1;
6523 	buf += len;
6524 
6525 	ret = -EINVAL;
6526 	options = dup_token(&buf, NULL);
6527 	if (!options)
6528 		return -ENOMEM;
6529 	if (!*options) {
6530 		rbd_warn(NULL, "no options provided");
6531 		goto out_err;
6532 	}
6533 
6534 	pctx.spec = rbd_spec_alloc();
6535 	if (!pctx.spec)
6536 		goto out_mem;
6537 
6538 	pctx.spec->pool_name = dup_token(&buf, NULL);
6539 	if (!pctx.spec->pool_name)
6540 		goto out_mem;
6541 	if (!*pctx.spec->pool_name) {
6542 		rbd_warn(NULL, "no pool name provided");
6543 		goto out_err;
6544 	}
6545 
6546 	pctx.spec->image_name = dup_token(&buf, NULL);
6547 	if (!pctx.spec->image_name)
6548 		goto out_mem;
6549 	if (!*pctx.spec->image_name) {
6550 		rbd_warn(NULL, "no image name provided");
6551 		goto out_err;
6552 	}
6553 
6554 	/*
6555 	 * Snapshot name is optional; default is to use "-"
6556 	 * (indicating the head/no snapshot).
6557 	 */
6558 	len = next_token(&buf);
6559 	if (!len) {
6560 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6561 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6562 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6563 		ret = -ENAMETOOLONG;
6564 		goto out_err;
6565 	}
6566 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6567 	if (!snap_name)
6568 		goto out_mem;
6569 	*(snap_name + len) = '\0';
6570 	pctx.spec->snap_name = snap_name;
6571 
6572 	/* Initialize all rbd options to the defaults */
6573 
6574 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6575 	if (!pctx.opts)
6576 		goto out_mem;
6577 
6578 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6579 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
6580 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6581 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6582 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6583 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6584 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6585 
6586 	copts = ceph_parse_options(options, mon_addrs,
6587 				   mon_addrs + mon_addrs_size - 1,
6588 				   parse_rbd_opts_token, &pctx);
6589 	if (IS_ERR(copts)) {
6590 		ret = PTR_ERR(copts);
6591 		goto out_err;
6592 	}
6593 	kfree(options);
6594 
6595 	*ceph_opts = copts;
6596 	*opts = pctx.opts;
6597 	*rbd_spec = pctx.spec;
6598 
6599 	return 0;
6600 out_mem:
6601 	ret = -ENOMEM;
6602 out_err:
6603 	kfree(pctx.opts);
6604 	rbd_spec_put(pctx.spec);
6605 	kfree(options);
6606 
6607 	return ret;
6608 }
6609 
6610 static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6611 {
6612 	down_write(&rbd_dev->lock_rwsem);
6613 	if (__rbd_is_lock_owner(rbd_dev))
6614 		__rbd_release_lock(rbd_dev);
6615 	up_write(&rbd_dev->lock_rwsem);
6616 }
6617 
6618 /*
6619  * If the wait is interrupted, an error is returned even if the lock
6620  * was successfully acquired.  rbd_dev_image_unlock() will release it
6621  * if needed.
6622  */
6623 static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6624 {
6625 	long ret;
6626 
6627 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6628 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6629 			return 0;
6630 
6631 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6632 		return -EINVAL;
6633 	}
6634 
6635 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
6636 		return 0;
6637 
6638 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6639 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6640 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6641 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6642 	if (ret > 0)
6643 		ret = rbd_dev->acquire_err;
6644 	else if (!ret)
6645 		ret = -ETIMEDOUT;
6646 
6647 	if (ret) {
6648 		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6649 		return ret;
6650 	}
6651 
6652 	/*
6653 	 * The lock may have been released by now, unless automatic lock
6654 	 * transitions are disabled.
6655 	 */
6656 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6657 	return 0;
6658 }
6659 
6660 /*
6661  * An rbd format 2 image has a unique identifier, distinct from the
6662  * name given to it by the user.  Internally, that identifier is
6663  * what's used to specify the names of objects related to the image.
6664  *
6665  * A special "rbd id" object is used to map an rbd image name to its
6666  * id.  If that object doesn't exist, then there is no v2 rbd image
6667  * with the supplied name.
6668  *
6669  * This function will record the given rbd_dev's image_id field if
6670  * it can be determined, and in that case will return 0.  If any
6671  * errors occur a negative errno will be returned and the rbd_dev's
6672  * image_id field will be unchanged (and should be NULL).
6673  */
6674 static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6675 {
6676 	int ret;
6677 	size_t size;
6678 	CEPH_DEFINE_OID_ONSTACK(oid);
6679 	void *response;
6680 	char *image_id;
6681 
6682 	/*
6683 	 * When probing a parent image, the image id is already
6684 	 * known (and the image name likely is not).  There's no
6685 	 * need to fetch the image id again in this case.  We
6686 	 * do still need to set the image format though.
6687 	 */
6688 	if (rbd_dev->spec->image_id) {
6689 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6690 
6691 		return 0;
6692 	}
6693 
6694 	/*
6695 	 * First, see if the format 2 image id file exists, and if
6696 	 * so, get the image's persistent id from it.
6697 	 */
6698 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6699 			       rbd_dev->spec->image_name);
6700 	if (ret)
6701 		return ret;
6702 
6703 	dout("rbd id object name is %s\n", oid.name);
6704 
6705 	/* Response will be an encoded string, which includes a length */
6706 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6707 	response = kzalloc(size, GFP_NOIO);
6708 	if (!response) {
6709 		ret = -ENOMEM;
6710 		goto out;
6711 	}
6712 
6713 	/* If it doesn't exist we'll assume it's a format 1 image */
6714 
6715 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6716 				  "get_id", NULL, 0,
6717 				  response, size);
6718 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6719 	if (ret == -ENOENT) {
6720 		image_id = kstrdup("", GFP_KERNEL);
6721 		ret = image_id ? 0 : -ENOMEM;
6722 		if (!ret)
6723 			rbd_dev->image_format = 1;
6724 	} else if (ret >= 0) {
6725 		void *p = response;
6726 
6727 		image_id = ceph_extract_encoded_string(&p, p + ret,
6728 						NULL, GFP_NOIO);
6729 		ret = PTR_ERR_OR_ZERO(image_id);
6730 		if (!ret)
6731 			rbd_dev->image_format = 2;
6732 	}
6733 
6734 	if (!ret) {
6735 		rbd_dev->spec->image_id = image_id;
6736 		dout("image_id is %s\n", image_id);
6737 	}
6738 out:
6739 	kfree(response);
6740 	ceph_oid_destroy(&oid);
6741 	return ret;
6742 }
6743 
6744 /*
6745  * Undo whatever state changes are made by v1 or v2 header info
6746  * call.
6747  */
6748 static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6749 {
6750 	struct rbd_image_header	*header;
6751 
6752 	rbd_dev_parent_put(rbd_dev);
6753 	rbd_object_map_free(rbd_dev);
6754 	rbd_dev_mapping_clear(rbd_dev);
6755 
6756 	/* Free dynamic fields from the header, then zero it out */
6757 
6758 	header = &rbd_dev->header;
6759 	ceph_put_snap_context(header->snapc);
6760 	kfree(header->snap_sizes);
6761 	kfree(header->snap_names);
6762 	kfree(header->object_prefix);
6763 	memset(header, 0, sizeof (*header));
6764 }
6765 
6766 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6767 {
6768 	int ret;
6769 
6770 	ret = rbd_dev_v2_object_prefix(rbd_dev);
6771 	if (ret)
6772 		goto out_err;
6773 
6774 	/*
6775 	 * Get the and check features for the image.  Currently the
6776 	 * features are assumed to never change.
6777 	 */
6778 	ret = rbd_dev_v2_features(rbd_dev);
6779 	if (ret)
6780 		goto out_err;
6781 
6782 	/* If the image supports fancy striping, get its parameters */
6783 
6784 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6785 		ret = rbd_dev_v2_striping_info(rbd_dev);
6786 		if (ret < 0)
6787 			goto out_err;
6788 	}
6789 
6790 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6791 		ret = rbd_dev_v2_data_pool(rbd_dev);
6792 		if (ret)
6793 			goto out_err;
6794 	}
6795 
6796 	rbd_init_layout(rbd_dev);
6797 	return 0;
6798 
6799 out_err:
6800 	rbd_dev->header.features = 0;
6801 	kfree(rbd_dev->header.object_prefix);
6802 	rbd_dev->header.object_prefix = NULL;
6803 	return ret;
6804 }
6805 
6806 /*
6807  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6808  * rbd_dev_image_probe() recursion depth, which means it's also the
6809  * length of the already discovered part of the parent chain.
6810  */
6811 static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
6812 {
6813 	struct rbd_device *parent = NULL;
6814 	int ret;
6815 
6816 	if (!rbd_dev->parent_spec)
6817 		return 0;
6818 
6819 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6820 		pr_info("parent chain is too long (%d)\n", depth);
6821 		ret = -EINVAL;
6822 		goto out_err;
6823 	}
6824 
6825 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
6826 	if (!parent) {
6827 		ret = -ENOMEM;
6828 		goto out_err;
6829 	}
6830 
6831 	/*
6832 	 * Images related by parent/child relationships always share
6833 	 * rbd_client and spec/parent_spec, so bump their refcounts.
6834 	 */
6835 	__rbd_get_client(rbd_dev->rbd_client);
6836 	rbd_spec_get(rbd_dev->parent_spec);
6837 
6838 	ret = rbd_dev_image_probe(parent, depth);
6839 	if (ret < 0)
6840 		goto out_err;
6841 
6842 	rbd_dev->parent = parent;
6843 	atomic_set(&rbd_dev->parent_ref, 1);
6844 	return 0;
6845 
6846 out_err:
6847 	rbd_dev_unparent(rbd_dev);
6848 	rbd_dev_destroy(parent);
6849 	return ret;
6850 }
6851 
6852 static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6853 {
6854 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6855 	rbd_free_disk(rbd_dev);
6856 	if (!single_major)
6857 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6858 }
6859 
6860 /*
6861  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6862  * upon return.
6863  */
6864 static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6865 {
6866 	int ret;
6867 
6868 	/* Record our major and minor device numbers. */
6869 
6870 	if (!single_major) {
6871 		ret = register_blkdev(0, rbd_dev->name);
6872 		if (ret < 0)
6873 			goto err_out_unlock;
6874 
6875 		rbd_dev->major = ret;
6876 		rbd_dev->minor = 0;
6877 	} else {
6878 		rbd_dev->major = rbd_major;
6879 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6880 	}
6881 
6882 	/* Set up the blkdev mapping. */
6883 
6884 	ret = rbd_init_disk(rbd_dev);
6885 	if (ret)
6886 		goto err_out_blkdev;
6887 
6888 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
6889 	set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
6890 
6891 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6892 	if (ret)
6893 		goto err_out_disk;
6894 
6895 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6896 	up_write(&rbd_dev->header_rwsem);
6897 	return 0;
6898 
6899 err_out_disk:
6900 	rbd_free_disk(rbd_dev);
6901 err_out_blkdev:
6902 	if (!single_major)
6903 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6904 err_out_unlock:
6905 	up_write(&rbd_dev->header_rwsem);
6906 	return ret;
6907 }
6908 
6909 static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6910 {
6911 	struct rbd_spec *spec = rbd_dev->spec;
6912 	int ret;
6913 
6914 	/* Record the header object name for this rbd image. */
6915 
6916 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6917 	if (rbd_dev->image_format == 1)
6918 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6919 				       spec->image_name, RBD_SUFFIX);
6920 	else
6921 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6922 				       RBD_HEADER_PREFIX, spec->image_id);
6923 
6924 	return ret;
6925 }
6926 
6927 static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6928 {
6929 	rbd_dev_unprobe(rbd_dev);
6930 	if (rbd_dev->opts)
6931 		rbd_unregister_watch(rbd_dev);
6932 	rbd_dev->image_format = 0;
6933 	kfree(rbd_dev->spec->image_id);
6934 	rbd_dev->spec->image_id = NULL;
6935 }
6936 
6937 /*
6938  * Probe for the existence of the header object for the given rbd
6939  * device.  If this image is the one being mapped (i.e., not a
6940  * parent), initiate a watch on its header object before using that
6941  * object to get detailed information about the rbd image.
6942  */
6943 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6944 {
6945 	int ret;
6946 
6947 	/*
6948 	 * Get the id from the image id object.  Unless there's an
6949 	 * error, rbd_dev->spec->image_id will be filled in with
6950 	 * a dynamically-allocated string, and rbd_dev->image_format
6951 	 * will be set to either 1 or 2.
6952 	 */
6953 	ret = rbd_dev_image_id(rbd_dev);
6954 	if (ret)
6955 		return ret;
6956 
6957 	ret = rbd_dev_header_name(rbd_dev);
6958 	if (ret)
6959 		goto err_out_format;
6960 
6961 	if (!depth) {
6962 		ret = rbd_register_watch(rbd_dev);
6963 		if (ret) {
6964 			if (ret == -ENOENT)
6965 				pr_info("image %s/%s%s%s does not exist\n",
6966 					rbd_dev->spec->pool_name,
6967 					rbd_dev->spec->pool_ns ?: "",
6968 					rbd_dev->spec->pool_ns ? "/" : "",
6969 					rbd_dev->spec->image_name);
6970 			goto err_out_format;
6971 		}
6972 	}
6973 
6974 	ret = rbd_dev_header_info(rbd_dev);
6975 	if (ret)
6976 		goto err_out_watch;
6977 
6978 	/*
6979 	 * If this image is the one being mapped, we have pool name and
6980 	 * id, image name and id, and snap name - need to fill snap id.
6981 	 * Otherwise this is a parent image, identified by pool, image
6982 	 * and snap ids - need to fill in names for those ids.
6983 	 */
6984 	if (!depth)
6985 		ret = rbd_spec_fill_snap_id(rbd_dev);
6986 	else
6987 		ret = rbd_spec_fill_names(rbd_dev);
6988 	if (ret) {
6989 		if (ret == -ENOENT)
6990 			pr_info("snap %s/%s%s%s@%s does not exist\n",
6991 				rbd_dev->spec->pool_name,
6992 				rbd_dev->spec->pool_ns ?: "",
6993 				rbd_dev->spec->pool_ns ? "/" : "",
6994 				rbd_dev->spec->image_name,
6995 				rbd_dev->spec->snap_name);
6996 		goto err_out_probe;
6997 	}
6998 
6999 	ret = rbd_dev_mapping_set(rbd_dev);
7000 	if (ret)
7001 		goto err_out_probe;
7002 
7003 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
7004 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7005 		ret = rbd_object_map_load(rbd_dev);
7006 		if (ret)
7007 			goto err_out_probe;
7008 	}
7009 
7010 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7011 		ret = rbd_dev_v2_parent_info(rbd_dev);
7012 		if (ret)
7013 			goto err_out_probe;
7014 	}
7015 
7016 	ret = rbd_dev_probe_parent(rbd_dev, depth);
7017 	if (ret)
7018 		goto err_out_probe;
7019 
7020 	dout("discovered format %u image, header name is %s\n",
7021 		rbd_dev->image_format, rbd_dev->header_oid.name);
7022 	return 0;
7023 
7024 err_out_probe:
7025 	rbd_dev_unprobe(rbd_dev);
7026 err_out_watch:
7027 	if (!depth)
7028 		rbd_unregister_watch(rbd_dev);
7029 err_out_format:
7030 	rbd_dev->image_format = 0;
7031 	kfree(rbd_dev->spec->image_id);
7032 	rbd_dev->spec->image_id = NULL;
7033 	return ret;
7034 }
7035 
7036 static ssize_t do_rbd_add(struct bus_type *bus,
7037 			  const char *buf,
7038 			  size_t count)
7039 {
7040 	struct rbd_device *rbd_dev = NULL;
7041 	struct ceph_options *ceph_opts = NULL;
7042 	struct rbd_options *rbd_opts = NULL;
7043 	struct rbd_spec *spec = NULL;
7044 	struct rbd_client *rbdc;
7045 	int rc;
7046 
7047 	if (!try_module_get(THIS_MODULE))
7048 		return -ENODEV;
7049 
7050 	/* parse add command */
7051 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7052 	if (rc < 0)
7053 		goto out;
7054 
7055 	rbdc = rbd_get_client(ceph_opts);
7056 	if (IS_ERR(rbdc)) {
7057 		rc = PTR_ERR(rbdc);
7058 		goto err_out_args;
7059 	}
7060 
7061 	/* pick the pool */
7062 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
7063 	if (rc < 0) {
7064 		if (rc == -ENOENT)
7065 			pr_info("pool %s does not exist\n", spec->pool_name);
7066 		goto err_out_client;
7067 	}
7068 	spec->pool_id = (u64)rc;
7069 
7070 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7071 	if (!rbd_dev) {
7072 		rc = -ENOMEM;
7073 		goto err_out_client;
7074 	}
7075 	rbdc = NULL;		/* rbd_dev now owns this */
7076 	spec = NULL;		/* rbd_dev now owns this */
7077 	rbd_opts = NULL;	/* rbd_dev now owns this */
7078 
7079 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7080 	if (!rbd_dev->config_info) {
7081 		rc = -ENOMEM;
7082 		goto err_out_rbd_dev;
7083 	}
7084 
7085 	down_write(&rbd_dev->header_rwsem);
7086 	rc = rbd_dev_image_probe(rbd_dev, 0);
7087 	if (rc < 0) {
7088 		up_write(&rbd_dev->header_rwsem);
7089 		goto err_out_rbd_dev;
7090 	}
7091 
7092 	/* If we are mapping a snapshot it must be marked read-only */
7093 	if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
7094 		rbd_dev->opts->read_only = true;
7095 
7096 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7097 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7098 			 rbd_dev->layout.object_size);
7099 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7100 	}
7101 
7102 	rc = rbd_dev_device_setup(rbd_dev);
7103 	if (rc)
7104 		goto err_out_image_probe;
7105 
7106 	rc = rbd_add_acquire_lock(rbd_dev);
7107 	if (rc)
7108 		goto err_out_image_lock;
7109 
7110 	/* Everything's ready.  Announce the disk to the world. */
7111 
7112 	rc = device_add(&rbd_dev->dev);
7113 	if (rc)
7114 		goto err_out_image_lock;
7115 
7116 	add_disk(rbd_dev->disk);
7117 	/* see rbd_init_disk() */
7118 	blk_put_queue(rbd_dev->disk->queue);
7119 
7120 	spin_lock(&rbd_dev_list_lock);
7121 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
7122 	spin_unlock(&rbd_dev_list_lock);
7123 
7124 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7125 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7126 		rbd_dev->header.features);
7127 	rc = count;
7128 out:
7129 	module_put(THIS_MODULE);
7130 	return rc;
7131 
7132 err_out_image_lock:
7133 	rbd_dev_image_unlock(rbd_dev);
7134 	rbd_dev_device_release(rbd_dev);
7135 err_out_image_probe:
7136 	rbd_dev_image_release(rbd_dev);
7137 err_out_rbd_dev:
7138 	rbd_dev_destroy(rbd_dev);
7139 err_out_client:
7140 	rbd_put_client(rbdc);
7141 err_out_args:
7142 	rbd_spec_put(spec);
7143 	kfree(rbd_opts);
7144 	goto out;
7145 }
7146 
7147 static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
7148 {
7149 	if (single_major)
7150 		return -EINVAL;
7151 
7152 	return do_rbd_add(bus, buf, count);
7153 }
7154 
7155 static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7156 				      size_t count)
7157 {
7158 	return do_rbd_add(bus, buf, count);
7159 }
7160 
7161 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7162 {
7163 	while (rbd_dev->parent) {
7164 		struct rbd_device *first = rbd_dev;
7165 		struct rbd_device *second = first->parent;
7166 		struct rbd_device *third;
7167 
7168 		/*
7169 		 * Follow to the parent with no grandparent and
7170 		 * remove it.
7171 		 */
7172 		while (second && (third = second->parent)) {
7173 			first = second;
7174 			second = third;
7175 		}
7176 		rbd_assert(second);
7177 		rbd_dev_image_release(second);
7178 		rbd_dev_destroy(second);
7179 		first->parent = NULL;
7180 		first->parent_overlap = 0;
7181 
7182 		rbd_assert(first->parent_spec);
7183 		rbd_spec_put(first->parent_spec);
7184 		first->parent_spec = NULL;
7185 	}
7186 }
7187 
7188 static ssize_t do_rbd_remove(struct bus_type *bus,
7189 			     const char *buf,
7190 			     size_t count)
7191 {
7192 	struct rbd_device *rbd_dev = NULL;
7193 	struct list_head *tmp;
7194 	int dev_id;
7195 	char opt_buf[6];
7196 	bool force = false;
7197 	int ret;
7198 
7199 	dev_id = -1;
7200 	opt_buf[0] = '\0';
7201 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
7202 	if (dev_id < 0) {
7203 		pr_err("dev_id out of range\n");
7204 		return -EINVAL;
7205 	}
7206 	if (opt_buf[0] != '\0') {
7207 		if (!strcmp(opt_buf, "force")) {
7208 			force = true;
7209 		} else {
7210 			pr_err("bad remove option at '%s'\n", opt_buf);
7211 			return -EINVAL;
7212 		}
7213 	}
7214 
7215 	ret = -ENOENT;
7216 	spin_lock(&rbd_dev_list_lock);
7217 	list_for_each(tmp, &rbd_dev_list) {
7218 		rbd_dev = list_entry(tmp, struct rbd_device, node);
7219 		if (rbd_dev->dev_id == dev_id) {
7220 			ret = 0;
7221 			break;
7222 		}
7223 	}
7224 	if (!ret) {
7225 		spin_lock_irq(&rbd_dev->lock);
7226 		if (rbd_dev->open_count && !force)
7227 			ret = -EBUSY;
7228 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7229 					  &rbd_dev->flags))
7230 			ret = -EINPROGRESS;
7231 		spin_unlock_irq(&rbd_dev->lock);
7232 	}
7233 	spin_unlock(&rbd_dev_list_lock);
7234 	if (ret)
7235 		return ret;
7236 
7237 	if (force) {
7238 		/*
7239 		 * Prevent new IO from being queued and wait for existing
7240 		 * IO to complete/fail.
7241 		 */
7242 		blk_mq_freeze_queue(rbd_dev->disk->queue);
7243 		blk_set_queue_dying(rbd_dev->disk->queue);
7244 	}
7245 
7246 	del_gendisk(rbd_dev->disk);
7247 	spin_lock(&rbd_dev_list_lock);
7248 	list_del_init(&rbd_dev->node);
7249 	spin_unlock(&rbd_dev_list_lock);
7250 	device_del(&rbd_dev->dev);
7251 
7252 	rbd_dev_image_unlock(rbd_dev);
7253 	rbd_dev_device_release(rbd_dev);
7254 	rbd_dev_image_release(rbd_dev);
7255 	rbd_dev_destroy(rbd_dev);
7256 	return count;
7257 }
7258 
7259 static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
7260 {
7261 	if (single_major)
7262 		return -EINVAL;
7263 
7264 	return do_rbd_remove(bus, buf, count);
7265 }
7266 
7267 static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7268 					 size_t count)
7269 {
7270 	return do_rbd_remove(bus, buf, count);
7271 }
7272 
7273 /*
7274  * create control files in sysfs
7275  * /sys/bus/rbd/...
7276  */
7277 static int __init rbd_sysfs_init(void)
7278 {
7279 	int ret;
7280 
7281 	ret = device_register(&rbd_root_dev);
7282 	if (ret < 0)
7283 		return ret;
7284 
7285 	ret = bus_register(&rbd_bus_type);
7286 	if (ret < 0)
7287 		device_unregister(&rbd_root_dev);
7288 
7289 	return ret;
7290 }
7291 
7292 static void __exit rbd_sysfs_cleanup(void)
7293 {
7294 	bus_unregister(&rbd_bus_type);
7295 	device_unregister(&rbd_root_dev);
7296 }
7297 
7298 static int __init rbd_slab_init(void)
7299 {
7300 	rbd_assert(!rbd_img_request_cache);
7301 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7302 	if (!rbd_img_request_cache)
7303 		return -ENOMEM;
7304 
7305 	rbd_assert(!rbd_obj_request_cache);
7306 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
7307 	if (!rbd_obj_request_cache)
7308 		goto out_err;
7309 
7310 	return 0;
7311 
7312 out_err:
7313 	kmem_cache_destroy(rbd_img_request_cache);
7314 	rbd_img_request_cache = NULL;
7315 	return -ENOMEM;
7316 }
7317 
7318 static void rbd_slab_exit(void)
7319 {
7320 	rbd_assert(rbd_obj_request_cache);
7321 	kmem_cache_destroy(rbd_obj_request_cache);
7322 	rbd_obj_request_cache = NULL;
7323 
7324 	rbd_assert(rbd_img_request_cache);
7325 	kmem_cache_destroy(rbd_img_request_cache);
7326 	rbd_img_request_cache = NULL;
7327 }
7328 
7329 static int __init rbd_init(void)
7330 {
7331 	int rc;
7332 
7333 	if (!libceph_compatible(NULL)) {
7334 		rbd_warn(NULL, "libceph incompatibility (quitting)");
7335 		return -EINVAL;
7336 	}
7337 
7338 	rc = rbd_slab_init();
7339 	if (rc)
7340 		return rc;
7341 
7342 	/*
7343 	 * The number of active work items is limited by the number of
7344 	 * rbd devices * queue depth, so leave @max_active at default.
7345 	 */
7346 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7347 	if (!rbd_wq) {
7348 		rc = -ENOMEM;
7349 		goto err_out_slab;
7350 	}
7351 
7352 	if (single_major) {
7353 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
7354 		if (rbd_major < 0) {
7355 			rc = rbd_major;
7356 			goto err_out_wq;
7357 		}
7358 	}
7359 
7360 	rc = rbd_sysfs_init();
7361 	if (rc)
7362 		goto err_out_blkdev;
7363 
7364 	if (single_major)
7365 		pr_info("loaded (major %d)\n", rbd_major);
7366 	else
7367 		pr_info("loaded\n");
7368 
7369 	return 0;
7370 
7371 err_out_blkdev:
7372 	if (single_major)
7373 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7374 err_out_wq:
7375 	destroy_workqueue(rbd_wq);
7376 err_out_slab:
7377 	rbd_slab_exit();
7378 	return rc;
7379 }
7380 
7381 static void __exit rbd_exit(void)
7382 {
7383 	ida_destroy(&rbd_dev_id_ida);
7384 	rbd_sysfs_cleanup();
7385 	if (single_major)
7386 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7387 	destroy_workqueue(rbd_wq);
7388 	rbd_slab_exit();
7389 }
7390 
7391 module_init(rbd_init);
7392 module_exit(rbd_exit);
7393 
7394 MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7395 MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7396 MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7397 /* following authorship retained from original osdblk.c */
7398 MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7399 
7400 MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7401 MODULE_LICENSE("GPL");
7402