1e2a58ee5SAlex Elder
2602adf40SYehuda Sadeh /*
3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device
4602adf40SYehuda Sadeh
5602adf40SYehuda Sadeh
6602adf40SYehuda Sadeh based on drivers/block/osdblk.c:
7602adf40SYehuda Sadeh
8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc.
9602adf40SYehuda Sadeh
10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify
11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by
12602adf40SYehuda Sadeh the Free Software Foundation.
13602adf40SYehuda Sadeh
14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful,
15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of
16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17602adf40SYehuda Sadeh GNU General Public License for more details.
18602adf40SYehuda Sadeh
19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License
20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to
21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22602adf40SYehuda Sadeh
23602adf40SYehuda Sadeh
24602adf40SYehuda Sadeh
25dfc5606dSYehuda Sadeh For usage instructions, please refer to:
26602adf40SYehuda Sadeh
27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd
28602adf40SYehuda Sadeh
29602adf40SYehuda Sadeh */
30602adf40SYehuda Sadeh
31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h>
32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h>
33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h>
34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h>
3543df3d35SIlya Dryomov #include <linux/ceph/striper.h>
36602adf40SYehuda Sadeh #include <linux/ceph/decode.h>
3782995cc6SDavid Howells #include <linux/fs_parser.h>
3830d1cff8SAlex Elder #include <linux/bsearch.h>
39602adf40SYehuda Sadeh
40602adf40SYehuda Sadeh #include <linux/kernel.h>
41602adf40SYehuda Sadeh #include <linux/device.h>
42602adf40SYehuda Sadeh #include <linux/module.h>
437ad18afaSChristoph Hellwig #include <linux/blk-mq.h>
44602adf40SYehuda Sadeh #include <linux/fs.h>
45602adf40SYehuda Sadeh #include <linux/blkdev.h>
461c2a9dfeSAlex Elder #include <linux/slab.h>
47f8a22fc2SIlya Dryomov #include <linux/idr.h>
48bc1ecc65SIlya Dryomov #include <linux/workqueue.h>
49602adf40SYehuda Sadeh
50602adf40SYehuda Sadeh #include "rbd_types.h"
51602adf40SYehuda Sadeh
52aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */
53aafb230eSAlex Elder
54593a9e7bSAlex Elder /*
55a2acd00eSAlex Elder * Increment the given counter and return its updated value.
56a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented.
57a2acd00eSAlex Elder * If the counter is already at its maximum value returns
58a2acd00eSAlex Elder * -EINVAL without updating it.
59a2acd00eSAlex Elder */
atomic_inc_return_safe(atomic_t * v)60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v)
61a2acd00eSAlex Elder {
62a2acd00eSAlex Elder unsigned int counter;
63a2acd00eSAlex Elder
64bfc18e38SMark Rutland counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX)
66a2acd00eSAlex Elder return (int)counter;
67a2acd00eSAlex Elder
68a2acd00eSAlex Elder atomic_dec(v);
69a2acd00eSAlex Elder
70a2acd00eSAlex Elder return -EINVAL;
71a2acd00eSAlex Elder }
72a2acd00eSAlex Elder
73a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */
atomic_dec_return_safe(atomic_t * v)74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v)
75a2acd00eSAlex Elder {
76a2acd00eSAlex Elder int counter;
77a2acd00eSAlex Elder
78a2acd00eSAlex Elder counter = atomic_dec_return(v);
79a2acd00eSAlex Elder if (counter >= 0)
80a2acd00eSAlex Elder return counter;
81a2acd00eSAlex Elder
82a2acd00eSAlex Elder atomic_inc(v);
83a2acd00eSAlex Elder
84a2acd00eSAlex Elder return -EINVAL;
85a2acd00eSAlex Elder }
86a2acd00eSAlex Elder
87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd"
88602adf40SYehuda Sadeh
897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256
907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4
91602adf40SYehuda Sadeh
926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16
936d69bb53SIlya Dryomov
94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \
96d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97d4b125e9SAlex Elder
9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
99602adf40SYehuda Sadeh
100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-"
101602adf40SYehuda Sadeh
1029682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
1039682fc6dSAlex Elder
1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */
1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64
1079e15b77dSAlex Elder
1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64
109589d30e0SAlex Elder
110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */
11199d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000)
11299d16943SIlya Dryomov
113d889140cSAlex Elder /* Feature bits */
114d889140cSAlex Elder
1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0)
1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
11822e8bd51SIlya Dryomov #define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
11922e8bd51SIlya Dryomov #define RBD_FEATURE_FAST_DIFF (1ULL<<4)
120b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
1218767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7)
122e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS (1ULL<<8)
1238767b293SIlya Dryomov
124ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \
1267e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \
12722e8bd51SIlya Dryomov RBD_FEATURE_OBJECT_MAP | \
12822e8bd51SIlya Dryomov RBD_FEATURE_FAST_DIFF | \
129b9f6d447SIlya Dryomov RBD_FEATURE_DEEP_FLATTEN | \
130e573427aSIlya Dryomov RBD_FEATURE_DATA_POOL | \
131e573427aSIlya Dryomov RBD_FEATURE_OPERATIONS)
132d889140cSAlex Elder
133d889140cSAlex Elder /* Features supported by this (client software) implementation. */
134d889140cSAlex Elder
135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
136d889140cSAlex Elder
13781a89793SAlex Elder /*
13881a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from
13981a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier.
14081a89793SAlex Elder */
141602adf40SYehuda Sadeh #define DEV_NAME_LEN 32
142602adf40SYehuda Sadeh
143602adf40SYehuda Sadeh /*
144602adf40SYehuda Sadeh * block device image metadata (in-memory version)
145602adf40SYehuda Sadeh */
146602adf40SYehuda Sadeh struct rbd_image_header {
147f35a4deeSAlex Elder /* These six fields never change for a given rbd image */
148849b4260SAlex Elder char *object_prefix;
149602adf40SYehuda Sadeh __u8 obj_order;
150f35a4deeSAlex Elder u64 stripe_unit;
151f35a4deeSAlex Elder u64 stripe_count;
1527e97332eSIlya Dryomov s64 data_pool_id;
153f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */
154602adf40SYehuda Sadeh
155f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */
156f84344f3SAlex Elder u64 image_size;
157f84344f3SAlex Elder struct ceph_snap_context *snapc;
158f35a4deeSAlex Elder char *snap_names; /* format 1 only */
159f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */
16059c2be1eSYehuda Sadeh };
16159c2be1eSYehuda Sadeh
1620d7dbfceSAlex Elder /*
1630d7dbfceSAlex Elder * An rbd image specification.
1640d7dbfceSAlex Elder *
1650d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to
167c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity.
168c66c6e0cSAlex Elder *
169c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a
170c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated
171c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is
172c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up.
173c66c6e0cSAlex Elder *
174c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is
175c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered
176c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used
177c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure
178c66c6e0cSAlex Elder * is shared between the parent and child).
179c66c6e0cSAlex Elder *
180c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery
181c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so
182c66c6e0cSAlex Elder * we make no effort to synchronize access to them.
183c66c6e0cSAlex Elder *
184c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it
185c66c6e0cSAlex Elder * could be a null pointer).
1860d7dbfceSAlex Elder */
1870d7dbfceSAlex Elder struct rbd_spec {
1880d7dbfceSAlex Elder u64 pool_id;
189ecb4dc22SAlex Elder const char *pool_name;
190b26c047bSIlya Dryomov const char *pool_ns; /* NULL if default, never "" */
1910d7dbfceSAlex Elder
192ecb4dc22SAlex Elder const char *image_id;
193ecb4dc22SAlex Elder const char *image_name;
1940d7dbfceSAlex Elder
1950d7dbfceSAlex Elder u64 snap_id;
196ecb4dc22SAlex Elder const char *snap_name;
1970d7dbfceSAlex Elder
1980d7dbfceSAlex Elder struct kref kref;
1990d7dbfceSAlex Elder };
2000d7dbfceSAlex Elder
201602adf40SYehuda Sadeh /*
202f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client.
203602adf40SYehuda Sadeh */
204602adf40SYehuda Sadeh struct rbd_client {
205602adf40SYehuda Sadeh struct ceph_client *client;
206602adf40SYehuda Sadeh struct kref kref;
207602adf40SYehuda Sadeh struct list_head node;
208602adf40SYehuda Sadeh };
209602adf40SYehuda Sadeh
2100192ce2eSIlya Dryomov struct pending_result {
2110192ce2eSIlya Dryomov int result; /* first nonzero result */
2120192ce2eSIlya Dryomov int num_pending;
2130192ce2eSIlya Dryomov };
2140192ce2eSIlya Dryomov
215bf0d5f50SAlex Elder struct rbd_img_request;
216bf0d5f50SAlex Elder
2179969ebc5SAlex Elder enum obj_request_type {
218a1fbb5e7SIlya Dryomov OBJ_REQUEST_NODATA = 1,
2195359a17dSIlya Dryomov OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
2207e07efb1SIlya Dryomov OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
221afb97888SIlya Dryomov OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
2229969ebc5SAlex Elder };
223bf0d5f50SAlex Elder
2246d2940c8SGuangliang Zhao enum obj_operation_type {
225a1fbb5e7SIlya Dryomov OBJ_OP_READ = 1,
2266d2940c8SGuangliang Zhao OBJ_OP_WRITE,
22790e98c52SGuangliang Zhao OBJ_OP_DISCARD,
2286484cbe9SIlya Dryomov OBJ_OP_ZEROOUT,
2296d2940c8SGuangliang Zhao };
2306d2940c8SGuangliang Zhao
2310ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_DELETION (1U << 0)
2320ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
233793333a3SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
23422e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
23522e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
2360ad5d953SIlya Dryomov
237a9b67e69SIlya Dryomov enum rbd_obj_read_state {
23885b5e6d1SIlya Dryomov RBD_OBJ_READ_START = 1,
23985b5e6d1SIlya Dryomov RBD_OBJ_READ_OBJECT,
240a9b67e69SIlya Dryomov RBD_OBJ_READ_PARENT,
241a9b67e69SIlya Dryomov };
242a9b67e69SIlya Dryomov
2433da691bfSIlya Dryomov /*
2443da691bfSIlya Dryomov * Writes go through the following state machine to deal with
2453da691bfSIlya Dryomov * layering:
2463da691bfSIlya Dryomov *
24789a59c1cSIlya Dryomov * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
24889a59c1cSIlya Dryomov * . | .
24989a59c1cSIlya Dryomov * . v .
25089a59c1cSIlya Dryomov * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
25189a59c1cSIlya Dryomov * . | . .
25289a59c1cSIlya Dryomov * . v v (deep-copyup .
25389a59c1cSIlya Dryomov * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
25489a59c1cSIlya Dryomov * flattened) v | . .
25589a59c1cSIlya Dryomov * . v . .
25689a59c1cSIlya Dryomov * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
25789a59c1cSIlya Dryomov * | not needed) v
25889a59c1cSIlya Dryomov * v .
25989a59c1cSIlya Dryomov * done . . . . . . . . . . . . . . . . . .
2603da691bfSIlya Dryomov * ^
2613da691bfSIlya Dryomov * |
2623da691bfSIlya Dryomov * RBD_OBJ_WRITE_FLAT
2633da691bfSIlya Dryomov *
2643da691bfSIlya Dryomov * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
26589a59c1cSIlya Dryomov * assert_exists guard is needed or not (in some cases it's not needed
26689a59c1cSIlya Dryomov * even if there is a parent).
2673da691bfSIlya Dryomov */
2683da691bfSIlya Dryomov enum rbd_obj_write_state {
26985b5e6d1SIlya Dryomov RBD_OBJ_WRITE_START = 1,
27022e8bd51SIlya Dryomov RBD_OBJ_WRITE_PRE_OBJECT_MAP,
27185b5e6d1SIlya Dryomov RBD_OBJ_WRITE_OBJECT,
272793333a3SIlya Dryomov __RBD_OBJ_WRITE_COPYUP,
273793333a3SIlya Dryomov RBD_OBJ_WRITE_COPYUP,
27422e8bd51SIlya Dryomov RBD_OBJ_WRITE_POST_OBJECT_MAP,
275793333a3SIlya Dryomov };
276793333a3SIlya Dryomov
277793333a3SIlya Dryomov enum rbd_obj_copyup_state {
278793333a3SIlya Dryomov RBD_OBJ_COPYUP_START = 1,
279793333a3SIlya Dryomov RBD_OBJ_COPYUP_READ_PARENT,
28022e8bd51SIlya Dryomov __RBD_OBJ_COPYUP_OBJECT_MAPS,
28122e8bd51SIlya Dryomov RBD_OBJ_COPYUP_OBJECT_MAPS,
282793333a3SIlya Dryomov __RBD_OBJ_COPYUP_WRITE_OBJECT,
283793333a3SIlya Dryomov RBD_OBJ_COPYUP_WRITE_OBJECT,
284926f9b3fSAlex Elder };
285926f9b3fSAlex Elder
286bf0d5f50SAlex Elder struct rbd_obj_request {
28743df3d35SIlya Dryomov struct ceph_object_extent ex;
2880ad5d953SIlya Dryomov unsigned int flags; /* RBD_OBJ_FLAG_* */
289c5b5ef6cSAlex Elder union {
290a9b67e69SIlya Dryomov enum rbd_obj_read_state read_state; /* for reads */
2913da691bfSIlya Dryomov enum rbd_obj_write_state write_state; /* for writes */
2923da691bfSIlya Dryomov };
293bf0d5f50SAlex Elder
294bf0d5f50SAlex Elder struct rbd_img_request *img_request;
29586bd7998SIlya Dryomov struct ceph_file_extent *img_extents;
29686bd7998SIlya Dryomov u32 num_img_extents;
297bf0d5f50SAlex Elder
298788e2df3SAlex Elder union {
2995359a17dSIlya Dryomov struct ceph_bio_iter bio_pos;
300788e2df3SAlex Elder struct {
3017e07efb1SIlya Dryomov struct ceph_bvec_iter bvec_pos;
3027e07efb1SIlya Dryomov u32 bvec_count;
303afb97888SIlya Dryomov u32 bvec_idx;
304788e2df3SAlex Elder };
305788e2df3SAlex Elder };
306793333a3SIlya Dryomov
307793333a3SIlya Dryomov enum rbd_obj_copyup_state copyup_state;
3087e07efb1SIlya Dryomov struct bio_vec *copyup_bvecs;
3097e07efb1SIlya Dryomov u32 copyup_bvec_count;
310bf0d5f50SAlex Elder
311bcbab1dbSIlya Dryomov struct list_head osd_reqs; /* w/ r_private_item */
312bf0d5f50SAlex Elder
31385b5e6d1SIlya Dryomov struct mutex state_mutex;
314793333a3SIlya Dryomov struct pending_result pending;
315bf0d5f50SAlex Elder struct kref kref;
316bf0d5f50SAlex Elder };
317bf0d5f50SAlex Elder
3180c425248SAlex Elder enum img_req_flags {
3199849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
320d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
3210c425248SAlex Elder };
3220c425248SAlex Elder
3230192ce2eSIlya Dryomov enum rbd_img_state {
3240192ce2eSIlya Dryomov RBD_IMG_START = 1,
325637cd060SIlya Dryomov RBD_IMG_EXCLUSIVE_LOCK,
3260192ce2eSIlya Dryomov __RBD_IMG_OBJECT_REQUESTS,
3270192ce2eSIlya Dryomov RBD_IMG_OBJECT_REQUESTS,
3280192ce2eSIlya Dryomov };
3290192ce2eSIlya Dryomov
330bf0d5f50SAlex Elder struct rbd_img_request {
331bf0d5f50SAlex Elder struct rbd_device *rbd_dev;
3329bb0248dSIlya Dryomov enum obj_operation_type op_type;
333ecc633caSIlya Dryomov enum obj_request_type data_type;
3340c425248SAlex Elder unsigned long flags;
3350192ce2eSIlya Dryomov enum rbd_img_state state;
336bf0d5f50SAlex Elder union {
337bf0d5f50SAlex Elder u64 snap_id; /* for reads */
3389849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */
3399849e986SAlex Elder };
3409849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */
341bf0d5f50SAlex Elder
342e1fddc8fSIlya Dryomov struct list_head lock_item;
34343df3d35SIlya Dryomov struct list_head object_extents; /* obj_req.ex structs */
344bf0d5f50SAlex Elder
3450192ce2eSIlya Dryomov struct mutex state_mutex;
3460192ce2eSIlya Dryomov struct pending_result pending;
3470192ce2eSIlya Dryomov struct work_struct work;
3480192ce2eSIlya Dryomov int work_result;
349bf0d5f50SAlex Elder };
350bf0d5f50SAlex Elder
351bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \
35243df3d35SIlya Dryomov list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
353bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \
35443df3d35SIlya Dryomov list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
355bf0d5f50SAlex Elder
35699d16943SIlya Dryomov enum rbd_watch_state {
35799d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED,
35899d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED,
35999d16943SIlya Dryomov RBD_WATCH_STATE_ERROR,
36099d16943SIlya Dryomov };
36199d16943SIlya Dryomov
362ed95b21aSIlya Dryomov enum rbd_lock_state {
363ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED,
364ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED,
365ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING,
366ed95b21aSIlya Dryomov };
367ed95b21aSIlya Dryomov
368ed95b21aSIlya Dryomov /* WatchNotify::ClientId */
369ed95b21aSIlya Dryomov struct rbd_client_id {
370ed95b21aSIlya Dryomov u64 gid;
371ed95b21aSIlya Dryomov u64 handle;
372ed95b21aSIlya Dryomov };
373ed95b21aSIlya Dryomov
374f84344f3SAlex Elder struct rbd_mapping {
37599c1f08fSAlex Elder u64 size;
376f84344f3SAlex Elder };
377f84344f3SAlex Elder
378602adf40SYehuda Sadeh /*
379602adf40SYehuda Sadeh * a single device
380602adf40SYehuda Sadeh */
381602adf40SYehuda Sadeh struct rbd_device {
382de71a297SAlex Elder int dev_id; /* blkdev unique id */
383602adf40SYehuda Sadeh
384602adf40SYehuda Sadeh int major; /* blkdev assigned major */
385dd82fff1SIlya Dryomov int minor;
386602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */
387602adf40SYehuda Sadeh
388a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */
389602adf40SYehuda Sadeh struct rbd_client *rbd_client;
390602adf40SYehuda Sadeh
391602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392602adf40SYehuda Sadeh
393b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */
394602adf40SYehuda Sadeh
395602adf40SYehuda Sadeh struct rbd_image_header header;
396b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */
3970d7dbfceSAlex Elder struct rbd_spec *spec;
398d147543dSIlya Dryomov struct rbd_options *opts;
3990d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */
400602adf40SYehuda Sadeh
401c41d13a3SIlya Dryomov struct ceph_object_id header_oid;
402922dab61SIlya Dryomov struct ceph_object_locator header_oloc;
403971f839aSAlex Elder
4041643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */
4050903e875SAlex Elder
40699d16943SIlya Dryomov struct mutex watch_mutex;
40799d16943SIlya Dryomov enum rbd_watch_state watch_state;
408922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle;
40999d16943SIlya Dryomov u64 watch_cookie;
41099d16943SIlya Dryomov struct delayed_work watch_dwork;
41159c2be1eSYehuda Sadeh
412ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem;
413ed95b21aSIlya Dryomov enum rbd_lock_state lock_state;
414cbbfb0ffSIlya Dryomov char lock_cookie[32];
415ed95b21aSIlya Dryomov struct rbd_client_id owner_cid;
416ed95b21aSIlya Dryomov struct work_struct acquired_lock_work;
417ed95b21aSIlya Dryomov struct work_struct released_lock_work;
418ed95b21aSIlya Dryomov struct delayed_work lock_dwork;
419ed95b21aSIlya Dryomov struct work_struct unlock_work;
420e1fddc8fSIlya Dryomov spinlock_t lock_lists_lock;
421637cd060SIlya Dryomov struct list_head acquiring_list;
422e1fddc8fSIlya Dryomov struct list_head running_list;
423637cd060SIlya Dryomov struct completion acquire_wait;
424637cd060SIlya Dryomov int acquire_err;
425e1fddc8fSIlya Dryomov struct completion releasing_wait;
426ed95b21aSIlya Dryomov
42722e8bd51SIlya Dryomov spinlock_t object_map_lock;
42822e8bd51SIlya Dryomov u8 *object_map;
42922e8bd51SIlya Dryomov u64 object_map_size; /* in objects */
43022e8bd51SIlya Dryomov u64 object_map_flags;
431602adf40SYehuda Sadeh
4321643dfa4SIlya Dryomov struct workqueue_struct *task_wq;
433602adf40SYehuda Sadeh
43486b00e0dSAlex Elder struct rbd_spec *parent_spec;
43586b00e0dSAlex Elder u64 parent_overlap;
436a2acd00eSAlex Elder atomic_t parent_ref;
4372f82ee54SAlex Elder struct rbd_device *parent;
43886b00e0dSAlex Elder
4397ad18afaSChristoph Hellwig /* Block layer tags. */
4407ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set;
4417ad18afaSChristoph Hellwig
442c666601aSJosh Durgin /* protects updating the header */
443c666601aSJosh Durgin struct rw_semaphore header_rwsem;
444f84344f3SAlex Elder
445f84344f3SAlex Elder struct rbd_mapping mapping;
446602adf40SYehuda Sadeh
447602adf40SYehuda Sadeh struct list_head node;
448dfc5606dSYehuda Sadeh
449dfc5606dSYehuda Sadeh /* sysfs related */
450dfc5606dSYehuda Sadeh struct device dev;
451b82d167bSAlex Elder unsigned long open_count; /* protected by lock */
452dfc5606dSYehuda Sadeh };
453dfc5606dSYehuda Sadeh
454b82d167bSAlex Elder /*
45587c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags:
45687c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected
45787c0fdedSIlya Dryomov * by rbd_dev->lock
458b82d167bSAlex Elder */
4596d292906SAlex Elder enum rbd_dev_flags {
460686238b7SIlya Dryomov RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
461b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
46239258aa2SIlya Dryomov RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
4636d292906SAlex Elder };
4646d292906SAlex Elder
465cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
466e124a82fSAlex Elder
467602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */
468e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock);
469e124a82fSAlex Elder
470602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */
471432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock);
472602adf40SYehuda Sadeh
47378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */
47478c2a44aSAlex Elder
4751c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache;
476868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache;
4771c2a9dfeSAlex Elder
4789b60e70bSIlya Dryomov static int rbd_major;
479f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida);
480f8a22fc2SIlya Dryomov
481f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq;
482f5ee37bdSIlya Dryomov
48389a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = {
48489a59c1cSIlya Dryomov .nref = REFCOUNT_INIT(1),
48589a59c1cSIlya Dryomov };
48689a59c1cSIlya Dryomov
4879b60e70bSIlya Dryomov /*
4883cfa3b16SIlya Dryomov * single-major requires >= 0.75 version of userspace rbd utility.
4899b60e70bSIlya Dryomov */
4903cfa3b16SIlya Dryomov static bool single_major = true;
4915657a819SJoe Perches module_param(single_major, bool, 0444);
4923cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
4939b60e70bSIlya Dryomov
49475cff725SGreg Kroah-Hartman static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count);
49575cff725SGreg Kroah-Hartman static ssize_t remove_store(const struct bus_type *bus, const char *buf,
496f0f8cef5SAlex Elder size_t count);
49775cff725SGreg Kroah-Hartman static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
498f0f8cef5SAlex Elder size_t count);
49975cff725SGreg Kroah-Hartman static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
5009b60e70bSIlya Dryomov size_t count);
5016d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
502f0f8cef5SAlex Elder
rbd_dev_id_to_minor(int dev_id)5039b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id)
5049b60e70bSIlya Dryomov {
5057e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
5069b60e70bSIlya Dryomov }
5079b60e70bSIlya Dryomov
minor_to_rbd_dev_id(int minor)5089b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor)
5099b60e70bSIlya Dryomov {
5107e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
5119b60e70bSIlya Dryomov }
5129b60e70bSIlya Dryomov
rbd_is_ro(struct rbd_device * rbd_dev)51339258aa2SIlya Dryomov static bool rbd_is_ro(struct rbd_device *rbd_dev)
51439258aa2SIlya Dryomov {
51539258aa2SIlya Dryomov return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
51639258aa2SIlya Dryomov }
51739258aa2SIlya Dryomov
rbd_is_snap(struct rbd_device * rbd_dev)518f3c0e459SIlya Dryomov static bool rbd_is_snap(struct rbd_device *rbd_dev)
519f3c0e459SIlya Dryomov {
520f3c0e459SIlya Dryomov return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521f3c0e459SIlya Dryomov }
522f3c0e459SIlya Dryomov
__rbd_is_lock_owner(struct rbd_device * rbd_dev)523ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524ed95b21aSIlya Dryomov {
525637cd060SIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem);
526637cd060SIlya Dryomov
527ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529ed95b21aSIlya Dryomov }
530ed95b21aSIlya Dryomov
rbd_is_lock_owner(struct rbd_device * rbd_dev)531ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532ed95b21aSIlya Dryomov {
533ed95b21aSIlya Dryomov bool is_lock_owner;
534ed95b21aSIlya Dryomov
535ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem);
536ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
538ed95b21aSIlya Dryomov return is_lock_owner;
539ed95b21aSIlya Dryomov }
540ed95b21aSIlya Dryomov
supported_features_show(const struct bus_type * bus,char * buf)54175cff725SGreg Kroah-Hartman static ssize_t supported_features_show(const struct bus_type *bus, char *buf)
5428767b293SIlya Dryomov {
5438767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
5448767b293SIlya Dryomov }
5458767b293SIlya Dryomov
5467e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add);
5477e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove);
5487e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major);
5497e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major);
5507e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features);
551b15a21ddSGreg Kroah-Hartman
552b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = {
553b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr,
554b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr,
5559b60e70bSIlya Dryomov &bus_attr_add_single_major.attr,
5569b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr,
5578767b293SIlya Dryomov &bus_attr_supported_features.attr,
558b15a21ddSGreg Kroah-Hartman NULL,
559f0f8cef5SAlex Elder };
56092c76dc0SIlya Dryomov
rbd_bus_is_visible(struct kobject * kobj,struct attribute * attr,int index)56192c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj,
56292c76dc0SIlya Dryomov struct attribute *attr, int index)
56392c76dc0SIlya Dryomov {
5649b60e70bSIlya Dryomov if (!single_major &&
5659b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr ||
5669b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr))
5679b60e70bSIlya Dryomov return 0;
5689b60e70bSIlya Dryomov
56992c76dc0SIlya Dryomov return attr->mode;
57092c76dc0SIlya Dryomov }
57192c76dc0SIlya Dryomov
57292c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = {
57392c76dc0SIlya Dryomov .attrs = rbd_bus_attrs,
57492c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible,
57592c76dc0SIlya Dryomov };
57692c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus);
577f0f8cef5SAlex Elder
578f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = {
579f0f8cef5SAlex Elder .name = "rbd",
580b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups,
581f0f8cef5SAlex Elder };
582f0f8cef5SAlex Elder
rbd_root_dev_release(struct device * dev)583f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev)
584f0f8cef5SAlex Elder {
585f0f8cef5SAlex Elder }
586f0f8cef5SAlex Elder
587f0f8cef5SAlex Elder static struct device rbd_root_dev = {
588f0f8cef5SAlex Elder .init_name = "rbd",
589f0f8cef5SAlex Elder .release = rbd_root_dev_release,
590f0f8cef5SAlex Elder };
591f0f8cef5SAlex Elder
59206ecc6cbSAlex Elder static __printf(2, 3)
rbd_warn(struct rbd_device * rbd_dev,const char * fmt,...)59306ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
59406ecc6cbSAlex Elder {
59506ecc6cbSAlex Elder struct va_format vaf;
59606ecc6cbSAlex Elder va_list args;
59706ecc6cbSAlex Elder
59806ecc6cbSAlex Elder va_start(args, fmt);
59906ecc6cbSAlex Elder vaf.fmt = fmt;
60006ecc6cbSAlex Elder vaf.va = &args;
60106ecc6cbSAlex Elder
60206ecc6cbSAlex Elder if (!rbd_dev)
60306ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
60406ecc6cbSAlex Elder else if (rbd_dev->disk)
60506ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n",
60606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
60706ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name)
60806ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n",
60906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
61006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id)
61106ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n",
61206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
61306ecc6cbSAlex Elder else /* punt */
61406ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
61506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf);
61606ecc6cbSAlex Elder va_end(args);
61706ecc6cbSAlex Elder }
61806ecc6cbSAlex Elder
619aafb230eSAlex Elder #ifdef RBD_DEBUG
620aafb230eSAlex Elder #define rbd_assert(expr) \
621aafb230eSAlex Elder if (unlikely(!(expr))) { \
622aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \
623aafb230eSAlex Elder "at line %d:\n\n" \
624aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \
625aafb230eSAlex Elder __func__, __LINE__, #expr); \
626aafb230eSAlex Elder BUG(); \
627aafb230eSAlex Elder }
628aafb230eSAlex Elder #else /* !RBD_DEBUG */
629aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0)
630aafb230eSAlex Elder #endif /* !RBD_DEBUG */
631dfc5606dSYehuda Sadeh
63205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
6338b3e1a56SAlex Elder
634cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev);
635510a7330SIlya Dryomov static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
636510a7330SIlya Dryomov struct rbd_image_header *header);
63754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
63854cac61fSAlex Elder u64 snap_id);
6392ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
6402ad3d716SAlex Elder u8 *order, u64 *snap_size);
64122e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
64259c2be1eSYehuda Sadeh
64354ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
6440192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
6450192ce2eSIlya Dryomov
6460192ce2eSIlya Dryomov /*
6470192ce2eSIlya Dryomov * Return true if nothing else is pending.
6480192ce2eSIlya Dryomov */
pending_result_dec(struct pending_result * pending,int * result)6490192ce2eSIlya Dryomov static bool pending_result_dec(struct pending_result *pending, int *result)
6500192ce2eSIlya Dryomov {
6510192ce2eSIlya Dryomov rbd_assert(pending->num_pending > 0);
6520192ce2eSIlya Dryomov
6530192ce2eSIlya Dryomov if (*result && !pending->result)
6540192ce2eSIlya Dryomov pending->result = *result;
6550192ce2eSIlya Dryomov if (--pending->num_pending)
6560192ce2eSIlya Dryomov return false;
6570192ce2eSIlya Dryomov
6580192ce2eSIlya Dryomov *result = pending->result;
6590192ce2eSIlya Dryomov return true;
6600192ce2eSIlya Dryomov }
661602adf40SYehuda Sadeh
rbd_open(struct gendisk * disk,blk_mode_t mode)66205bdb996SChristoph Hellwig static int rbd_open(struct gendisk *disk, blk_mode_t mode)
663602adf40SYehuda Sadeh {
664d32e2bf8SChristoph Hellwig struct rbd_device *rbd_dev = disk->private_data;
665b82d167bSAlex Elder bool removing = false;
666602adf40SYehuda Sadeh
667a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock);
668b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
669b82d167bSAlex Elder removing = true;
670b82d167bSAlex Elder else
671b82d167bSAlex Elder rbd_dev->open_count++;
672a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock);
673b82d167bSAlex Elder if (removing)
674b82d167bSAlex Elder return -ENOENT;
675b82d167bSAlex Elder
676c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev);
677340c7a2bSAlex Elder
678602adf40SYehuda Sadeh return 0;
679602adf40SYehuda Sadeh }
680602adf40SYehuda Sadeh
rbd_release(struct gendisk * disk)681ae220766SChristoph Hellwig static void rbd_release(struct gendisk *disk)
682dfc5606dSYehuda Sadeh {
683dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data;
684b82d167bSAlex Elder unsigned long open_count_before;
685b82d167bSAlex Elder
686a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock);
687b82d167bSAlex Elder open_count_before = rbd_dev->open_count--;
688a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock);
689b82d167bSAlex Elder rbd_assert(open_count_before > 0);
690dfc5606dSYehuda Sadeh
691c3e946ceSAlex Elder put_device(&rbd_dev->dev);
692dfc5606dSYehuda Sadeh }
693dfc5606dSYehuda Sadeh
694602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = {
695602adf40SYehuda Sadeh .owner = THIS_MODULE,
696602adf40SYehuda Sadeh .open = rbd_open,
697dfc5606dSYehuda Sadeh .release = rbd_release,
698602adf40SYehuda Sadeh };
699602adf40SYehuda Sadeh
700602adf40SYehuda Sadeh /*
7017262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function
702cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex.
703602adf40SYehuda Sadeh */
rbd_client_create(struct ceph_options * ceph_opts)704f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
705602adf40SYehuda Sadeh {
706602adf40SYehuda Sadeh struct rbd_client *rbdc;
707602adf40SYehuda Sadeh int ret = -ENOMEM;
708602adf40SYehuda Sadeh
70937206ee5SAlex Elder dout("%s:\n", __func__);
710602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
711602adf40SYehuda Sadeh if (!rbdc)
712602adf40SYehuda Sadeh goto out_opt;
713602adf40SYehuda Sadeh
714602adf40SYehuda Sadeh kref_init(&rbdc->kref);
715602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node);
716602adf40SYehuda Sadeh
71774da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc);
718602adf40SYehuda Sadeh if (IS_ERR(rbdc->client))
71908f75463SAlex Elder goto out_rbdc;
72043ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
721602adf40SYehuda Sadeh
722602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client);
723602adf40SYehuda Sadeh if (ret < 0)
72408f75463SAlex Elder goto out_client;
725602adf40SYehuda Sadeh
726432b8587SAlex Elder spin_lock(&rbd_client_list_lock);
727602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list);
728432b8587SAlex Elder spin_unlock(&rbd_client_list_lock);
729602adf40SYehuda Sadeh
73037206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc);
731bc534d86SAlex Elder
732602adf40SYehuda Sadeh return rbdc;
73308f75463SAlex Elder out_client:
734602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client);
73508f75463SAlex Elder out_rbdc:
736602adf40SYehuda Sadeh kfree(rbdc);
737602adf40SYehuda Sadeh out_opt:
73843ae4701SAlex Elder if (ceph_opts)
73943ae4701SAlex Elder ceph_destroy_options(ceph_opts);
74037206ee5SAlex Elder dout("%s: error %d\n", __func__, ret);
74137206ee5SAlex Elder
74228f259b7SVasiliy Kulikov return ERR_PTR(ret);
743602adf40SYehuda Sadeh }
744602adf40SYehuda Sadeh
__rbd_get_client(struct rbd_client * rbdc)7452f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
7462f82ee54SAlex Elder {
7472f82ee54SAlex Elder kref_get(&rbdc->kref);
7482f82ee54SAlex Elder
7492f82ee54SAlex Elder return rbdc;
7502f82ee54SAlex Elder }
7512f82ee54SAlex Elder
752602adf40SYehuda Sadeh /*
7531f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If
7541f7ba331SAlex Elder * found, bump its reference count.
755602adf40SYehuda Sadeh */
rbd_client_find(struct ceph_options * ceph_opts)7561f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
757602adf40SYehuda Sadeh {
7583302ffd4SJakob Koschel struct rbd_client *rbdc = NULL, *iter;
759602adf40SYehuda Sadeh
76043ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE)
761602adf40SYehuda Sadeh return NULL;
762602adf40SYehuda Sadeh
7631f7ba331SAlex Elder spin_lock(&rbd_client_list_lock);
7643302ffd4SJakob Koschel list_for_each_entry(iter, &rbd_client_list, node) {
7653302ffd4SJakob Koschel if (!ceph_compare_options(ceph_opts, iter->client)) {
7663302ffd4SJakob Koschel __rbd_get_client(iter);
7672f82ee54SAlex Elder
7683302ffd4SJakob Koschel rbdc = iter;
7691f7ba331SAlex Elder break;
7701f7ba331SAlex Elder }
7711f7ba331SAlex Elder }
7721f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock);
7731f7ba331SAlex Elder
7743302ffd4SJakob Koschel return rbdc;
775602adf40SYehuda Sadeh }
776602adf40SYehuda Sadeh
777602adf40SYehuda Sadeh /*
778210c104cSIlya Dryomov * (Per device) rbd map options
77959c2be1eSYehuda Sadeh */
78059c2be1eSYehuda Sadeh enum {
781b5584180SIlya Dryomov Opt_queue_depth,
7820c93e1b7SIlya Dryomov Opt_alloc_size,
78334f55d0bSDongsheng Yang Opt_lock_timeout,
78459c2be1eSYehuda Sadeh /* int args above */
785b26c047bSIlya Dryomov Opt_pool_ns,
786dc1dad8eSIlya Dryomov Opt_compression_hint,
78759c2be1eSYehuda Sadeh /* string args above */
788cc0538b6SAlex Elder Opt_read_only,
789cc0538b6SAlex Elder Opt_read_write,
79080de1912SIlya Dryomov Opt_lock_on_read,
791e010dd0aSIlya Dryomov Opt_exclusive,
792d9360540SIlya Dryomov Opt_notrim,
79359c2be1eSYehuda Sadeh };
79459c2be1eSYehuda Sadeh
795dc1dad8eSIlya Dryomov enum {
796dc1dad8eSIlya Dryomov Opt_compression_hint_none,
797dc1dad8eSIlya Dryomov Opt_compression_hint_compressible,
798dc1dad8eSIlya Dryomov Opt_compression_hint_incompressible,
799dc1dad8eSIlya Dryomov };
800dc1dad8eSIlya Dryomov
801dc1dad8eSIlya Dryomov static const struct constant_table rbd_param_compression_hint[] = {
802dc1dad8eSIlya Dryomov {"none", Opt_compression_hint_none},
803dc1dad8eSIlya Dryomov {"compressible", Opt_compression_hint_compressible},
804dc1dad8eSIlya Dryomov {"incompressible", Opt_compression_hint_incompressible},
805dc1dad8eSIlya Dryomov {}
806dc1dad8eSIlya Dryomov };
807dc1dad8eSIlya Dryomov
808d7167b14SAl Viro static const struct fs_parameter_spec rbd_parameters[] = {
80982995cc6SDavid Howells fsparam_u32 ("alloc_size", Opt_alloc_size),
810dc1dad8eSIlya Dryomov fsparam_enum ("compression_hint", Opt_compression_hint,
811dc1dad8eSIlya Dryomov rbd_param_compression_hint),
81282995cc6SDavid Howells fsparam_flag ("exclusive", Opt_exclusive),
81382995cc6SDavid Howells fsparam_flag ("lock_on_read", Opt_lock_on_read),
81482995cc6SDavid Howells fsparam_u32 ("lock_timeout", Opt_lock_timeout),
81582995cc6SDavid Howells fsparam_flag ("notrim", Opt_notrim),
81682995cc6SDavid Howells fsparam_string ("_pool_ns", Opt_pool_ns),
81782995cc6SDavid Howells fsparam_u32 ("queue_depth", Opt_queue_depth),
81882995cc6SDavid Howells fsparam_flag ("read_only", Opt_read_only),
81982995cc6SDavid Howells fsparam_flag ("read_write", Opt_read_write),
82082995cc6SDavid Howells fsparam_flag ("ro", Opt_read_only),
82182995cc6SDavid Howells fsparam_flag ("rw", Opt_read_write),
82282995cc6SDavid Howells {}
82382995cc6SDavid Howells };
82482995cc6SDavid Howells
82598571b5aSAlex Elder struct rbd_options {
826b5584180SIlya Dryomov int queue_depth;
8270c93e1b7SIlya Dryomov int alloc_size;
82834f55d0bSDongsheng Yang unsigned long lock_timeout;
82998571b5aSAlex Elder bool read_only;
83080de1912SIlya Dryomov bool lock_on_read;
831e010dd0aSIlya Dryomov bool exclusive;
832d9360540SIlya Dryomov bool trim;
833dc1dad8eSIlya Dryomov
834dc1dad8eSIlya Dryomov u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
83598571b5aSAlex Elder };
83698571b5aSAlex Elder
837d2a27964SJohn Garry #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_DEFAULT_RQ
8380c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
83934f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
84098571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false
84180de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false
842e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT false
843d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT true
84498571b5aSAlex Elder
84582995cc6SDavid Howells struct rbd_parse_opts_ctx {
846c300156bSIlya Dryomov struct rbd_spec *spec;
84782995cc6SDavid Howells struct ceph_options *copts;
848c300156bSIlya Dryomov struct rbd_options *opts;
849c300156bSIlya Dryomov };
850c300156bSIlya Dryomov
obj_op_name(enum obj_operation_type op_type)8516d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type)
8526d2940c8SGuangliang Zhao {
8536d2940c8SGuangliang Zhao switch (op_type) {
8546d2940c8SGuangliang Zhao case OBJ_OP_READ:
8556d2940c8SGuangliang Zhao return "read";
8566d2940c8SGuangliang Zhao case OBJ_OP_WRITE:
8576d2940c8SGuangliang Zhao return "write";
85890e98c52SGuangliang Zhao case OBJ_OP_DISCARD:
85990e98c52SGuangliang Zhao return "discard";
8606484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT:
8616484cbe9SIlya Dryomov return "zeroout";
8626d2940c8SGuangliang Zhao default:
8636d2940c8SGuangliang Zhao return "???";
8646d2940c8SGuangliang Zhao }
8656d2940c8SGuangliang Zhao }
8666d2940c8SGuangliang Zhao
86759c2be1eSYehuda Sadeh /*
868602adf40SYehuda Sadeh * Destroy ceph client
869d23a4b3fSAlex Elder *
870432b8587SAlex Elder * Caller must hold rbd_client_list_lock.
871602adf40SYehuda Sadeh */
rbd_client_release(struct kref * kref)872602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref)
873602adf40SYehuda Sadeh {
874602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
875602adf40SYehuda Sadeh
87637206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc);
877cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock);
878602adf40SYehuda Sadeh list_del(&rbdc->node);
879cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock);
880602adf40SYehuda Sadeh
881602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client);
882602adf40SYehuda Sadeh kfree(rbdc);
883602adf40SYehuda Sadeh }
884602adf40SYehuda Sadeh
885602adf40SYehuda Sadeh /*
886602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release
887602adf40SYehuda Sadeh * it.
888602adf40SYehuda Sadeh */
rbd_put_client(struct rbd_client * rbdc)8899d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc)
890602adf40SYehuda Sadeh {
891c53d5893SAlex Elder if (rbdc)
8929d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release);
893602adf40SYehuda Sadeh }
894602adf40SYehuda Sadeh
8955feb0d8dSIlya Dryomov /*
8965feb0d8dSIlya Dryomov * Get a ceph client with specific addr and configuration, if one does
8975feb0d8dSIlya Dryomov * not exist create it. Either way, ceph_opts is consumed by this
8985feb0d8dSIlya Dryomov * function.
8995feb0d8dSIlya Dryomov */
rbd_get_client(struct ceph_options * ceph_opts)9005feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
9015feb0d8dSIlya Dryomov {
9025feb0d8dSIlya Dryomov struct rbd_client *rbdc;
903dd435855SIlya Dryomov int ret;
9045feb0d8dSIlya Dryomov
905a32e4143SIlya Dryomov mutex_lock(&client_mutex);
9065feb0d8dSIlya Dryomov rbdc = rbd_client_find(ceph_opts);
907dd435855SIlya Dryomov if (rbdc) {
9085feb0d8dSIlya Dryomov ceph_destroy_options(ceph_opts);
909dd435855SIlya Dryomov
910dd435855SIlya Dryomov /*
911dd435855SIlya Dryomov * Using an existing client. Make sure ->pg_pools is up to
912dd435855SIlya Dryomov * date before we look up the pool id in do_rbd_add().
913dd435855SIlya Dryomov */
9149d4a227fSIlya Dryomov ret = ceph_wait_for_latest_osdmap(rbdc->client,
9159d4a227fSIlya Dryomov rbdc->client->options->mount_timeout);
916dd435855SIlya Dryomov if (ret) {
917dd435855SIlya Dryomov rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
918dd435855SIlya Dryomov rbd_put_client(rbdc);
919dd435855SIlya Dryomov rbdc = ERR_PTR(ret);
920dd435855SIlya Dryomov }
921dd435855SIlya Dryomov } else {
9225feb0d8dSIlya Dryomov rbdc = rbd_client_create(ceph_opts);
923dd435855SIlya Dryomov }
9245feb0d8dSIlya Dryomov mutex_unlock(&client_mutex);
9255feb0d8dSIlya Dryomov
9265feb0d8dSIlya Dryomov return rbdc;
9275feb0d8dSIlya Dryomov }
9285feb0d8dSIlya Dryomov
rbd_image_format_valid(u32 image_format)929a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format)
930a30b71b9SAlex Elder {
931a30b71b9SAlex Elder return image_format == 1 || image_format == 2;
932a30b71b9SAlex Elder }
933a30b71b9SAlex Elder
rbd_dev_ondisk_valid(struct rbd_image_header_ondisk * ondisk)9348e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
9358e94af8eSAlex Elder {
936103a150fSAlex Elder size_t size;
937103a150fSAlex Elder u32 snap_count;
938103a150fSAlex Elder
939103a150fSAlex Elder /* The header has to start with the magic rbd header text */
940103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
941103a150fSAlex Elder return false;
942103a150fSAlex Elder
943db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */
944db2388b6SAlex Elder
945db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT)
946db2388b6SAlex Elder return false;
947db2388b6SAlex Elder
948db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */
949db2388b6SAlex Elder
950db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1)
951db2388b6SAlex Elder return false;
952db2388b6SAlex Elder
953103a150fSAlex Elder /*
954103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and
955103a150fSAlex Elder * that limits the number of snapshots.
956103a150fSAlex Elder */
957103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count);
958103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context);
959103a150fSAlex Elder if (snap_count > size / sizeof (__le64))
960103a150fSAlex Elder return false;
961103a150fSAlex Elder
962103a150fSAlex Elder /*
963103a150fSAlex Elder * Not only that, but the size of the entire the snapshot
964103a150fSAlex Elder * header must also be representable in a size_t.
965103a150fSAlex Elder */
966103a150fSAlex Elder size -= snap_count * sizeof (__le64);
967103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
968103a150fSAlex Elder return false;
969103a150fSAlex Elder
970103a150fSAlex Elder return true;
9718e94af8eSAlex Elder }
9728e94af8eSAlex Elder
973602adf40SYehuda Sadeh /*
9745bc3fb17SIlya Dryomov * returns the size of an object in the image
9755bc3fb17SIlya Dryomov */
rbd_obj_bytes(struct rbd_image_header * header)9765bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header)
9775bc3fb17SIlya Dryomov {
9785bc3fb17SIlya Dryomov return 1U << header->obj_order;
9795bc3fb17SIlya Dryomov }
9805bc3fb17SIlya Dryomov
rbd_init_layout(struct rbd_device * rbd_dev)981263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev)
982263423f8SIlya Dryomov {
983263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 ||
984263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) {
985263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
986263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1;
987263423f8SIlya Dryomov }
988263423f8SIlya Dryomov
989263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
990263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
991263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
9927e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
9937e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
994263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
995263423f8SIlya Dryomov }
996263423f8SIlya Dryomov
rbd_image_header_cleanup(struct rbd_image_header * header)997510a7330SIlya Dryomov static void rbd_image_header_cleanup(struct rbd_image_header *header)
998510a7330SIlya Dryomov {
999510a7330SIlya Dryomov kfree(header->object_prefix);
1000510a7330SIlya Dryomov ceph_put_snap_context(header->snapc);
1001510a7330SIlya Dryomov kfree(header->snap_sizes);
1002510a7330SIlya Dryomov kfree(header->snap_names);
1003510a7330SIlya Dryomov
1004510a7330SIlya Dryomov memset(header, 0, sizeof(*header));
1005510a7330SIlya Dryomov }
1006510a7330SIlya Dryomov
10075bc3fb17SIlya Dryomov /*
1008bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1
1009bb23e37aSAlex Elder * on-disk header.
1010602adf40SYehuda Sadeh */
rbd_header_from_disk(struct rbd_image_header * header,struct rbd_image_header_ondisk * ondisk,bool first_time)1011510a7330SIlya Dryomov static int rbd_header_from_disk(struct rbd_image_header *header,
1012510a7330SIlya Dryomov struct rbd_image_header_ondisk *ondisk,
1013510a7330SIlya Dryomov bool first_time)
1014602adf40SYehuda Sadeh {
1015bb23e37aSAlex Elder struct ceph_snap_context *snapc;
1016bb23e37aSAlex Elder char *object_prefix = NULL;
1017bb23e37aSAlex Elder char *snap_names = NULL;
1018bb23e37aSAlex Elder u64 *snap_sizes = NULL;
1019ccece235SAlex Elder u32 snap_count;
1020bb23e37aSAlex Elder int ret = -ENOMEM;
1021621901d6SAlex Elder u32 i;
1022602adf40SYehuda Sadeh
1023bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */
1024103a150fSAlex Elder
1025bb23e37aSAlex Elder if (first_time) {
1026848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix,
1027848d796cSIlya Dryomov sizeof(ondisk->object_prefix),
1028848d796cSIlya Dryomov GFP_KERNEL);
1029bb23e37aSAlex Elder if (!object_prefix)
1030602adf40SYehuda Sadeh return -ENOMEM;
1031bb23e37aSAlex Elder }
103200f1f36fSAlex Elder
1033bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */
1034d2bb24e5SAlex Elder
1035602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count);
1036bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1037bb23e37aSAlex Elder if (!snapc)
1038bb23e37aSAlex Elder goto out_err;
1039bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq);
1040602adf40SYehuda Sadeh if (snap_count) {
1041bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps;
1042f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1043f785cc1dSAlex Elder
1044bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */
1045621901d6SAlex Elder
1046f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX)
1047bb23e37aSAlex Elder goto out_2big;
1048bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1049bb23e37aSAlex Elder if (!snap_names)
1050602adf40SYehuda Sadeh goto out_err;
1051bb23e37aSAlex Elder
1052bb23e37aSAlex Elder /* ...as well as the array of their sizes. */
105388a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count,
105488a25a5fSMarkus Elfring sizeof(*header->snap_sizes),
105588a25a5fSMarkus Elfring GFP_KERNEL);
1056bb23e37aSAlex Elder if (!snap_sizes)
1057bb23e37aSAlex Elder goto out_err;
1058bb23e37aSAlex Elder
1059f785cc1dSAlex Elder /*
1060bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id
1061bb23e37aSAlex Elder * and size.
1062bb23e37aSAlex Elder *
106399a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the
1064bb23e37aSAlex Elder * ondisk buffer we're working with has
1065f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the
1066f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe.
1067f785cc1dSAlex Elder */
1068bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1069bb23e37aSAlex Elder snaps = ondisk->snaps;
1070bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) {
1071bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1072bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1073bb23e37aSAlex Elder }
1074602adf40SYehuda Sadeh }
1075849b4260SAlex Elder
1076bb23e37aSAlex Elder /* We won't fail any more, fill in the header */
1077bb23e37aSAlex Elder
1078bb23e37aSAlex Elder if (first_time) {
1079bb23e37aSAlex Elder header->object_prefix = object_prefix;
1080602adf40SYehuda Sadeh header->obj_order = ondisk->options.order;
1081bb23e37aSAlex Elder }
10826a52325fSAlex Elder
1083bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */
1084621901d6SAlex Elder
1085f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size);
1086bb23e37aSAlex Elder header->snapc = snapc;
1087bb23e37aSAlex Elder header->snap_names = snap_names;
1088bb23e37aSAlex Elder header->snap_sizes = snap_sizes;
1089468521c1SAlex Elder
1090602adf40SYehuda Sadeh return 0;
1091bb23e37aSAlex Elder out_2big:
1092bb23e37aSAlex Elder ret = -EIO;
10936a52325fSAlex Elder out_err:
1094bb23e37aSAlex Elder kfree(snap_sizes);
1095bb23e37aSAlex Elder kfree(snap_names);
1096bb23e37aSAlex Elder ceph_put_snap_context(snapc);
1097bb23e37aSAlex Elder kfree(object_prefix);
1098ccece235SAlex Elder
1099bb23e37aSAlex Elder return ret;
1100602adf40SYehuda Sadeh }
1101602adf40SYehuda Sadeh
_rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u32 which)11029682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
11039682fc6dSAlex Elder {
11049682fc6dSAlex Elder const char *snap_name;
11059682fc6dSAlex Elder
11069682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps);
11079682fc6dSAlex Elder
11089682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */
11099682fc6dSAlex Elder
11109682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names;
11119682fc6dSAlex Elder while (which--)
11129682fc6dSAlex Elder snap_name += strlen(snap_name) + 1;
11139682fc6dSAlex Elder
11149682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL);
11159682fc6dSAlex Elder }
11169682fc6dSAlex Elder
111730d1cff8SAlex Elder /*
111830d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch().
111930d1cff8SAlex Elder * Note that result is for snapshots in *descending* order.
112030d1cff8SAlex Elder */
snapid_compare_reverse(const void * s1,const void * s2)112130d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2)
112230d1cff8SAlex Elder {
112330d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1;
112430d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2;
112530d1cff8SAlex Elder
112630d1cff8SAlex Elder if (snap_id1 < snap_id2)
112730d1cff8SAlex Elder return 1;
112830d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1;
112930d1cff8SAlex Elder }
113030d1cff8SAlex Elder
113130d1cff8SAlex Elder /*
113230d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is
113330d1cff8SAlex Elder * present.
113430d1cff8SAlex Elder *
113530d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found,
113630d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise.
113730d1cff8SAlex Elder *
113830d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in
113930d1cff8SAlex Elder * reverse order, highest snapshot id first.
114030d1cff8SAlex Elder */
rbd_dev_snap_index(struct rbd_device * rbd_dev,u64 snap_id)11419682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
11429682fc6dSAlex Elder {
11439682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc;
114430d1cff8SAlex Elder u64 *found;
11459682fc6dSAlex Elder
114630d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
114730d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse);
11489682fc6dSAlex Elder
114930d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
11509682fc6dSAlex Elder }
11519682fc6dSAlex Elder
rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u64 snap_id)11522ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
11532ad3d716SAlex Elder u64 snap_id)
115454cac61fSAlex Elder {
115554cac61fSAlex Elder u32 which;
1156da6a6b63SJosh Durgin const char *snap_name;
115754cac61fSAlex Elder
115854cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id);
115954cac61fSAlex Elder if (which == BAD_SNAP_INDEX)
1160da6a6b63SJosh Durgin return ERR_PTR(-ENOENT);
116154cac61fSAlex Elder
1162da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1163da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM);
116454cac61fSAlex Elder }
116554cac61fSAlex Elder
rbd_snap_name(struct rbd_device * rbd_dev,u64 snap_id)11669e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
11679e15b77dSAlex Elder {
11689e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP)
11699e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME;
11709e15b77dSAlex Elder
117154cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
117254cac61fSAlex Elder if (rbd_dev->image_format == 1)
117354cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id);
11749e15b77dSAlex Elder
117554cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id);
11769e15b77dSAlex Elder }
11779e15b77dSAlex Elder
rbd_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u64 * snap_size)11782ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
11792ad3d716SAlex Elder u64 *snap_size)
1180602adf40SYehuda Sadeh {
11812ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
11822ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) {
11832ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size;
11842ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) {
11852ad3d716SAlex Elder u32 which;
118600f1f36fSAlex Elder
11872ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id);
11882ad3d716SAlex Elder if (which == BAD_SNAP_INDEX)
11892ad3d716SAlex Elder return -ENOENT;
119000f1f36fSAlex Elder
11912ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which];
11922ad3d716SAlex Elder } else {
11932ad3d716SAlex Elder u64 size = 0;
11942ad3d716SAlex Elder int ret;
11952ad3d716SAlex Elder
11962ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
11972ad3d716SAlex Elder if (ret)
11982ad3d716SAlex Elder return ret;
11992ad3d716SAlex Elder
12002ad3d716SAlex Elder *snap_size = size;
12012ad3d716SAlex Elder }
12022ad3d716SAlex Elder return 0;
12032ad3d716SAlex Elder }
12042ad3d716SAlex Elder
rbd_dev_mapping_set(struct rbd_device * rbd_dev)1205d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1206602adf40SYehuda Sadeh {
12078f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id;
12082ad3d716SAlex Elder u64 size = 0;
12092ad3d716SAlex Elder int ret;
12108b0241f8SAlex Elder
12112ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size);
12122ad3d716SAlex Elder if (ret)
12132ad3d716SAlex Elder return ret;
12142ad3d716SAlex Elder
12152ad3d716SAlex Elder rbd_dev->mapping.size = size;
12168b0241f8SAlex Elder return 0;
1217602adf40SYehuda Sadeh }
1218602adf40SYehuda Sadeh
rbd_dev_mapping_clear(struct rbd_device * rbd_dev)1219d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1220d1cf5788SAlex Elder {
1221d1cf5788SAlex Elder rbd_dev->mapping.size = 0;
1222200a6a8bSAlex Elder }
1223200a6a8bSAlex Elder
zero_bios(struct ceph_bio_iter * bio_pos,u32 off,u32 bytes)12245359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1225b9434c5bSAlex Elder {
12265359a17dSIlya Dryomov struct ceph_bio_iter it = *bio_pos;
1227b9434c5bSAlex Elder
12285359a17dSIlya Dryomov ceph_bio_iter_advance(&it, off);
12295359a17dSIlya Dryomov ceph_bio_iter_advance_step(&it, bytes, ({
1230732022b8SChristoph Hellwig memzero_bvec(&bv);
12315359a17dSIlya Dryomov }));
1232b9434c5bSAlex Elder }
1233b9434c5bSAlex Elder
zero_bvecs(struct ceph_bvec_iter * bvec_pos,u32 off,u32 bytes)12347e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1235602adf40SYehuda Sadeh {
12367e07efb1SIlya Dryomov struct ceph_bvec_iter it = *bvec_pos;
1237602adf40SYehuda Sadeh
12387e07efb1SIlya Dryomov ceph_bvec_iter_advance(&it, off);
12397e07efb1SIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({
1240732022b8SChristoph Hellwig memzero_bvec(&bv);
12417e07efb1SIlya Dryomov }));
1242602adf40SYehuda Sadeh }
1243602adf40SYehuda Sadeh
1244f7760dadSAlex Elder /*
12453da691bfSIlya Dryomov * Zero a range in @obj_req data buffer defined by a bio (list) or
1246afb97888SIlya Dryomov * (private) bio_vec array.
1247f7760dadSAlex Elder *
12483da691bfSIlya Dryomov * @off is relative to the start of the data buffer.
1249f7760dadSAlex Elder */
rbd_obj_zero_range(struct rbd_obj_request * obj_req,u32 off,u32 bytes)12503da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
12513da691bfSIlya Dryomov u32 bytes)
1252f7760dadSAlex Elder {
125354ab3b24SIlya Dryomov dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
125454ab3b24SIlya Dryomov
1255ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) {
12563da691bfSIlya Dryomov case OBJ_REQUEST_BIO:
12573da691bfSIlya Dryomov zero_bios(&obj_req->bio_pos, off, bytes);
12583da691bfSIlya Dryomov break;
12593da691bfSIlya Dryomov case OBJ_REQUEST_BVECS:
1260afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS:
12613da691bfSIlya Dryomov zero_bvecs(&obj_req->bvec_pos, off, bytes);
12623da691bfSIlya Dryomov break;
12633da691bfSIlya Dryomov default:
126416809372SArnd Bergmann BUG();
1265f5400b7aSAlex Elder }
1266bf0d5f50SAlex Elder }
1267bf0d5f50SAlex Elder
1268bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref);
rbd_obj_request_put(struct rbd_obj_request * obj_request)1269bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1270bf0d5f50SAlex Elder {
1271bf0d5f50SAlex Elder rbd_assert(obj_request != NULL);
127237206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request,
12732c935bc5SPeter Zijlstra kref_read(&obj_request->kref));
1274bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy);
1275bf0d5f50SAlex Elder }
1276bf0d5f50SAlex Elder
rbd_img_obj_request_add(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1277bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1278bf0d5f50SAlex Elder struct rbd_obj_request *obj_request)
1279bf0d5f50SAlex Elder {
128025dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL);
128125dcf954SAlex Elder
1282b155e86cSAlex Elder /* Image request now owns object's original reference */
1283bf0d5f50SAlex Elder obj_request->img_request = img_request;
128415961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1285bf0d5f50SAlex Elder }
1286bf0d5f50SAlex Elder
rbd_img_obj_request_del(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1287bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1288bf0d5f50SAlex Elder struct rbd_obj_request *obj_request)
1289bf0d5f50SAlex Elder {
129015961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
129143df3d35SIlya Dryomov list_del(&obj_request->ex.oe_item);
1292bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request);
1293bf0d5f50SAlex Elder rbd_obj_request_put(obj_request);
1294bf0d5f50SAlex Elder }
1295bf0d5f50SAlex Elder
rbd_osd_submit(struct ceph_osd_request * osd_req)1296a086a1b8SIlya Dryomov static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1297bf0d5f50SAlex Elder {
1298a086a1b8SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
1299980917fcSIlya Dryomov
1300a086a1b8SIlya Dryomov dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1301a086a1b8SIlya Dryomov __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1302a086a1b8SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len);
1303a8af0d68SJeff Layton ceph_osdc_start_request(osd_req->r_osdc, osd_req);
1304bf0d5f50SAlex Elder }
1305bf0d5f50SAlex Elder
13060c425248SAlex Elder /*
13070c425248SAlex Elder * The default/initial value for all image request flags is 0. Each
13080c425248SAlex Elder * is conditionally set to 1 at image request initialization time
13090c425248SAlex Elder * and currently never change thereafter.
13100c425248SAlex Elder */
img_request_layered_set(struct rbd_img_request * img_request)1311d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request)
1312d0b2e944SAlex Elder {
1313d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags);
1314d0b2e944SAlex Elder }
1315d0b2e944SAlex Elder
img_request_layered_test(struct rbd_img_request * img_request)1316d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request)
1317d0b2e944SAlex Elder {
1318d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1319d0b2e944SAlex Elder }
1320d0b2e944SAlex Elder
rbd_obj_is_entire(struct rbd_obj_request * obj_req)13213da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
13223b434a2aSJosh Durgin {
13233da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
13243da691bfSIlya Dryomov
132543df3d35SIlya Dryomov return !obj_req->ex.oe_off &&
132643df3d35SIlya Dryomov obj_req->ex.oe_len == rbd_dev->layout.object_size;
13273b434a2aSJosh Durgin }
13283b434a2aSJosh Durgin
rbd_obj_is_tail(struct rbd_obj_request * obj_req)13293da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
13306e2a4505SAlex Elder {
13313da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1332b9434c5bSAlex Elder
133343df3d35SIlya Dryomov return obj_req->ex.oe_off + obj_req->ex.oe_len ==
13343da691bfSIlya Dryomov rbd_dev->layout.object_size;
13356e2a4505SAlex Elder }
13366e2a4505SAlex Elder
133713488d53SIlya Dryomov /*
133813488d53SIlya Dryomov * Must be called after rbd_obj_calc_img_extents().
133913488d53SIlya Dryomov */
rbd_obj_set_copyup_enabled(struct rbd_obj_request * obj_req)134009fe05c5SIlya Dryomov static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req)
134113488d53SIlya Dryomov {
1342870611e4SIlya Dryomov rbd_assert(obj_req->img_request->snapc);
1343870611e4SIlya Dryomov
134409fe05c5SIlya Dryomov if (obj_req->img_request->op_type == OBJ_OP_DISCARD) {
134509fe05c5SIlya Dryomov dout("%s %p objno %llu discard\n", __func__, obj_req,
134609fe05c5SIlya Dryomov obj_req->ex.oe_objno);
134709fe05c5SIlya Dryomov return;
134809fe05c5SIlya Dryomov }
134913488d53SIlya Dryomov
135009fe05c5SIlya Dryomov if (!obj_req->num_img_extents) {
135109fe05c5SIlya Dryomov dout("%s %p objno %llu not overlapping\n", __func__, obj_req,
135209fe05c5SIlya Dryomov obj_req->ex.oe_objno);
135309fe05c5SIlya Dryomov return;
135409fe05c5SIlya Dryomov }
135509fe05c5SIlya Dryomov
135609fe05c5SIlya Dryomov if (rbd_obj_is_entire(obj_req) &&
135709fe05c5SIlya Dryomov !obj_req->img_request->snapc->num_snaps) {
135809fe05c5SIlya Dryomov dout("%s %p objno %llu entire\n", __func__, obj_req,
135909fe05c5SIlya Dryomov obj_req->ex.oe_objno);
136009fe05c5SIlya Dryomov return;
136109fe05c5SIlya Dryomov }
136209fe05c5SIlya Dryomov
136309fe05c5SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
136413488d53SIlya Dryomov }
136513488d53SIlya Dryomov
rbd_obj_img_extents_bytes(struct rbd_obj_request * obj_req)136686bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1367bf0d5f50SAlex Elder {
136886bd7998SIlya Dryomov return ceph_file_extents_bytes(obj_req->img_extents,
136986bd7998SIlya Dryomov obj_req->num_img_extents);
1370bf0d5f50SAlex Elder }
1371bf0d5f50SAlex Elder
rbd_img_is_write(struct rbd_img_request * img_req)13723da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req)
13730dcc685eSIlya Dryomov {
13749bb0248dSIlya Dryomov switch (img_req->op_type) {
13753da691bfSIlya Dryomov case OBJ_OP_READ:
13763da691bfSIlya Dryomov return false;
13773da691bfSIlya Dryomov case OBJ_OP_WRITE:
13783da691bfSIlya Dryomov case OBJ_OP_DISCARD:
13796484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT:
13803da691bfSIlya Dryomov return true;
13813da691bfSIlya Dryomov default:
1382c6244b3bSArnd Bergmann BUG();
13830dcc685eSIlya Dryomov }
13840dcc685eSIlya Dryomov }
13850dcc685eSIlya Dryomov
rbd_osd_req_callback(struct ceph_osd_request * osd_req)138685e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1387bf0d5f50SAlex Elder {
13883da691bfSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
138954ab3b24SIlya Dryomov int result;
1390bf0d5f50SAlex Elder
13913da691bfSIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
13923da691bfSIlya Dryomov osd_req->r_result, obj_req);
1393bf0d5f50SAlex Elder
1394c47f9371SAlex Elder /*
13953da691bfSIlya Dryomov * Writes aren't allowed to return a data payload. In some
13963da691bfSIlya Dryomov * guarded write cases (e.g. stat + zero on an empty object)
13973da691bfSIlya Dryomov * a stat response makes it through, but we don't care.
1398c47f9371SAlex Elder */
139954ab3b24SIlya Dryomov if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
140054ab3b24SIlya Dryomov result = 0;
140154ab3b24SIlya Dryomov else
140254ab3b24SIlya Dryomov result = osd_req->r_result;
14030ccd5926SIlya Dryomov
140454ab3b24SIlya Dryomov rbd_obj_handle_request(obj_req, result);
1405bf0d5f50SAlex Elder }
1406bf0d5f50SAlex Elder
rbd_osd_format_read(struct ceph_osd_request * osd_req)1407bcbab1dbSIlya Dryomov static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1408430c28c3SAlex Elder {
1409bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_request = osd_req->r_priv;
141022d2cfdfSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
141122d2cfdfSIlya Dryomov struct ceph_options *opt = rbd_dev->rbd_client->client->options;
1412430c28c3SAlex Elder
141322d2cfdfSIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
14147c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id;
14159d4df01fSAlex Elder }
14169d4df01fSAlex Elder
rbd_osd_format_write(struct ceph_osd_request * osd_req)1417bcbab1dbSIlya Dryomov static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
14189d4df01fSAlex Elder {
1419bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_request = osd_req->r_priv;
14209d4df01fSAlex Elder
1421a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1422fac02ddfSArnd Bergmann ktime_get_real_ts64(&osd_req->r_mtime);
142343df3d35SIlya Dryomov osd_req->r_data_offset = obj_request->ex.oe_off;
1424430c28c3SAlex Elder }
1425430c28c3SAlex Elder
1426bc81207eSIlya Dryomov static struct ceph_osd_request *
__rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,struct ceph_snap_context * snapc,int num_ops)1427bcbab1dbSIlya Dryomov __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1428bcbab1dbSIlya Dryomov struct ceph_snap_context *snapc, int num_ops)
1429bc81207eSIlya Dryomov {
1430e28eded5SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1431bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1432bc81207eSIlya Dryomov struct ceph_osd_request *req;
1433a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ?
1434a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1435bcbab1dbSIlya Dryomov int ret;
1436bc81207eSIlya Dryomov
1437e28eded5SIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1438bc81207eSIlya Dryomov if (!req)
1439bcbab1dbSIlya Dryomov return ERR_PTR(-ENOMEM);
1440bc81207eSIlya Dryomov
1441bcbab1dbSIlya Dryomov list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1442bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback;
1443a162b308SIlya Dryomov req->r_priv = obj_req;
1444bc81207eSIlya Dryomov
1445b26c047bSIlya Dryomov /*
1446b26c047bSIlya Dryomov * Data objects may be stored in a separate pool, but always in
1447b26c047bSIlya Dryomov * the same namespace in that pool as the header in its pool.
1448b26c047bSIlya Dryomov */
1449b26c047bSIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1450bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1451b26c047bSIlya Dryomov
1452bcbab1dbSIlya Dryomov ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1453bcbab1dbSIlya Dryomov rbd_dev->header.object_prefix,
1454bcbab1dbSIlya Dryomov obj_req->ex.oe_objno);
1455bcbab1dbSIlya Dryomov if (ret)
1456bcbab1dbSIlya Dryomov return ERR_PTR(ret);
1457bc81207eSIlya Dryomov
1458bc81207eSIlya Dryomov return req;
1459bc81207eSIlya Dryomov }
1460bc81207eSIlya Dryomov
1461e28eded5SIlya Dryomov static struct ceph_osd_request *
rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,int num_ops)1462bcbab1dbSIlya Dryomov rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1463e28eded5SIlya Dryomov {
1464870611e4SIlya Dryomov rbd_assert(obj_req->img_request->snapc);
1465bcbab1dbSIlya Dryomov return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1466e28eded5SIlya Dryomov num_ops);
1467e28eded5SIlya Dryomov }
1468e28eded5SIlya Dryomov
rbd_obj_request_create(void)1469ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void)
1470bf0d5f50SAlex Elder {
1471bf0d5f50SAlex Elder struct rbd_obj_request *obj_request;
1472bf0d5f50SAlex Elder
14735a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
14746c696d85SIlya Dryomov if (!obj_request)
1475f907ad55SAlex Elder return NULL;
1476f907ad55SAlex Elder
147743df3d35SIlya Dryomov ceph_object_extent_init(&obj_request->ex);
1478bcbab1dbSIlya Dryomov INIT_LIST_HEAD(&obj_request->osd_reqs);
147985b5e6d1SIlya Dryomov mutex_init(&obj_request->state_mutex);
1480bf0d5f50SAlex Elder kref_init(&obj_request->kref);
1481bf0d5f50SAlex Elder
148267e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request);
1483bf0d5f50SAlex Elder return obj_request;
1484bf0d5f50SAlex Elder }
1485bf0d5f50SAlex Elder
rbd_obj_request_destroy(struct kref * kref)1486bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref)
1487bf0d5f50SAlex Elder {
1488bf0d5f50SAlex Elder struct rbd_obj_request *obj_request;
1489bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req;
14907e07efb1SIlya Dryomov u32 i;
1491bf0d5f50SAlex Elder
1492bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref);
1493bf0d5f50SAlex Elder
149437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request);
149537206ee5SAlex Elder
1496bcbab1dbSIlya Dryomov while (!list_empty(&obj_request->osd_reqs)) {
1497bcbab1dbSIlya Dryomov osd_req = list_first_entry(&obj_request->osd_reqs,
1498bcbab1dbSIlya Dryomov struct ceph_osd_request, r_private_item);
1499bcbab1dbSIlya Dryomov list_del_init(&osd_req->r_private_item);
1500bcbab1dbSIlya Dryomov ceph_osdc_put_request(osd_req);
1501bcbab1dbSIlya Dryomov }
1502bf0d5f50SAlex Elder
1503ecc633caSIlya Dryomov switch (obj_request->img_request->data_type) {
15049969ebc5SAlex Elder case OBJ_REQUEST_NODATA:
1505bf0d5f50SAlex Elder case OBJ_REQUEST_BIO:
15067e07efb1SIlya Dryomov case OBJ_REQUEST_BVECS:
15075359a17dSIlya Dryomov break; /* Nothing to do */
1508afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS:
1509afb97888SIlya Dryomov kfree(obj_request->bvec_pos.bvecs);
1510bf0d5f50SAlex Elder break;
15117e07efb1SIlya Dryomov default:
151216809372SArnd Bergmann BUG();
1513bf0d5f50SAlex Elder }
1514bf0d5f50SAlex Elder
151586bd7998SIlya Dryomov kfree(obj_request->img_extents);
15167e07efb1SIlya Dryomov if (obj_request->copyup_bvecs) {
15177e07efb1SIlya Dryomov for (i = 0; i < obj_request->copyup_bvec_count; i++) {
15187e07efb1SIlya Dryomov if (obj_request->copyup_bvecs[i].bv_page)
15197e07efb1SIlya Dryomov __free_page(obj_request->copyup_bvecs[i].bv_page);
15207e07efb1SIlya Dryomov }
15217e07efb1SIlya Dryomov kfree(obj_request->copyup_bvecs);
1522bf0d5f50SAlex Elder }
1523bf0d5f50SAlex Elder
1524868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request);
1525bf0d5f50SAlex Elder }
1526bf0d5f50SAlex Elder
1527fb65d228SAlex Elder /* It's OK to call this for a device with no parent */
1528fb65d228SAlex Elder
1529fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec);
rbd_dev_unparent(struct rbd_device * rbd_dev)1530fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1531fb65d228SAlex Elder {
1532fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev);
1533fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec);
1534fb65d228SAlex Elder rbd_dev->parent_spec = NULL;
1535fb65d228SAlex Elder rbd_dev->parent_overlap = 0;
1536fb65d228SAlex Elder }
1537fb65d228SAlex Elder
1538bf0d5f50SAlex Elder /*
1539a2acd00eSAlex Elder * Parent image reference counting is used to determine when an
1540a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no
1541a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last
1542a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe.
1543a2acd00eSAlex Elder */
rbd_dev_parent_put(struct rbd_device * rbd_dev)1544a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1545a2acd00eSAlex Elder {
1546a2acd00eSAlex Elder int counter;
1547a2acd00eSAlex Elder
1548a2acd00eSAlex Elder if (!rbd_dev->parent_spec)
1549a2acd00eSAlex Elder return;
1550a2acd00eSAlex Elder
1551a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1552a2acd00eSAlex Elder if (counter > 0)
1553a2acd00eSAlex Elder return;
1554a2acd00eSAlex Elder
1555a2acd00eSAlex Elder /* Last reference; clean up parent data structures */
1556a2acd00eSAlex Elder
1557a2acd00eSAlex Elder if (!counter)
1558a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev);
1559a2acd00eSAlex Elder else
15609584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow");
1561a2acd00eSAlex Elder }
1562a2acd00eSAlex Elder
1563a2acd00eSAlex Elder /*
1564a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its
1565a2acd00eSAlex Elder * parent.
1566a2acd00eSAlex Elder *
1567a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero
1568a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or
1569a2acd00eSAlex Elder * false otherwise.
1570a2acd00eSAlex Elder */
rbd_dev_parent_get(struct rbd_device * rbd_dev)1571a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1572a2acd00eSAlex Elder {
1573ae43e9d0SIlya Dryomov int counter = 0;
1574a2acd00eSAlex Elder
1575a2acd00eSAlex Elder if (!rbd_dev->parent_spec)
1576a2acd00eSAlex Elder return false;
1577a2acd00eSAlex Elder
1578ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap)
1579a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1580a2acd00eSAlex Elder
1581a2acd00eSAlex Elder if (counter < 0)
15829584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow");
1583a2acd00eSAlex Elder
1584ae43e9d0SIlya Dryomov return counter > 0;
1585a2acd00eSAlex Elder }
1586a2acd00eSAlex Elder
rbd_img_request_init(struct rbd_img_request * img_request,struct rbd_device * rbd_dev,enum obj_operation_type op_type)158759e542c8SIlya Dryomov static void rbd_img_request_init(struct rbd_img_request *img_request,
1588cc344fa1SAlex Elder struct rbd_device *rbd_dev,
1589a52cc685SIlya Dryomov enum obj_operation_type op_type)
1590bf0d5f50SAlex Elder {
159159e542c8SIlya Dryomov memset(img_request, 0, sizeof(*img_request));
1592bf0d5f50SAlex Elder
1593bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev;
15949bb0248dSIlya Dryomov img_request->op_type = op_type;
1595a0c5895bSIlya Dryomov
1596e1fddc8fSIlya Dryomov INIT_LIST_HEAD(&img_request->lock_item);
159743df3d35SIlya Dryomov INIT_LIST_HEAD(&img_request->object_extents);
15980192ce2eSIlya Dryomov mutex_init(&img_request->state_mutex);
1599bf0d5f50SAlex Elder }
1600bf0d5f50SAlex Elder
1601870611e4SIlya Dryomov /*
1602870611e4SIlya Dryomov * Only snap_id is captured here, for reads. For writes, snapshot
1603870611e4SIlya Dryomov * context is captured in rbd_img_object_requests() after exclusive
1604870611e4SIlya Dryomov * lock is ensured to be held.
1605870611e4SIlya Dryomov */
rbd_img_capture_header(struct rbd_img_request * img_req)1606a52cc685SIlya Dryomov static void rbd_img_capture_header(struct rbd_img_request *img_req)
1607a52cc685SIlya Dryomov {
1608a52cc685SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
1609a52cc685SIlya Dryomov
1610a52cc685SIlya Dryomov lockdep_assert_held(&rbd_dev->header_rwsem);
1611a52cc685SIlya Dryomov
1612870611e4SIlya Dryomov if (!rbd_img_is_write(img_req))
1613a52cc685SIlya Dryomov img_req->snap_id = rbd_dev->spec->snap_id;
1614a52cc685SIlya Dryomov
1615a52cc685SIlya Dryomov if (rbd_dev_parent_get(rbd_dev))
1616a52cc685SIlya Dryomov img_request_layered_set(img_req);
1617a52cc685SIlya Dryomov }
1618a52cc685SIlya Dryomov
rbd_img_request_destroy(struct rbd_img_request * img_request)1619679a97d2SHannes Reinecke static void rbd_img_request_destroy(struct rbd_img_request *img_request)
1620bf0d5f50SAlex Elder {
1621bf0d5f50SAlex Elder struct rbd_obj_request *obj_request;
1622bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request;
1623bf0d5f50SAlex Elder
162437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request);
162537206ee5SAlex Elder
1626e1fddc8fSIlya Dryomov WARN_ON(!list_empty(&img_request->lock_item));
1627bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1628bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request);
1629bf0d5f50SAlex Elder
163078b42a87SIlya Dryomov if (img_request_layered_test(img_request))
1631a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev);
1632a2acd00eSAlex Elder
16339bb0248dSIlya Dryomov if (rbd_img_is_write(img_request))
1634812164f8SAlex Elder ceph_put_snap_context(img_request->snapc);
1635bf0d5f50SAlex Elder
163659e542c8SIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_request->flags))
16371c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request);
1638bf0d5f50SAlex Elder }
1639bf0d5f50SAlex Elder
164022e8bd51SIlya Dryomov #define BITS_PER_OBJ 2
164122e8bd51SIlya Dryomov #define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
164222e8bd51SIlya Dryomov #define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
164322e8bd51SIlya Dryomov
__rbd_object_map_index(struct rbd_device * rbd_dev,u64 objno,u64 * index,u8 * shift)164422e8bd51SIlya Dryomov static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
164522e8bd51SIlya Dryomov u64 *index, u8 *shift)
164622e8bd51SIlya Dryomov {
164722e8bd51SIlya Dryomov u32 off;
164822e8bd51SIlya Dryomov
164922e8bd51SIlya Dryomov rbd_assert(objno < rbd_dev->object_map_size);
165022e8bd51SIlya Dryomov *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
165122e8bd51SIlya Dryomov *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
165222e8bd51SIlya Dryomov }
165322e8bd51SIlya Dryomov
__rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)165422e8bd51SIlya Dryomov static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
165522e8bd51SIlya Dryomov {
165622e8bd51SIlya Dryomov u64 index;
165722e8bd51SIlya Dryomov u8 shift;
165822e8bd51SIlya Dryomov
165922e8bd51SIlya Dryomov lockdep_assert_held(&rbd_dev->object_map_lock);
166022e8bd51SIlya Dryomov __rbd_object_map_index(rbd_dev, objno, &index, &shift);
166122e8bd51SIlya Dryomov return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
166222e8bd51SIlya Dryomov }
166322e8bd51SIlya Dryomov
__rbd_object_map_set(struct rbd_device * rbd_dev,u64 objno,u8 val)166422e8bd51SIlya Dryomov static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
166522e8bd51SIlya Dryomov {
166622e8bd51SIlya Dryomov u64 index;
166722e8bd51SIlya Dryomov u8 shift;
166822e8bd51SIlya Dryomov u8 *p;
166922e8bd51SIlya Dryomov
167022e8bd51SIlya Dryomov lockdep_assert_held(&rbd_dev->object_map_lock);
167122e8bd51SIlya Dryomov rbd_assert(!(val & ~OBJ_MASK));
167222e8bd51SIlya Dryomov
167322e8bd51SIlya Dryomov __rbd_object_map_index(rbd_dev, objno, &index, &shift);
167422e8bd51SIlya Dryomov p = &rbd_dev->object_map[index];
167522e8bd51SIlya Dryomov *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
167622e8bd51SIlya Dryomov }
167722e8bd51SIlya Dryomov
rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)167822e8bd51SIlya Dryomov static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
167922e8bd51SIlya Dryomov {
168022e8bd51SIlya Dryomov u8 state;
168122e8bd51SIlya Dryomov
168222e8bd51SIlya Dryomov spin_lock(&rbd_dev->object_map_lock);
168322e8bd51SIlya Dryomov state = __rbd_object_map_get(rbd_dev, objno);
168422e8bd51SIlya Dryomov spin_unlock(&rbd_dev->object_map_lock);
168522e8bd51SIlya Dryomov return state;
168622e8bd51SIlya Dryomov }
168722e8bd51SIlya Dryomov
use_object_map(struct rbd_device * rbd_dev)168822e8bd51SIlya Dryomov static bool use_object_map(struct rbd_device *rbd_dev)
168922e8bd51SIlya Dryomov {
16903fe69921SIlya Dryomov /*
16913fe69921SIlya Dryomov * An image mapped read-only can't use the object map -- it isn't
16923fe69921SIlya Dryomov * loaded because the header lock isn't acquired. Someone else can
16933fe69921SIlya Dryomov * write to the image and update the object map behind our back.
16943fe69921SIlya Dryomov *
16953fe69921SIlya Dryomov * A snapshot can't be written to, so using the object map is always
16963fe69921SIlya Dryomov * safe.
16973fe69921SIlya Dryomov */
16983fe69921SIlya Dryomov if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
16993fe69921SIlya Dryomov return false;
17003fe69921SIlya Dryomov
170122e8bd51SIlya Dryomov return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
170222e8bd51SIlya Dryomov !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
170322e8bd51SIlya Dryomov }
170422e8bd51SIlya Dryomov
rbd_object_map_may_exist(struct rbd_device * rbd_dev,u64 objno)170522e8bd51SIlya Dryomov static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
170622e8bd51SIlya Dryomov {
170722e8bd51SIlya Dryomov u8 state;
170822e8bd51SIlya Dryomov
170922e8bd51SIlya Dryomov /* fall back to default logic if object map is disabled or invalid */
171022e8bd51SIlya Dryomov if (!use_object_map(rbd_dev))
171122e8bd51SIlya Dryomov return true;
171222e8bd51SIlya Dryomov
171322e8bd51SIlya Dryomov state = rbd_object_map_get(rbd_dev, objno);
171422e8bd51SIlya Dryomov return state != OBJECT_NONEXISTENT;
171522e8bd51SIlya Dryomov }
171622e8bd51SIlya Dryomov
rbd_object_map_name(struct rbd_device * rbd_dev,u64 snap_id,struct ceph_object_id * oid)171722e8bd51SIlya Dryomov static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
171822e8bd51SIlya Dryomov struct ceph_object_id *oid)
171922e8bd51SIlya Dryomov {
172022e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP)
172122e8bd51SIlya Dryomov ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
172222e8bd51SIlya Dryomov rbd_dev->spec->image_id);
172322e8bd51SIlya Dryomov else
172422e8bd51SIlya Dryomov ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
172522e8bd51SIlya Dryomov rbd_dev->spec->image_id, snap_id);
172622e8bd51SIlya Dryomov }
172722e8bd51SIlya Dryomov
rbd_object_map_lock(struct rbd_device * rbd_dev)172822e8bd51SIlya Dryomov static int rbd_object_map_lock(struct rbd_device *rbd_dev)
172922e8bd51SIlya Dryomov {
173022e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
173122e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid);
173222e8bd51SIlya Dryomov u8 lock_type;
173322e8bd51SIlya Dryomov char *lock_tag;
173422e8bd51SIlya Dryomov struct ceph_locker *lockers;
173522e8bd51SIlya Dryomov u32 num_lockers;
173622e8bd51SIlya Dryomov bool broke_lock = false;
173722e8bd51SIlya Dryomov int ret;
173822e8bd51SIlya Dryomov
173922e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
174022e8bd51SIlya Dryomov
174122e8bd51SIlya Dryomov again:
174222e8bd51SIlya Dryomov ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
174322e8bd51SIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
174422e8bd51SIlya Dryomov if (ret != -EBUSY || broke_lock) {
174522e8bd51SIlya Dryomov if (ret == -EEXIST)
174622e8bd51SIlya Dryomov ret = 0; /* already locked by myself */
174722e8bd51SIlya Dryomov if (ret)
174822e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
174922e8bd51SIlya Dryomov return ret;
175022e8bd51SIlya Dryomov }
175122e8bd51SIlya Dryomov
175222e8bd51SIlya Dryomov ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
175322e8bd51SIlya Dryomov RBD_LOCK_NAME, &lock_type, &lock_tag,
175422e8bd51SIlya Dryomov &lockers, &num_lockers);
175522e8bd51SIlya Dryomov if (ret) {
175622e8bd51SIlya Dryomov if (ret == -ENOENT)
175722e8bd51SIlya Dryomov goto again;
175822e8bd51SIlya Dryomov
175922e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
176022e8bd51SIlya Dryomov return ret;
176122e8bd51SIlya Dryomov }
176222e8bd51SIlya Dryomov
176322e8bd51SIlya Dryomov kfree(lock_tag);
176422e8bd51SIlya Dryomov if (num_lockers == 0)
176522e8bd51SIlya Dryomov goto again;
176622e8bd51SIlya Dryomov
176722e8bd51SIlya Dryomov rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
176822e8bd51SIlya Dryomov ENTITY_NAME(lockers[0].id.name));
176922e8bd51SIlya Dryomov
177022e8bd51SIlya Dryomov ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
177122e8bd51SIlya Dryomov RBD_LOCK_NAME, lockers[0].id.cookie,
177222e8bd51SIlya Dryomov &lockers[0].id.name);
177322e8bd51SIlya Dryomov ceph_free_lockers(lockers, num_lockers);
177422e8bd51SIlya Dryomov if (ret) {
177522e8bd51SIlya Dryomov if (ret == -ENOENT)
177622e8bd51SIlya Dryomov goto again;
177722e8bd51SIlya Dryomov
177822e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
177922e8bd51SIlya Dryomov return ret;
178022e8bd51SIlya Dryomov }
178122e8bd51SIlya Dryomov
178222e8bd51SIlya Dryomov broke_lock = true;
178322e8bd51SIlya Dryomov goto again;
178422e8bd51SIlya Dryomov }
178522e8bd51SIlya Dryomov
rbd_object_map_unlock(struct rbd_device * rbd_dev)178622e8bd51SIlya Dryomov static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
178722e8bd51SIlya Dryomov {
178822e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
178922e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid);
179022e8bd51SIlya Dryomov int ret;
179122e8bd51SIlya Dryomov
179222e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
179322e8bd51SIlya Dryomov
179422e8bd51SIlya Dryomov ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
179522e8bd51SIlya Dryomov "");
179622e8bd51SIlya Dryomov if (ret && ret != -ENOENT)
179722e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
179822e8bd51SIlya Dryomov }
179922e8bd51SIlya Dryomov
decode_object_map_header(void ** p,void * end,u64 * object_map_size)180022e8bd51SIlya Dryomov static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
180122e8bd51SIlya Dryomov {
180222e8bd51SIlya Dryomov u8 struct_v;
180322e8bd51SIlya Dryomov u32 struct_len;
180422e8bd51SIlya Dryomov u32 header_len;
180522e8bd51SIlya Dryomov void *header_end;
180622e8bd51SIlya Dryomov int ret;
180722e8bd51SIlya Dryomov
180822e8bd51SIlya Dryomov ceph_decode_32_safe(p, end, header_len, e_inval);
180922e8bd51SIlya Dryomov header_end = *p + header_len;
181022e8bd51SIlya Dryomov
181122e8bd51SIlya Dryomov ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
181222e8bd51SIlya Dryomov &struct_len);
181322e8bd51SIlya Dryomov if (ret)
181422e8bd51SIlya Dryomov return ret;
181522e8bd51SIlya Dryomov
181622e8bd51SIlya Dryomov ceph_decode_64_safe(p, end, *object_map_size, e_inval);
181722e8bd51SIlya Dryomov
181822e8bd51SIlya Dryomov *p = header_end;
181922e8bd51SIlya Dryomov return 0;
182022e8bd51SIlya Dryomov
182122e8bd51SIlya Dryomov e_inval:
182222e8bd51SIlya Dryomov return -EINVAL;
182322e8bd51SIlya Dryomov }
182422e8bd51SIlya Dryomov
__rbd_object_map_load(struct rbd_device * rbd_dev)182522e8bd51SIlya Dryomov static int __rbd_object_map_load(struct rbd_device *rbd_dev)
182622e8bd51SIlya Dryomov {
182722e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
182822e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid);
182922e8bd51SIlya Dryomov struct page **pages;
183022e8bd51SIlya Dryomov void *p, *end;
183122e8bd51SIlya Dryomov size_t reply_len;
183222e8bd51SIlya Dryomov u64 num_objects;
183322e8bd51SIlya Dryomov u64 object_map_bytes;
183422e8bd51SIlya Dryomov u64 object_map_size;
183522e8bd51SIlya Dryomov int num_pages;
183622e8bd51SIlya Dryomov int ret;
183722e8bd51SIlya Dryomov
183822e8bd51SIlya Dryomov rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
183922e8bd51SIlya Dryomov
184022e8bd51SIlya Dryomov num_objects = ceph_get_num_objects(&rbd_dev->layout,
184122e8bd51SIlya Dryomov rbd_dev->mapping.size);
184222e8bd51SIlya Dryomov object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
184322e8bd51SIlya Dryomov BITS_PER_BYTE);
184422e8bd51SIlya Dryomov num_pages = calc_pages_for(0, object_map_bytes) + 1;
184522e8bd51SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
184622e8bd51SIlya Dryomov if (IS_ERR(pages))
184722e8bd51SIlya Dryomov return PTR_ERR(pages);
184822e8bd51SIlya Dryomov
184922e8bd51SIlya Dryomov reply_len = num_pages * PAGE_SIZE;
185022e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
185122e8bd51SIlya Dryomov ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
185222e8bd51SIlya Dryomov "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
185322e8bd51SIlya Dryomov NULL, 0, pages, &reply_len);
185422e8bd51SIlya Dryomov if (ret)
185522e8bd51SIlya Dryomov goto out;
185622e8bd51SIlya Dryomov
185722e8bd51SIlya Dryomov p = page_address(pages[0]);
185822e8bd51SIlya Dryomov end = p + min(reply_len, (size_t)PAGE_SIZE);
185922e8bd51SIlya Dryomov ret = decode_object_map_header(&p, end, &object_map_size);
186022e8bd51SIlya Dryomov if (ret)
186122e8bd51SIlya Dryomov goto out;
186222e8bd51SIlya Dryomov
186322e8bd51SIlya Dryomov if (object_map_size != num_objects) {
186422e8bd51SIlya Dryomov rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
186522e8bd51SIlya Dryomov object_map_size, num_objects);
186622e8bd51SIlya Dryomov ret = -EINVAL;
186722e8bd51SIlya Dryomov goto out;
186822e8bd51SIlya Dryomov }
186922e8bd51SIlya Dryomov
187022e8bd51SIlya Dryomov if (offset_in_page(p) + object_map_bytes > reply_len) {
187122e8bd51SIlya Dryomov ret = -EINVAL;
187222e8bd51SIlya Dryomov goto out;
187322e8bd51SIlya Dryomov }
187422e8bd51SIlya Dryomov
187522e8bd51SIlya Dryomov rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
187622e8bd51SIlya Dryomov if (!rbd_dev->object_map) {
187722e8bd51SIlya Dryomov ret = -ENOMEM;
187822e8bd51SIlya Dryomov goto out;
187922e8bd51SIlya Dryomov }
188022e8bd51SIlya Dryomov
188122e8bd51SIlya Dryomov rbd_dev->object_map_size = object_map_size;
188222e8bd51SIlya Dryomov ceph_copy_from_page_vector(pages, rbd_dev->object_map,
188322e8bd51SIlya Dryomov offset_in_page(p), object_map_bytes);
188422e8bd51SIlya Dryomov
188522e8bd51SIlya Dryomov out:
188622e8bd51SIlya Dryomov ceph_release_page_vector(pages, num_pages);
188722e8bd51SIlya Dryomov return ret;
188822e8bd51SIlya Dryomov }
188922e8bd51SIlya Dryomov
rbd_object_map_free(struct rbd_device * rbd_dev)189022e8bd51SIlya Dryomov static void rbd_object_map_free(struct rbd_device *rbd_dev)
189122e8bd51SIlya Dryomov {
189222e8bd51SIlya Dryomov kvfree(rbd_dev->object_map);
189322e8bd51SIlya Dryomov rbd_dev->object_map = NULL;
189422e8bd51SIlya Dryomov rbd_dev->object_map_size = 0;
189522e8bd51SIlya Dryomov }
189622e8bd51SIlya Dryomov
rbd_object_map_load(struct rbd_device * rbd_dev)189722e8bd51SIlya Dryomov static int rbd_object_map_load(struct rbd_device *rbd_dev)
189822e8bd51SIlya Dryomov {
189922e8bd51SIlya Dryomov int ret;
190022e8bd51SIlya Dryomov
190122e8bd51SIlya Dryomov ret = __rbd_object_map_load(rbd_dev);
190222e8bd51SIlya Dryomov if (ret)
190322e8bd51SIlya Dryomov return ret;
190422e8bd51SIlya Dryomov
190522e8bd51SIlya Dryomov ret = rbd_dev_v2_get_flags(rbd_dev);
190622e8bd51SIlya Dryomov if (ret) {
190722e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev);
190822e8bd51SIlya Dryomov return ret;
190922e8bd51SIlya Dryomov }
191022e8bd51SIlya Dryomov
191122e8bd51SIlya Dryomov if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
191222e8bd51SIlya Dryomov rbd_warn(rbd_dev, "object map is invalid");
191322e8bd51SIlya Dryomov
191422e8bd51SIlya Dryomov return 0;
191522e8bd51SIlya Dryomov }
191622e8bd51SIlya Dryomov
rbd_object_map_open(struct rbd_device * rbd_dev)191722e8bd51SIlya Dryomov static int rbd_object_map_open(struct rbd_device *rbd_dev)
191822e8bd51SIlya Dryomov {
191922e8bd51SIlya Dryomov int ret;
192022e8bd51SIlya Dryomov
192122e8bd51SIlya Dryomov ret = rbd_object_map_lock(rbd_dev);
192222e8bd51SIlya Dryomov if (ret)
192322e8bd51SIlya Dryomov return ret;
192422e8bd51SIlya Dryomov
192522e8bd51SIlya Dryomov ret = rbd_object_map_load(rbd_dev);
192622e8bd51SIlya Dryomov if (ret) {
192722e8bd51SIlya Dryomov rbd_object_map_unlock(rbd_dev);
192822e8bd51SIlya Dryomov return ret;
192922e8bd51SIlya Dryomov }
193022e8bd51SIlya Dryomov
193122e8bd51SIlya Dryomov return 0;
193222e8bd51SIlya Dryomov }
193322e8bd51SIlya Dryomov
rbd_object_map_close(struct rbd_device * rbd_dev)193422e8bd51SIlya Dryomov static void rbd_object_map_close(struct rbd_device *rbd_dev)
193522e8bd51SIlya Dryomov {
193622e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev);
193722e8bd51SIlya Dryomov rbd_object_map_unlock(rbd_dev);
193822e8bd51SIlya Dryomov }
193922e8bd51SIlya Dryomov
194022e8bd51SIlya Dryomov /*
194122e8bd51SIlya Dryomov * This function needs snap_id (or more precisely just something to
194222e8bd51SIlya Dryomov * distinguish between HEAD and snapshot object maps), new_state and
194322e8bd51SIlya Dryomov * current_state that were passed to rbd_object_map_update().
194422e8bd51SIlya Dryomov *
194522e8bd51SIlya Dryomov * To avoid allocating and stashing a context we piggyback on the OSD
194622e8bd51SIlya Dryomov * request. A HEAD update has two ops (assert_locked). For new_state
194722e8bd51SIlya Dryomov * and current_state we decode our own object_map_update op, encoded in
194822e8bd51SIlya Dryomov * rbd_cls_object_map_update().
194922e8bd51SIlya Dryomov */
rbd_object_map_update_finish(struct rbd_obj_request * obj_req,struct ceph_osd_request * osd_req)195022e8bd51SIlya Dryomov static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
195122e8bd51SIlya Dryomov struct ceph_osd_request *osd_req)
195222e8bd51SIlya Dryomov {
195322e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
195422e8bd51SIlya Dryomov struct ceph_osd_data *osd_data;
195522e8bd51SIlya Dryomov u64 objno;
19563f649ab7SKees Cook u8 state, new_state, current_state;
195722e8bd51SIlya Dryomov bool has_current_state;
195822e8bd51SIlya Dryomov void *p;
195922e8bd51SIlya Dryomov
196022e8bd51SIlya Dryomov if (osd_req->r_result)
196122e8bd51SIlya Dryomov return osd_req->r_result;
196222e8bd51SIlya Dryomov
196322e8bd51SIlya Dryomov /*
196422e8bd51SIlya Dryomov * Nothing to do for a snapshot object map.
196522e8bd51SIlya Dryomov */
196622e8bd51SIlya Dryomov if (osd_req->r_num_ops == 1)
196722e8bd51SIlya Dryomov return 0;
196822e8bd51SIlya Dryomov
196922e8bd51SIlya Dryomov /*
197022e8bd51SIlya Dryomov * Update in-memory HEAD object map.
197122e8bd51SIlya Dryomov */
197222e8bd51SIlya Dryomov rbd_assert(osd_req->r_num_ops == 2);
197322e8bd51SIlya Dryomov osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
197422e8bd51SIlya Dryomov rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
197522e8bd51SIlya Dryomov
197622e8bd51SIlya Dryomov p = page_address(osd_data->pages[0]);
197722e8bd51SIlya Dryomov objno = ceph_decode_64(&p);
197822e8bd51SIlya Dryomov rbd_assert(objno == obj_req->ex.oe_objno);
197922e8bd51SIlya Dryomov rbd_assert(ceph_decode_64(&p) == objno + 1);
198022e8bd51SIlya Dryomov new_state = ceph_decode_8(&p);
198122e8bd51SIlya Dryomov has_current_state = ceph_decode_8(&p);
198222e8bd51SIlya Dryomov if (has_current_state)
198322e8bd51SIlya Dryomov current_state = ceph_decode_8(&p);
198422e8bd51SIlya Dryomov
198522e8bd51SIlya Dryomov spin_lock(&rbd_dev->object_map_lock);
198622e8bd51SIlya Dryomov state = __rbd_object_map_get(rbd_dev, objno);
198722e8bd51SIlya Dryomov if (!has_current_state || current_state == state ||
198822e8bd51SIlya Dryomov (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
198922e8bd51SIlya Dryomov __rbd_object_map_set(rbd_dev, objno, new_state);
199022e8bd51SIlya Dryomov spin_unlock(&rbd_dev->object_map_lock);
199122e8bd51SIlya Dryomov
199222e8bd51SIlya Dryomov return 0;
199322e8bd51SIlya Dryomov }
199422e8bd51SIlya Dryomov
rbd_object_map_callback(struct ceph_osd_request * osd_req)199522e8bd51SIlya Dryomov static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
199622e8bd51SIlya Dryomov {
199722e8bd51SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
199822e8bd51SIlya Dryomov int result;
199922e8bd51SIlya Dryomov
200022e8bd51SIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
200122e8bd51SIlya Dryomov osd_req->r_result, obj_req);
200222e8bd51SIlya Dryomov
200322e8bd51SIlya Dryomov result = rbd_object_map_update_finish(obj_req, osd_req);
200422e8bd51SIlya Dryomov rbd_obj_handle_request(obj_req, result);
200522e8bd51SIlya Dryomov }
200622e8bd51SIlya Dryomov
update_needed(struct rbd_device * rbd_dev,u64 objno,u8 new_state)200722e8bd51SIlya Dryomov static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
200822e8bd51SIlya Dryomov {
200922e8bd51SIlya Dryomov u8 state = rbd_object_map_get(rbd_dev, objno);
201022e8bd51SIlya Dryomov
201122e8bd51SIlya Dryomov if (state == new_state ||
201222e8bd51SIlya Dryomov (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
201322e8bd51SIlya Dryomov (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
201422e8bd51SIlya Dryomov return false;
201522e8bd51SIlya Dryomov
201622e8bd51SIlya Dryomov return true;
201722e8bd51SIlya Dryomov }
201822e8bd51SIlya Dryomov
rbd_cls_object_map_update(struct ceph_osd_request * req,int which,u64 objno,u8 new_state,const u8 * current_state)201922e8bd51SIlya Dryomov static int rbd_cls_object_map_update(struct ceph_osd_request *req,
202022e8bd51SIlya Dryomov int which, u64 objno, u8 new_state,
202122e8bd51SIlya Dryomov const u8 *current_state)
202222e8bd51SIlya Dryomov {
202322e8bd51SIlya Dryomov struct page **pages;
202422e8bd51SIlya Dryomov void *p, *start;
202522e8bd51SIlya Dryomov int ret;
202622e8bd51SIlya Dryomov
202722e8bd51SIlya Dryomov ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
202822e8bd51SIlya Dryomov if (ret)
202922e8bd51SIlya Dryomov return ret;
203022e8bd51SIlya Dryomov
203122e8bd51SIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO);
203222e8bd51SIlya Dryomov if (IS_ERR(pages))
203322e8bd51SIlya Dryomov return PTR_ERR(pages);
203422e8bd51SIlya Dryomov
203522e8bd51SIlya Dryomov p = start = page_address(pages[0]);
203622e8bd51SIlya Dryomov ceph_encode_64(&p, objno);
203722e8bd51SIlya Dryomov ceph_encode_64(&p, objno + 1);
203822e8bd51SIlya Dryomov ceph_encode_8(&p, new_state);
203922e8bd51SIlya Dryomov if (current_state) {
204022e8bd51SIlya Dryomov ceph_encode_8(&p, 1);
204122e8bd51SIlya Dryomov ceph_encode_8(&p, *current_state);
204222e8bd51SIlya Dryomov } else {
204322e8bd51SIlya Dryomov ceph_encode_8(&p, 0);
204422e8bd51SIlya Dryomov }
204522e8bd51SIlya Dryomov
204622e8bd51SIlya Dryomov osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
204722e8bd51SIlya Dryomov false, true);
204822e8bd51SIlya Dryomov return 0;
204922e8bd51SIlya Dryomov }
205022e8bd51SIlya Dryomov
205122e8bd51SIlya Dryomov /*
205222e8bd51SIlya Dryomov * Return:
205322e8bd51SIlya Dryomov * 0 - object map update sent
205422e8bd51SIlya Dryomov * 1 - object map update isn't needed
205522e8bd51SIlya Dryomov * <0 - error
205622e8bd51SIlya Dryomov */
rbd_object_map_update(struct rbd_obj_request * obj_req,u64 snap_id,u8 new_state,const u8 * current_state)205722e8bd51SIlya Dryomov static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
205822e8bd51SIlya Dryomov u8 new_state, const u8 *current_state)
205922e8bd51SIlya Dryomov {
206022e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
206122e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
206222e8bd51SIlya Dryomov struct ceph_osd_request *req;
206322e8bd51SIlya Dryomov int num_ops = 1;
206422e8bd51SIlya Dryomov int which = 0;
206522e8bd51SIlya Dryomov int ret;
206622e8bd51SIlya Dryomov
206722e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) {
206822e8bd51SIlya Dryomov if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
206922e8bd51SIlya Dryomov return 1;
207022e8bd51SIlya Dryomov
207122e8bd51SIlya Dryomov num_ops++; /* assert_locked */
207222e8bd51SIlya Dryomov }
207322e8bd51SIlya Dryomov
207422e8bd51SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
207522e8bd51SIlya Dryomov if (!req)
207622e8bd51SIlya Dryomov return -ENOMEM;
207722e8bd51SIlya Dryomov
207822e8bd51SIlya Dryomov list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
207922e8bd51SIlya Dryomov req->r_callback = rbd_object_map_callback;
208022e8bd51SIlya Dryomov req->r_priv = obj_req;
208122e8bd51SIlya Dryomov
208222e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
208322e8bd51SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
208422e8bd51SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_WRITE;
208522e8bd51SIlya Dryomov ktime_get_real_ts64(&req->r_mtime);
208622e8bd51SIlya Dryomov
208722e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) {
208822e8bd51SIlya Dryomov /*
208922e8bd51SIlya Dryomov * Protect against possible race conditions during lock
209022e8bd51SIlya Dryomov * ownership transitions.
209122e8bd51SIlya Dryomov */
209222e8bd51SIlya Dryomov ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
209322e8bd51SIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, "", "");
209422e8bd51SIlya Dryomov if (ret)
209522e8bd51SIlya Dryomov return ret;
209622e8bd51SIlya Dryomov }
209722e8bd51SIlya Dryomov
209822e8bd51SIlya Dryomov ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
209922e8bd51SIlya Dryomov new_state, current_state);
210022e8bd51SIlya Dryomov if (ret)
210122e8bd51SIlya Dryomov return ret;
210222e8bd51SIlya Dryomov
210322e8bd51SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
210422e8bd51SIlya Dryomov if (ret)
210522e8bd51SIlya Dryomov return ret;
210622e8bd51SIlya Dryomov
2107a8af0d68SJeff Layton ceph_osdc_start_request(osdc, req);
210822e8bd51SIlya Dryomov return 0;
210922e8bd51SIlya Dryomov }
211022e8bd51SIlya Dryomov
prune_extents(struct ceph_file_extent * img_extents,u32 * num_img_extents,u64 overlap)211186bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents,
211286bd7998SIlya Dryomov u32 *num_img_extents, u64 overlap)
2113e93f3152SAlex Elder {
211486bd7998SIlya Dryomov u32 cnt = *num_img_extents;
2115e93f3152SAlex Elder
211686bd7998SIlya Dryomov /* drop extents completely beyond the overlap */
211786bd7998SIlya Dryomov while (cnt && img_extents[cnt - 1].fe_off >= overlap)
211886bd7998SIlya Dryomov cnt--;
2119e93f3152SAlex Elder
212086bd7998SIlya Dryomov if (cnt) {
212186bd7998SIlya Dryomov struct ceph_file_extent *ex = &img_extents[cnt - 1];
2122e93f3152SAlex Elder
212386bd7998SIlya Dryomov /* trim final overlapping extent */
212486bd7998SIlya Dryomov if (ex->fe_off + ex->fe_len > overlap)
212586bd7998SIlya Dryomov ex->fe_len = overlap - ex->fe_off;
2126e93f3152SAlex Elder }
2127e93f3152SAlex Elder
212886bd7998SIlya Dryomov *num_img_extents = cnt;
212986bd7998SIlya Dryomov }
213086bd7998SIlya Dryomov
213186bd7998SIlya Dryomov /*
213286bd7998SIlya Dryomov * Determine the byte range(s) covered by either just the object extent
213386bd7998SIlya Dryomov * or the entire object in the parent image.
213486bd7998SIlya Dryomov */
rbd_obj_calc_img_extents(struct rbd_obj_request * obj_req,bool entire)213586bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
213686bd7998SIlya Dryomov bool entire)
2137e93f3152SAlex Elder {
213886bd7998SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2139c5b5ef6cSAlex Elder int ret;
2140c5b5ef6cSAlex Elder
214186bd7998SIlya Dryomov if (!rbd_dev->parent_overlap)
214286bd7998SIlya Dryomov return 0;
214386bd7998SIlya Dryomov
214486bd7998SIlya Dryomov ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
214586bd7998SIlya Dryomov entire ? 0 : obj_req->ex.oe_off,
214686bd7998SIlya Dryomov entire ? rbd_dev->layout.object_size :
214786bd7998SIlya Dryomov obj_req->ex.oe_len,
214886bd7998SIlya Dryomov &obj_req->img_extents,
214986bd7998SIlya Dryomov &obj_req->num_img_extents);
215086bd7998SIlya Dryomov if (ret)
215186bd7998SIlya Dryomov return ret;
215286bd7998SIlya Dryomov
215386bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
215486bd7998SIlya Dryomov rbd_dev->parent_overlap);
215586bd7998SIlya Dryomov return 0;
215686bd7998SIlya Dryomov }
215786bd7998SIlya Dryomov
rbd_osd_setup_data(struct ceph_osd_request * osd_req,int which)2158bcbab1dbSIlya Dryomov static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
21593da691bfSIlya Dryomov {
2160bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
2161bcbab1dbSIlya Dryomov
2162ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) {
21633da691bfSIlya Dryomov case OBJ_REQUEST_BIO:
2164bcbab1dbSIlya Dryomov osd_req_op_extent_osd_data_bio(osd_req, which,
21653da691bfSIlya Dryomov &obj_req->bio_pos,
216643df3d35SIlya Dryomov obj_req->ex.oe_len);
21673da691bfSIlya Dryomov break;
21683da691bfSIlya Dryomov case OBJ_REQUEST_BVECS:
2169afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS:
21703da691bfSIlya Dryomov rbd_assert(obj_req->bvec_pos.iter.bi_size ==
217143df3d35SIlya Dryomov obj_req->ex.oe_len);
2172afb97888SIlya Dryomov rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2173bcbab1dbSIlya Dryomov osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
21743da691bfSIlya Dryomov &obj_req->bvec_pos);
21753da691bfSIlya Dryomov break;
21763da691bfSIlya Dryomov default:
217716809372SArnd Bergmann BUG();
21783da691bfSIlya Dryomov }
21793da691bfSIlya Dryomov }
21803da691bfSIlya Dryomov
rbd_osd_setup_stat(struct ceph_osd_request * osd_req,int which)2181bcbab1dbSIlya Dryomov static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
21823da691bfSIlya Dryomov {
21833da691bfSIlya Dryomov struct page **pages;
21843da691bfSIlya Dryomov
2185c5b5ef6cSAlex Elder /*
2186c5b5ef6cSAlex Elder * The response data for a STAT call consists of:
2187c5b5ef6cSAlex Elder * le64 length;
2188c5b5ef6cSAlex Elder * struct {
2189c5b5ef6cSAlex Elder * le32 tv_sec;
2190c5b5ef6cSAlex Elder * le32 tv_nsec;
2191c5b5ef6cSAlex Elder * } mtime;
2192c5b5ef6cSAlex Elder */
21933da691bfSIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO);
21943da691bfSIlya Dryomov if (IS_ERR(pages))
21953da691bfSIlya Dryomov return PTR_ERR(pages);
21963da691bfSIlya Dryomov
2197bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2198bcbab1dbSIlya Dryomov osd_req_op_raw_data_in_pages(osd_req, which, pages,
21993da691bfSIlya Dryomov 8 + sizeof(struct ceph_timespec),
22003da691bfSIlya Dryomov 0, false, true);
22013da691bfSIlya Dryomov return 0;
2202710214e3SIlya Dryomov }
2203c5b5ef6cSAlex Elder
rbd_osd_setup_copyup(struct ceph_osd_request * osd_req,int which,u32 bytes)2204b5ae8cbcSIlya Dryomov static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2205b5ae8cbcSIlya Dryomov u32 bytes)
220613488d53SIlya Dryomov {
2207b5ae8cbcSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
2208b5ae8cbcSIlya Dryomov int ret;
2209b5ae8cbcSIlya Dryomov
2210b5ae8cbcSIlya Dryomov ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2211b5ae8cbcSIlya Dryomov if (ret)
2212b5ae8cbcSIlya Dryomov return ret;
2213b5ae8cbcSIlya Dryomov
2214b5ae8cbcSIlya Dryomov osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2215b5ae8cbcSIlya Dryomov obj_req->copyup_bvec_count, bytes);
2216b5ae8cbcSIlya Dryomov return 0;
221713488d53SIlya Dryomov }
221813488d53SIlya Dryomov
rbd_obj_init_read(struct rbd_obj_request * obj_req)2219ea9b743cSIlya Dryomov static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
22203da691bfSIlya Dryomov {
2221ea9b743cSIlya Dryomov obj_req->read_state = RBD_OBJ_READ_START;
2222ea9b743cSIlya Dryomov return 0;
2223ea9b743cSIlya Dryomov }
2224ea9b743cSIlya Dryomov
__rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2225bcbab1dbSIlya Dryomov static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2226bcbab1dbSIlya Dryomov int which)
22273da691bfSIlya Dryomov {
2228bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
22293da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
22303da691bfSIlya Dryomov u16 opcode;
2231c5b5ef6cSAlex Elder
22328b5bec5cSIlya Dryomov if (!use_object_map(rbd_dev) ||
22338b5bec5cSIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2234bcbab1dbSIlya Dryomov osd_req_op_alloc_hint_init(osd_req, which++,
22353da691bfSIlya Dryomov rbd_dev->layout.object_size,
2236d3798accSIlya Dryomov rbd_dev->layout.object_size,
2237dc1dad8eSIlya Dryomov rbd_dev->opts->alloc_hint_flags);
22388b5bec5cSIlya Dryomov }
2239c5b5ef6cSAlex Elder
22403da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req))
22413da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL;
22423da691bfSIlya Dryomov else
22433da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITE;
2244c5b5ef6cSAlex Elder
2245bcbab1dbSIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode,
224643df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2247bcbab1dbSIlya Dryomov rbd_osd_setup_data(osd_req, which);
22483da691bfSIlya Dryomov }
22493da691bfSIlya Dryomov
rbd_obj_init_write(struct rbd_obj_request * obj_req)2250ea9b743cSIlya Dryomov static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
22513da691bfSIlya Dryomov {
22523da691bfSIlya Dryomov int ret;
22533da691bfSIlya Dryomov
225486bd7998SIlya Dryomov /* reverse map the entire object onto the parent */
225586bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true);
225686bd7998SIlya Dryomov if (ret)
225786bd7998SIlya Dryomov return ret;
225886bd7998SIlya Dryomov
225985b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START;
22603da691bfSIlya Dryomov return 0;
226170d045f6SIlya Dryomov }
226270d045f6SIlya Dryomov
truncate_or_zero_opcode(struct rbd_obj_request * obj_req)22636484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
22646484cbe9SIlya Dryomov {
22656484cbe9SIlya Dryomov return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
22666484cbe9SIlya Dryomov CEPH_OSD_OP_ZERO;
22676484cbe9SIlya Dryomov }
22686484cbe9SIlya Dryomov
__rbd_osd_setup_discard_ops(struct ceph_osd_request * osd_req,int which)226927bbd911SIlya Dryomov static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
227027bbd911SIlya Dryomov int which)
227127bbd911SIlya Dryomov {
227227bbd911SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
227327bbd911SIlya Dryomov
227427bbd911SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
227527bbd911SIlya Dryomov rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
227627bbd911SIlya Dryomov osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
227727bbd911SIlya Dryomov } else {
227827bbd911SIlya Dryomov osd_req_op_extent_init(osd_req, which,
227927bbd911SIlya Dryomov truncate_or_zero_opcode(obj_req),
228027bbd911SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len,
228127bbd911SIlya Dryomov 0, 0);
228227bbd911SIlya Dryomov }
228327bbd911SIlya Dryomov }
228427bbd911SIlya Dryomov
rbd_obj_init_discard(struct rbd_obj_request * obj_req)2285ea9b743cSIlya Dryomov static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
22866484cbe9SIlya Dryomov {
22870c93e1b7SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
228827bbd911SIlya Dryomov u64 off, next_off;
22896484cbe9SIlya Dryomov int ret;
22906484cbe9SIlya Dryomov
22910c93e1b7SIlya Dryomov /*
22920c93e1b7SIlya Dryomov * Align the range to alloc_size boundary and punt on discards
22930c93e1b7SIlya Dryomov * that are too small to free up any space.
22940c93e1b7SIlya Dryomov *
22950c93e1b7SIlya Dryomov * alloc_size == object_size && is_tail() is a special case for
22960c93e1b7SIlya Dryomov * filestore with filestore_punch_hole = false, needed to allow
22970c93e1b7SIlya Dryomov * truncate (in addition to delete).
22980c93e1b7SIlya Dryomov */
22990c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
23000c93e1b7SIlya Dryomov !rbd_obj_is_tail(obj_req)) {
230127bbd911SIlya Dryomov off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
230227bbd911SIlya Dryomov next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
230327bbd911SIlya Dryomov rbd_dev->opts->alloc_size);
23040c93e1b7SIlya Dryomov if (off >= next_off)
23050c93e1b7SIlya Dryomov return 1;
230627bbd911SIlya Dryomov
230727bbd911SIlya Dryomov dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
230827bbd911SIlya Dryomov obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
230927bbd911SIlya Dryomov off, next_off - off);
231027bbd911SIlya Dryomov obj_req->ex.oe_off = off;
231127bbd911SIlya Dryomov obj_req->ex.oe_len = next_off - off;
23120c93e1b7SIlya Dryomov }
23130c93e1b7SIlya Dryomov
23146484cbe9SIlya Dryomov /* reverse map the entire object onto the parent */
23156484cbe9SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true);
23166484cbe9SIlya Dryomov if (ret)
23176484cbe9SIlya Dryomov return ret;
23186484cbe9SIlya Dryomov
231922e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
23200ad5d953SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
23210ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_DELETION;
23226484cbe9SIlya Dryomov
232385b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START;
23246484cbe9SIlya Dryomov return 0;
23256484cbe9SIlya Dryomov }
23266484cbe9SIlya Dryomov
__rbd_osd_setup_zeroout_ops(struct ceph_osd_request * osd_req,int which)2327bcbab1dbSIlya Dryomov static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2328bcbab1dbSIlya Dryomov int which)
232913488d53SIlya Dryomov {
2330bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
23313da691bfSIlya Dryomov u16 opcode;
2332058aa991SIlya Dryomov
23333da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) {
233486bd7998SIlya Dryomov if (obj_req->num_img_extents) {
23350ad5d953SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2336bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which++,
23372bb1e56eSIlya Dryomov CEPH_OSD_OP_CREATE, 0);
23383da691bfSIlya Dryomov opcode = CEPH_OSD_OP_TRUNCATE;
23393da691bfSIlya Dryomov } else {
23400ad5d953SIlya Dryomov rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2341bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which++,
23423da691bfSIlya Dryomov CEPH_OSD_OP_DELETE, 0);
23433da691bfSIlya Dryomov opcode = 0;
23443da691bfSIlya Dryomov }
23453da691bfSIlya Dryomov } else {
23466484cbe9SIlya Dryomov opcode = truncate_or_zero_opcode(obj_req);
23473da691bfSIlya Dryomov }
23483da691bfSIlya Dryomov
23493da691bfSIlya Dryomov if (opcode)
2350bcbab1dbSIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode,
235143df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len,
23523da691bfSIlya Dryomov 0, 0);
23533da691bfSIlya Dryomov }
23543da691bfSIlya Dryomov
rbd_obj_init_zeroout(struct rbd_obj_request * obj_req)2355ea9b743cSIlya Dryomov static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
23563da691bfSIlya Dryomov {
23573da691bfSIlya Dryomov int ret;
23583da691bfSIlya Dryomov
235986bd7998SIlya Dryomov /* reverse map the entire object onto the parent */
236086bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true);
236186bd7998SIlya Dryomov if (ret)
236286bd7998SIlya Dryomov return ret;
236386bd7998SIlya Dryomov
23640ad5d953SIlya Dryomov if (!obj_req->num_img_extents) {
236522e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
23660ad5d953SIlya Dryomov if (rbd_obj_is_entire(obj_req))
23670ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_DELETION;
23683da691bfSIlya Dryomov }
23693da691bfSIlya Dryomov
237085b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START;
2371980917fcSIlya Dryomov return 0;
2372b454e36dSAlex Elder }
2373b454e36dSAlex Elder
count_write_ops(struct rbd_obj_request * obj_req)2374a086a1b8SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req)
2375a086a1b8SIlya Dryomov {
23768b5bec5cSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request;
23778b5bec5cSIlya Dryomov
23788b5bec5cSIlya Dryomov switch (img_req->op_type) {
2379a086a1b8SIlya Dryomov case OBJ_OP_WRITE:
23808b5bec5cSIlya Dryomov if (!use_object_map(img_req->rbd_dev) ||
23818b5bec5cSIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2382a086a1b8SIlya Dryomov return 2; /* setallochint + write/writefull */
23838b5bec5cSIlya Dryomov
23848b5bec5cSIlya Dryomov return 1; /* write/writefull */
2385a086a1b8SIlya Dryomov case OBJ_OP_DISCARD:
2386a086a1b8SIlya Dryomov return 1; /* delete/truncate/zero */
2387a086a1b8SIlya Dryomov case OBJ_OP_ZEROOUT:
2388a086a1b8SIlya Dryomov if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2389a086a1b8SIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2390a086a1b8SIlya Dryomov return 2; /* create + truncate */
2391a086a1b8SIlya Dryomov
2392a086a1b8SIlya Dryomov return 1; /* delete/truncate/zero */
2393a086a1b8SIlya Dryomov default:
2394a086a1b8SIlya Dryomov BUG();
2395a086a1b8SIlya Dryomov }
2396a086a1b8SIlya Dryomov }
2397a086a1b8SIlya Dryomov
rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2398a086a1b8SIlya Dryomov static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2399a086a1b8SIlya Dryomov int which)
2400a086a1b8SIlya Dryomov {
2401a086a1b8SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv;
2402a086a1b8SIlya Dryomov
2403a086a1b8SIlya Dryomov switch (obj_req->img_request->op_type) {
2404a086a1b8SIlya Dryomov case OBJ_OP_WRITE:
2405a086a1b8SIlya Dryomov __rbd_osd_setup_write_ops(osd_req, which);
2406a086a1b8SIlya Dryomov break;
2407a086a1b8SIlya Dryomov case OBJ_OP_DISCARD:
2408a086a1b8SIlya Dryomov __rbd_osd_setup_discard_ops(osd_req, which);
2409a086a1b8SIlya Dryomov break;
2410a086a1b8SIlya Dryomov case OBJ_OP_ZEROOUT:
2411a086a1b8SIlya Dryomov __rbd_osd_setup_zeroout_ops(osd_req, which);
2412a086a1b8SIlya Dryomov break;
2413a086a1b8SIlya Dryomov default:
2414a086a1b8SIlya Dryomov BUG();
2415a086a1b8SIlya Dryomov }
2416a086a1b8SIlya Dryomov }
2417a086a1b8SIlya Dryomov
2418b454e36dSAlex Elder /*
2419a086a1b8SIlya Dryomov * Prune the list of object requests (adjust offset and/or length, drop
2420a086a1b8SIlya Dryomov * redundant requests). Prepare object request state machines and image
2421a086a1b8SIlya Dryomov * request state machine for execution.
2422b454e36dSAlex Elder */
__rbd_img_fill_request(struct rbd_img_request * img_req)24233da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req)
24243da691bfSIlya Dryomov {
24250c93e1b7SIlya Dryomov struct rbd_obj_request *obj_req, *next_obj_req;
24263da691bfSIlya Dryomov int ret;
24273d7efd18SAlex Elder
24280c93e1b7SIlya Dryomov for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
24299bb0248dSIlya Dryomov switch (img_req->op_type) {
24303da691bfSIlya Dryomov case OBJ_OP_READ:
2431ea9b743cSIlya Dryomov ret = rbd_obj_init_read(obj_req);
24323da691bfSIlya Dryomov break;
24333da691bfSIlya Dryomov case OBJ_OP_WRITE:
2434ea9b743cSIlya Dryomov ret = rbd_obj_init_write(obj_req);
24353da691bfSIlya Dryomov break;
24363da691bfSIlya Dryomov case OBJ_OP_DISCARD:
2437ea9b743cSIlya Dryomov ret = rbd_obj_init_discard(obj_req);
24383da691bfSIlya Dryomov break;
24396484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT:
2440ea9b743cSIlya Dryomov ret = rbd_obj_init_zeroout(obj_req);
24416484cbe9SIlya Dryomov break;
24423da691bfSIlya Dryomov default:
244316809372SArnd Bergmann BUG();
24443da691bfSIlya Dryomov }
24450c93e1b7SIlya Dryomov if (ret < 0)
24463da691bfSIlya Dryomov return ret;
24470c93e1b7SIlya Dryomov if (ret > 0) {
24480c93e1b7SIlya Dryomov rbd_img_obj_request_del(img_req, obj_req);
24490c93e1b7SIlya Dryomov continue;
24500c93e1b7SIlya Dryomov }
2451b454e36dSAlex Elder }
2452b454e36dSAlex Elder
24530192ce2eSIlya Dryomov img_req->state = RBD_IMG_START;
24543da691bfSIlya Dryomov return 0;
24553da691bfSIlya Dryomov }
24563da691bfSIlya Dryomov
24575a237819SIlya Dryomov union rbd_img_fill_iter {
24585a237819SIlya Dryomov struct ceph_bio_iter bio_iter;
24595a237819SIlya Dryomov struct ceph_bvec_iter bvec_iter;
24605a237819SIlya Dryomov };
24615a237819SIlya Dryomov
24625a237819SIlya Dryomov struct rbd_img_fill_ctx {
24635a237819SIlya Dryomov enum obj_request_type pos_type;
24645a237819SIlya Dryomov union rbd_img_fill_iter *pos;
24655a237819SIlya Dryomov union rbd_img_fill_iter iter;
24665a237819SIlya Dryomov ceph_object_extent_fn_t set_pos_fn;
2467afb97888SIlya Dryomov ceph_object_extent_fn_t count_fn;
2468afb97888SIlya Dryomov ceph_object_extent_fn_t copy_fn;
24695a237819SIlya Dryomov };
24705a237819SIlya Dryomov
alloc_object_extent(void * arg)24715a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg)
24725a237819SIlya Dryomov {
24735a237819SIlya Dryomov struct rbd_img_request *img_req = arg;
24745a237819SIlya Dryomov struct rbd_obj_request *obj_req;
24755a237819SIlya Dryomov
24765a237819SIlya Dryomov obj_req = rbd_obj_request_create();
24775a237819SIlya Dryomov if (!obj_req)
24785a237819SIlya Dryomov return NULL;
24795a237819SIlya Dryomov
24805a237819SIlya Dryomov rbd_img_obj_request_add(img_req, obj_req);
24815a237819SIlya Dryomov return &obj_req->ex;
24825a237819SIlya Dryomov }
24835a237819SIlya Dryomov
24845a237819SIlya Dryomov /*
2485afb97888SIlya Dryomov * While su != os && sc == 1 is technically not fancy (it's the same
2486afb97888SIlya Dryomov * layout as su == os && sc == 1), we can't use the nocopy path for it
2487afb97888SIlya Dryomov * because ->set_pos_fn() should be called only once per object.
2488afb97888SIlya Dryomov * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2489afb97888SIlya Dryomov * treat su != os && sc == 1 as fancy.
24905a237819SIlya Dryomov */
rbd_layout_is_fancy(struct ceph_file_layout * l)2491afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2492afb97888SIlya Dryomov {
2493afb97888SIlya Dryomov return l->stripe_unit != l->object_size;
2494afb97888SIlya Dryomov }
2495afb97888SIlya Dryomov
rbd_img_fill_request_nocopy(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2496afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
24975a237819SIlya Dryomov struct ceph_file_extent *img_extents,
24985a237819SIlya Dryomov u32 num_img_extents,
24995a237819SIlya Dryomov struct rbd_img_fill_ctx *fctx)
25005a237819SIlya Dryomov {
25015a237819SIlya Dryomov u32 i;
25025a237819SIlya Dryomov int ret;
25035a237819SIlya Dryomov
25045a237819SIlya Dryomov img_req->data_type = fctx->pos_type;
25055a237819SIlya Dryomov
25065a237819SIlya Dryomov /*
25075a237819SIlya Dryomov * Create object requests and set each object request's starting
25085a237819SIlya Dryomov * position in the provided bio (list) or bio_vec array.
25095a237819SIlya Dryomov */
25105a237819SIlya Dryomov fctx->iter = *fctx->pos;
25115a237819SIlya Dryomov for (i = 0; i < num_img_extents; i++) {
25125a237819SIlya Dryomov ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
25135a237819SIlya Dryomov img_extents[i].fe_off,
25145a237819SIlya Dryomov img_extents[i].fe_len,
25155a237819SIlya Dryomov &img_req->object_extents,
25165a237819SIlya Dryomov alloc_object_extent, img_req,
25175a237819SIlya Dryomov fctx->set_pos_fn, &fctx->iter);
25185a237819SIlya Dryomov if (ret)
25195a237819SIlya Dryomov return ret;
25205a237819SIlya Dryomov }
25215a237819SIlya Dryomov
25225a237819SIlya Dryomov return __rbd_img_fill_request(img_req);
25235a237819SIlya Dryomov }
25245a237819SIlya Dryomov
2525afb97888SIlya Dryomov /*
2526afb97888SIlya Dryomov * Map a list of image extents to a list of object extents, create the
2527afb97888SIlya Dryomov * corresponding object requests (normally each to a different object,
2528afb97888SIlya Dryomov * but not always) and add them to @img_req. For each object request,
2529afb97888SIlya Dryomov * set up its data descriptor to point to the corresponding chunk(s) of
2530afb97888SIlya Dryomov * @fctx->pos data buffer.
2531afb97888SIlya Dryomov *
2532afb97888SIlya Dryomov * Because ceph_file_to_extents() will merge adjacent object extents
2533afb97888SIlya Dryomov * together, each object request's data descriptor may point to multiple
2534afb97888SIlya Dryomov * different chunks of @fctx->pos data buffer.
2535afb97888SIlya Dryomov *
2536afb97888SIlya Dryomov * @fctx->pos data buffer is assumed to be large enough.
2537afb97888SIlya Dryomov */
rbd_img_fill_request(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2538afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req,
2539afb97888SIlya Dryomov struct ceph_file_extent *img_extents,
2540afb97888SIlya Dryomov u32 num_img_extents,
2541afb97888SIlya Dryomov struct rbd_img_fill_ctx *fctx)
2542afb97888SIlya Dryomov {
2543afb97888SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
2544afb97888SIlya Dryomov struct rbd_obj_request *obj_req;
2545afb97888SIlya Dryomov u32 i;
2546afb97888SIlya Dryomov int ret;
2547afb97888SIlya Dryomov
2548afb97888SIlya Dryomov if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2549afb97888SIlya Dryomov !rbd_layout_is_fancy(&rbd_dev->layout))
2550afb97888SIlya Dryomov return rbd_img_fill_request_nocopy(img_req, img_extents,
2551afb97888SIlya Dryomov num_img_extents, fctx);
2552afb97888SIlya Dryomov
2553afb97888SIlya Dryomov img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2554afb97888SIlya Dryomov
2555afb97888SIlya Dryomov /*
2556afb97888SIlya Dryomov * Create object requests and determine ->bvec_count for each object
2557afb97888SIlya Dryomov * request. Note that ->bvec_count sum over all object requests may
2558afb97888SIlya Dryomov * be greater than the number of bio_vecs in the provided bio (list)
2559afb97888SIlya Dryomov * or bio_vec array because when mapped, those bio_vecs can straddle
2560afb97888SIlya Dryomov * stripe unit boundaries.
2561afb97888SIlya Dryomov */
2562afb97888SIlya Dryomov fctx->iter = *fctx->pos;
2563afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) {
2564afb97888SIlya Dryomov ret = ceph_file_to_extents(&rbd_dev->layout,
2565afb97888SIlya Dryomov img_extents[i].fe_off,
2566afb97888SIlya Dryomov img_extents[i].fe_len,
2567afb97888SIlya Dryomov &img_req->object_extents,
2568afb97888SIlya Dryomov alloc_object_extent, img_req,
2569afb97888SIlya Dryomov fctx->count_fn, &fctx->iter);
2570afb97888SIlya Dryomov if (ret)
2571afb97888SIlya Dryomov return ret;
2572afb97888SIlya Dryomov }
2573afb97888SIlya Dryomov
2574afb97888SIlya Dryomov for_each_obj_request(img_req, obj_req) {
2575afb97888SIlya Dryomov obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2576afb97888SIlya Dryomov sizeof(*obj_req->bvec_pos.bvecs),
2577afb97888SIlya Dryomov GFP_NOIO);
2578afb97888SIlya Dryomov if (!obj_req->bvec_pos.bvecs)
2579afb97888SIlya Dryomov return -ENOMEM;
2580afb97888SIlya Dryomov }
2581afb97888SIlya Dryomov
2582afb97888SIlya Dryomov /*
2583afb97888SIlya Dryomov * Fill in each object request's private bio_vec array, splitting and
2584afb97888SIlya Dryomov * rearranging the provided bio_vecs in stripe unit chunks as needed.
2585afb97888SIlya Dryomov */
2586afb97888SIlya Dryomov fctx->iter = *fctx->pos;
2587afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) {
2588afb97888SIlya Dryomov ret = ceph_iterate_extents(&rbd_dev->layout,
2589afb97888SIlya Dryomov img_extents[i].fe_off,
2590afb97888SIlya Dryomov img_extents[i].fe_len,
2591afb97888SIlya Dryomov &img_req->object_extents,
2592afb97888SIlya Dryomov fctx->copy_fn, &fctx->iter);
2593afb97888SIlya Dryomov if (ret)
2594afb97888SIlya Dryomov return ret;
2595afb97888SIlya Dryomov }
2596afb97888SIlya Dryomov
2597afb97888SIlya Dryomov return __rbd_img_fill_request(img_req);
2598afb97888SIlya Dryomov }
2599afb97888SIlya Dryomov
rbd_img_fill_nodata(struct rbd_img_request * img_req,u64 off,u64 len)26005a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
26015a237819SIlya Dryomov u64 off, u64 len)
26025a237819SIlya Dryomov {
26035a237819SIlya Dryomov struct ceph_file_extent ex = { off, len };
2604a55e601bSArnd Bergmann union rbd_img_fill_iter dummy = {};
26055a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = {
26065a237819SIlya Dryomov .pos_type = OBJ_REQUEST_NODATA,
26075a237819SIlya Dryomov .pos = &dummy,
26085a237819SIlya Dryomov };
26095a237819SIlya Dryomov
26105a237819SIlya Dryomov return rbd_img_fill_request(img_req, &ex, 1, &fctx);
26115a237819SIlya Dryomov }
26125a237819SIlya Dryomov
set_bio_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)26135a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
26145a237819SIlya Dryomov {
26155a237819SIlya Dryomov struct rbd_obj_request *obj_req =
26165a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex);
26175a237819SIlya Dryomov struct ceph_bio_iter *it = arg;
26185a237819SIlya Dryomov
26195a237819SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
26205a237819SIlya Dryomov obj_req->bio_pos = *it;
26215a237819SIlya Dryomov ceph_bio_iter_advance(it, bytes);
26225a237819SIlya Dryomov }
26235a237819SIlya Dryomov
count_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2624afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2625afb97888SIlya Dryomov {
2626afb97888SIlya Dryomov struct rbd_obj_request *obj_req =
2627afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex);
2628afb97888SIlya Dryomov struct ceph_bio_iter *it = arg;
2629afb97888SIlya Dryomov
2630afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2631afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({
2632afb97888SIlya Dryomov obj_req->bvec_count++;
2633afb97888SIlya Dryomov }));
2634afb97888SIlya Dryomov
2635afb97888SIlya Dryomov }
2636afb97888SIlya Dryomov
copy_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2637afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2638afb97888SIlya Dryomov {
2639afb97888SIlya Dryomov struct rbd_obj_request *obj_req =
2640afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex);
2641afb97888SIlya Dryomov struct ceph_bio_iter *it = arg;
2642afb97888SIlya Dryomov
2643afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2644afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({
2645afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2646afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2647afb97888SIlya Dryomov }));
2648afb97888SIlya Dryomov }
2649afb97888SIlya Dryomov
__rbd_img_fill_from_bio(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bio_iter * bio_pos)26505a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
26515a237819SIlya Dryomov struct ceph_file_extent *img_extents,
26525a237819SIlya Dryomov u32 num_img_extents,
26535a237819SIlya Dryomov struct ceph_bio_iter *bio_pos)
26545a237819SIlya Dryomov {
26555a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = {
26565a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BIO,
26575a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bio_pos,
26585a237819SIlya Dryomov .set_pos_fn = set_bio_pos,
2659afb97888SIlya Dryomov .count_fn = count_bio_bvecs,
2660afb97888SIlya Dryomov .copy_fn = copy_bio_bvecs,
26615a237819SIlya Dryomov };
26625a237819SIlya Dryomov
26635a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents,
26645a237819SIlya Dryomov &fctx);
26655a237819SIlya Dryomov }
26665a237819SIlya Dryomov
rbd_img_fill_from_bio(struct rbd_img_request * img_req,u64 off,u64 len,struct bio * bio)26675a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
26685a237819SIlya Dryomov u64 off, u64 len, struct bio *bio)
26695a237819SIlya Dryomov {
26705a237819SIlya Dryomov struct ceph_file_extent ex = { off, len };
26715a237819SIlya Dryomov struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
26725a237819SIlya Dryomov
26735a237819SIlya Dryomov return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
26745a237819SIlya Dryomov }
26755a237819SIlya Dryomov
set_bvec_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)26765a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
26775a237819SIlya Dryomov {
26785a237819SIlya Dryomov struct rbd_obj_request *obj_req =
26795a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex);
26805a237819SIlya Dryomov struct ceph_bvec_iter *it = arg;
26815a237819SIlya Dryomov
26825a237819SIlya Dryomov obj_req->bvec_pos = *it;
26835a237819SIlya Dryomov ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
26845a237819SIlya Dryomov ceph_bvec_iter_advance(it, bytes);
26855a237819SIlya Dryomov }
26865a237819SIlya Dryomov
count_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2687afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2688afb97888SIlya Dryomov {
2689afb97888SIlya Dryomov struct rbd_obj_request *obj_req =
2690afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex);
2691afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg;
2692afb97888SIlya Dryomov
2693afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({
2694afb97888SIlya Dryomov obj_req->bvec_count++;
2695afb97888SIlya Dryomov }));
2696afb97888SIlya Dryomov }
2697afb97888SIlya Dryomov
copy_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2698afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2699afb97888SIlya Dryomov {
2700afb97888SIlya Dryomov struct rbd_obj_request *obj_req =
2701afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex);
2702afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg;
2703afb97888SIlya Dryomov
2704afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({
2705afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2706afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2707afb97888SIlya Dryomov }));
2708afb97888SIlya Dryomov }
2709afb97888SIlya Dryomov
__rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bvec_iter * bvec_pos)27105a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
27115a237819SIlya Dryomov struct ceph_file_extent *img_extents,
27125a237819SIlya Dryomov u32 num_img_extents,
27135a237819SIlya Dryomov struct ceph_bvec_iter *bvec_pos)
27145a237819SIlya Dryomov {
27155a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = {
27165a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BVECS,
27175a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bvec_pos,
27185a237819SIlya Dryomov .set_pos_fn = set_bvec_pos,
2719afb97888SIlya Dryomov .count_fn = count_bvecs,
2720afb97888SIlya Dryomov .copy_fn = copy_bvecs,
27215a237819SIlya Dryomov };
27225a237819SIlya Dryomov
27235a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents,
27245a237819SIlya Dryomov &fctx);
27255a237819SIlya Dryomov }
27265a237819SIlya Dryomov
rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct bio_vec * bvecs)27275a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
27285a237819SIlya Dryomov struct ceph_file_extent *img_extents,
27295a237819SIlya Dryomov u32 num_img_extents,
27305a237819SIlya Dryomov struct bio_vec *bvecs)
27315a237819SIlya Dryomov {
27325a237819SIlya Dryomov struct ceph_bvec_iter it = {
27335a237819SIlya Dryomov .bvecs = bvecs,
27345a237819SIlya Dryomov .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
27355a237819SIlya Dryomov num_img_extents) },
27365a237819SIlya Dryomov };
27375a237819SIlya Dryomov
27385a237819SIlya Dryomov return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
27395a237819SIlya Dryomov &it);
27405a237819SIlya Dryomov }
27415a237819SIlya Dryomov
rbd_img_handle_request_work(struct work_struct * work)27420192ce2eSIlya Dryomov static void rbd_img_handle_request_work(struct work_struct *work)
2743bf0d5f50SAlex Elder {
27440192ce2eSIlya Dryomov struct rbd_img_request *img_req =
27450192ce2eSIlya Dryomov container_of(work, struct rbd_img_request, work);
2746bf0d5f50SAlex Elder
27470192ce2eSIlya Dryomov rbd_img_handle_request(img_req, img_req->work_result);
27480192ce2eSIlya Dryomov }
2749bf0d5f50SAlex Elder
rbd_img_schedule(struct rbd_img_request * img_req,int result)27500192ce2eSIlya Dryomov static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
27510192ce2eSIlya Dryomov {
27520192ce2eSIlya Dryomov INIT_WORK(&img_req->work, rbd_img_handle_request_work);
27530192ce2eSIlya Dryomov img_req->work_result = result;
27540192ce2eSIlya Dryomov queue_work(rbd_wq, &img_req->work);
2755bf0d5f50SAlex Elder }
2756bf0d5f50SAlex Elder
rbd_obj_may_exist(struct rbd_obj_request * obj_req)275722e8bd51SIlya Dryomov static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
275822e8bd51SIlya Dryomov {
275922e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
276022e8bd51SIlya Dryomov
276122e8bd51SIlya Dryomov if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
276222e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
276322e8bd51SIlya Dryomov return true;
276422e8bd51SIlya Dryomov }
276522e8bd51SIlya Dryomov
276622e8bd51SIlya Dryomov dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
276722e8bd51SIlya Dryomov obj_req->ex.oe_objno);
276822e8bd51SIlya Dryomov return false;
276922e8bd51SIlya Dryomov }
277022e8bd51SIlya Dryomov
rbd_obj_read_object(struct rbd_obj_request * obj_req)277185b5e6d1SIlya Dryomov static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
277285b5e6d1SIlya Dryomov {
2773a086a1b8SIlya Dryomov struct ceph_osd_request *osd_req;
2774a086a1b8SIlya Dryomov int ret;
2775a086a1b8SIlya Dryomov
2776a086a1b8SIlya Dryomov osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2777a086a1b8SIlya Dryomov if (IS_ERR(osd_req))
2778a086a1b8SIlya Dryomov return PTR_ERR(osd_req);
2779a086a1b8SIlya Dryomov
2780a086a1b8SIlya Dryomov osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2781a086a1b8SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2782a086a1b8SIlya Dryomov rbd_osd_setup_data(osd_req, 0);
2783a086a1b8SIlya Dryomov rbd_osd_format_read(osd_req);
2784a086a1b8SIlya Dryomov
2785a086a1b8SIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2786a086a1b8SIlya Dryomov if (ret)
2787a086a1b8SIlya Dryomov return ret;
2788a086a1b8SIlya Dryomov
2789a086a1b8SIlya Dryomov rbd_osd_submit(osd_req);
279085b5e6d1SIlya Dryomov return 0;
2791bf0d5f50SAlex Elder }
2792bf0d5f50SAlex Elder
rbd_obj_read_from_parent(struct rbd_obj_request * obj_req)279386bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
27943da691bfSIlya Dryomov {
27953da691bfSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request;
2796a52cc685SIlya Dryomov struct rbd_device *parent = img_req->rbd_dev->parent;
27973da691bfSIlya Dryomov struct rbd_img_request *child_img_req;
27983da691bfSIlya Dryomov int ret;
27993da691bfSIlya Dryomov
280059e542c8SIlya Dryomov child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
28013da691bfSIlya Dryomov if (!child_img_req)
28023da691bfSIlya Dryomov return -ENOMEM;
28033da691bfSIlya Dryomov
280459e542c8SIlya Dryomov rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
2805e93aca0aSIlya Dryomov __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2806e93aca0aSIlya Dryomov child_img_req->obj_request = obj_req;
2807e93aca0aSIlya Dryomov
2808a52cc685SIlya Dryomov down_read(&parent->header_rwsem);
2809a52cc685SIlya Dryomov rbd_img_capture_header(child_img_req);
2810a52cc685SIlya Dryomov up_read(&parent->header_rwsem);
2811a52cc685SIlya Dryomov
281221ed05a8SIlya Dryomov dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
281321ed05a8SIlya Dryomov obj_req);
281421ed05a8SIlya Dryomov
28153da691bfSIlya Dryomov if (!rbd_img_is_write(img_req)) {
2816ecc633caSIlya Dryomov switch (img_req->data_type) {
28173da691bfSIlya Dryomov case OBJ_REQUEST_BIO:
28185a237819SIlya Dryomov ret = __rbd_img_fill_from_bio(child_img_req,
28195a237819SIlya Dryomov obj_req->img_extents,
28205a237819SIlya Dryomov obj_req->num_img_extents,
28213da691bfSIlya Dryomov &obj_req->bio_pos);
28223da691bfSIlya Dryomov break;
28233da691bfSIlya Dryomov case OBJ_REQUEST_BVECS:
2824afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS:
28255a237819SIlya Dryomov ret = __rbd_img_fill_from_bvecs(child_img_req,
28265a237819SIlya Dryomov obj_req->img_extents,
28275a237819SIlya Dryomov obj_req->num_img_extents,
28283da691bfSIlya Dryomov &obj_req->bvec_pos);
28293da691bfSIlya Dryomov break;
28303da691bfSIlya Dryomov default:
2831d342a15bSArnd Bergmann BUG();
28323da691bfSIlya Dryomov }
28333da691bfSIlya Dryomov } else {
28345a237819SIlya Dryomov ret = rbd_img_fill_from_bvecs(child_img_req,
28355a237819SIlya Dryomov obj_req->img_extents,
28365a237819SIlya Dryomov obj_req->num_img_extents,
28375a237819SIlya Dryomov obj_req->copyup_bvecs);
28383da691bfSIlya Dryomov }
28393da691bfSIlya Dryomov if (ret) {
2840679a97d2SHannes Reinecke rbd_img_request_destroy(child_img_req);
2841663ae2ccSIlya Dryomov return ret;
2842bf0d5f50SAlex Elder }
2843bf0d5f50SAlex Elder
28440192ce2eSIlya Dryomov /* avoid parent chain recursion */
28450192ce2eSIlya Dryomov rbd_img_schedule(child_img_req, 0);
28463da691bfSIlya Dryomov return 0;
28473da691bfSIlya Dryomov }
28483da691bfSIlya Dryomov
rbd_obj_advance_read(struct rbd_obj_request * obj_req,int * result)284985b5e6d1SIlya Dryomov static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
28508b3e1a56SAlex Elder {
28513da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
28523da691bfSIlya Dryomov int ret;
28538b3e1a56SAlex Elder
285422e8bd51SIlya Dryomov again:
2855a9b67e69SIlya Dryomov switch (obj_req->read_state) {
285685b5e6d1SIlya Dryomov case RBD_OBJ_READ_START:
285785b5e6d1SIlya Dryomov rbd_assert(!*result);
285885b5e6d1SIlya Dryomov
285922e8bd51SIlya Dryomov if (!rbd_obj_may_exist(obj_req)) {
286022e8bd51SIlya Dryomov *result = -ENOENT;
286122e8bd51SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_OBJECT;
286222e8bd51SIlya Dryomov goto again;
286322e8bd51SIlya Dryomov }
286422e8bd51SIlya Dryomov
286585b5e6d1SIlya Dryomov ret = rbd_obj_read_object(obj_req);
286685b5e6d1SIlya Dryomov if (ret) {
286785b5e6d1SIlya Dryomov *result = ret;
286885b5e6d1SIlya Dryomov return true;
286985b5e6d1SIlya Dryomov }
287085b5e6d1SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_OBJECT;
287185b5e6d1SIlya Dryomov return false;
2872a9b67e69SIlya Dryomov case RBD_OBJ_READ_OBJECT:
2873a9b67e69SIlya Dryomov if (*result == -ENOENT && rbd_dev->parent_overlap) {
287486bd7998SIlya Dryomov /* reverse map this object extent onto the parent */
287586bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, false);
287686bd7998SIlya Dryomov if (ret) {
287754ab3b24SIlya Dryomov *result = ret;
287886bd7998SIlya Dryomov return true;
287986bd7998SIlya Dryomov }
288086bd7998SIlya Dryomov if (obj_req->num_img_extents) {
288186bd7998SIlya Dryomov ret = rbd_obj_read_from_parent(obj_req);
28823da691bfSIlya Dryomov if (ret) {
288354ab3b24SIlya Dryomov *result = ret;
28843da691bfSIlya Dryomov return true;
28853da691bfSIlya Dryomov }
2886a9b67e69SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_PARENT;
28873da691bfSIlya Dryomov return false;
28883da691bfSIlya Dryomov }
288986bd7998SIlya Dryomov }
289002c74fbaSAlex Elder
289102c74fbaSAlex Elder /*
28923da691bfSIlya Dryomov * -ENOENT means a hole in the image -- zero-fill the entire
28933da691bfSIlya Dryomov * length of the request. A short read also implies zero-fill
289454ab3b24SIlya Dryomov * to the end of the request.
289502c74fbaSAlex Elder */
289654ab3b24SIlya Dryomov if (*result == -ENOENT) {
289754ab3b24SIlya Dryomov rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
289854ab3b24SIlya Dryomov *result = 0;
289954ab3b24SIlya Dryomov } else if (*result >= 0) {
290054ab3b24SIlya Dryomov if (*result < obj_req->ex.oe_len)
290154ab3b24SIlya Dryomov rbd_obj_zero_range(obj_req, *result,
290254ab3b24SIlya Dryomov obj_req->ex.oe_len - *result);
290354ab3b24SIlya Dryomov else
290454ab3b24SIlya Dryomov rbd_assert(*result == obj_req->ex.oe_len);
290554ab3b24SIlya Dryomov *result = 0;
29063da691bfSIlya Dryomov }
29073da691bfSIlya Dryomov return true;
2908a9b67e69SIlya Dryomov case RBD_OBJ_READ_PARENT:
2909d435c9a7SIlya Dryomov /*
2910d435c9a7SIlya Dryomov * The parent image is read only up to the overlap -- zero-fill
2911d435c9a7SIlya Dryomov * from the overlap to the end of the request.
2912d435c9a7SIlya Dryomov */
2913d435c9a7SIlya Dryomov if (!*result) {
2914d435c9a7SIlya Dryomov u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2915d435c9a7SIlya Dryomov
2916d435c9a7SIlya Dryomov if (obj_overlap < obj_req->ex.oe_len)
2917d435c9a7SIlya Dryomov rbd_obj_zero_range(obj_req, obj_overlap,
2918d435c9a7SIlya Dryomov obj_req->ex.oe_len - obj_overlap);
2919d435c9a7SIlya Dryomov }
2920a9b67e69SIlya Dryomov return true;
2921a9b67e69SIlya Dryomov default:
2922a9b67e69SIlya Dryomov BUG();
2923a9b67e69SIlya Dryomov }
29243da691bfSIlya Dryomov }
29253da691bfSIlya Dryomov
rbd_obj_write_is_noop(struct rbd_obj_request * obj_req)292622e8bd51SIlya Dryomov static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
292722e8bd51SIlya Dryomov {
292822e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
292922e8bd51SIlya Dryomov
293022e8bd51SIlya Dryomov if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
293122e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
293222e8bd51SIlya Dryomov
293322e8bd51SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
293422e8bd51SIlya Dryomov (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
293522e8bd51SIlya Dryomov dout("%s %p noop for nonexistent\n", __func__, obj_req);
29363da691bfSIlya Dryomov return true;
29373da691bfSIlya Dryomov }
29383da691bfSIlya Dryomov
293922e8bd51SIlya Dryomov return false;
294022e8bd51SIlya Dryomov }
294122e8bd51SIlya Dryomov
294222e8bd51SIlya Dryomov /*
294322e8bd51SIlya Dryomov * Return:
294422e8bd51SIlya Dryomov * 0 - object map update sent
294522e8bd51SIlya Dryomov * 1 - object map update isn't needed
294622e8bd51SIlya Dryomov * <0 - error
294722e8bd51SIlya Dryomov */
rbd_obj_write_pre_object_map(struct rbd_obj_request * obj_req)294822e8bd51SIlya Dryomov static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
294922e8bd51SIlya Dryomov {
295022e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
295122e8bd51SIlya Dryomov u8 new_state;
295222e8bd51SIlya Dryomov
295322e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
295422e8bd51SIlya Dryomov return 1;
295522e8bd51SIlya Dryomov
295622e8bd51SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
295722e8bd51SIlya Dryomov new_state = OBJECT_PENDING;
295822e8bd51SIlya Dryomov else
295922e8bd51SIlya Dryomov new_state = OBJECT_EXISTS;
296022e8bd51SIlya Dryomov
296122e8bd51SIlya Dryomov return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
296222e8bd51SIlya Dryomov }
296322e8bd51SIlya Dryomov
rbd_obj_write_object(struct rbd_obj_request * obj_req)296485b5e6d1SIlya Dryomov static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
296585b5e6d1SIlya Dryomov {
2966a086a1b8SIlya Dryomov struct ceph_osd_request *osd_req;
2967a086a1b8SIlya Dryomov int num_ops = count_write_ops(obj_req);
2968a086a1b8SIlya Dryomov int which = 0;
2969a086a1b8SIlya Dryomov int ret;
2970a086a1b8SIlya Dryomov
2971a086a1b8SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2972a086a1b8SIlya Dryomov num_ops++; /* stat */
2973a086a1b8SIlya Dryomov
2974a086a1b8SIlya Dryomov osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2975a086a1b8SIlya Dryomov if (IS_ERR(osd_req))
2976a086a1b8SIlya Dryomov return PTR_ERR(osd_req);
2977a086a1b8SIlya Dryomov
2978a086a1b8SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2979a086a1b8SIlya Dryomov ret = rbd_osd_setup_stat(osd_req, which++);
2980a086a1b8SIlya Dryomov if (ret)
2981a086a1b8SIlya Dryomov return ret;
2982a086a1b8SIlya Dryomov }
2983a086a1b8SIlya Dryomov
2984a086a1b8SIlya Dryomov rbd_osd_setup_write_ops(osd_req, which);
2985a086a1b8SIlya Dryomov rbd_osd_format_write(osd_req);
2986a086a1b8SIlya Dryomov
2987a086a1b8SIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2988a086a1b8SIlya Dryomov if (ret)
2989a086a1b8SIlya Dryomov return ret;
2990a086a1b8SIlya Dryomov
2991a086a1b8SIlya Dryomov rbd_osd_submit(osd_req);
299285b5e6d1SIlya Dryomov return 0;
299385b5e6d1SIlya Dryomov }
299485b5e6d1SIlya Dryomov
29953da691bfSIlya Dryomov /*
29963da691bfSIlya Dryomov * copyup_bvecs pages are never highmem pages
29973da691bfSIlya Dryomov */
is_zero_bvecs(struct bio_vec * bvecs,u32 bytes)29983da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
29993da691bfSIlya Dryomov {
30003da691bfSIlya Dryomov struct ceph_bvec_iter it = {
30013da691bfSIlya Dryomov .bvecs = bvecs,
30023da691bfSIlya Dryomov .iter = { .bi_size = bytes },
30033da691bfSIlya Dryomov };
30043da691bfSIlya Dryomov
30053da691bfSIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({
3006cf58b537SChristoph Hellwig if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len))
30073da691bfSIlya Dryomov return false;
30083da691bfSIlya Dryomov }));
30093da691bfSIlya Dryomov return true;
30103da691bfSIlya Dryomov }
30113da691bfSIlya Dryomov
30123a482501SIlya Dryomov #define MODS_ONLY U32_MAX
30133a482501SIlya Dryomov
rbd_obj_copyup_empty_snapc(struct rbd_obj_request * obj_req,u32 bytes)3014793333a3SIlya Dryomov static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
301589a59c1cSIlya Dryomov u32 bytes)
30163da691bfSIlya Dryomov {
3017bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req;
3018fe943d50SChengguang Xu int ret;
30193da691bfSIlya Dryomov
30203da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
302189a59c1cSIlya Dryomov rbd_assert(bytes > 0 && bytes != MODS_ONLY);
30223da691bfSIlya Dryomov
3023bcbab1dbSIlya Dryomov osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3024bcbab1dbSIlya Dryomov if (IS_ERR(osd_req))
3025bcbab1dbSIlya Dryomov return PTR_ERR(osd_req);
30263da691bfSIlya Dryomov
3027b5ae8cbcSIlya Dryomov ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3028fe943d50SChengguang Xu if (ret)
3029fe943d50SChengguang Xu return ret;
3030fe943d50SChengguang Xu
3031bcbab1dbSIlya Dryomov rbd_osd_format_write(osd_req);
30323da691bfSIlya Dryomov
3033bcbab1dbSIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
303489a59c1cSIlya Dryomov if (ret)
303589a59c1cSIlya Dryomov return ret;
303689a59c1cSIlya Dryomov
3037a086a1b8SIlya Dryomov rbd_osd_submit(osd_req);
303889a59c1cSIlya Dryomov return 0;
303989a59c1cSIlya Dryomov }
304089a59c1cSIlya Dryomov
rbd_obj_copyup_current_snapc(struct rbd_obj_request * obj_req,u32 bytes)3041793333a3SIlya Dryomov static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3042793333a3SIlya Dryomov u32 bytes)
30433da691bfSIlya Dryomov {
3044bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req;
3045a086a1b8SIlya Dryomov int num_ops = count_write_ops(obj_req);
3046a086a1b8SIlya Dryomov int which = 0;
30473da691bfSIlya Dryomov int ret;
30483da691bfSIlya Dryomov
30493da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
30503da691bfSIlya Dryomov
3051a086a1b8SIlya Dryomov if (bytes != MODS_ONLY)
3052a086a1b8SIlya Dryomov num_ops++; /* copyup */
305313488d53SIlya Dryomov
3054a086a1b8SIlya Dryomov osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3055bcbab1dbSIlya Dryomov if (IS_ERR(osd_req))
3056bcbab1dbSIlya Dryomov return PTR_ERR(osd_req);
30573da691bfSIlya Dryomov
30583a482501SIlya Dryomov if (bytes != MODS_ONLY) {
3059b5ae8cbcSIlya Dryomov ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
30603da691bfSIlya Dryomov if (ret)
30613da691bfSIlya Dryomov return ret;
30623a482501SIlya Dryomov }
30633da691bfSIlya Dryomov
3064a086a1b8SIlya Dryomov rbd_osd_setup_write_ops(osd_req, which);
3065a086a1b8SIlya Dryomov rbd_osd_format_write(osd_req);
30663da691bfSIlya Dryomov
3067bcbab1dbSIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
30683da691bfSIlya Dryomov if (ret)
30693da691bfSIlya Dryomov return ret;
30703da691bfSIlya Dryomov
3071a086a1b8SIlya Dryomov rbd_osd_submit(osd_req);
30723da691bfSIlya Dryomov return 0;
30733da691bfSIlya Dryomov }
30743da691bfSIlya Dryomov
setup_copyup_bvecs(struct rbd_obj_request * obj_req,u64 obj_overlap)30757e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
30767e07efb1SIlya Dryomov {
30777e07efb1SIlya Dryomov u32 i;
30787e07efb1SIlya Dryomov
30797e07efb1SIlya Dryomov rbd_assert(!obj_req->copyup_bvecs);
30807e07efb1SIlya Dryomov obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
30817e07efb1SIlya Dryomov obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
30827e07efb1SIlya Dryomov sizeof(*obj_req->copyup_bvecs),
30837e07efb1SIlya Dryomov GFP_NOIO);
30847e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs)
30857e07efb1SIlya Dryomov return -ENOMEM;
30867e07efb1SIlya Dryomov
30877e07efb1SIlya Dryomov for (i = 0; i < obj_req->copyup_bvec_count; i++) {
30887e07efb1SIlya Dryomov unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
30897df2af0bSChristoph Hellwig struct page *page = alloc_page(GFP_NOIO);
30907e07efb1SIlya Dryomov
30917df2af0bSChristoph Hellwig if (!page)
30927e07efb1SIlya Dryomov return -ENOMEM;
30937e07efb1SIlya Dryomov
30947df2af0bSChristoph Hellwig bvec_set_page(&obj_req->copyup_bvecs[i], page, len, 0);
30957e07efb1SIlya Dryomov obj_overlap -= len;
30967e07efb1SIlya Dryomov }
30977e07efb1SIlya Dryomov
30987e07efb1SIlya Dryomov rbd_assert(!obj_overlap);
30997e07efb1SIlya Dryomov return 0;
31007e07efb1SIlya Dryomov }
31017e07efb1SIlya Dryomov
31020ad5d953SIlya Dryomov /*
31030ad5d953SIlya Dryomov * The target object doesn't exist. Read the data for the entire
31040ad5d953SIlya Dryomov * target object up to the overlap point (if any) from the parent,
31050ad5d953SIlya Dryomov * so we can use it for a copyup.
31060ad5d953SIlya Dryomov */
rbd_obj_copyup_read_parent(struct rbd_obj_request * obj_req)3107793333a3SIlya Dryomov static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
31083da691bfSIlya Dryomov {
31093da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
31103da691bfSIlya Dryomov int ret;
31113da691bfSIlya Dryomov
311286bd7998SIlya Dryomov rbd_assert(obj_req->num_img_extents);
311386bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
311486bd7998SIlya Dryomov rbd_dev->parent_overlap);
311586bd7998SIlya Dryomov if (!obj_req->num_img_extents) {
31163da691bfSIlya Dryomov /*
31173da691bfSIlya Dryomov * The overlap has become 0 (most likely because the
31183a482501SIlya Dryomov * image has been flattened). Re-submit the original write
31193a482501SIlya Dryomov * request -- pass MODS_ONLY since the copyup isn't needed
31203a482501SIlya Dryomov * anymore.
31213da691bfSIlya Dryomov */
3122793333a3SIlya Dryomov return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
31233da691bfSIlya Dryomov }
31243da691bfSIlya Dryomov
312586bd7998SIlya Dryomov ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
31263da691bfSIlya Dryomov if (ret)
31273da691bfSIlya Dryomov return ret;
31283da691bfSIlya Dryomov
312986bd7998SIlya Dryomov return rbd_obj_read_from_parent(obj_req);
31303da691bfSIlya Dryomov }
31313da691bfSIlya Dryomov
rbd_obj_copyup_object_maps(struct rbd_obj_request * obj_req)313222e8bd51SIlya Dryomov static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
31333da691bfSIlya Dryomov {
313422e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
313522e8bd51SIlya Dryomov struct ceph_snap_context *snapc = obj_req->img_request->snapc;
313622e8bd51SIlya Dryomov u8 new_state;
313722e8bd51SIlya Dryomov u32 i;
31383da691bfSIlya Dryomov int ret;
31393da691bfSIlya Dryomov
314022e8bd51SIlya Dryomov rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
31413da691bfSIlya Dryomov
314222e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
314322e8bd51SIlya Dryomov return;
314489a59c1cSIlya Dryomov
314522e8bd51SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
314622e8bd51SIlya Dryomov return;
31473da691bfSIlya Dryomov
314822e8bd51SIlya Dryomov for (i = 0; i < snapc->num_snaps; i++) {
314922e8bd51SIlya Dryomov if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
315022e8bd51SIlya Dryomov i + 1 < snapc->num_snaps)
315122e8bd51SIlya Dryomov new_state = OBJECT_EXISTS_CLEAN;
315222e8bd51SIlya Dryomov else
315322e8bd51SIlya Dryomov new_state = OBJECT_EXISTS;
31543da691bfSIlya Dryomov
315522e8bd51SIlya Dryomov ret = rbd_object_map_update(obj_req, snapc->snaps[i],
315622e8bd51SIlya Dryomov new_state, NULL);
315722e8bd51SIlya Dryomov if (ret < 0) {
315822e8bd51SIlya Dryomov obj_req->pending.result = ret;
315902c74fbaSAlex Elder return;
316002c74fbaSAlex Elder }
316102c74fbaSAlex Elder
316222e8bd51SIlya Dryomov rbd_assert(!ret);
316322e8bd51SIlya Dryomov obj_req->pending.num_pending++;
3164a9e8ba2cSAlex Elder }
31658b3e1a56SAlex Elder }
31668b3e1a56SAlex Elder
rbd_obj_copyup_write_object(struct rbd_obj_request * obj_req)3167793333a3SIlya Dryomov static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
31688b3e1a56SAlex Elder {
3169793333a3SIlya Dryomov u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3170793333a3SIlya Dryomov int ret;
31718b3e1a56SAlex Elder
3172793333a3SIlya Dryomov rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
31738b3e1a56SAlex Elder
3174793333a3SIlya Dryomov /*
3175793333a3SIlya Dryomov * Only send non-zero copyup data to save some I/O and network
3176793333a3SIlya Dryomov * bandwidth -- zero copyup data is equivalent to the object not
3177793333a3SIlya Dryomov * existing.
3178793333a3SIlya Dryomov */
3179793333a3SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3180793333a3SIlya Dryomov bytes = 0;
3181793333a3SIlya Dryomov
3182793333a3SIlya Dryomov if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3183793333a3SIlya Dryomov /*
3184793333a3SIlya Dryomov * Send a copyup request with an empty snapshot context to
3185793333a3SIlya Dryomov * deep-copyup the object through all existing snapshots.
3186793333a3SIlya Dryomov * A second request with the current snapshot context will be
3187793333a3SIlya Dryomov * sent for the actual modification.
3188793333a3SIlya Dryomov */
3189793333a3SIlya Dryomov ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3190793333a3SIlya Dryomov if (ret) {
3191793333a3SIlya Dryomov obj_req->pending.result = ret;
3192793333a3SIlya Dryomov return;
31937114edacSIlya Dryomov }
31948b3e1a56SAlex Elder
3195793333a3SIlya Dryomov obj_req->pending.num_pending++;
3196793333a3SIlya Dryomov bytes = MODS_ONLY;
31973da691bfSIlya Dryomov }
31988b3e1a56SAlex Elder
3199793333a3SIlya Dryomov ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3200793333a3SIlya Dryomov if (ret) {
3201793333a3SIlya Dryomov obj_req->pending.result = ret;
3202793333a3SIlya Dryomov return;
3203793333a3SIlya Dryomov }
3204793333a3SIlya Dryomov
3205793333a3SIlya Dryomov obj_req->pending.num_pending++;
3206793333a3SIlya Dryomov }
3207793333a3SIlya Dryomov
rbd_obj_advance_copyup(struct rbd_obj_request * obj_req,int * result)3208793333a3SIlya Dryomov static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
32093da691bfSIlya Dryomov {
321022e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3211793333a3SIlya Dryomov int ret;
32127114edacSIlya Dryomov
32137114edacSIlya Dryomov again:
3214793333a3SIlya Dryomov switch (obj_req->copyup_state) {
3215793333a3SIlya Dryomov case RBD_OBJ_COPYUP_START:
3216793333a3SIlya Dryomov rbd_assert(!*result);
32173da691bfSIlya Dryomov
3218793333a3SIlya Dryomov ret = rbd_obj_copyup_read_parent(obj_req);
3219793333a3SIlya Dryomov if (ret) {
3220793333a3SIlya Dryomov *result = ret;
3221793333a3SIlya Dryomov return true;
3222793333a3SIlya Dryomov }
3223793333a3SIlya Dryomov if (obj_req->num_img_extents)
3224793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3225793333a3SIlya Dryomov else
3226793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3227793333a3SIlya Dryomov return false;
3228793333a3SIlya Dryomov case RBD_OBJ_COPYUP_READ_PARENT:
3229793333a3SIlya Dryomov if (*result)
3230793333a3SIlya Dryomov return true;
3231793333a3SIlya Dryomov
3232793333a3SIlya Dryomov if (is_zero_bvecs(obj_req->copyup_bvecs,
3233793333a3SIlya Dryomov rbd_obj_img_extents_bytes(obj_req))) {
3234793333a3SIlya Dryomov dout("%s %p detected zeros\n", __func__, obj_req);
3235793333a3SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
32367114edacSIlya Dryomov }
32377114edacSIlya Dryomov
323822e8bd51SIlya Dryomov rbd_obj_copyup_object_maps(obj_req);
323922e8bd51SIlya Dryomov if (!obj_req->pending.num_pending) {
324022e8bd51SIlya Dryomov *result = obj_req->pending.result;
324122e8bd51SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
32427114edacSIlya Dryomov goto again;
32437114edacSIlya Dryomov }
324422e8bd51SIlya Dryomov obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
324522e8bd51SIlya Dryomov return false;
324622e8bd51SIlya Dryomov case __RBD_OBJ_COPYUP_OBJECT_MAPS:
324722e8bd51SIlya Dryomov if (!pending_result_dec(&obj_req->pending, result))
324822e8bd51SIlya Dryomov return false;
3249df561f66SGustavo A. R. Silva fallthrough;
325022e8bd51SIlya Dryomov case RBD_OBJ_COPYUP_OBJECT_MAPS:
325122e8bd51SIlya Dryomov if (*result) {
325222e8bd51SIlya Dryomov rbd_warn(rbd_dev, "snap object map update failed: %d",
325322e8bd51SIlya Dryomov *result);
325422e8bd51SIlya Dryomov return true;
325522e8bd51SIlya Dryomov }
325622e8bd51SIlya Dryomov
3257793333a3SIlya Dryomov rbd_obj_copyup_write_object(obj_req);
3258793333a3SIlya Dryomov if (!obj_req->pending.num_pending) {
3259793333a3SIlya Dryomov *result = obj_req->pending.result;
3260793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3261793333a3SIlya Dryomov goto again;
3262793333a3SIlya Dryomov }
3263793333a3SIlya Dryomov obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3264793333a3SIlya Dryomov return false;
3265793333a3SIlya Dryomov case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3266793333a3SIlya Dryomov if (!pending_result_dec(&obj_req->pending, result))
3267793333a3SIlya Dryomov return false;
3268df561f66SGustavo A. R. Silva fallthrough;
3269793333a3SIlya Dryomov case RBD_OBJ_COPYUP_WRITE_OBJECT:
3270793333a3SIlya Dryomov return true;
3271793333a3SIlya Dryomov default:
3272793333a3SIlya Dryomov BUG();
3273793333a3SIlya Dryomov }
3274793333a3SIlya Dryomov }
3275793333a3SIlya Dryomov
327622e8bd51SIlya Dryomov /*
327722e8bd51SIlya Dryomov * Return:
327822e8bd51SIlya Dryomov * 0 - object map update sent
327922e8bd51SIlya Dryomov * 1 - object map update isn't needed
328022e8bd51SIlya Dryomov * <0 - error
328122e8bd51SIlya Dryomov */
rbd_obj_write_post_object_map(struct rbd_obj_request * obj_req)328222e8bd51SIlya Dryomov static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
328322e8bd51SIlya Dryomov {
328422e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
328522e8bd51SIlya Dryomov u8 current_state = OBJECT_PENDING;
328622e8bd51SIlya Dryomov
328722e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
328822e8bd51SIlya Dryomov return 1;
328922e8bd51SIlya Dryomov
329022e8bd51SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
329122e8bd51SIlya Dryomov return 1;
329222e8bd51SIlya Dryomov
329322e8bd51SIlya Dryomov return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
329422e8bd51SIlya Dryomov ¤t_state);
329522e8bd51SIlya Dryomov }
329622e8bd51SIlya Dryomov
rbd_obj_advance_write(struct rbd_obj_request * obj_req,int * result)329785b5e6d1SIlya Dryomov static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3298b8d70035SAlex Elder {
3299793333a3SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3300b8d70035SAlex Elder int ret;
3301b8d70035SAlex Elder
3302793333a3SIlya Dryomov again:
3303cf81b60eSAlex Elder switch (obj_req->write_state) {
330485b5e6d1SIlya Dryomov case RBD_OBJ_WRITE_START:
330585b5e6d1SIlya Dryomov rbd_assert(!*result);
330685b5e6d1SIlya Dryomov
330709fe05c5SIlya Dryomov rbd_obj_set_copyup_enabled(obj_req);
330822e8bd51SIlya Dryomov if (rbd_obj_write_is_noop(obj_req))
330922e8bd51SIlya Dryomov return true;
331022e8bd51SIlya Dryomov
331122e8bd51SIlya Dryomov ret = rbd_obj_write_pre_object_map(obj_req);
331222e8bd51SIlya Dryomov if (ret < 0) {
331322e8bd51SIlya Dryomov *result = ret;
331422e8bd51SIlya Dryomov return true;
331522e8bd51SIlya Dryomov }
331622e8bd51SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
331722e8bd51SIlya Dryomov if (ret > 0)
331822e8bd51SIlya Dryomov goto again;
331922e8bd51SIlya Dryomov return false;
332022e8bd51SIlya Dryomov case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
332122e8bd51SIlya Dryomov if (*result) {
332222e8bd51SIlya Dryomov rbd_warn(rbd_dev, "pre object map update failed: %d",
332322e8bd51SIlya Dryomov *result);
332422e8bd51SIlya Dryomov return true;
332522e8bd51SIlya Dryomov }
332685b5e6d1SIlya Dryomov ret = rbd_obj_write_object(obj_req);
332785b5e6d1SIlya Dryomov if (ret) {
332885b5e6d1SIlya Dryomov *result = ret;
332985b5e6d1SIlya Dryomov return true;
333085b5e6d1SIlya Dryomov }
333185b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
333285b5e6d1SIlya Dryomov return false;
33330ad5d953SIlya Dryomov case RBD_OBJ_WRITE_OBJECT:
333454ab3b24SIlya Dryomov if (*result == -ENOENT) {
33350ad5d953SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3336793333a3SIlya Dryomov *result = 0;
3337793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3338793333a3SIlya Dryomov obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3339793333a3SIlya Dryomov goto again;
3340b8d70035SAlex Elder }
33410ad5d953SIlya Dryomov /*
33420ad5d953SIlya Dryomov * On a non-existent object:
33430ad5d953SIlya Dryomov * delete - -ENOENT, truncate/zero - 0
33440ad5d953SIlya Dryomov */
33450ad5d953SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
33460ad5d953SIlya Dryomov *result = 0;
33470ad5d953SIlya Dryomov }
3348793333a3SIlya Dryomov if (*result)
3349793333a3SIlya Dryomov return true;
3350793333a3SIlya Dryomov
3351793333a3SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3352793333a3SIlya Dryomov goto again;
3353793333a3SIlya Dryomov case __RBD_OBJ_WRITE_COPYUP:
3354793333a3SIlya Dryomov if (!rbd_obj_advance_copyup(obj_req, result))
3355793333a3SIlya Dryomov return false;
3356df561f66SGustavo A. R. Silva fallthrough;
3357793333a3SIlya Dryomov case RBD_OBJ_WRITE_COPYUP:
335822e8bd51SIlya Dryomov if (*result) {
3359793333a3SIlya Dryomov rbd_warn(rbd_dev, "copyup failed: %d", *result);
3360b8d70035SAlex Elder return true;
336122e8bd51SIlya Dryomov }
336222e8bd51SIlya Dryomov ret = rbd_obj_write_post_object_map(obj_req);
336322e8bd51SIlya Dryomov if (ret < 0) {
336422e8bd51SIlya Dryomov *result = ret;
336522e8bd51SIlya Dryomov return true;
336622e8bd51SIlya Dryomov }
336722e8bd51SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
336822e8bd51SIlya Dryomov if (ret > 0)
336922e8bd51SIlya Dryomov goto again;
337022e8bd51SIlya Dryomov return false;
337122e8bd51SIlya Dryomov case RBD_OBJ_WRITE_POST_OBJECT_MAP:
337222e8bd51SIlya Dryomov if (*result)
337322e8bd51SIlya Dryomov rbd_warn(rbd_dev, "post object map update failed: %d",
337422e8bd51SIlya Dryomov *result);
337522e8bd51SIlya Dryomov return true;
3376b8d70035SAlex Elder default:
3377b8d70035SAlex Elder BUG();
3378b8d70035SAlex Elder }
3379b8d70035SAlex Elder }
3380b8d70035SAlex Elder
3381b8d70035SAlex Elder /*
33820ad5d953SIlya Dryomov * Return true if @obj_req is completed.
3383b8d70035SAlex Elder */
__rbd_obj_handle_request(struct rbd_obj_request * obj_req,int * result)338454ab3b24SIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
338554ab3b24SIlya Dryomov int *result)
3386b8d70035SAlex Elder {
33870ad5d953SIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request;
33880192ce2eSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
33890ad5d953SIlya Dryomov bool done;
33900ad5d953SIlya Dryomov
339185b5e6d1SIlya Dryomov mutex_lock(&obj_req->state_mutex);
33920ad5d953SIlya Dryomov if (!rbd_img_is_write(img_req))
339385b5e6d1SIlya Dryomov done = rbd_obj_advance_read(obj_req, result);
33940ad5d953SIlya Dryomov else
339585b5e6d1SIlya Dryomov done = rbd_obj_advance_write(obj_req, result);
339685b5e6d1SIlya Dryomov mutex_unlock(&obj_req->state_mutex);
33970ad5d953SIlya Dryomov
33980192ce2eSIlya Dryomov if (done && *result) {
33990192ce2eSIlya Dryomov rbd_assert(*result < 0);
34000192ce2eSIlya Dryomov rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
34010192ce2eSIlya Dryomov obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
34020192ce2eSIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
34030192ce2eSIlya Dryomov }
34040ad5d953SIlya Dryomov return done;
34059969ebc5SAlex Elder }
34069969ebc5SAlex Elder
34070192ce2eSIlya Dryomov /*
34080192ce2eSIlya Dryomov * This is open-coded in rbd_img_handle_request() to avoid parent chain
34090192ce2eSIlya Dryomov * recursion.
34100192ce2eSIlya Dryomov */
rbd_obj_handle_request(struct rbd_obj_request * obj_req,int result)341154ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
34129969ebc5SAlex Elder {
34130192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result))
34140192ce2eSIlya Dryomov rbd_img_handle_request(obj_req->img_request, result);
34157114edacSIlya Dryomov }
34167114edacSIlya Dryomov
need_exclusive_lock(struct rbd_img_request * img_req)3417e1fddc8fSIlya Dryomov static bool need_exclusive_lock(struct rbd_img_request *img_req)
3418e1fddc8fSIlya Dryomov {
3419e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
3420e1fddc8fSIlya Dryomov
3421e1fddc8fSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3422e1fddc8fSIlya Dryomov return false;
3423e1fddc8fSIlya Dryomov
34243fe69921SIlya Dryomov if (rbd_is_ro(rbd_dev))
3425e1fddc8fSIlya Dryomov return false;
3426e1fddc8fSIlya Dryomov
3427e1fddc8fSIlya Dryomov rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
342822e8bd51SIlya Dryomov if (rbd_dev->opts->lock_on_read ||
342922e8bd51SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3430e1fddc8fSIlya Dryomov return true;
3431e1fddc8fSIlya Dryomov
3432e1fddc8fSIlya Dryomov return rbd_img_is_write(img_req);
3433e1fddc8fSIlya Dryomov }
3434e1fddc8fSIlya Dryomov
rbd_lock_add_request(struct rbd_img_request * img_req)3435637cd060SIlya Dryomov static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3436e1fddc8fSIlya Dryomov {
3437e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
3438637cd060SIlya Dryomov bool locked;
3439e1fddc8fSIlya Dryomov
3440e1fddc8fSIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem);
3441637cd060SIlya Dryomov locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3442e1fddc8fSIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock);
3443e1fddc8fSIlya Dryomov rbd_assert(list_empty(&img_req->lock_item));
3444637cd060SIlya Dryomov if (!locked)
3445637cd060SIlya Dryomov list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3446637cd060SIlya Dryomov else
3447e1fddc8fSIlya Dryomov list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3448e1fddc8fSIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock);
3449637cd060SIlya Dryomov return locked;
3450e1fddc8fSIlya Dryomov }
3451e1fddc8fSIlya Dryomov
rbd_lock_del_request(struct rbd_img_request * img_req)3452e1fddc8fSIlya Dryomov static void rbd_lock_del_request(struct rbd_img_request *img_req)
3453e1fddc8fSIlya Dryomov {
3454e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
3455*801474eaSIlya Dryomov bool need_wakeup = false;
3456e1fddc8fSIlya Dryomov
3457e1fddc8fSIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem);
3458e1fddc8fSIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock);
3459*801474eaSIlya Dryomov if (!list_empty(&img_req->lock_item)) {
3460e1fddc8fSIlya Dryomov list_del_init(&img_req->lock_item);
3461e1fddc8fSIlya Dryomov need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3462e1fddc8fSIlya Dryomov list_empty(&rbd_dev->running_list));
3463*801474eaSIlya Dryomov }
3464e1fddc8fSIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock);
3465e1fddc8fSIlya Dryomov if (need_wakeup)
3466e1fddc8fSIlya Dryomov complete(&rbd_dev->releasing_wait);
3467e1fddc8fSIlya Dryomov }
3468e1fddc8fSIlya Dryomov
rbd_img_exclusive_lock(struct rbd_img_request * img_req)3469637cd060SIlya Dryomov static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3470637cd060SIlya Dryomov {
3471637cd060SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
3472637cd060SIlya Dryomov
3473637cd060SIlya Dryomov if (!need_exclusive_lock(img_req))
3474637cd060SIlya Dryomov return 1;
3475637cd060SIlya Dryomov
3476637cd060SIlya Dryomov if (rbd_lock_add_request(img_req))
3477637cd060SIlya Dryomov return 1;
3478637cd060SIlya Dryomov
3479637cd060SIlya Dryomov if (rbd_dev->opts->exclusive) {
3480637cd060SIlya Dryomov WARN_ON(1); /* lock got released? */
3481637cd060SIlya Dryomov return -EROFS;
3482637cd060SIlya Dryomov }
3483637cd060SIlya Dryomov
3484637cd060SIlya Dryomov /*
3485637cd060SIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock()
3486637cd060SIlya Dryomov * and cancel_delayed_work() in wake_lock_waiters().
3487637cd060SIlya Dryomov */
3488637cd060SIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3489637cd060SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3490637cd060SIlya Dryomov return 0;
3491637cd060SIlya Dryomov }
3492637cd060SIlya Dryomov
rbd_img_object_requests(struct rbd_img_request * img_req)34930192ce2eSIlya Dryomov static void rbd_img_object_requests(struct rbd_img_request *img_req)
34940192ce2eSIlya Dryomov {
3495870611e4SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
34960192ce2eSIlya Dryomov struct rbd_obj_request *obj_req;
34970192ce2eSIlya Dryomov
34980192ce2eSIlya Dryomov rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3499870611e4SIlya Dryomov rbd_assert(!need_exclusive_lock(img_req) ||
3500870611e4SIlya Dryomov __rbd_is_lock_owner(rbd_dev));
3501870611e4SIlya Dryomov
3502870611e4SIlya Dryomov if (rbd_img_is_write(img_req)) {
3503870611e4SIlya Dryomov rbd_assert(!img_req->snapc);
3504870611e4SIlya Dryomov down_read(&rbd_dev->header_rwsem);
3505870611e4SIlya Dryomov img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
3506870611e4SIlya Dryomov up_read(&rbd_dev->header_rwsem);
3507870611e4SIlya Dryomov }
35080192ce2eSIlya Dryomov
35090192ce2eSIlya Dryomov for_each_obj_request(img_req, obj_req) {
35100192ce2eSIlya Dryomov int result = 0;
35110192ce2eSIlya Dryomov
35120192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) {
35130192ce2eSIlya Dryomov if (result) {
35140192ce2eSIlya Dryomov img_req->pending.result = result;
35150192ce2eSIlya Dryomov return;
35160192ce2eSIlya Dryomov }
35170192ce2eSIlya Dryomov } else {
35180192ce2eSIlya Dryomov img_req->pending.num_pending++;
35190192ce2eSIlya Dryomov }
35200192ce2eSIlya Dryomov }
35210192ce2eSIlya Dryomov }
35220192ce2eSIlya Dryomov
rbd_img_advance(struct rbd_img_request * img_req,int * result)35230192ce2eSIlya Dryomov static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
35240192ce2eSIlya Dryomov {
3525637cd060SIlya Dryomov int ret;
3526637cd060SIlya Dryomov
35270192ce2eSIlya Dryomov again:
35280192ce2eSIlya Dryomov switch (img_req->state) {
35290192ce2eSIlya Dryomov case RBD_IMG_START:
35300192ce2eSIlya Dryomov rbd_assert(!*result);
35310192ce2eSIlya Dryomov
3532637cd060SIlya Dryomov ret = rbd_img_exclusive_lock(img_req);
3533637cd060SIlya Dryomov if (ret < 0) {
3534637cd060SIlya Dryomov *result = ret;
3535637cd060SIlya Dryomov return true;
3536637cd060SIlya Dryomov }
3537637cd060SIlya Dryomov img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3538637cd060SIlya Dryomov if (ret > 0)
3539637cd060SIlya Dryomov goto again;
3540637cd060SIlya Dryomov return false;
3541637cd060SIlya Dryomov case RBD_IMG_EXCLUSIVE_LOCK:
3542637cd060SIlya Dryomov if (*result)
3543637cd060SIlya Dryomov return true;
3544637cd060SIlya Dryomov
35450192ce2eSIlya Dryomov rbd_img_object_requests(img_req);
35460192ce2eSIlya Dryomov if (!img_req->pending.num_pending) {
35470192ce2eSIlya Dryomov *result = img_req->pending.result;
35480192ce2eSIlya Dryomov img_req->state = RBD_IMG_OBJECT_REQUESTS;
35497114edacSIlya Dryomov goto again;
35507114edacSIlya Dryomov }
35510192ce2eSIlya Dryomov img_req->state = __RBD_IMG_OBJECT_REQUESTS;
35520192ce2eSIlya Dryomov return false;
35530192ce2eSIlya Dryomov case __RBD_IMG_OBJECT_REQUESTS:
35540192ce2eSIlya Dryomov if (!pending_result_dec(&img_req->pending, result))
35550192ce2eSIlya Dryomov return false;
3556df561f66SGustavo A. R. Silva fallthrough;
35570192ce2eSIlya Dryomov case RBD_IMG_OBJECT_REQUESTS:
35580192ce2eSIlya Dryomov return true;
35590192ce2eSIlya Dryomov default:
35600192ce2eSIlya Dryomov BUG();
35610192ce2eSIlya Dryomov }
35620192ce2eSIlya Dryomov }
35630192ce2eSIlya Dryomov
35640192ce2eSIlya Dryomov /*
35650192ce2eSIlya Dryomov * Return true if @img_req is completed.
35660192ce2eSIlya Dryomov */
__rbd_img_handle_request(struct rbd_img_request * img_req,int * result)35670192ce2eSIlya Dryomov static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
35680192ce2eSIlya Dryomov int *result)
35690192ce2eSIlya Dryomov {
35700192ce2eSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev;
35710192ce2eSIlya Dryomov bool done;
35720192ce2eSIlya Dryomov
3573e1fddc8fSIlya Dryomov if (need_exclusive_lock(img_req)) {
3574e1fddc8fSIlya Dryomov down_read(&rbd_dev->lock_rwsem);
3575e1fddc8fSIlya Dryomov mutex_lock(&img_req->state_mutex);
3576e1fddc8fSIlya Dryomov done = rbd_img_advance(img_req, result);
3577e1fddc8fSIlya Dryomov if (done)
3578e1fddc8fSIlya Dryomov rbd_lock_del_request(img_req);
3579e1fddc8fSIlya Dryomov mutex_unlock(&img_req->state_mutex);
3580e1fddc8fSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
3581e1fddc8fSIlya Dryomov } else {
35820192ce2eSIlya Dryomov mutex_lock(&img_req->state_mutex);
35830192ce2eSIlya Dryomov done = rbd_img_advance(img_req, result);
35840192ce2eSIlya Dryomov mutex_unlock(&img_req->state_mutex);
3585e1fddc8fSIlya Dryomov }
35860192ce2eSIlya Dryomov
35870192ce2eSIlya Dryomov if (done && *result) {
35880192ce2eSIlya Dryomov rbd_assert(*result < 0);
35890192ce2eSIlya Dryomov rbd_warn(rbd_dev, "%s%s result %d",
35900192ce2eSIlya Dryomov test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
35910192ce2eSIlya Dryomov obj_op_name(img_req->op_type), *result);
35920192ce2eSIlya Dryomov }
35930192ce2eSIlya Dryomov return done;
35940192ce2eSIlya Dryomov }
35950192ce2eSIlya Dryomov
rbd_img_handle_request(struct rbd_img_request * img_req,int result)35960192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
35970192ce2eSIlya Dryomov {
35980192ce2eSIlya Dryomov again:
35990192ce2eSIlya Dryomov if (!__rbd_img_handle_request(img_req, &result))
36000192ce2eSIlya Dryomov return;
36010192ce2eSIlya Dryomov
36020192ce2eSIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
36030192ce2eSIlya Dryomov struct rbd_obj_request *obj_req = img_req->obj_request;
36040192ce2eSIlya Dryomov
3605679a97d2SHannes Reinecke rbd_img_request_destroy(img_req);
36060192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) {
36070192ce2eSIlya Dryomov img_req = obj_req->img_request;
36080192ce2eSIlya Dryomov goto again;
36090192ce2eSIlya Dryomov }
36100192ce2eSIlya Dryomov } else {
361159e542c8SIlya Dryomov struct request *rq = blk_mq_rq_from_pdu(img_req);
36120192ce2eSIlya Dryomov
3613679a97d2SHannes Reinecke rbd_img_request_destroy(img_req);
36140192ce2eSIlya Dryomov blk_mq_end_request(rq, errno_to_blk_status(result));
36150192ce2eSIlya Dryomov }
36169969ebc5SAlex Elder }
36179969ebc5SAlex Elder
3618ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid;
3619ed95b21aSIlya Dryomov
rbd_cid_equal(const struct rbd_client_id * lhs,const struct rbd_client_id * rhs)3620ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3621ed95b21aSIlya Dryomov const struct rbd_client_id *rhs)
3622ed95b21aSIlya Dryomov {
3623ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3624ed95b21aSIlya Dryomov }
3625ed95b21aSIlya Dryomov
rbd_get_cid(struct rbd_device * rbd_dev)3626ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3627ed95b21aSIlya Dryomov {
3628ed95b21aSIlya Dryomov struct rbd_client_id cid;
3629ed95b21aSIlya Dryomov
3630ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex);
3631ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3632ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie;
3633ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
3634ed95b21aSIlya Dryomov return cid;
3635ed95b21aSIlya Dryomov }
3636ed95b21aSIlya Dryomov
3637ed95b21aSIlya Dryomov /*
3638ed95b21aSIlya Dryomov * lock_rwsem must be held for write
3639ed95b21aSIlya Dryomov */
rbd_set_owner_cid(struct rbd_device * rbd_dev,const struct rbd_client_id * cid)3640ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3641ed95b21aSIlya Dryomov const struct rbd_client_id *cid)
3642ed95b21aSIlya Dryomov {
3643ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3644ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3645ed95b21aSIlya Dryomov cid->gid, cid->handle);
3646ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */
3647ed95b21aSIlya Dryomov }
3648ed95b21aSIlya Dryomov
format_lock_cookie(struct rbd_device * rbd_dev,char * buf)3649ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3650ed95b21aSIlya Dryomov {
3651ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex);
3652ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3653ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
3654ed95b21aSIlya Dryomov }
3655ed95b21aSIlya Dryomov
__rbd_lock(struct rbd_device * rbd_dev,const char * cookie)3656edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3657edd8ca80SFlorian Margaine {
3658edd8ca80SFlorian Margaine struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3659edd8ca80SFlorian Margaine
3660a2b1da09SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3661edd8ca80SFlorian Margaine strcpy(rbd_dev->lock_cookie, cookie);
3662edd8ca80SFlorian Margaine rbd_set_owner_cid(rbd_dev, &cid);
3663edd8ca80SFlorian Margaine queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3664edd8ca80SFlorian Margaine }
3665edd8ca80SFlorian Margaine
3666ed95b21aSIlya Dryomov /*
3667ed95b21aSIlya Dryomov * lock_rwsem must be held for write
3668ed95b21aSIlya Dryomov */
rbd_lock(struct rbd_device * rbd_dev)3669ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev)
3670ed95b21aSIlya Dryomov {
3671ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3672ed95b21aSIlya Dryomov char cookie[32];
3673ed95b21aSIlya Dryomov int ret;
3674ed95b21aSIlya Dryomov
3675cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3676cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0');
3677ed95b21aSIlya Dryomov
3678ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie);
3679ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3680ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3681ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0);
36829d01e07fSIlya Dryomov if (ret && ret != -EEXIST)
3683ed95b21aSIlya Dryomov return ret;
3684ed95b21aSIlya Dryomov
3685edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie);
3686ed95b21aSIlya Dryomov return 0;
3687ed95b21aSIlya Dryomov }
3688ed95b21aSIlya Dryomov
3689ed95b21aSIlya Dryomov /*
3690ed95b21aSIlya Dryomov * lock_rwsem must be held for write
3691ed95b21aSIlya Dryomov */
rbd_unlock(struct rbd_device * rbd_dev)3692bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev)
3693ed95b21aSIlya Dryomov {
3694ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3695ed95b21aSIlya Dryomov int ret;
3696ed95b21aSIlya Dryomov
3697cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3698cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0');
3699ed95b21aSIlya Dryomov
3700ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3701cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie);
3702bbead745SIlya Dryomov if (ret && ret != -ENOENT)
3703637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3704ed95b21aSIlya Dryomov
3705bbead745SIlya Dryomov /* treat errors as the image is unlocked */
3706bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3707cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0';
3708ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3709ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3710ed95b21aSIlya Dryomov }
3711ed95b21aSIlya Dryomov
__rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op,struct page *** preply_pages,size_t * preply_len)3712ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3713ed95b21aSIlya Dryomov enum rbd_notify_op notify_op,
3714ed95b21aSIlya Dryomov struct page ***preply_pages,
3715ed95b21aSIlya Dryomov size_t *preply_len)
3716ed95b21aSIlya Dryomov {
3717ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3718ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev);
371908a79102SKyle Spiers char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
372008a79102SKyle Spiers int buf_size = sizeof(buf);
3721ed95b21aSIlya Dryomov void *p = buf;
3722ed95b21aSIlya Dryomov
3723ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3724ed95b21aSIlya Dryomov
3725ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */
3726ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3727ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op);
3728ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid);
3729ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle);
3730ed95b21aSIlya Dryomov
3731ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3732ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size,
3733ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3734ed95b21aSIlya Dryomov }
3735ed95b21aSIlya Dryomov
rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op)3736ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3737ed95b21aSIlya Dryomov enum rbd_notify_op notify_op)
3738ed95b21aSIlya Dryomov {
37398ae0299aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
3740ed95b21aSIlya Dryomov }
3741ed95b21aSIlya Dryomov
rbd_notify_acquired_lock(struct work_struct * work)3742ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work)
3743ed95b21aSIlya Dryomov {
3744ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3745ed95b21aSIlya Dryomov acquired_lock_work);
3746ed95b21aSIlya Dryomov
3747ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3748ed95b21aSIlya Dryomov }
3749ed95b21aSIlya Dryomov
rbd_notify_released_lock(struct work_struct * work)3750ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work)
3751ed95b21aSIlya Dryomov {
3752ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3753ed95b21aSIlya Dryomov released_lock_work);
3754ed95b21aSIlya Dryomov
3755ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3756ed95b21aSIlya Dryomov }
3757ed95b21aSIlya Dryomov
rbd_request_lock(struct rbd_device * rbd_dev)3758ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev)
3759ed95b21aSIlya Dryomov {
3760ed95b21aSIlya Dryomov struct page **reply_pages;
3761ed95b21aSIlya Dryomov size_t reply_len;
3762ed95b21aSIlya Dryomov bool lock_owner_responded = false;
3763ed95b21aSIlya Dryomov int ret;
3764ed95b21aSIlya Dryomov
3765ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
3766ed95b21aSIlya Dryomov
3767ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3768ed95b21aSIlya Dryomov &reply_pages, &reply_len);
3769ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) {
3770ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3771ed95b21aSIlya Dryomov goto out;
3772ed95b21aSIlya Dryomov }
3773ed95b21aSIlya Dryomov
3774ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3775ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]);
3776ed95b21aSIlya Dryomov void *const end = p + reply_len;
3777ed95b21aSIlya Dryomov u32 n;
3778ed95b21aSIlya Dryomov
3779ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3780ed95b21aSIlya Dryomov while (n--) {
3781ed95b21aSIlya Dryomov u8 struct_v;
3782ed95b21aSIlya Dryomov u32 len;
3783ed95b21aSIlya Dryomov
3784ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval);
3785ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */
3786ed95b21aSIlya Dryomov
3787ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval);
3788ed95b21aSIlya Dryomov if (!len)
3789ed95b21aSIlya Dryomov continue;
3790ed95b21aSIlya Dryomov
3791ed95b21aSIlya Dryomov if (lock_owner_responded) {
3792ed95b21aSIlya Dryomov rbd_warn(rbd_dev,
3793ed95b21aSIlya Dryomov "duplicate lock owners detected");
3794ed95b21aSIlya Dryomov ret = -EIO;
3795ed95b21aSIlya Dryomov goto out;
3796ed95b21aSIlya Dryomov }
3797ed95b21aSIlya Dryomov
3798ed95b21aSIlya Dryomov lock_owner_responded = true;
3799ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3800ed95b21aSIlya Dryomov &struct_v, &len);
3801ed95b21aSIlya Dryomov if (ret) {
3802ed95b21aSIlya Dryomov rbd_warn(rbd_dev,
3803ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d",
3804ed95b21aSIlya Dryomov ret);
3805ed95b21aSIlya Dryomov goto e_inval;
3806ed95b21aSIlya Dryomov }
3807ed95b21aSIlya Dryomov
3808ed95b21aSIlya Dryomov ret = ceph_decode_32(&p);
3809ed95b21aSIlya Dryomov }
3810ed95b21aSIlya Dryomov }
3811ed95b21aSIlya Dryomov
3812ed95b21aSIlya Dryomov if (!lock_owner_responded) {
3813ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected");
3814ed95b21aSIlya Dryomov ret = -ETIMEDOUT;
3815ed95b21aSIlya Dryomov }
3816ed95b21aSIlya Dryomov
3817ed95b21aSIlya Dryomov out:
3818ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3819ed95b21aSIlya Dryomov return ret;
3820ed95b21aSIlya Dryomov
3821ed95b21aSIlya Dryomov e_inval:
3822ed95b21aSIlya Dryomov ret = -EINVAL;
3823ed95b21aSIlya Dryomov goto out;
3824ed95b21aSIlya Dryomov }
3825ed95b21aSIlya Dryomov
3826637cd060SIlya Dryomov /*
3827637cd060SIlya Dryomov * Either image request state machine(s) or rbd_add_acquire_lock()
3828637cd060SIlya Dryomov * (i.e. "rbd map").
3829637cd060SIlya Dryomov */
wake_lock_waiters(struct rbd_device * rbd_dev,int result)3830637cd060SIlya Dryomov static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3831ed95b21aSIlya Dryomov {
3832637cd060SIlya Dryomov struct rbd_img_request *img_req;
3833637cd060SIlya Dryomov
3834637cd060SIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3835d9b9c893SLinus Torvalds lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3836ed95b21aSIlya Dryomov
3837ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork);
3838637cd060SIlya Dryomov if (!completion_done(&rbd_dev->acquire_wait)) {
3839637cd060SIlya Dryomov rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3840637cd060SIlya Dryomov list_empty(&rbd_dev->running_list));
3841637cd060SIlya Dryomov rbd_dev->acquire_err = result;
3842637cd060SIlya Dryomov complete_all(&rbd_dev->acquire_wait);
3843637cd060SIlya Dryomov return;
3844637cd060SIlya Dryomov }
3845637cd060SIlya Dryomov
3846*801474eaSIlya Dryomov while (!list_empty(&rbd_dev->acquiring_list)) {
3847*801474eaSIlya Dryomov img_req = list_first_entry(&rbd_dev->acquiring_list,
3848*801474eaSIlya Dryomov struct rbd_img_request, lock_item);
3849637cd060SIlya Dryomov mutex_lock(&img_req->state_mutex);
3850637cd060SIlya Dryomov rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3851*801474eaSIlya Dryomov if (!result)
3852*801474eaSIlya Dryomov list_move_tail(&img_req->lock_item,
3853*801474eaSIlya Dryomov &rbd_dev->running_list);
3854*801474eaSIlya Dryomov else
3855*801474eaSIlya Dryomov list_del_init(&img_req->lock_item);
3856637cd060SIlya Dryomov rbd_img_schedule(img_req, result);
3857637cd060SIlya Dryomov mutex_unlock(&img_req->state_mutex);
3858637cd060SIlya Dryomov }
3859ed95b21aSIlya Dryomov }
3860ed95b21aSIlya Dryomov
locker_equal(const struct ceph_locker * lhs,const struct ceph_locker * rhs)386158815900SIlya Dryomov static bool locker_equal(const struct ceph_locker *lhs,
386258815900SIlya Dryomov const struct ceph_locker *rhs)
386358815900SIlya Dryomov {
386458815900SIlya Dryomov return lhs->id.name.type == rhs->id.name.type &&
386558815900SIlya Dryomov lhs->id.name.num == rhs->id.name.num &&
386658815900SIlya Dryomov !strcmp(lhs->id.cookie, rhs->id.cookie) &&
386758815900SIlya Dryomov ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr);
386858815900SIlya Dryomov }
386958815900SIlya Dryomov
free_locker(struct ceph_locker * locker)3870f38cb9d9SIlya Dryomov static void free_locker(struct ceph_locker *locker)
3871f38cb9d9SIlya Dryomov {
3872f38cb9d9SIlya Dryomov if (locker)
3873f38cb9d9SIlya Dryomov ceph_free_lockers(locker, 1);
3874f38cb9d9SIlya Dryomov }
3875f38cb9d9SIlya Dryomov
get_lock_owner_info(struct rbd_device * rbd_dev)3876f38cb9d9SIlya Dryomov static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev)
3877ed95b21aSIlya Dryomov {
3878ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3879f38cb9d9SIlya Dryomov struct ceph_locker *lockers;
3880f38cb9d9SIlya Dryomov u32 num_lockers;
3881ed95b21aSIlya Dryomov u8 lock_type;
3882ed95b21aSIlya Dryomov char *lock_tag;
38838ff2c64cSIlya Dryomov u64 handle;
3884ed95b21aSIlya Dryomov int ret;
3885ed95b21aSIlya Dryomov
3886ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3887ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME,
3888f38cb9d9SIlya Dryomov &lock_type, &lock_tag, &lockers, &num_lockers);
3889f38cb9d9SIlya Dryomov if (ret) {
38909d01e07fSIlya Dryomov rbd_warn(rbd_dev, "failed to get header lockers: %d", ret);
3891f38cb9d9SIlya Dryomov return ERR_PTR(ret);
3892f38cb9d9SIlya Dryomov }
3893ed95b21aSIlya Dryomov
3894f38cb9d9SIlya Dryomov if (num_lockers == 0) {
3895ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3896f38cb9d9SIlya Dryomov lockers = NULL;
3897ed95b21aSIlya Dryomov goto out;
3898ed95b21aSIlya Dryomov }
3899ed95b21aSIlya Dryomov
3900ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3901ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3902ed95b21aSIlya Dryomov lock_tag);
3903f38cb9d9SIlya Dryomov goto err_busy;
3904ed95b21aSIlya Dryomov }
3905ed95b21aSIlya Dryomov
39068ff2c64cSIlya Dryomov if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) {
39078ff2c64cSIlya Dryomov rbd_warn(rbd_dev, "incompatible lock type detected");
3908f38cb9d9SIlya Dryomov goto err_busy;
3909ed95b21aSIlya Dryomov }
3910ed95b21aSIlya Dryomov
3911f38cb9d9SIlya Dryomov WARN_ON(num_lockers != 1);
39128ff2c64cSIlya Dryomov ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu",
39138ff2c64cSIlya Dryomov &handle);
39148ff2c64cSIlya Dryomov if (ret != 1) {
3915ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3916f38cb9d9SIlya Dryomov lockers[0].id.cookie);
3917f38cb9d9SIlya Dryomov goto err_busy;
3918ed95b21aSIlya Dryomov }
39198ff2c64cSIlya Dryomov if (ceph_addr_is_blank(&lockers[0].info.addr)) {
39208ff2c64cSIlya Dryomov rbd_warn(rbd_dev, "locker has a blank address");
39218ff2c64cSIlya Dryomov goto err_busy;
39228ff2c64cSIlya Dryomov }
39238ff2c64cSIlya Dryomov
39248ff2c64cSIlya Dryomov dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n",
39258ff2c64cSIlya Dryomov __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name),
39268ff2c64cSIlya Dryomov &lockers[0].info.addr.in_addr,
39278ff2c64cSIlya Dryomov le32_to_cpu(lockers[0].info.addr.nonce), handle);
3928ed95b21aSIlya Dryomov
3929ed95b21aSIlya Dryomov out:
3930ed95b21aSIlya Dryomov kfree(lock_tag);
3931f38cb9d9SIlya Dryomov return lockers;
3932f38cb9d9SIlya Dryomov
3933f38cb9d9SIlya Dryomov err_busy:
3934f38cb9d9SIlya Dryomov kfree(lock_tag);
3935f38cb9d9SIlya Dryomov ceph_free_lockers(lockers, num_lockers);
3936f38cb9d9SIlya Dryomov return ERR_PTR(-EBUSY);
3937ed95b21aSIlya Dryomov }
3938ed95b21aSIlya Dryomov
find_watcher(struct rbd_device * rbd_dev,const struct ceph_locker * locker)3939ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev,
3940ed95b21aSIlya Dryomov const struct ceph_locker *locker)
3941ed95b21aSIlya Dryomov {
3942ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3943ed95b21aSIlya Dryomov struct ceph_watch_item *watchers;
3944ed95b21aSIlya Dryomov u32 num_watchers;
3945ed95b21aSIlya Dryomov u64 cookie;
3946ed95b21aSIlya Dryomov int i;
3947ed95b21aSIlya Dryomov int ret;
3948ed95b21aSIlya Dryomov
3949ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3950ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers,
3951ed95b21aSIlya Dryomov &num_watchers);
39529d01e07fSIlya Dryomov if (ret) {
39539d01e07fSIlya Dryomov rbd_warn(rbd_dev, "failed to get watchers: %d", ret);
3954ed95b21aSIlya Dryomov return ret;
39559d01e07fSIlya Dryomov }
3956ed95b21aSIlya Dryomov
3957ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3958ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) {
3959313771e8SIlya Dryomov /*
3960313771e8SIlya Dryomov * Ignore addr->type while comparing. This mimics
3961313771e8SIlya Dryomov * entity_addr_t::get_legacy_str() + strcmp().
3962313771e8SIlya Dryomov */
3963313771e8SIlya Dryomov if (ceph_addr_equal_no_type(&watchers[i].addr,
3964313771e8SIlya Dryomov &locker->info.addr) &&
3965ed95b21aSIlya Dryomov watchers[i].cookie == cookie) {
3966ed95b21aSIlya Dryomov struct rbd_client_id cid = {
3967ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num),
3968ed95b21aSIlya Dryomov .handle = cookie,
3969ed95b21aSIlya Dryomov };
3970ed95b21aSIlya Dryomov
3971ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3972ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle);
3973ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid);
3974ed95b21aSIlya Dryomov ret = 1;
3975ed95b21aSIlya Dryomov goto out;
3976ed95b21aSIlya Dryomov }
3977ed95b21aSIlya Dryomov }
3978ed95b21aSIlya Dryomov
3979ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3980ed95b21aSIlya Dryomov ret = 0;
3981ed95b21aSIlya Dryomov out:
3982ed95b21aSIlya Dryomov kfree(watchers);
3983ed95b21aSIlya Dryomov return ret;
3984ed95b21aSIlya Dryomov }
3985ed95b21aSIlya Dryomov
3986ed95b21aSIlya Dryomov /*
3987ed95b21aSIlya Dryomov * lock_rwsem must be held for write
3988ed95b21aSIlya Dryomov */
rbd_try_lock(struct rbd_device * rbd_dev)3989ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev)
3990ed95b21aSIlya Dryomov {
3991ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client;
399258815900SIlya Dryomov struct ceph_locker *locker, *refreshed_locker;
3993ed95b21aSIlya Dryomov int ret;
3994ed95b21aSIlya Dryomov
3995ed95b21aSIlya Dryomov for (;;) {
399658815900SIlya Dryomov locker = refreshed_locker = NULL;
3997f38cb9d9SIlya Dryomov
3998ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev);
39999d01e07fSIlya Dryomov if (!ret)
4000f38cb9d9SIlya Dryomov goto out;
40019d01e07fSIlya Dryomov if (ret != -EBUSY) {
40029d01e07fSIlya Dryomov rbd_warn(rbd_dev, "failed to lock header: %d", ret);
40039d01e07fSIlya Dryomov goto out;
40049d01e07fSIlya Dryomov }
4005ed95b21aSIlya Dryomov
4006ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */
4007f38cb9d9SIlya Dryomov locker = get_lock_owner_info(rbd_dev);
4008f38cb9d9SIlya Dryomov if (IS_ERR(locker)) {
4009f38cb9d9SIlya Dryomov ret = PTR_ERR(locker);
4010f38cb9d9SIlya Dryomov locker = NULL;
4011f38cb9d9SIlya Dryomov goto out;
4012f38cb9d9SIlya Dryomov }
4013f38cb9d9SIlya Dryomov if (!locker)
4014ed95b21aSIlya Dryomov goto again;
4015ed95b21aSIlya Dryomov
4016f38cb9d9SIlya Dryomov ret = find_watcher(rbd_dev, locker);
4017637cd060SIlya Dryomov if (ret)
4018637cd060SIlya Dryomov goto out; /* request lock or error */
4019ed95b21aSIlya Dryomov
402058815900SIlya Dryomov refreshed_locker = get_lock_owner_info(rbd_dev);
402158815900SIlya Dryomov if (IS_ERR(refreshed_locker)) {
402258815900SIlya Dryomov ret = PTR_ERR(refreshed_locker);
402358815900SIlya Dryomov refreshed_locker = NULL;
402458815900SIlya Dryomov goto out;
402558815900SIlya Dryomov }
402658815900SIlya Dryomov if (!refreshed_locker ||
402758815900SIlya Dryomov !locker_equal(locker, refreshed_locker))
402858815900SIlya Dryomov goto again;
402958815900SIlya Dryomov
403022e8bd51SIlya Dryomov rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4031f38cb9d9SIlya Dryomov ENTITY_NAME(locker->id.name));
4032ed95b21aSIlya Dryomov
40330b98acd6SIlya Dryomov ret = ceph_monc_blocklist_add(&client->monc,
4034f38cb9d9SIlya Dryomov &locker->info.addr);
4035ed95b21aSIlya Dryomov if (ret) {
4036f38cb9d9SIlya Dryomov rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d",
4037f38cb9d9SIlya Dryomov ENTITY_NAME(locker->id.name), ret);
4038ed95b21aSIlya Dryomov goto out;
4039ed95b21aSIlya Dryomov }
4040ed95b21aSIlya Dryomov
4041ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4042ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME,
4043f38cb9d9SIlya Dryomov locker->id.cookie, &locker->id.name);
4044f38cb9d9SIlya Dryomov if (ret && ret != -ENOENT) {
4045f38cb9d9SIlya Dryomov rbd_warn(rbd_dev, "failed to break header lock: %d",
4046f38cb9d9SIlya Dryomov ret);
4047ed95b21aSIlya Dryomov goto out;
4048f38cb9d9SIlya Dryomov }
4049ed95b21aSIlya Dryomov
4050ed95b21aSIlya Dryomov again:
405158815900SIlya Dryomov free_locker(refreshed_locker);
4052f38cb9d9SIlya Dryomov free_locker(locker);
4053ed95b21aSIlya Dryomov }
4054ed95b21aSIlya Dryomov
4055ed95b21aSIlya Dryomov out:
405658815900SIlya Dryomov free_locker(refreshed_locker);
4057f38cb9d9SIlya Dryomov free_locker(locker);
4058ed95b21aSIlya Dryomov return ret;
4059ed95b21aSIlya Dryomov }
4060ed95b21aSIlya Dryomov
rbd_post_acquire_action(struct rbd_device * rbd_dev)406122e8bd51SIlya Dryomov static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4062ed95b21aSIlya Dryomov {
406322e8bd51SIlya Dryomov int ret;
406422e8bd51SIlya Dryomov
4065870611e4SIlya Dryomov ret = rbd_dev_refresh(rbd_dev);
4066870611e4SIlya Dryomov if (ret)
4067870611e4SIlya Dryomov return ret;
4068870611e4SIlya Dryomov
406922e8bd51SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
407022e8bd51SIlya Dryomov ret = rbd_object_map_open(rbd_dev);
407122e8bd51SIlya Dryomov if (ret)
407222e8bd51SIlya Dryomov return ret;
407322e8bd51SIlya Dryomov }
407422e8bd51SIlya Dryomov
407522e8bd51SIlya Dryomov return 0;
407622e8bd51SIlya Dryomov }
407722e8bd51SIlya Dryomov
4078ed95b21aSIlya Dryomov /*
4079637cd060SIlya Dryomov * Return:
4080637cd060SIlya Dryomov * 0 - lock acquired
4081637cd060SIlya Dryomov * 1 - caller should call rbd_request_lock()
4082637cd060SIlya Dryomov * <0 - error
4083ed95b21aSIlya Dryomov */
rbd_try_acquire_lock(struct rbd_device * rbd_dev)4084637cd060SIlya Dryomov static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4085ed95b21aSIlya Dryomov {
4086637cd060SIlya Dryomov int ret;
4087ed95b21aSIlya Dryomov
4088ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem);
4089ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4090ed95b21aSIlya Dryomov rbd_dev->lock_state);
4091ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) {
4092ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
4093637cd060SIlya Dryomov return 0;
4094ed95b21aSIlya Dryomov }
4095ed95b21aSIlya Dryomov
4096ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
4097ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4098ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4099ed95b21aSIlya Dryomov rbd_dev->lock_state);
4100637cd060SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) {
4101637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4102637cd060SIlya Dryomov return 0;
4103ed95b21aSIlya Dryomov }
4104ed95b21aSIlya Dryomov
4105637cd060SIlya Dryomov ret = rbd_try_lock(rbd_dev);
4106637cd060SIlya Dryomov if (ret < 0) {
41079d01e07fSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", ret);
4108637cd060SIlya Dryomov goto out;
4109637cd060SIlya Dryomov }
4110637cd060SIlya Dryomov if (ret > 0) {
4111ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4112637cd060SIlya Dryomov return ret;
4113637cd060SIlya Dryomov }
4114637cd060SIlya Dryomov
4115637cd060SIlya Dryomov rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4116637cd060SIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list));
4117637cd060SIlya Dryomov
411822e8bd51SIlya Dryomov ret = rbd_post_acquire_action(rbd_dev);
411922e8bd51SIlya Dryomov if (ret) {
412022e8bd51SIlya Dryomov rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
412122e8bd51SIlya Dryomov /*
412222e8bd51SIlya Dryomov * Can't stay in RBD_LOCK_STATE_LOCKED because
412322e8bd51SIlya Dryomov * rbd_lock_add_request() would let the request through,
412422e8bd51SIlya Dryomov * assuming that e.g. object map is locked and loaded.
412522e8bd51SIlya Dryomov */
412622e8bd51SIlya Dryomov rbd_unlock(rbd_dev);
412722e8bd51SIlya Dryomov }
412822e8bd51SIlya Dryomov
4129637cd060SIlya Dryomov out:
4130637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret);
4131637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4132637cd060SIlya Dryomov return ret;
4133ed95b21aSIlya Dryomov }
4134ed95b21aSIlya Dryomov
rbd_acquire_lock(struct work_struct * work)4135ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work)
4136ed95b21aSIlya Dryomov {
4137ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4138ed95b21aSIlya Dryomov struct rbd_device, lock_dwork);
4139637cd060SIlya Dryomov int ret;
4140ed95b21aSIlya Dryomov
4141ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
4142ed95b21aSIlya Dryomov again:
4143637cd060SIlya Dryomov ret = rbd_try_acquire_lock(rbd_dev);
4144637cd060SIlya Dryomov if (ret <= 0) {
4145637cd060SIlya Dryomov dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4146ed95b21aSIlya Dryomov return;
4147ed95b21aSIlya Dryomov }
4148ed95b21aSIlya Dryomov
4149ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev);
4150ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) {
4151ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */
4152e010dd0aSIlya Dryomov } else if (ret == -EROFS) {
4153e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "peer will not release lock");
4154637cd060SIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4155637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret);
4156637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4157ed95b21aSIlya Dryomov } else if (ret < 0) {
4158ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4159ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4160ed95b21aSIlya Dryomov RBD_RETRY_DELAY);
4161ed95b21aSIlya Dryomov } else {
4162ed95b21aSIlya Dryomov /*
4163ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them
4164ed95b21aSIlya Dryomov * release the lock
4165ed95b21aSIlya Dryomov */
41666b0a8774SColin Ian King dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4167ed95b21aSIlya Dryomov rbd_dev);
4168ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4169ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4170ed95b21aSIlya Dryomov }
4171ed95b21aSIlya Dryomov }
4172ed95b21aSIlya Dryomov
rbd_quiesce_lock(struct rbd_device * rbd_dev)4173a2b1da09SIlya Dryomov static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4174ed95b21aSIlya Dryomov {
4175a2b1da09SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
4176d9b9c893SLinus Torvalds lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4177a2b1da09SIlya Dryomov
4178ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4179ed95b21aSIlya Dryomov return false;
4180ed95b21aSIlya Dryomov
4181ed95b21aSIlya Dryomov /*
4182ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed.
4183ed95b21aSIlya Dryomov */
4184e1fddc8fSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4185e1fddc8fSIlya Dryomov rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4186ed9eb710SIlya Dryomov if (list_empty(&rbd_dev->running_list))
4187ed9eb710SIlya Dryomov return true;
4188ed9eb710SIlya Dryomov
4189ed9eb710SIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4190e1fddc8fSIlya Dryomov wait_for_completion(&rbd_dev->releasing_wait);
4191ed95b21aSIlya Dryomov
4192ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4193ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4194ed95b21aSIlya Dryomov return false;
4195ed95b21aSIlya Dryomov
4196e1fddc8fSIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list));
4197a2b1da09SIlya Dryomov return true;
4198a2b1da09SIlya Dryomov }
4199a2b1da09SIlya Dryomov
rbd_pre_release_action(struct rbd_device * rbd_dev)420022e8bd51SIlya Dryomov static void rbd_pre_release_action(struct rbd_device *rbd_dev)
420122e8bd51SIlya Dryomov {
420222e8bd51SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
420322e8bd51SIlya Dryomov rbd_object_map_close(rbd_dev);
420422e8bd51SIlya Dryomov }
420522e8bd51SIlya Dryomov
__rbd_release_lock(struct rbd_device * rbd_dev)4206e1fddc8fSIlya Dryomov static void __rbd_release_lock(struct rbd_device *rbd_dev)
4207e1fddc8fSIlya Dryomov {
4208e1fddc8fSIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list));
4209e1fddc8fSIlya Dryomov
421022e8bd51SIlya Dryomov rbd_pre_release_action(rbd_dev);
4211bbead745SIlya Dryomov rbd_unlock(rbd_dev);
4212e1fddc8fSIlya Dryomov }
4213e1fddc8fSIlya Dryomov
4214a2b1da09SIlya Dryomov /*
4215a2b1da09SIlya Dryomov * lock_rwsem must be held for write
4216a2b1da09SIlya Dryomov */
rbd_release_lock(struct rbd_device * rbd_dev)4217a2b1da09SIlya Dryomov static void rbd_release_lock(struct rbd_device *rbd_dev)
4218a2b1da09SIlya Dryomov {
4219a2b1da09SIlya Dryomov if (!rbd_quiesce_lock(rbd_dev))
4220a2b1da09SIlya Dryomov return;
4221a2b1da09SIlya Dryomov
4222e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev);
4223a2b1da09SIlya Dryomov
4224ed95b21aSIlya Dryomov /*
4225ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire
4226637cd060SIlya Dryomov * almost immediately if we got new IO while draining the running
4227637cd060SIlya Dryomov * list otherwise. We need to ack our own notifications, so this
4228637cd060SIlya Dryomov * lock_dwork will be requeued from rbd_handle_released_lock() by
4229637cd060SIlya Dryomov * way of maybe_kick_acquire().
4230ed95b21aSIlya Dryomov */
4231ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork);
4232ed95b21aSIlya Dryomov }
4233ed95b21aSIlya Dryomov
rbd_release_lock_work(struct work_struct * work)4234ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work)
4235ed95b21aSIlya Dryomov {
4236ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4237ed95b21aSIlya Dryomov unlock_work);
4238ed95b21aSIlya Dryomov
4239ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4240ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev);
4241ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4242ed95b21aSIlya Dryomov }
4243ed95b21aSIlya Dryomov
maybe_kick_acquire(struct rbd_device * rbd_dev)4244637cd060SIlya Dryomov static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4245637cd060SIlya Dryomov {
4246637cd060SIlya Dryomov bool have_requests;
4247637cd060SIlya Dryomov
4248637cd060SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
4249637cd060SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev))
4250637cd060SIlya Dryomov return;
4251637cd060SIlya Dryomov
4252637cd060SIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock);
4253637cd060SIlya Dryomov have_requests = !list_empty(&rbd_dev->acquiring_list);
4254637cd060SIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock);
4255637cd060SIlya Dryomov if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4256637cd060SIlya Dryomov dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4257637cd060SIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4258637cd060SIlya Dryomov }
4259637cd060SIlya Dryomov }
4260637cd060SIlya Dryomov
rbd_handle_acquired_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4261ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4262ed95b21aSIlya Dryomov void **p)
4263ed95b21aSIlya Dryomov {
4264ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 };
4265ed95b21aSIlya Dryomov
4266ed95b21aSIlya Dryomov if (struct_v >= 2) {
4267ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p);
4268ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p);
4269ed95b21aSIlya Dryomov }
4270ed95b21aSIlya Dryomov
4271ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4272ed95b21aSIlya Dryomov cid.handle);
4273ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4274ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4275ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
42768798d070SIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
42778798d070SIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle);
42788798d070SIlya Dryomov } else {
4279ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid);
42808798d070SIlya Dryomov }
4281ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem);
4282ed95b21aSIlya Dryomov } else {
4283ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem);
4284ed95b21aSIlya Dryomov }
4285ed95b21aSIlya Dryomov
4286637cd060SIlya Dryomov maybe_kick_acquire(rbd_dev);
4287ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
4288ed95b21aSIlya Dryomov }
4289ed95b21aSIlya Dryomov
rbd_handle_released_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4290ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4291ed95b21aSIlya Dryomov void **p)
4292ed95b21aSIlya Dryomov {
4293ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 };
4294ed95b21aSIlya Dryomov
4295ed95b21aSIlya Dryomov if (struct_v >= 2) {
4296ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p);
4297ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p);
4298ed95b21aSIlya Dryomov }
4299ed95b21aSIlya Dryomov
4300ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4301ed95b21aSIlya Dryomov cid.handle);
4302ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4303ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4304ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
43058798d070SIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
4306ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle,
4307ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
43088798d070SIlya Dryomov } else {
4309ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
43108798d070SIlya Dryomov }
4311ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem);
4312ed95b21aSIlya Dryomov } else {
4313ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem);
4314ed95b21aSIlya Dryomov }
4315ed95b21aSIlya Dryomov
4316637cd060SIlya Dryomov maybe_kick_acquire(rbd_dev);
4317ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
4318ed95b21aSIlya Dryomov }
4319ed95b21aSIlya Dryomov
43203b77faa0SIlya Dryomov /*
43213b77faa0SIlya Dryomov * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
43223b77faa0SIlya Dryomov * ResponseMessage is needed.
43233b77faa0SIlya Dryomov */
rbd_handle_request_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)43243b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4325ed95b21aSIlya Dryomov void **p)
4326ed95b21aSIlya Dryomov {
4327ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4328ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 };
43293b77faa0SIlya Dryomov int result = 1;
4330ed95b21aSIlya Dryomov
4331ed95b21aSIlya Dryomov if (struct_v >= 2) {
4332ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p);
4333ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p);
4334ed95b21aSIlya Dryomov }
4335ed95b21aSIlya Dryomov
4336ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4337ed95b21aSIlya Dryomov cid.handle);
4338ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid))
43393b77faa0SIlya Dryomov return result;
4340ed95b21aSIlya Dryomov
4341ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem);
43423b77faa0SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) {
43433b77faa0SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
43443b77faa0SIlya Dryomov rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
43453b77faa0SIlya Dryomov goto out_unlock;
43463b77faa0SIlya Dryomov
43473b77faa0SIlya Dryomov /*
43483b77faa0SIlya Dryomov * encode ResponseMessage(0) so the peer can detect
43493b77faa0SIlya Dryomov * a missing owner
43503b77faa0SIlya Dryomov */
43513b77faa0SIlya Dryomov result = 0;
43523b77faa0SIlya Dryomov
4353ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4354e010dd0aSIlya Dryomov if (!rbd_dev->opts->exclusive) {
4355e010dd0aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n",
4356e010dd0aSIlya Dryomov __func__, rbd_dev);
4357e010dd0aSIlya Dryomov queue_work(rbd_dev->task_wq,
4358e010dd0aSIlya Dryomov &rbd_dev->unlock_work);
4359e010dd0aSIlya Dryomov } else {
4360e010dd0aSIlya Dryomov /* refuse to release the lock */
4361e010dd0aSIlya Dryomov result = -EROFS;
4362ed95b21aSIlya Dryomov }
4363ed95b21aSIlya Dryomov }
4364ed95b21aSIlya Dryomov }
43653b77faa0SIlya Dryomov
43663b77faa0SIlya Dryomov out_unlock:
4367ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem);
43683b77faa0SIlya Dryomov return result;
4369ed95b21aSIlya Dryomov }
4370ed95b21aSIlya Dryomov
__rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 * result)4371ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4372ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result)
4373ed95b21aSIlya Dryomov {
4374ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
437508a79102SKyle Spiers char buf[4 + CEPH_ENCODING_START_BLK_LEN];
437608a79102SKyle Spiers int buf_size = sizeof(buf);
4377ed95b21aSIlya Dryomov int ret;
4378ed95b21aSIlya Dryomov
4379ed95b21aSIlya Dryomov if (result) {
4380ed95b21aSIlya Dryomov void *p = buf;
4381ed95b21aSIlya Dryomov
4382ed95b21aSIlya Dryomov /* encode ResponseMessage */
4383ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1,
4384ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN);
4385ed95b21aSIlya Dryomov ceph_encode_32(&p, *result);
4386ed95b21aSIlya Dryomov } else {
4387ed95b21aSIlya Dryomov buf_size = 0;
4388ed95b21aSIlya Dryomov }
4389ed95b21aSIlya Dryomov
4390ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4391ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie,
4392ed95b21aSIlya Dryomov buf, buf_size);
4393ed95b21aSIlya Dryomov if (ret)
4394ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4395ed95b21aSIlya Dryomov }
4396ed95b21aSIlya Dryomov
rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie)4397ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4398ed95b21aSIlya Dryomov u64 cookie)
4399ed95b21aSIlya Dryomov {
4400ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
4401ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4402ed95b21aSIlya Dryomov }
4403ed95b21aSIlya Dryomov
rbd_acknowledge_notify_result(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 result)4404ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4405ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result)
4406ed95b21aSIlya Dryomov {
4407ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4408ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4409ed95b21aSIlya Dryomov }
4410922dab61SIlya Dryomov
rbd_watch_cb(void * arg,u64 notify_id,u64 cookie,u64 notifier_id,void * data,size_t data_len)4411922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4412922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len)
4413bf0d5f50SAlex Elder {
4414922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg;
4415ed95b21aSIlya Dryomov void *p = data;
4416ed95b21aSIlya Dryomov void *const end = p + data_len;
4417d4c2269bSIlya Dryomov u8 struct_v = 0;
4418ed95b21aSIlya Dryomov u32 len;
4419ed95b21aSIlya Dryomov u32 notify_op;
4420bf0d5f50SAlex Elder int ret;
4421bf0d5f50SAlex Elder
4422ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4423ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len);
4424ed95b21aSIlya Dryomov if (data_len) {
4425ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4426ed95b21aSIlya Dryomov &struct_v, &len);
4427ed95b21aSIlya Dryomov if (ret) {
4428ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4429ed95b21aSIlya Dryomov ret);
4430ed95b21aSIlya Dryomov return;
4431ed95b21aSIlya Dryomov }
443252bb1f9bSIlya Dryomov
4433ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p);
4434ed95b21aSIlya Dryomov } else {
4435ed95b21aSIlya Dryomov /* legacy notification for header updates */
4436ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4437ed95b21aSIlya Dryomov len = 0;
4438ed95b21aSIlya Dryomov }
4439ed95b21aSIlya Dryomov
4440ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4441ed95b21aSIlya Dryomov switch (notify_op) {
4442ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4443ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4444ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4445ed95b21aSIlya Dryomov break;
4446ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK:
4447ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p);
4448ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4449ed95b21aSIlya Dryomov break;
4450ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK:
44513b77faa0SIlya Dryomov ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
44523b77faa0SIlya Dryomov if (ret <= 0)
4453ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id,
44543b77faa0SIlya Dryomov cookie, ret);
4455ed95b21aSIlya Dryomov else
4456ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4457ed95b21aSIlya Dryomov break;
4458ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE:
4459e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev);
4460e627db08SAlex Elder if (ret)
44619584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret);
4462bf0d5f50SAlex Elder
4463ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4464ed95b21aSIlya Dryomov break;
4465ed95b21aSIlya Dryomov default:
4466ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev))
4467ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id,
4468ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP);
4469ed95b21aSIlya Dryomov else
4470ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4471ed95b21aSIlya Dryomov break;
44729969ebc5SAlex Elder }
44739969ebc5SAlex Elder }
44749969ebc5SAlex Elder
447599d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
44769969ebc5SAlex Elder
rbd_watch_errcb(void * arg,u64 cookie,int err)4477922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4478bb040aa0SIlya Dryomov {
4479922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg;
4480bb040aa0SIlya Dryomov
4481922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err);
4482bb040aa0SIlya Dryomov
4483ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4484ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4485ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4486bb040aa0SIlya Dryomov
448799d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex);
448899d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
448999d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev);
449099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4491bb040aa0SIlya Dryomov
449299d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4493bb040aa0SIlya Dryomov }
449499d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
4495bb040aa0SIlya Dryomov }
4496bb040aa0SIlya Dryomov
4497bb040aa0SIlya Dryomov /*
449899d16943SIlya Dryomov * watch_mutex must be locked
44999969ebc5SAlex Elder */
__rbd_register_watch(struct rbd_device * rbd_dev)450099d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev)
45019969ebc5SAlex Elder {
45029969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4503922dab61SIlya Dryomov struct ceph_osd_linger_request *handle;
45049969ebc5SAlex Elder
4505922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle);
450699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
45079969ebc5SAlex Elder
4508922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4509922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb,
4510922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev);
4511922dab61SIlya Dryomov if (IS_ERR(handle))
4512922dab61SIlya Dryomov return PTR_ERR(handle);
45139969ebc5SAlex Elder
4514922dab61SIlya Dryomov rbd_dev->watch_handle = handle;
45158eb87565SAlex Elder return 0;
45169969ebc5SAlex Elder }
45179969ebc5SAlex Elder
451899d16943SIlya Dryomov /*
451999d16943SIlya Dryomov * watch_mutex must be locked
452099d16943SIlya Dryomov */
__rbd_unregister_watch(struct rbd_device * rbd_dev)452199d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4522fca27065SIlya Dryomov {
4523922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4524922dab61SIlya Dryomov int ret;
4525b30a01f2SIlya Dryomov
452699d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle);
452799d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
4528b30a01f2SIlya Dryomov
4529922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4530922dab61SIlya Dryomov if (ret)
4531922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4532b30a01f2SIlya Dryomov
4533922dab61SIlya Dryomov rbd_dev->watch_handle = NULL;
4534c525f036SIlya Dryomov }
4535c525f036SIlya Dryomov
rbd_register_watch(struct rbd_device * rbd_dev)453699d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev)
4537c525f036SIlya Dryomov {
453899d16943SIlya Dryomov int ret;
4539811c6688SIlya Dryomov
454099d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex);
454199d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
454299d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev);
454399d16943SIlya Dryomov if (ret)
454499d16943SIlya Dryomov goto out;
454599d16943SIlya Dryomov
454699d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
454799d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
454899d16943SIlya Dryomov
454999d16943SIlya Dryomov out:
455099d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
455199d16943SIlya Dryomov return ret;
455299d16943SIlya Dryomov }
455399d16943SIlya Dryomov
cancel_tasks_sync(struct rbd_device * rbd_dev)455499d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev)
455599d16943SIlya Dryomov {
455699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
455799d16943SIlya Dryomov
4558ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work);
4559ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work);
4560ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4561ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work);
456299d16943SIlya Dryomov }
456399d16943SIlya Dryomov
45640e4e1de5SIlya Dryomov /*
45650e4e1de5SIlya Dryomov * header_rwsem must not be held to avoid a deadlock with
45660e4e1de5SIlya Dryomov * rbd_dev_refresh() when flushing notifies.
45670e4e1de5SIlya Dryomov */
rbd_unregister_watch(struct rbd_device * rbd_dev)456899d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev)
456999d16943SIlya Dryomov {
457099d16943SIlya Dryomov cancel_tasks_sync(rbd_dev);
457199d16943SIlya Dryomov
457299d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex);
457399d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
457499d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev);
457599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
457699d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
457799d16943SIlya Dryomov
457823edca86SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4579811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4580fca27065SIlya Dryomov }
4581fca27065SIlya Dryomov
458214bb211dSIlya Dryomov /*
458314bb211dSIlya Dryomov * lock_rwsem must be held for write
458414bb211dSIlya Dryomov */
rbd_reacquire_lock(struct rbd_device * rbd_dev)458514bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
458614bb211dSIlya Dryomov {
458714bb211dSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
458814bb211dSIlya Dryomov char cookie[32];
458914bb211dSIlya Dryomov int ret;
459014bb211dSIlya Dryomov
4591a2b1da09SIlya Dryomov if (!rbd_quiesce_lock(rbd_dev))
4592a2b1da09SIlya Dryomov return;
459314bb211dSIlya Dryomov
459414bb211dSIlya Dryomov format_lock_cookie(rbd_dev, cookie);
459514bb211dSIlya Dryomov ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
459614bb211dSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME,
459714bb211dSIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
459814bb211dSIlya Dryomov RBD_LOCK_TAG, cookie);
459914bb211dSIlya Dryomov if (ret) {
460014bb211dSIlya Dryomov if (ret != -EOPNOTSUPP)
460114bb211dSIlya Dryomov rbd_warn(rbd_dev, "failed to update lock cookie: %d",
460214bb211dSIlya Dryomov ret);
460314bb211dSIlya Dryomov
460414bb211dSIlya Dryomov /*
460514bb211dSIlya Dryomov * Lock cookie cannot be updated on older OSDs, so do
460614bb211dSIlya Dryomov * a manual release and queue an acquire.
460714bb211dSIlya Dryomov */
4608e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev);
4609a2b1da09SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
461014bb211dSIlya Dryomov } else {
4611edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie);
4612637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, 0);
461314bb211dSIlya Dryomov }
461414bb211dSIlya Dryomov }
461514bb211dSIlya Dryomov
rbd_reregister_watch(struct work_struct * work)461699d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work)
461799d16943SIlya Dryomov {
461899d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
461999d16943SIlya Dryomov struct rbd_device, watch_dwork);
462099d16943SIlya Dryomov int ret;
462199d16943SIlya Dryomov
462299d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
462399d16943SIlya Dryomov
462499d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex);
462587c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
462687c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
462714bb211dSIlya Dryomov return;
462887c0fdedSIlya Dryomov }
462999d16943SIlya Dryomov
463099d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev);
463199d16943SIlya Dryomov if (ret) {
463299d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
46330b98acd6SIlya Dryomov if (ret != -EBLOCKLISTED && ret != -ENOENT) {
463499d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq,
463599d16943SIlya Dryomov &rbd_dev->watch_dwork,
463699d16943SIlya Dryomov RBD_RETRY_DELAY);
463787c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
463814bb211dSIlya Dryomov return;
463999d16943SIlya Dryomov }
464099d16943SIlya Dryomov
4641637cd060SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
4642637cd060SIlya Dryomov down_write(&rbd_dev->lock_rwsem);
4643637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret);
4644637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem);
4645637cd060SIlya Dryomov return;
4646637cd060SIlya Dryomov }
4647637cd060SIlya Dryomov
464899d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
464999d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
465099d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex);
465199d16943SIlya Dryomov
465214bb211dSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
465314bb211dSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
465414bb211dSIlya Dryomov rbd_reacquire_lock(rbd_dev);
465514bb211dSIlya Dryomov up_write(&rbd_dev->lock_rwsem);
465614bb211dSIlya Dryomov
465799d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev);
465899d16943SIlya Dryomov if (ret)
4659f6870cc9SColin Ian King rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
466099d16943SIlya Dryomov }
466199d16943SIlya Dryomov
466236be9a76SAlex Elder /*
4663f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes
4664f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code.
466536be9a76SAlex Elder */
rbd_obj_method_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,const char * method_name,const void * outbound,size_t outbound_size,void * inbound,size_t inbound_size)466636be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4667ecd4a68aSIlya Dryomov struct ceph_object_id *oid,
4668ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc,
466936be9a76SAlex Elder const char *method_name,
46704157976bSAlex Elder const void *outbound,
467136be9a76SAlex Elder size_t outbound_size,
46724157976bSAlex Elder void *inbound,
4673e2a58ee5SAlex Elder size_t inbound_size)
467436be9a76SAlex Elder {
4675ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4676ecd4a68aSIlya Dryomov struct page *req_page = NULL;
4677ecd4a68aSIlya Dryomov struct page *reply_page;
467836be9a76SAlex Elder int ret;
467936be9a76SAlex Elder
468036be9a76SAlex Elder /*
46816010a451SAlex Elder * Method calls are ultimately read operations. The result
46826010a451SAlex Elder * should placed into the inbound buffer provided. They
46836010a451SAlex Elder * also supply outbound data--parameters for the object
46846010a451SAlex Elder * method. Currently if this is present it will be a
46856010a451SAlex Elder * snapshot id.
468636be9a76SAlex Elder */
4687ecd4a68aSIlya Dryomov if (outbound) {
4688ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE)
4689ecd4a68aSIlya Dryomov return -E2BIG;
469036be9a76SAlex Elder
4691ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL);
4692ecd4a68aSIlya Dryomov if (!req_page)
4693ecd4a68aSIlya Dryomov return -ENOMEM;
469436be9a76SAlex Elder
4695ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size);
469604017e29SAlex Elder }
4697430c28c3SAlex Elder
4698ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL);
4699ecd4a68aSIlya Dryomov if (!reply_page) {
4700ecd4a68aSIlya Dryomov if (req_page)
4701ecd4a68aSIlya Dryomov __free_page(req_page);
4702ecd4a68aSIlya Dryomov return -ENOMEM;
4703ecd4a68aSIlya Dryomov }
470436be9a76SAlex Elder
4705ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4706ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size,
470768ada915SIlya Dryomov &reply_page, &inbound_size);
4708ecd4a68aSIlya Dryomov if (!ret) {
4709ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size);
4710ecd4a68aSIlya Dryomov ret = inbound_size;
4711ecd4a68aSIlya Dryomov }
471257385b51SAlex Elder
4713ecd4a68aSIlya Dryomov if (req_page)
4714ecd4a68aSIlya Dryomov __free_page(req_page);
4715ecd4a68aSIlya Dryomov __free_page(reply_page);
471636be9a76SAlex Elder return ret;
471736be9a76SAlex Elder }
471836be9a76SAlex Elder
rbd_queue_workfn(struct work_struct * work)47197ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work)
4720bc1ecc65SIlya Dryomov {
472159e542c8SIlya Dryomov struct rbd_img_request *img_request =
472259e542c8SIlya Dryomov container_of(work, struct rbd_img_request, work);
472359e542c8SIlya Dryomov struct rbd_device *rbd_dev = img_request->rbd_dev;
472459e542c8SIlya Dryomov enum obj_operation_type op_type = img_request->op_type;
472559e542c8SIlya Dryomov struct request *rq = blk_mq_rq_from_pdu(img_request);
4726bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4727bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq);
47284e752f0aSJosh Durgin u64 mapping_size;
4729bc1ecc65SIlya Dryomov int result;
4730bc1ecc65SIlya Dryomov
4731bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */
4732bc1ecc65SIlya Dryomov if (!length) {
4733bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__);
4734bc1ecc65SIlya Dryomov result = 0;
473559e542c8SIlya Dryomov goto err_img_request;
4736bc1ecc65SIlya Dryomov }
4737bc1ecc65SIlya Dryomov
47387ad18afaSChristoph Hellwig blk_mq_start_request(rq);
47397ad18afaSChristoph Hellwig
47404e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem);
47414e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size;
4742a52cc685SIlya Dryomov rbd_img_capture_header(img_request);
47434e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem);
47444e752f0aSJosh Durgin
47454e752f0aSJosh Durgin if (offset + length > mapping_size) {
4746bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
47474e752f0aSJosh Durgin length, mapping_size);
4748bc1ecc65SIlya Dryomov result = -EIO;
4749a52cc685SIlya Dryomov goto err_img_request;
4750bc1ecc65SIlya Dryomov }
4751bc1ecc65SIlya Dryomov
475221ed05a8SIlya Dryomov dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
475321ed05a8SIlya Dryomov img_request, obj_op_name(op_type), offset, length);
475421ed05a8SIlya Dryomov
47556484cbe9SIlya Dryomov if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
47565a237819SIlya Dryomov result = rbd_img_fill_nodata(img_request, offset, length);
475790e98c52SGuangliang Zhao else
47585a237819SIlya Dryomov result = rbd_img_fill_from_bio(img_request, offset, length,
475990e98c52SGuangliang Zhao rq->bio);
47600192ce2eSIlya Dryomov if (result)
4761bc1ecc65SIlya Dryomov goto err_img_request;
4762bc1ecc65SIlya Dryomov
4763e1fddc8fSIlya Dryomov rbd_img_handle_request(img_request, 0);
4764bc1ecc65SIlya Dryomov return;
4765bc1ecc65SIlya Dryomov
4766bc1ecc65SIlya Dryomov err_img_request:
4767679a97d2SHannes Reinecke rbd_img_request_destroy(img_request);
4768bc1ecc65SIlya Dryomov if (result)
4769bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d",
47706d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result);
47712a842acaSChristoph Hellwig blk_mq_end_request(rq, errno_to_blk_status(result));
4772bc1ecc65SIlya Dryomov }
4773bc1ecc65SIlya Dryomov
rbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)4774fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
47757ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd)
4776bc1ecc65SIlya Dryomov {
477759e542c8SIlya Dryomov struct rbd_device *rbd_dev = hctx->queue->queuedata;
477859e542c8SIlya Dryomov struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
477959e542c8SIlya Dryomov enum obj_operation_type op_type;
4780bc1ecc65SIlya Dryomov
478159e542c8SIlya Dryomov switch (req_op(bd->rq)) {
478259e542c8SIlya Dryomov case REQ_OP_DISCARD:
478359e542c8SIlya Dryomov op_type = OBJ_OP_DISCARD;
478459e542c8SIlya Dryomov break;
478559e542c8SIlya Dryomov case REQ_OP_WRITE_ZEROES:
478659e542c8SIlya Dryomov op_type = OBJ_OP_ZEROOUT;
478759e542c8SIlya Dryomov break;
478859e542c8SIlya Dryomov case REQ_OP_WRITE:
478959e542c8SIlya Dryomov op_type = OBJ_OP_WRITE;
479059e542c8SIlya Dryomov break;
479159e542c8SIlya Dryomov case REQ_OP_READ:
479259e542c8SIlya Dryomov op_type = OBJ_OP_READ;
479359e542c8SIlya Dryomov break;
479459e542c8SIlya Dryomov default:
479559e542c8SIlya Dryomov rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
479659e542c8SIlya Dryomov return BLK_STS_IOERR;
479759e542c8SIlya Dryomov }
479859e542c8SIlya Dryomov
479959e542c8SIlya Dryomov rbd_img_request_init(img_req, rbd_dev, op_type);
480059e542c8SIlya Dryomov
480159e542c8SIlya Dryomov if (rbd_img_is_write(img_req)) {
480259e542c8SIlya Dryomov if (rbd_is_ro(rbd_dev)) {
480359e542c8SIlya Dryomov rbd_warn(rbd_dev, "%s on read-only mapping",
480459e542c8SIlya Dryomov obj_op_name(img_req->op_type));
480559e542c8SIlya Dryomov return BLK_STS_IOERR;
480659e542c8SIlya Dryomov }
480759e542c8SIlya Dryomov rbd_assert(!rbd_is_snap(rbd_dev));
480859e542c8SIlya Dryomov }
480959e542c8SIlya Dryomov
481059e542c8SIlya Dryomov INIT_WORK(&img_req->work, rbd_queue_workfn);
481159e542c8SIlya Dryomov queue_work(rbd_wq, &img_req->work);
4812fc17b653SChristoph Hellwig return BLK_STS_OK;
4813bf0d5f50SAlex Elder }
4814bf0d5f50SAlex Elder
rbd_free_disk(struct rbd_device * rbd_dev)4815602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev)
4816602adf40SYehuda Sadeh {
48178b9ab626SChristoph Hellwig put_disk(rbd_dev->disk);
48187ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set);
48195769ed0cSIlya Dryomov rbd_dev->disk = NULL;
4820602adf40SYehuda Sadeh }
4821602adf40SYehuda Sadeh
rbd_obj_read_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,void * buf,int buf_len)4822788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4823fe5478e0SIlya Dryomov struct ceph_object_id *oid,
4824fe5478e0SIlya Dryomov struct ceph_object_locator *oloc,
4825fe5478e0SIlya Dryomov void *buf, int buf_len)
4826788e2df3SAlex Elder
4827788e2df3SAlex Elder {
4828fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4829fe5478e0SIlya Dryomov struct ceph_osd_request *req;
4830fe5478e0SIlya Dryomov struct page **pages;
4831fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len);
4832788e2df3SAlex Elder int ret;
4833788e2df3SAlex Elder
4834fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4835fe5478e0SIlya Dryomov if (!req)
4836fe5478e0SIlya Dryomov return -ENOMEM;
4837788e2df3SAlex Elder
4838fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid);
4839fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc);
4840fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ;
4841788e2df3SAlex Elder
4842fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4843fe5478e0SIlya Dryomov if (IS_ERR(pages)) {
4844fe5478e0SIlya Dryomov ret = PTR_ERR(pages);
4845fe5478e0SIlya Dryomov goto out_req;
4846fe5478e0SIlya Dryomov }
48471ceae7efSAlex Elder
4848fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4849fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4850fe5478e0SIlya Dryomov true);
4851788e2df3SAlex Elder
485226f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
485326f887e0SIlya Dryomov if (ret)
485426f887e0SIlya Dryomov goto out_req;
485526f887e0SIlya Dryomov
4856a8af0d68SJeff Layton ceph_osdc_start_request(osdc, req);
4857fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req);
4858fe5478e0SIlya Dryomov if (ret >= 0)
4859fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret);
4860fe5478e0SIlya Dryomov
4861fe5478e0SIlya Dryomov out_req:
4862fe5478e0SIlya Dryomov ceph_osdc_put_request(req);
4863788e2df3SAlex Elder return ret;
4864788e2df3SAlex Elder }
4865788e2df3SAlex Elder
4866602adf40SYehuda Sadeh /*
4867662518b1SAlex Elder * Read the complete header for the given rbd device. On successful
4868662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date
4869662518b1SAlex Elder * information about the image.
48704156d998SAlex Elder */
rbd_dev_v1_header_info(struct rbd_device * rbd_dev,struct rbd_image_header * header,bool first_time)4871510a7330SIlya Dryomov static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev,
4872510a7330SIlya Dryomov struct rbd_image_header *header,
4873510a7330SIlya Dryomov bool first_time)
48744156d998SAlex Elder {
48754156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL;
48764156d998SAlex Elder u32 snap_count = 0;
48774156d998SAlex Elder u64 names_size = 0;
48784156d998SAlex Elder u32 want_count;
48794156d998SAlex Elder int ret;
48804156d998SAlex Elder
48814156d998SAlex Elder /*
48824156d998SAlex Elder * The complete header will include an array of its 64-bit
48834156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as
48844156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that
48854156d998SAlex Elder * the number of snapshots could change by the time we read
48864156d998SAlex Elder * it in, in which case we re-read it.
48874156d998SAlex Elder */
48884156d998SAlex Elder do {
48894156d998SAlex Elder size_t size;
48904156d998SAlex Elder
48914156d998SAlex Elder kfree(ondisk);
48924156d998SAlex Elder
48934156d998SAlex Elder size = sizeof (*ondisk);
48944156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk);
48954156d998SAlex Elder size += names_size;
48964156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL);
48974156d998SAlex Elder if (!ondisk)
4898662518b1SAlex Elder return -ENOMEM;
48994156d998SAlex Elder
4900fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4901fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size);
49024156d998SAlex Elder if (ret < 0)
4903662518b1SAlex Elder goto out;
4904c0cd10dbSAlex Elder if ((size_t)ret < size) {
49054156d998SAlex Elder ret = -ENXIO;
490606ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)",
490706ecc6cbSAlex Elder size, ret);
4908662518b1SAlex Elder goto out;
49094156d998SAlex Elder }
49104156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) {
49114156d998SAlex Elder ret = -ENXIO;
491206ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header");
4913662518b1SAlex Elder goto out;
49144156d998SAlex Elder }
49154156d998SAlex Elder
49164156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len);
49174156d998SAlex Elder want_count = snap_count;
49184156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count);
49194156d998SAlex Elder } while (snap_count != want_count);
49204156d998SAlex Elder
4921510a7330SIlya Dryomov ret = rbd_header_from_disk(header, ondisk, first_time);
4922662518b1SAlex Elder out:
49234156d998SAlex Elder kfree(ondisk);
49244156d998SAlex Elder
4925dfc5606dSYehuda Sadeh return ret;
4926602adf40SYehuda Sadeh }
4927602adf40SYehuda Sadeh
rbd_dev_update_size(struct rbd_device * rbd_dev)49289875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev)
49299875201eSJosh Durgin {
49309875201eSJosh Durgin sector_t size;
49319875201eSJosh Durgin
49329875201eSJosh Durgin /*
4933811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4934811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size
4935811c6688SIlya Dryomov * is just useless work since the device can't be opened.
49369875201eSJosh Durgin */
4937811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4938811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
49399875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
49409875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size);
4941e864e49aSChristoph Hellwig set_capacity_and_notify(rbd_dev->disk, size);
49429875201eSJosh Durgin }
49439875201eSJosh Durgin }
49449875201eSJosh Durgin
4945f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = {
49467ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq,
49477ad18afaSChristoph Hellwig };
49487ad18afaSChristoph Hellwig
rbd_init_disk(struct rbd_device * rbd_dev)4949602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev)
4950602adf40SYehuda Sadeh {
4951602adf40SYehuda Sadeh struct gendisk *disk;
4952602adf40SYehuda Sadeh struct request_queue *q;
4953420efbdfSIlya Dryomov unsigned int objset_bytes =
4954420efbdfSIlya Dryomov rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
49557ad18afaSChristoph Hellwig int err;
4956602adf40SYehuda Sadeh
49577ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
49587ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops;
4959b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
49607ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
496156d18f62SMing Lei rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
4962f9b6b98dSHannes Reinecke rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
496359e542c8SIlya Dryomov rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
49647ad18afaSChristoph Hellwig
49657ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
49667ad18afaSChristoph Hellwig if (err)
4967195b1956SChristoph Hellwig return err;
4968029bcbd8SJosh Durgin
4969195b1956SChristoph Hellwig disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
4970195b1956SChristoph Hellwig if (IS_ERR(disk)) {
4971195b1956SChristoph Hellwig err = PTR_ERR(disk);
49727ad18afaSChristoph Hellwig goto out_tag_set;
49737ad18afaSChristoph Hellwig }
4974195b1956SChristoph Hellwig q = disk->queue;
4975195b1956SChristoph Hellwig
4976195b1956SChristoph Hellwig snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4977195b1956SChristoph Hellwig rbd_dev->dev_id);
4978195b1956SChristoph Hellwig disk->major = rbd_dev->major;
4979195b1956SChristoph Hellwig disk->first_minor = rbd_dev->minor;
49801ebe2e5fSChristoph Hellwig if (single_major)
4981195b1956SChristoph Hellwig disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
49821ebe2e5fSChristoph Hellwig else
4983195b1956SChristoph Hellwig disk->minors = RBD_MINORS_PER_MAJOR;
4984195b1956SChristoph Hellwig disk->fops = &rbd_bd_ops;
49850077a500SIlya Dryomov disk->private_data = rbd_dev;
49867ad18afaSChristoph Hellwig
49878b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
4988d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
4989593a9e7bSAlex Elder
4990420efbdfSIlya Dryomov blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
49910d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q);
499221acdf45SIlya Dryomov blk_queue_max_segments(q, USHRT_MAX);
499324f1df60SIlya Dryomov blk_queue_max_segment_size(q, UINT_MAX);
499416d80c54SIlya Dryomov blk_queue_io_min(q, rbd_dev->opts->alloc_size);
499516d80c54SIlya Dryomov blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
4996029bcbd8SJosh Durgin
4997d9360540SIlya Dryomov if (rbd_dev->opts->trim) {
499816d80c54SIlya Dryomov q->limits.discard_granularity = rbd_dev->opts->alloc_size;
4999420efbdfSIlya Dryomov blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5000420efbdfSIlya Dryomov blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5001d9360540SIlya Dryomov }
500290e98c52SGuangliang Zhao
5003bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
50041cb039f3SChristoph Hellwig blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
5005bae818eeSRonny Hegewald
5006602adf40SYehuda Sadeh rbd_dev->disk = disk;
5007602adf40SYehuda Sadeh
5008602adf40SYehuda Sadeh return 0;
50097ad18afaSChristoph Hellwig out_tag_set:
50107ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set);
50117ad18afaSChristoph Hellwig return err;
5012602adf40SYehuda Sadeh }
5013602adf40SYehuda Sadeh
5014dfc5606dSYehuda Sadeh /*
5015dfc5606dSYehuda Sadeh sysfs
5016dfc5606dSYehuda Sadeh */
5017602adf40SYehuda Sadeh
dev_to_rbd_dev(struct device * dev)5018593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5019593a9e7bSAlex Elder {
5020593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev);
5021593a9e7bSAlex Elder }
5022593a9e7bSAlex Elder
rbd_size_show(struct device * dev,struct device_attribute * attr,char * buf)5023dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev,
5024dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf)
5025602adf40SYehuda Sadeh {
5026593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5027dfc5606dSYehuda Sadeh
5028fc71d833SAlex Elder return sprintf(buf, "%llu\n",
5029fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size);
5030602adf40SYehuda Sadeh }
5031602adf40SYehuda Sadeh
rbd_features_show(struct device * dev,struct device_attribute * attr,char * buf)503234b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev,
503334b13184SAlex Elder struct device_attribute *attr, char *buf)
503434b13184SAlex Elder {
503534b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
503634b13184SAlex Elder
5037fa58bcadSIlya Dryomov return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
503834b13184SAlex Elder }
503934b13184SAlex Elder
rbd_major_show(struct device * dev,struct device_attribute * attr,char * buf)5040dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev,
5041dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf)
5042602adf40SYehuda Sadeh {
5043593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5044dfc5606dSYehuda Sadeh
5045fc71d833SAlex Elder if (rbd_dev->major)
5046dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major);
5047fc71d833SAlex Elder
5048fc71d833SAlex Elder return sprintf(buf, "(none)\n");
5049dd82fff1SIlya Dryomov }
5050fc71d833SAlex Elder
rbd_minor_show(struct device * dev,struct device_attribute * attr,char * buf)5051dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev,
5052dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf)
5053dd82fff1SIlya Dryomov {
5054dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5055dd82fff1SIlya Dryomov
5056dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor);
5057dfc5606dSYehuda Sadeh }
5058dfc5606dSYehuda Sadeh
rbd_client_addr_show(struct device * dev,struct device_attribute * attr,char * buf)5059005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev,
5060005a07bfSIlya Dryomov struct device_attribute *attr, char *buf)
5061005a07bfSIlya Dryomov {
5062005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5063005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr =
5064005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client);
5065005a07bfSIlya Dryomov
5066005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5067005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce));
5068005a07bfSIlya Dryomov }
5069005a07bfSIlya Dryomov
rbd_client_id_show(struct device * dev,struct device_attribute * attr,char * buf)5070dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev,
5071dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf)
5072dfc5606dSYehuda Sadeh {
5073593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5074dfc5606dSYehuda Sadeh
50751dbb4399SAlex Elder return sprintf(buf, "client%lld\n",
5076033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client));
5077dfc5606dSYehuda Sadeh }
5078dfc5606dSYehuda Sadeh
rbd_cluster_fsid_show(struct device * dev,struct device_attribute * attr,char * buf)5079267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev,
5080267fb90bSMike Christie struct device_attribute *attr, char *buf)
5081267fb90bSMike Christie {
5082267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5083267fb90bSMike Christie
5084267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5085267fb90bSMike Christie }
5086267fb90bSMike Christie
rbd_config_info_show(struct device * dev,struct device_attribute * attr,char * buf)50870d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev,
50880d6d1e9cSMike Christie struct device_attribute *attr, char *buf)
50890d6d1e9cSMike Christie {
50900d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
50910d6d1e9cSMike Christie
5092f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN))
5093f44d04e6SIlya Dryomov return -EPERM;
5094f44d04e6SIlya Dryomov
50950d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info);
5096dfc5606dSYehuda Sadeh }
5097dfc5606dSYehuda Sadeh
rbd_pool_show(struct device * dev,struct device_attribute * attr,char * buf)5098dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev,
5099dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf)
5100dfc5606dSYehuda Sadeh {
5101593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5102dfc5606dSYehuda Sadeh
51030d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5104dfc5606dSYehuda Sadeh }
5105dfc5606dSYehuda Sadeh
rbd_pool_id_show(struct device * dev,struct device_attribute * attr,char * buf)51069bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev,
51079bb2f334SAlex Elder struct device_attribute *attr, char *buf)
51089bb2f334SAlex Elder {
51099bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
51109bb2f334SAlex Elder
51110d7dbfceSAlex Elder return sprintf(buf, "%llu\n",
51120d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id);
51139bb2f334SAlex Elder }
51149bb2f334SAlex Elder
rbd_pool_ns_show(struct device * dev,struct device_attribute * attr,char * buf)5115b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev,
5116b26c047bSIlya Dryomov struct device_attribute *attr, char *buf)
5117b26c047bSIlya Dryomov {
5118b26c047bSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5119b26c047bSIlya Dryomov
5120b26c047bSIlya Dryomov return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5121b26c047bSIlya Dryomov }
5122b26c047bSIlya Dryomov
rbd_name_show(struct device * dev,struct device_attribute * attr,char * buf)5123dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev,
5124dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf)
5125dfc5606dSYehuda Sadeh {
5126593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5127dfc5606dSYehuda Sadeh
5128a92ffdf8SAlex Elder if (rbd_dev->spec->image_name)
51290d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5130a92ffdf8SAlex Elder
5131a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n");
5132dfc5606dSYehuda Sadeh }
5133dfc5606dSYehuda Sadeh
rbd_image_id_show(struct device * dev,struct device_attribute * attr,char * buf)5134589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev,
5135589d30e0SAlex Elder struct device_attribute *attr, char *buf)
5136589d30e0SAlex Elder {
5137589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5138589d30e0SAlex Elder
51390d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5140589d30e0SAlex Elder }
5141589d30e0SAlex Elder
514234b13184SAlex Elder /*
514334b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or
514434b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image).
514534b13184SAlex Elder */
rbd_snap_show(struct device * dev,struct device_attribute * attr,char * buf)5146dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev,
5147dfc5606dSYehuda Sadeh struct device_attribute *attr,
5148dfc5606dSYehuda Sadeh char *buf)
5149dfc5606dSYehuda Sadeh {
5150593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5151dfc5606dSYehuda Sadeh
51520d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5153dfc5606dSYehuda Sadeh }
5154dfc5606dSYehuda Sadeh
rbd_snap_id_show(struct device * dev,struct device_attribute * attr,char * buf)515592a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev,
515692a58671SMike Christie struct device_attribute *attr, char *buf)
515792a58671SMike Christie {
515892a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
515992a58671SMike Christie
516092a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
516192a58671SMike Christie }
516292a58671SMike Christie
516386b00e0dSAlex Elder /*
5164ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty
5165ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent
5166ff96128fSIlya Dryomov * image)".
516786b00e0dSAlex Elder */
rbd_parent_show(struct device * dev,struct device_attribute * attr,char * buf)516886b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev,
516986b00e0dSAlex Elder struct device_attribute *attr,
517086b00e0dSAlex Elder char *buf)
517186b00e0dSAlex Elder {
517286b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5173ff96128fSIlya Dryomov ssize_t count = 0;
517486b00e0dSAlex Elder
5175ff96128fSIlya Dryomov if (!rbd_dev->parent)
517686b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n");
517786b00e0dSAlex Elder
5178ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5179ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec;
518086b00e0dSAlex Elder
5181ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s"
5182ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n"
5183e92c0eafSIlya Dryomov "pool_ns %s\n"
5184ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n"
5185ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n"
5186ff96128fSIlya Dryomov "overlap %llu\n",
5187ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */
5188ff96128fSIlya Dryomov spec->pool_id, spec->pool_name,
5189e92c0eafSIlya Dryomov spec->pool_ns ?: "",
5190ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)",
5191ff96128fSIlya Dryomov spec->snap_id, spec->snap_name,
5192ff96128fSIlya Dryomov rbd_dev->parent_overlap);
5193ff96128fSIlya Dryomov }
519486b00e0dSAlex Elder
519586b00e0dSAlex Elder return count;
519686b00e0dSAlex Elder }
519786b00e0dSAlex Elder
rbd_image_refresh(struct device * dev,struct device_attribute * attr,const char * buf,size_t size)5198dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev,
5199dfc5606dSYehuda Sadeh struct device_attribute *attr,
5200dfc5606dSYehuda Sadeh const char *buf,
5201dfc5606dSYehuda Sadeh size_t size)
5202dfc5606dSYehuda Sadeh {
5203593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5204b813623aSAlex Elder int ret;
5205602adf40SYehuda Sadeh
5206f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN))
5207f44d04e6SIlya Dryomov return -EPERM;
5208f44d04e6SIlya Dryomov
5209cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev);
5210e627db08SAlex Elder if (ret)
521152bb1f9bSIlya Dryomov return ret;
5212b813623aSAlex Elder
521352bb1f9bSIlya Dryomov return size;
5214dfc5606dSYehuda Sadeh }
5215602adf40SYehuda Sadeh
52165657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
52175657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
52185657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
52195657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
52205657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
52215657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
52225657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
52235657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
52245657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
52255657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5226b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
52275657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
52285657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
52295657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
52305657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
52315657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
52325657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5233dfc5606dSYehuda Sadeh
5234dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = {
5235dfc5606dSYehuda Sadeh &dev_attr_size.attr,
523634b13184SAlex Elder &dev_attr_features.attr,
5237dfc5606dSYehuda Sadeh &dev_attr_major.attr,
5238dd82fff1SIlya Dryomov &dev_attr_minor.attr,
5239005a07bfSIlya Dryomov &dev_attr_client_addr.attr,
5240dfc5606dSYehuda Sadeh &dev_attr_client_id.attr,
5241267fb90bSMike Christie &dev_attr_cluster_fsid.attr,
52420d6d1e9cSMike Christie &dev_attr_config_info.attr,
5243dfc5606dSYehuda Sadeh &dev_attr_pool.attr,
52449bb2f334SAlex Elder &dev_attr_pool_id.attr,
5245b26c047bSIlya Dryomov &dev_attr_pool_ns.attr,
5246dfc5606dSYehuda Sadeh &dev_attr_name.attr,
5247589d30e0SAlex Elder &dev_attr_image_id.attr,
5248dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr,
524992a58671SMike Christie &dev_attr_snap_id.attr,
525086b00e0dSAlex Elder &dev_attr_parent.attr,
5251dfc5606dSYehuda Sadeh &dev_attr_refresh.attr,
5252dfc5606dSYehuda Sadeh NULL
5253dfc5606dSYehuda Sadeh };
5254dfc5606dSYehuda Sadeh
5255dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = {
5256dfc5606dSYehuda Sadeh .attrs = rbd_attrs,
5257dfc5606dSYehuda Sadeh };
5258dfc5606dSYehuda Sadeh
5259dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = {
5260dfc5606dSYehuda Sadeh &rbd_attr_group,
5261dfc5606dSYehuda Sadeh NULL
5262dfc5606dSYehuda Sadeh };
5263dfc5606dSYehuda Sadeh
52646cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev);
5265dfc5606dSYehuda Sadeh
5266b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = {
5267dfc5606dSYehuda Sadeh .name = "rbd",
5268dfc5606dSYehuda Sadeh .groups = rbd_attr_groups,
52696cac4695SIlya Dryomov .release = rbd_dev_release,
5270dfc5606dSYehuda Sadeh };
5271dfc5606dSYehuda Sadeh
rbd_spec_get(struct rbd_spec * spec)52728b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
52738b8fb99cSAlex Elder {
52748b8fb99cSAlex Elder kref_get(&spec->kref);
52758b8fb99cSAlex Elder
52768b8fb99cSAlex Elder return spec;
52778b8fb99cSAlex Elder }
52788b8fb99cSAlex Elder
52798b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref);
rbd_spec_put(struct rbd_spec * spec)52808b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec)
52818b8fb99cSAlex Elder {
52828b8fb99cSAlex Elder if (spec)
52838b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free);
52848b8fb99cSAlex Elder }
52858b8fb99cSAlex Elder
rbd_spec_alloc(void)52868b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void)
52878b8fb99cSAlex Elder {
52888b8fb99cSAlex Elder struct rbd_spec *spec;
52898b8fb99cSAlex Elder
52908b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL);
52918b8fb99cSAlex Elder if (!spec)
52928b8fb99cSAlex Elder return NULL;
529304077599SIlya Dryomov
529404077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL;
529504077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP;
52968b8fb99cSAlex Elder kref_init(&spec->kref);
52978b8fb99cSAlex Elder
52988b8fb99cSAlex Elder return spec;
52998b8fb99cSAlex Elder }
53008b8fb99cSAlex Elder
rbd_spec_free(struct kref * kref)53018b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref)
53028b8fb99cSAlex Elder {
53038b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
53048b8fb99cSAlex Elder
53058b8fb99cSAlex Elder kfree(spec->pool_name);
5306b26c047bSIlya Dryomov kfree(spec->pool_ns);
53078b8fb99cSAlex Elder kfree(spec->image_id);
53088b8fb99cSAlex Elder kfree(spec->image_name);
53098b8fb99cSAlex Elder kfree(spec->snap_name);
53108b8fb99cSAlex Elder kfree(spec);
53118b8fb99cSAlex Elder }
53128b8fb99cSAlex Elder
rbd_dev_free(struct rbd_device * rbd_dev)53131643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev)
5314dd5ac32dSIlya Dryomov {
531599d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5316ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5317dd5ac32dSIlya Dryomov
5318c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid);
53196b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc);
53200d6d1e9cSMike Christie kfree(rbd_dev->config_info);
5321c41d13a3SIlya Dryomov
5322dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client);
5323dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec);
5324dd5ac32dSIlya Dryomov kfree(rbd_dev->opts);
5325dd5ac32dSIlya Dryomov kfree(rbd_dev);
53261643dfa4SIlya Dryomov }
53271643dfa4SIlya Dryomov
rbd_dev_release(struct device * dev)53281643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev)
53291643dfa4SIlya Dryomov {
53301643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
53311643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts;
53321643dfa4SIlya Dryomov
53331643dfa4SIlya Dryomov if (need_put) {
53341643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq);
53351643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
53361643dfa4SIlya Dryomov }
53371643dfa4SIlya Dryomov
53381643dfa4SIlya Dryomov rbd_dev_free(rbd_dev);
5339dd5ac32dSIlya Dryomov
5340dd5ac32dSIlya Dryomov /*
5341dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of
5342dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so
5343dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill.
5344dd5ac32dSIlya Dryomov */
5345dd5ac32dSIlya Dryomov if (need_put)
5346dd5ac32dSIlya Dryomov module_put(THIS_MODULE);
5347dd5ac32dSIlya Dryomov }
5348dd5ac32dSIlya Dryomov
__rbd_dev_create(struct rbd_spec * spec)5349f7c4d9b1SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
5350c53d5893SAlex Elder {
5351c53d5893SAlex Elder struct rbd_device *rbd_dev;
5352c53d5893SAlex Elder
5353c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5354c53d5893SAlex Elder if (!rbd_dev)
5355c53d5893SAlex Elder return NULL;
5356c53d5893SAlex Elder
5357c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock);
5358c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node);
5359c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem);
5360c53d5893SAlex Elder
53617e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5362c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid);
5363431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id;
5364b26c047bSIlya Dryomov if (spec->pool_ns) {
5365b26c047bSIlya Dryomov WARN_ON(!*spec->pool_ns);
5366b26c047bSIlya Dryomov rbd_dev->header_oloc.pool_ns =
5367b26c047bSIlya Dryomov ceph_find_or_create_string(spec->pool_ns,
5368b26c047bSIlya Dryomov strlen(spec->pool_ns));
5369b26c047bSIlya Dryomov }
5370c41d13a3SIlya Dryomov
537199d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex);
537299d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
537399d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
537499d16943SIlya Dryomov
5375ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem);
5376ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5377ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5378ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5379ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5380ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5381e1fddc8fSIlya Dryomov spin_lock_init(&rbd_dev->lock_lists_lock);
5382637cd060SIlya Dryomov INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5383e1fddc8fSIlya Dryomov INIT_LIST_HEAD(&rbd_dev->running_list);
5384637cd060SIlya Dryomov init_completion(&rbd_dev->acquire_wait);
5385e1fddc8fSIlya Dryomov init_completion(&rbd_dev->releasing_wait);
5386ed95b21aSIlya Dryomov
538722e8bd51SIlya Dryomov spin_lock_init(&rbd_dev->object_map_lock);
5388c53d5893SAlex Elder
5389dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type;
5390dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type;
5391dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev;
5392dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev);
5393dd5ac32dSIlya Dryomov
53941643dfa4SIlya Dryomov return rbd_dev;
53951643dfa4SIlya Dryomov }
53961643dfa4SIlya Dryomov
5397dd5ac32dSIlya Dryomov /*
53981643dfa4SIlya Dryomov * Create a mapping rbd_dev.
5399dd5ac32dSIlya Dryomov */
rbd_dev_create(struct rbd_client * rbdc,struct rbd_spec * spec,struct rbd_options * opts)54001643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
54011643dfa4SIlya Dryomov struct rbd_spec *spec,
54021643dfa4SIlya Dryomov struct rbd_options *opts)
54031643dfa4SIlya Dryomov {
54041643dfa4SIlya Dryomov struct rbd_device *rbd_dev;
54051643dfa4SIlya Dryomov
5406f7c4d9b1SIlya Dryomov rbd_dev = __rbd_dev_create(spec);
54071643dfa4SIlya Dryomov if (!rbd_dev)
54081643dfa4SIlya Dryomov return NULL;
54091643dfa4SIlya Dryomov
54101643dfa4SIlya Dryomov /* get an id and fill in device name */
54111643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
54121643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS),
54131643dfa4SIlya Dryomov GFP_KERNEL);
54141643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0)
54151643dfa4SIlya Dryomov goto fail_rbd_dev;
54161643dfa4SIlya Dryomov
54171643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
54181643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
54191643dfa4SIlya Dryomov rbd_dev->name);
54201643dfa4SIlya Dryomov if (!rbd_dev->task_wq)
54211643dfa4SIlya Dryomov goto fail_dev_id;
54221643dfa4SIlya Dryomov
54231643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */
5424dd5ac32dSIlya Dryomov __module_get(THIS_MODULE);
5425dd5ac32dSIlya Dryomov
5426f7c4d9b1SIlya Dryomov rbd_dev->rbd_client = rbdc;
5427f7c4d9b1SIlya Dryomov rbd_dev->spec = spec;
5428f7c4d9b1SIlya Dryomov rbd_dev->opts = opts;
5429f7c4d9b1SIlya Dryomov
54301643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5431c53d5893SAlex Elder return rbd_dev;
54321643dfa4SIlya Dryomov
54331643dfa4SIlya Dryomov fail_dev_id:
54341643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
54351643dfa4SIlya Dryomov fail_rbd_dev:
54361643dfa4SIlya Dryomov rbd_dev_free(rbd_dev);
54371643dfa4SIlya Dryomov return NULL;
5438c53d5893SAlex Elder }
5439c53d5893SAlex Elder
rbd_dev_destroy(struct rbd_device * rbd_dev)5440c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5441c53d5893SAlex Elder {
5442dd5ac32dSIlya Dryomov if (rbd_dev)
5443dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev);
5444c53d5893SAlex Elder }
5445c53d5893SAlex Elder
5446dfc5606dSYehuda Sadeh /*
54479d475de5SAlex Elder * Get the size and object order for an image snapshot, or if
54489d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base
54499d475de5SAlex Elder * image.
54509d475de5SAlex Elder */
_rbd_dev_v2_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u8 * order,u64 * snap_size)54519d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
54529d475de5SAlex Elder u8 *order, u64 *snap_size)
54539d475de5SAlex Elder {
54549d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id);
54559d475de5SAlex Elder int ret;
54569d475de5SAlex Elder struct {
54579d475de5SAlex Elder u8 order;
54589d475de5SAlex Elder __le64 size;
54599d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 };
54609d475de5SAlex Elder
5461ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5462ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size",
54634157976bSAlex Elder &snapid, sizeof(snapid),
5464e2a58ee5SAlex Elder &size_buf, sizeof(size_buf));
546536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
54669d475de5SAlex Elder if (ret < 0)
54679d475de5SAlex Elder return ret;
546857385b51SAlex Elder if (ret < sizeof (size_buf))
546957385b51SAlex Elder return -ERANGE;
54709d475de5SAlex Elder
5471c3545579SJosh Durgin if (order) {
54729d475de5SAlex Elder *order = size_buf.order;
5473c3545579SJosh Durgin dout(" order %u", (unsigned int)*order);
5474c3545579SJosh Durgin }
54759d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size);
54769d475de5SAlex Elder
5477c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n",
5478c3545579SJosh Durgin (unsigned long long)snap_id,
54799d475de5SAlex Elder (unsigned long long)*snap_size);
54809d475de5SAlex Elder
54819d475de5SAlex Elder return 0;
54829d475de5SAlex Elder }
54839d475de5SAlex Elder
rbd_dev_v2_object_prefix(struct rbd_device * rbd_dev,char ** pobject_prefix)5484510a7330SIlya Dryomov static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev,
5485510a7330SIlya Dryomov char **pobject_prefix)
54861e130199SAlex Elder {
54875435d206SDongsheng Yang size_t size;
54881e130199SAlex Elder void *reply_buf;
5489510a7330SIlya Dryomov char *object_prefix;
54901e130199SAlex Elder int ret;
54911e130199SAlex Elder void *p;
54921e130199SAlex Elder
54935435d206SDongsheng Yang /* Response will be an encoded string, which includes a length */
54945435d206SDongsheng Yang size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
54955435d206SDongsheng Yang reply_buf = kzalloc(size, GFP_KERNEL);
54961e130199SAlex Elder if (!reply_buf)
54971e130199SAlex Elder return -ENOMEM;
54981e130199SAlex Elder
5499ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5500ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix",
55015435d206SDongsheng Yang NULL, 0, reply_buf, size);
550236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
55031e130199SAlex Elder if (ret < 0)
55041e130199SAlex Elder goto out;
55051e130199SAlex Elder
55061e130199SAlex Elder p = reply_buf;
5507510a7330SIlya Dryomov object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL,
5508510a7330SIlya Dryomov GFP_NOIO);
5509510a7330SIlya Dryomov if (IS_ERR(object_prefix)) {
5510510a7330SIlya Dryomov ret = PTR_ERR(object_prefix);
5511510a7330SIlya Dryomov goto out;
5512510a7330SIlya Dryomov }
551357385b51SAlex Elder ret = 0;
55141e130199SAlex Elder
5515510a7330SIlya Dryomov *pobject_prefix = object_prefix;
5516510a7330SIlya Dryomov dout(" object_prefix = %s\n", object_prefix);
55171e130199SAlex Elder out:
55181e130199SAlex Elder kfree(reply_buf);
55191e130199SAlex Elder
55201e130199SAlex Elder return ret;
55211e130199SAlex Elder }
55221e130199SAlex Elder
_rbd_dev_v2_snap_features(struct rbd_device * rbd_dev,u64 snap_id,bool read_only,u64 * snap_features)5523b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5524196e2d6dSIlya Dryomov bool read_only, u64 *snap_features)
5525b1b5402aSAlex Elder {
5526196e2d6dSIlya Dryomov struct {
5527196e2d6dSIlya Dryomov __le64 snap_id;
5528196e2d6dSIlya Dryomov u8 read_only;
5529196e2d6dSIlya Dryomov } features_in;
5530b1b5402aSAlex Elder struct {
5531b1b5402aSAlex Elder __le64 features;
5532b1b5402aSAlex Elder __le64 incompat;
55334157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 };
5534d3767f0fSIlya Dryomov u64 unsup;
5535b1b5402aSAlex Elder int ret;
5536b1b5402aSAlex Elder
5537196e2d6dSIlya Dryomov features_in.snap_id = cpu_to_le64(snap_id);
5538196e2d6dSIlya Dryomov features_in.read_only = read_only;
5539196e2d6dSIlya Dryomov
5540ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5541ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features",
5542196e2d6dSIlya Dryomov &features_in, sizeof(features_in),
5543e2a58ee5SAlex Elder &features_buf, sizeof(features_buf));
554436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5545b1b5402aSAlex Elder if (ret < 0)
5546b1b5402aSAlex Elder return ret;
554757385b51SAlex Elder if (ret < sizeof (features_buf))
554857385b51SAlex Elder return -ERANGE;
5549d889140cSAlex Elder
5550d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5551d3767f0fSIlya Dryomov if (unsup) {
5552d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5553d3767f0fSIlya Dryomov unsup);
5554b8f5c6edSAlex Elder return -ENXIO;
5555d3767f0fSIlya Dryomov }
5556d889140cSAlex Elder
5557b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features);
5558b1b5402aSAlex Elder
5559b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5560b1b5402aSAlex Elder (unsigned long long)snap_id,
5561b1b5402aSAlex Elder (unsigned long long)*snap_features,
5562b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat));
5563b1b5402aSAlex Elder
5564b1b5402aSAlex Elder return 0;
5565b1b5402aSAlex Elder }
5566b1b5402aSAlex Elder
556722e8bd51SIlya Dryomov /*
556822e8bd51SIlya Dryomov * These are generic image flags, but since they are used only for
556922e8bd51SIlya Dryomov * object map, store them in rbd_dev->object_map_flags.
557022e8bd51SIlya Dryomov *
557122e8bd51SIlya Dryomov * For the same reason, this function is called only on object map
557222e8bd51SIlya Dryomov * (re)load and not on header refresh.
557322e8bd51SIlya Dryomov */
rbd_dev_v2_get_flags(struct rbd_device * rbd_dev)557422e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
557522e8bd51SIlya Dryomov {
557622e8bd51SIlya Dryomov __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
557722e8bd51SIlya Dryomov __le64 flags;
557822e8bd51SIlya Dryomov int ret;
557922e8bd51SIlya Dryomov
558022e8bd51SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
558122e8bd51SIlya Dryomov &rbd_dev->header_oloc, "get_flags",
558222e8bd51SIlya Dryomov &snapid, sizeof(snapid),
558322e8bd51SIlya Dryomov &flags, sizeof(flags));
558422e8bd51SIlya Dryomov if (ret < 0)
558522e8bd51SIlya Dryomov return ret;
558622e8bd51SIlya Dryomov if (ret < sizeof(flags))
558722e8bd51SIlya Dryomov return -EBADMSG;
558822e8bd51SIlya Dryomov
558922e8bd51SIlya Dryomov rbd_dev->object_map_flags = le64_to_cpu(flags);
559022e8bd51SIlya Dryomov return 0;
559122e8bd51SIlya Dryomov }
559222e8bd51SIlya Dryomov
5593eb3b2d6bSIlya Dryomov struct parent_image_info {
5594eb3b2d6bSIlya Dryomov u64 pool_id;
5595e92c0eafSIlya Dryomov const char *pool_ns;
5596eb3b2d6bSIlya Dryomov const char *image_id;
5597eb3b2d6bSIlya Dryomov u64 snap_id;
5598eb3b2d6bSIlya Dryomov
5599e92c0eafSIlya Dryomov bool has_overlap;
5600eb3b2d6bSIlya Dryomov u64 overlap;
5601eb3b2d6bSIlya Dryomov };
5602eb3b2d6bSIlya Dryomov
rbd_parent_info_cleanup(struct parent_image_info * pii)5603c1031177SIlya Dryomov static void rbd_parent_info_cleanup(struct parent_image_info *pii)
5604c1031177SIlya Dryomov {
5605c1031177SIlya Dryomov kfree(pii->pool_ns);
5606c1031177SIlya Dryomov kfree(pii->image_id);
5607c1031177SIlya Dryomov
5608c1031177SIlya Dryomov memset(pii, 0, sizeof(*pii));
5609c1031177SIlya Dryomov }
5610c1031177SIlya Dryomov
5611eb3b2d6bSIlya Dryomov /*
5612eb3b2d6bSIlya Dryomov * The caller is responsible for @pii.
5613eb3b2d6bSIlya Dryomov */
decode_parent_image_spec(void ** p,void * end,struct parent_image_info * pii)5614e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end,
5615e92c0eafSIlya Dryomov struct parent_image_info *pii)
5616e92c0eafSIlya Dryomov {
5617e92c0eafSIlya Dryomov u8 struct_v;
5618e92c0eafSIlya Dryomov u32 struct_len;
5619e92c0eafSIlya Dryomov int ret;
5620e92c0eafSIlya Dryomov
5621e92c0eafSIlya Dryomov ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5622e92c0eafSIlya Dryomov &struct_v, &struct_len);
5623e92c0eafSIlya Dryomov if (ret)
5624e92c0eafSIlya Dryomov return ret;
5625e92c0eafSIlya Dryomov
5626e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5627e92c0eafSIlya Dryomov pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5628e92c0eafSIlya Dryomov if (IS_ERR(pii->pool_ns)) {
5629e92c0eafSIlya Dryomov ret = PTR_ERR(pii->pool_ns);
5630e92c0eafSIlya Dryomov pii->pool_ns = NULL;
5631e92c0eafSIlya Dryomov return ret;
5632e92c0eafSIlya Dryomov }
5633e92c0eafSIlya Dryomov pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5634e92c0eafSIlya Dryomov if (IS_ERR(pii->image_id)) {
5635e92c0eafSIlya Dryomov ret = PTR_ERR(pii->image_id);
5636e92c0eafSIlya Dryomov pii->image_id = NULL;
5637e92c0eafSIlya Dryomov return ret;
5638e92c0eafSIlya Dryomov }
5639e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5640e92c0eafSIlya Dryomov return 0;
5641e92c0eafSIlya Dryomov
5642e92c0eafSIlya Dryomov e_inval:
5643e92c0eafSIlya Dryomov return -EINVAL;
5644e92c0eafSIlya Dryomov }
5645e92c0eafSIlya Dryomov
__get_parent_info(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5646e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev,
5647e92c0eafSIlya Dryomov struct page *req_page,
5648e92c0eafSIlya Dryomov struct page *reply_page,
5649e92c0eafSIlya Dryomov struct parent_image_info *pii)
5650e92c0eafSIlya Dryomov {
5651e92c0eafSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5652e92c0eafSIlya Dryomov size_t reply_len = PAGE_SIZE;
5653e92c0eafSIlya Dryomov void *p, *end;
5654e92c0eafSIlya Dryomov int ret;
5655e92c0eafSIlya Dryomov
5656e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5657e92c0eafSIlya Dryomov "rbd", "parent_get", CEPH_OSD_FLAG_READ,
565868ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len);
5659e92c0eafSIlya Dryomov if (ret)
5660e92c0eafSIlya Dryomov return ret == -EOPNOTSUPP ? 1 : ret;
5661e92c0eafSIlya Dryomov
5662e92c0eafSIlya Dryomov p = page_address(reply_page);
5663e92c0eafSIlya Dryomov end = p + reply_len;
5664e92c0eafSIlya Dryomov ret = decode_parent_image_spec(&p, end, pii);
5665e92c0eafSIlya Dryomov if (ret)
5666e92c0eafSIlya Dryomov return ret;
5667e92c0eafSIlya Dryomov
5668e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5669e92c0eafSIlya Dryomov "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
567068ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len);
5671e92c0eafSIlya Dryomov if (ret)
5672e92c0eafSIlya Dryomov return ret;
5673e92c0eafSIlya Dryomov
5674e92c0eafSIlya Dryomov p = page_address(reply_page);
5675e92c0eafSIlya Dryomov end = p + reply_len;
5676e92c0eafSIlya Dryomov ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5677e92c0eafSIlya Dryomov if (pii->has_overlap)
5678e92c0eafSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5679e92c0eafSIlya Dryomov
5680c1031177SIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5681c1031177SIlya Dryomov __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
5682c1031177SIlya Dryomov pii->has_overlap, pii->overlap);
5683e92c0eafSIlya Dryomov return 0;
5684e92c0eafSIlya Dryomov
5685e92c0eafSIlya Dryomov e_inval:
5686e92c0eafSIlya Dryomov return -EINVAL;
5687e92c0eafSIlya Dryomov }
5688e92c0eafSIlya Dryomov
5689e92c0eafSIlya Dryomov /*
5690e92c0eafSIlya Dryomov * The caller is responsible for @pii.
5691e92c0eafSIlya Dryomov */
__get_parent_info_legacy(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5692eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5693eb3b2d6bSIlya Dryomov struct page *req_page,
5694eb3b2d6bSIlya Dryomov struct page *reply_page,
5695eb3b2d6bSIlya Dryomov struct parent_image_info *pii)
5696eb3b2d6bSIlya Dryomov {
5697eb3b2d6bSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5698eb3b2d6bSIlya Dryomov size_t reply_len = PAGE_SIZE;
5699eb3b2d6bSIlya Dryomov void *p, *end;
5700eb3b2d6bSIlya Dryomov int ret;
5701eb3b2d6bSIlya Dryomov
5702eb3b2d6bSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5703eb3b2d6bSIlya Dryomov "rbd", "get_parent", CEPH_OSD_FLAG_READ,
570468ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len);
5705eb3b2d6bSIlya Dryomov if (ret)
5706eb3b2d6bSIlya Dryomov return ret;
5707eb3b2d6bSIlya Dryomov
5708eb3b2d6bSIlya Dryomov p = page_address(reply_page);
5709eb3b2d6bSIlya Dryomov end = p + reply_len;
5710eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5711eb3b2d6bSIlya Dryomov pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5712eb3b2d6bSIlya Dryomov if (IS_ERR(pii->image_id)) {
5713eb3b2d6bSIlya Dryomov ret = PTR_ERR(pii->image_id);
5714eb3b2d6bSIlya Dryomov pii->image_id = NULL;
5715eb3b2d6bSIlya Dryomov return ret;
5716eb3b2d6bSIlya Dryomov }
5717eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5718e92c0eafSIlya Dryomov pii->has_overlap = true;
5719eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5720eb3b2d6bSIlya Dryomov
5721c1031177SIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5722c1031177SIlya Dryomov __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
5723c1031177SIlya Dryomov pii->has_overlap, pii->overlap);
5724eb3b2d6bSIlya Dryomov return 0;
5725eb3b2d6bSIlya Dryomov
5726eb3b2d6bSIlya Dryomov e_inval:
5727eb3b2d6bSIlya Dryomov return -EINVAL;
5728eb3b2d6bSIlya Dryomov }
5729eb3b2d6bSIlya Dryomov
rbd_dev_v2_parent_info(struct rbd_device * rbd_dev,struct parent_image_info * pii)5730c1031177SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
5731eb3b2d6bSIlya Dryomov struct parent_image_info *pii)
5732eb3b2d6bSIlya Dryomov {
5733eb3b2d6bSIlya Dryomov struct page *req_page, *reply_page;
5734eb3b2d6bSIlya Dryomov void *p;
5735eb3b2d6bSIlya Dryomov int ret;
5736eb3b2d6bSIlya Dryomov
5737eb3b2d6bSIlya Dryomov req_page = alloc_page(GFP_KERNEL);
5738eb3b2d6bSIlya Dryomov if (!req_page)
5739eb3b2d6bSIlya Dryomov return -ENOMEM;
5740eb3b2d6bSIlya Dryomov
5741eb3b2d6bSIlya Dryomov reply_page = alloc_page(GFP_KERNEL);
5742eb3b2d6bSIlya Dryomov if (!reply_page) {
5743eb3b2d6bSIlya Dryomov __free_page(req_page);
5744eb3b2d6bSIlya Dryomov return -ENOMEM;
5745eb3b2d6bSIlya Dryomov }
5746eb3b2d6bSIlya Dryomov
5747eb3b2d6bSIlya Dryomov p = page_address(req_page);
5748eb3b2d6bSIlya Dryomov ceph_encode_64(&p, rbd_dev->spec->snap_id);
5749e92c0eafSIlya Dryomov ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5750e92c0eafSIlya Dryomov if (ret > 0)
5751e92c0eafSIlya Dryomov ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5752e92c0eafSIlya Dryomov pii);
5753eb3b2d6bSIlya Dryomov
5754eb3b2d6bSIlya Dryomov __free_page(req_page);
5755eb3b2d6bSIlya Dryomov __free_page(reply_page);
5756eb3b2d6bSIlya Dryomov return ret;
5757eb3b2d6bSIlya Dryomov }
5758eb3b2d6bSIlya Dryomov
rbd_dev_setup_parent(struct rbd_device * rbd_dev)5759c1031177SIlya Dryomov static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
576086b00e0dSAlex Elder {
576186b00e0dSAlex Elder struct rbd_spec *parent_spec;
5762eb3b2d6bSIlya Dryomov struct parent_image_info pii = { 0 };
576386b00e0dSAlex Elder int ret;
576486b00e0dSAlex Elder
576586b00e0dSAlex Elder parent_spec = rbd_spec_alloc();
576686b00e0dSAlex Elder if (!parent_spec)
576786b00e0dSAlex Elder return -ENOMEM;
576886b00e0dSAlex Elder
5769c1031177SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
5770eb3b2d6bSIlya Dryomov if (ret)
577186b00e0dSAlex Elder goto out_err;
577286b00e0dSAlex Elder
5773c1031177SIlya Dryomov if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
577486b00e0dSAlex Elder goto out; /* No parent? No problem. */
577586b00e0dSAlex Elder
57760903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */
57770903e875SAlex Elder
57780903e875SAlex Elder ret = -EIO;
5779eb3b2d6bSIlya Dryomov if (pii.pool_id > (u64)U32_MAX) {
57809584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5781eb3b2d6bSIlya Dryomov (unsigned long long)pii.pool_id, U32_MAX);
578257385b51SAlex Elder goto out_err;
5783c0cd10dbSAlex Elder }
57840903e875SAlex Elder
57853b5cf2a2SAlex Elder /*
5786c1031177SIlya Dryomov * The parent won't change except when the clone is flattened,
5787c1031177SIlya Dryomov * so we only need to record the parent image spec once.
57883b5cf2a2SAlex Elder */
5789eb3b2d6bSIlya Dryomov parent_spec->pool_id = pii.pool_id;
5790e92c0eafSIlya Dryomov if (pii.pool_ns && *pii.pool_ns) {
5791e92c0eafSIlya Dryomov parent_spec->pool_ns = pii.pool_ns;
5792e92c0eafSIlya Dryomov pii.pool_ns = NULL;
5793e92c0eafSIlya Dryomov }
5794eb3b2d6bSIlya Dryomov parent_spec->image_id = pii.image_id;
5795eb3b2d6bSIlya Dryomov pii.image_id = NULL;
5796eb3b2d6bSIlya Dryomov parent_spec->snap_id = pii.snap_id;
5797b26c047bSIlya Dryomov
5798c1031177SIlya Dryomov rbd_assert(!rbd_dev->parent_spec);
579986b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec;
580086b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */
58013b5cf2a2SAlex Elder
58023b5cf2a2SAlex Elder /*
5803c1031177SIlya Dryomov * Record the parent overlap. If it's zero, issue a warning as
5804c1031177SIlya Dryomov * we will proceed as if there is no parent.
58053b5cf2a2SAlex Elder */
5806c1031177SIlya Dryomov if (!pii.overlap)
5807cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
5808eb3b2d6bSIlya Dryomov rbd_dev->parent_overlap = pii.overlap;
5809cf32bd9cSIlya Dryomov
581086b00e0dSAlex Elder out:
581186b00e0dSAlex Elder ret = 0;
581286b00e0dSAlex Elder out_err:
5813c1031177SIlya Dryomov rbd_parent_info_cleanup(&pii);
581486b00e0dSAlex Elder rbd_spec_put(parent_spec);
581586b00e0dSAlex Elder return ret;
581686b00e0dSAlex Elder }
581786b00e0dSAlex Elder
rbd_dev_v2_striping_info(struct rbd_device * rbd_dev,u64 * stripe_unit,u64 * stripe_count)5818510a7330SIlya Dryomov static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev,
5819510a7330SIlya Dryomov u64 *stripe_unit, u64 *stripe_count)
5820cc070d59SAlex Elder {
5821cc070d59SAlex Elder struct {
5822cc070d59SAlex Elder __le64 stripe_unit;
5823cc070d59SAlex Elder __le64 stripe_count;
5824cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 };
5825cc070d59SAlex Elder size_t size = sizeof (striping_info_buf);
5826cc070d59SAlex Elder int ret;
5827cc070d59SAlex Elder
5828ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5829ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count",
5830ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size);
5831cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5832cc070d59SAlex Elder if (ret < 0)
5833cc070d59SAlex Elder return ret;
5834cc070d59SAlex Elder if (ret < size)
5835cc070d59SAlex Elder return -ERANGE;
5836cc070d59SAlex Elder
5837510a7330SIlya Dryomov *stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit);
5838510a7330SIlya Dryomov *stripe_count = le64_to_cpu(striping_info_buf.stripe_count);
5839510a7330SIlya Dryomov dout(" stripe_unit = %llu stripe_count = %llu\n", *stripe_unit,
5840510a7330SIlya Dryomov *stripe_count);
5841510a7330SIlya Dryomov
5842cc070d59SAlex Elder return 0;
5843cc070d59SAlex Elder }
5844cc070d59SAlex Elder
rbd_dev_v2_data_pool(struct rbd_device * rbd_dev,s64 * data_pool_id)5845510a7330SIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id)
58467e97332eSIlya Dryomov {
5847510a7330SIlya Dryomov __le64 data_pool_buf;
58487e97332eSIlya Dryomov int ret;
58497e97332eSIlya Dryomov
58507e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
58517e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool",
5852510a7330SIlya Dryomov NULL, 0, &data_pool_buf,
5853510a7330SIlya Dryomov sizeof(data_pool_buf));
5854510a7330SIlya Dryomov dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
58557e97332eSIlya Dryomov if (ret < 0)
58567e97332eSIlya Dryomov return ret;
5857510a7330SIlya Dryomov if (ret < sizeof(data_pool_buf))
58587e97332eSIlya Dryomov return -EBADMSG;
58597e97332eSIlya Dryomov
5860510a7330SIlya Dryomov *data_pool_id = le64_to_cpu(data_pool_buf);
5861510a7330SIlya Dryomov dout(" data_pool_id = %lld\n", *data_pool_id);
5862510a7330SIlya Dryomov WARN_ON(*data_pool_id == CEPH_NOPOOL);
5863510a7330SIlya Dryomov
58647e97332eSIlya Dryomov return 0;
58657e97332eSIlya Dryomov }
58667e97332eSIlya Dryomov
rbd_dev_image_name(struct rbd_device * rbd_dev)58679e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
58689e15b77dSAlex Elder {
5869ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid);
58709e15b77dSAlex Elder size_t image_id_size;
58719e15b77dSAlex Elder char *image_id;
58729e15b77dSAlex Elder void *p;
58739e15b77dSAlex Elder void *end;
58749e15b77dSAlex Elder size_t size;
58759e15b77dSAlex Elder void *reply_buf = NULL;
58769e15b77dSAlex Elder size_t len = 0;
58779e15b77dSAlex Elder char *image_name = NULL;
58789e15b77dSAlex Elder int ret;
58799e15b77dSAlex Elder
58809e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name);
58819e15b77dSAlex Elder
588269e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id);
588369e7a02fSAlex Elder image_id_size = sizeof (__le32) + len;
58849e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL);
58859e15b77dSAlex Elder if (!image_id)
58869e15b77dSAlex Elder return NULL;
58879e15b77dSAlex Elder
58889e15b77dSAlex Elder p = image_id;
58894157976bSAlex Elder end = image_id + image_id_size;
589069e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
58919e15b77dSAlex Elder
58929e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
58939e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL);
58949e15b77dSAlex Elder if (!reply_buf)
58959e15b77dSAlex Elder goto out;
58969e15b77dSAlex Elder
5897ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5898ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5899ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size,
5900e2a58ee5SAlex Elder reply_buf, size);
59019e15b77dSAlex Elder if (ret < 0)
59029e15b77dSAlex Elder goto out;
59039e15b77dSAlex Elder p = reply_buf;
5904f40eb349SAlex Elder end = reply_buf + ret;
5905f40eb349SAlex Elder
59069e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
59079e15b77dSAlex Elder if (IS_ERR(image_name))
59089e15b77dSAlex Elder image_name = NULL;
59099e15b77dSAlex Elder else
59109e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len);
59119e15b77dSAlex Elder out:
59129e15b77dSAlex Elder kfree(reply_buf);
59139e15b77dSAlex Elder kfree(image_id);
59149e15b77dSAlex Elder
59159e15b77dSAlex Elder return image_name;
59169e15b77dSAlex Elder }
59179e15b77dSAlex Elder
rbd_v1_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)59182ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59192ad3d716SAlex Elder {
59202ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc;
59212ad3d716SAlex Elder const char *snap_name;
59222ad3d716SAlex Elder u32 which = 0;
59232ad3d716SAlex Elder
59242ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */
59252ad3d716SAlex Elder
59262ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names;
59272ad3d716SAlex Elder while (which < snapc->num_snaps) {
59282ad3d716SAlex Elder if (!strcmp(name, snap_name))
59292ad3d716SAlex Elder return snapc->snaps[which];
59302ad3d716SAlex Elder snap_name += strlen(snap_name) + 1;
59312ad3d716SAlex Elder which++;
59322ad3d716SAlex Elder }
59332ad3d716SAlex Elder return CEPH_NOSNAP;
59342ad3d716SAlex Elder }
59352ad3d716SAlex Elder
rbd_v2_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)59362ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59372ad3d716SAlex Elder {
59382ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc;
59392ad3d716SAlex Elder u32 which;
59402ad3d716SAlex Elder bool found = false;
59412ad3d716SAlex Elder u64 snap_id;
59422ad3d716SAlex Elder
59432ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) {
59442ad3d716SAlex Elder const char *snap_name;
59452ad3d716SAlex Elder
59462ad3d716SAlex Elder snap_id = snapc->snaps[which];
59472ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
5948efadc98aSJosh Durgin if (IS_ERR(snap_name)) {
5949efadc98aSJosh Durgin /* ignore no-longer existing snapshots */
5950efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT)
5951efadc98aSJosh Durgin continue;
5952efadc98aSJosh Durgin else
59532ad3d716SAlex Elder break;
5954efadc98aSJosh Durgin }
59552ad3d716SAlex Elder found = !strcmp(name, snap_name);
59562ad3d716SAlex Elder kfree(snap_name);
59572ad3d716SAlex Elder }
59582ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP;
59592ad3d716SAlex Elder }
59602ad3d716SAlex Elder
59612ad3d716SAlex Elder /*
59622ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
59632ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs.
59642ad3d716SAlex Elder */
rbd_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)59652ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
59662ad3d716SAlex Elder {
59672ad3d716SAlex Elder if (rbd_dev->image_format == 1)
59682ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name);
59692ad3d716SAlex Elder
59702ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name);
59712ad3d716SAlex Elder }
59722ad3d716SAlex Elder
59739e15b77dSAlex Elder /*
597404077599SIlya Dryomov * An image being mapped will have everything but the snap id.
59759e15b77dSAlex Elder */
rbd_spec_fill_snap_id(struct rbd_device * rbd_dev)597604077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
597704077599SIlya Dryomov {
597804077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec;
597904077599SIlya Dryomov
598004077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
598104077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name);
598204077599SIlya Dryomov rbd_assert(spec->snap_name);
598304077599SIlya Dryomov
598404077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
598504077599SIlya Dryomov u64 snap_id;
598604077599SIlya Dryomov
598704077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
598804077599SIlya Dryomov if (snap_id == CEPH_NOSNAP)
598904077599SIlya Dryomov return -ENOENT;
599004077599SIlya Dryomov
599104077599SIlya Dryomov spec->snap_id = snap_id;
599204077599SIlya Dryomov } else {
599304077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP;
599404077599SIlya Dryomov }
599504077599SIlya Dryomov
599604077599SIlya Dryomov return 0;
599704077599SIlya Dryomov }
599804077599SIlya Dryomov
599904077599SIlya Dryomov /*
600004077599SIlya Dryomov * A parent image will have all ids but none of the names.
600104077599SIlya Dryomov *
600204077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we
600304077599SIlya Dryomov * can't figure out the name for an image id.
600404077599SIlya Dryomov */
rbd_spec_fill_names(struct rbd_device * rbd_dev)600504077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
60069e15b77dSAlex Elder {
60072e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
60082e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec;
60092e9f7f1cSAlex Elder const char *pool_name;
60102e9f7f1cSAlex Elder const char *image_name;
60112e9f7f1cSAlex Elder const char *snap_name;
60129e15b77dSAlex Elder int ret;
60139e15b77dSAlex Elder
601404077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL);
601504077599SIlya Dryomov rbd_assert(spec->image_id);
601604077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP);
60179e15b77dSAlex Elder
60182e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */
60199e15b77dSAlex Elder
60202e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
60212e9f7f1cSAlex Elder if (!pool_name) {
60222e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6023935dc89fSAlex Elder return -EIO;
6024935dc89fSAlex Elder }
60252e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL);
60262e9f7f1cSAlex Elder if (!pool_name)
60279e15b77dSAlex Elder return -ENOMEM;
60289e15b77dSAlex Elder
60299e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */
60309e15b77dSAlex Elder
60312e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev);
60322e9f7f1cSAlex Elder if (!image_name)
603306ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name");
60349e15b77dSAlex Elder
603504077599SIlya Dryomov /* Fetch the snapshot name */
60369e15b77dSAlex Elder
60372e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6038da6a6b63SJosh Durgin if (IS_ERR(snap_name)) {
6039da6a6b63SJosh Durgin ret = PTR_ERR(snap_name);
60409e15b77dSAlex Elder goto out_err;
60412e9f7f1cSAlex Elder }
60422e9f7f1cSAlex Elder
60432e9f7f1cSAlex Elder spec->pool_name = pool_name;
60442e9f7f1cSAlex Elder spec->image_name = image_name;
60452e9f7f1cSAlex Elder spec->snap_name = snap_name;
60469e15b77dSAlex Elder
60479e15b77dSAlex Elder return 0;
604804077599SIlya Dryomov
60499e15b77dSAlex Elder out_err:
60502e9f7f1cSAlex Elder kfree(image_name);
60512e9f7f1cSAlex Elder kfree(pool_name);
60529e15b77dSAlex Elder return ret;
60539e15b77dSAlex Elder }
60549e15b77dSAlex Elder
rbd_dev_v2_snap_context(struct rbd_device * rbd_dev,struct ceph_snap_context ** psnapc)6055510a7330SIlya Dryomov static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev,
6056510a7330SIlya Dryomov struct ceph_snap_context **psnapc)
605735d489f9SAlex Elder {
605835d489f9SAlex Elder size_t size;
605935d489f9SAlex Elder int ret;
606035d489f9SAlex Elder void *reply_buf;
606135d489f9SAlex Elder void *p;
606235d489f9SAlex Elder void *end;
606335d489f9SAlex Elder u64 seq;
606435d489f9SAlex Elder u32 snap_count;
606535d489f9SAlex Elder struct ceph_snap_context *snapc;
606635d489f9SAlex Elder u32 i;
606735d489f9SAlex Elder
606835d489f9SAlex Elder /*
606935d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id),
607035d489f9SAlex Elder * snapshot count, and array of that many snapshot ids.
607135d489f9SAlex Elder * For now we have a fixed upper limit on the number we're
607235d489f9SAlex Elder * prepared to receive.
607335d489f9SAlex Elder */
607435d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) +
607535d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64);
607635d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL);
607735d489f9SAlex Elder if (!reply_buf)
607835d489f9SAlex Elder return -ENOMEM;
607935d489f9SAlex Elder
6080ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6081ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext",
6082ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size);
608336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
608435d489f9SAlex Elder if (ret < 0)
608535d489f9SAlex Elder goto out;
608635d489f9SAlex Elder
608735d489f9SAlex Elder p = reply_buf;
608857385b51SAlex Elder end = reply_buf + ret;
608957385b51SAlex Elder ret = -ERANGE;
609035d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out);
609135d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out);
609235d489f9SAlex Elder
609335d489f9SAlex Elder /*
609435d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go
609535d489f9SAlex Elder * beyond the end of our buffer. But before checking that,
609635d489f9SAlex Elder * make sure the computed size of the snapshot context we
609735d489f9SAlex Elder * allocate is representable in a size_t.
609835d489f9SAlex Elder */
609935d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
610035d489f9SAlex Elder / sizeof (u64)) {
610135d489f9SAlex Elder ret = -EINVAL;
610235d489f9SAlex Elder goto out;
610335d489f9SAlex Elder }
610435d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
610535d489f9SAlex Elder goto out;
6106468521c1SAlex Elder ret = 0;
610735d489f9SAlex Elder
6108812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
610935d489f9SAlex Elder if (!snapc) {
611035d489f9SAlex Elder ret = -ENOMEM;
611135d489f9SAlex Elder goto out;
611235d489f9SAlex Elder }
611335d489f9SAlex Elder snapc->seq = seq;
611435d489f9SAlex Elder for (i = 0; i < snap_count; i++)
611535d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p);
611635d489f9SAlex Elder
6117510a7330SIlya Dryomov *psnapc = snapc;
611835d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n",
611935d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count);
612035d489f9SAlex Elder out:
612135d489f9SAlex Elder kfree(reply_buf);
612235d489f9SAlex Elder
612357385b51SAlex Elder return ret;
612435d489f9SAlex Elder }
612535d489f9SAlex Elder
rbd_dev_v2_snap_name(struct rbd_device * rbd_dev,u64 snap_id)612654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
612754cac61fSAlex Elder u64 snap_id)
6128b8b1e2dbSAlex Elder {
6129b8b1e2dbSAlex Elder size_t size;
6130b8b1e2dbSAlex Elder void *reply_buf;
613154cac61fSAlex Elder __le64 snapid;
6132b8b1e2dbSAlex Elder int ret;
6133b8b1e2dbSAlex Elder void *p;
6134b8b1e2dbSAlex Elder void *end;
6135b8b1e2dbSAlex Elder char *snap_name;
6136b8b1e2dbSAlex Elder
6137b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6138b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL);
6139b8b1e2dbSAlex Elder if (!reply_buf)
6140b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM);
6141b8b1e2dbSAlex Elder
614254cac61fSAlex Elder snapid = cpu_to_le64(snap_id);
6143ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6144ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name",
6145ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size);
614636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6147f40eb349SAlex Elder if (ret < 0) {
6148f40eb349SAlex Elder snap_name = ERR_PTR(ret);
6149b8b1e2dbSAlex Elder goto out;
6150f40eb349SAlex Elder }
6151b8b1e2dbSAlex Elder
6152b8b1e2dbSAlex Elder p = reply_buf;
6153f40eb349SAlex Elder end = reply_buf + ret;
6154e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6155f40eb349SAlex Elder if (IS_ERR(snap_name))
6156b8b1e2dbSAlex Elder goto out;
6157f40eb349SAlex Elder
6158b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n",
615954cac61fSAlex Elder (unsigned long long)snap_id, snap_name);
6160b8b1e2dbSAlex Elder out:
6161b8b1e2dbSAlex Elder kfree(reply_buf);
6162b8b1e2dbSAlex Elder
6163f40eb349SAlex Elder return snap_name;
6164b8b1e2dbSAlex Elder }
6165b8b1e2dbSAlex Elder
rbd_dev_v2_header_info(struct rbd_device * rbd_dev,struct rbd_image_header * header,bool first_time)6166510a7330SIlya Dryomov static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev,
6167510a7330SIlya Dryomov struct rbd_image_header *header,
6168510a7330SIlya Dryomov bool first_time)
6169117973fbSAlex Elder {
6170117973fbSAlex Elder int ret;
6171117973fbSAlex Elder
6172510a7330SIlya Dryomov ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
6173510a7330SIlya Dryomov first_time ? &header->obj_order : NULL,
6174510a7330SIlya Dryomov &header->image_size);
61751617e40cSJosh Durgin if (ret)
6176cfbf6377SAlex Elder return ret;
61771617e40cSJosh Durgin
61782df3fac7SAlex Elder if (first_time) {
6179510a7330SIlya Dryomov ret = rbd_dev_v2_header_onetime(rbd_dev, header);
61802df3fac7SAlex Elder if (ret)
6181cfbf6377SAlex Elder return ret;
61822df3fac7SAlex Elder }
61832df3fac7SAlex Elder
6184510a7330SIlya Dryomov ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc);
6185510a7330SIlya Dryomov if (ret)
6186117973fbSAlex Elder return ret;
6187510a7330SIlya Dryomov
6188510a7330SIlya Dryomov return 0;
6189117973fbSAlex Elder }
6190117973fbSAlex Elder
rbd_dev_header_info(struct rbd_device * rbd_dev,struct rbd_image_header * header,bool first_time)6191510a7330SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev,
6192510a7330SIlya Dryomov struct rbd_image_header *header,
6193510a7330SIlya Dryomov bool first_time)
6194a720ae09SIlya Dryomov {
6195a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6196510a7330SIlya Dryomov rbd_assert(!header->object_prefix && !header->snapc);
6197a720ae09SIlya Dryomov
6198a720ae09SIlya Dryomov if (rbd_dev->image_format == 1)
6199510a7330SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev, header, first_time);
6200a720ae09SIlya Dryomov
6201510a7330SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev, header, first_time);
6202a720ae09SIlya Dryomov }
6203a720ae09SIlya Dryomov
62041ddbe94eSAlex Elder /*
6205e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the
6206e28fff26SAlex Elder * first found non-space character (if any). Returns the length of
6207593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note
6208593a9e7bSAlex Elder * that *buf must be terminated with '\0'.
6209e28fff26SAlex Elder */
next_token(const char ** buf)6210e28fff26SAlex Elder static inline size_t next_token(const char **buf)
6211e28fff26SAlex Elder {
6212e28fff26SAlex Elder /*
6213e28fff26SAlex Elder * These are the characters that produce nonzero for
6214e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales.
6215e28fff26SAlex Elder */
6216435a120aSColin Ian King static const char spaces[] = " \f\n\r\t\v";
6217e28fff26SAlex Elder
6218e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */
6219e28fff26SAlex Elder
6220e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */
6221e28fff26SAlex Elder }
6222e28fff26SAlex Elder
6223e28fff26SAlex Elder /*
6224ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big
6225ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new
6226ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note
6227ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token.
6228ea3352f4SAlex Elder *
6229ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null
6230ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If
6231ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token
6232ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp.
6233ea3352f4SAlex Elder *
6234ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond
6235ea3352f4SAlex Elder * the end of the found token.
6236ea3352f4SAlex Elder *
6237ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation.
6238ea3352f4SAlex Elder */
dup_token(const char ** buf,size_t * lenp)6239ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp)
6240ea3352f4SAlex Elder {
6241ea3352f4SAlex Elder char *dup;
6242ea3352f4SAlex Elder size_t len;
6243ea3352f4SAlex Elder
6244ea3352f4SAlex Elder len = next_token(buf);
62454caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6246ea3352f4SAlex Elder if (!dup)
6247ea3352f4SAlex Elder return NULL;
6248ea3352f4SAlex Elder *(dup + len) = '\0';
6249ea3352f4SAlex Elder *buf += len;
6250ea3352f4SAlex Elder
6251ea3352f4SAlex Elder if (lenp)
6252ea3352f4SAlex Elder *lenp = len;
6253ea3352f4SAlex Elder
6254ea3352f4SAlex Elder return dup;
6255ea3352f4SAlex Elder }
6256ea3352f4SAlex Elder
rbd_parse_param(struct fs_parameter * param,struct rbd_parse_opts_ctx * pctx)625782995cc6SDavid Howells static int rbd_parse_param(struct fs_parameter *param,
625882995cc6SDavid Howells struct rbd_parse_opts_ctx *pctx)
625982995cc6SDavid Howells {
626082995cc6SDavid Howells struct rbd_options *opt = pctx->opts;
626182995cc6SDavid Howells struct fs_parse_result result;
62623fbb8d55SAl Viro struct p_log log = {.prefix = "rbd"};
626382995cc6SDavid Howells int token, ret;
626482995cc6SDavid Howells
626582995cc6SDavid Howells ret = ceph_parse_param(param, pctx->copts, NULL);
626682995cc6SDavid Howells if (ret != -ENOPARAM)
626782995cc6SDavid Howells return ret;
626882995cc6SDavid Howells
6269d7167b14SAl Viro token = __fs_parse(&log, rbd_parameters, param, &result);
627082995cc6SDavid Howells dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
627182995cc6SDavid Howells if (token < 0) {
62722c3f3dc3SAl Viro if (token == -ENOPARAM)
62732c3f3dc3SAl Viro return inval_plog(&log, "Unknown parameter '%s'",
627482995cc6SDavid Howells param->key);
627582995cc6SDavid Howells return token;
627682995cc6SDavid Howells }
627782995cc6SDavid Howells
627882995cc6SDavid Howells switch (token) {
627982995cc6SDavid Howells case Opt_queue_depth:
628082995cc6SDavid Howells if (result.uint_32 < 1)
628182995cc6SDavid Howells goto out_of_range;
628282995cc6SDavid Howells opt->queue_depth = result.uint_32;
628382995cc6SDavid Howells break;
628482995cc6SDavid Howells case Opt_alloc_size:
628582995cc6SDavid Howells if (result.uint_32 < SECTOR_SIZE)
628682995cc6SDavid Howells goto out_of_range;
62872c3f3dc3SAl Viro if (!is_power_of_2(result.uint_32))
62882c3f3dc3SAl Viro return inval_plog(&log, "alloc_size must be a power of 2");
628982995cc6SDavid Howells opt->alloc_size = result.uint_32;
629082995cc6SDavid Howells break;
629182995cc6SDavid Howells case Opt_lock_timeout:
629282995cc6SDavid Howells /* 0 is "wait forever" (i.e. infinite timeout) */
629382995cc6SDavid Howells if (result.uint_32 > INT_MAX / 1000)
629482995cc6SDavid Howells goto out_of_range;
629582995cc6SDavid Howells opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
629682995cc6SDavid Howells break;
629782995cc6SDavid Howells case Opt_pool_ns:
629882995cc6SDavid Howells kfree(pctx->spec->pool_ns);
629982995cc6SDavid Howells pctx->spec->pool_ns = param->string;
630082995cc6SDavid Howells param->string = NULL;
630182995cc6SDavid Howells break;
6302dc1dad8eSIlya Dryomov case Opt_compression_hint:
6303dc1dad8eSIlya Dryomov switch (result.uint_32) {
6304dc1dad8eSIlya Dryomov case Opt_compression_hint_none:
6305dc1dad8eSIlya Dryomov opt->alloc_hint_flags &=
6306dc1dad8eSIlya Dryomov ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6307dc1dad8eSIlya Dryomov CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6308dc1dad8eSIlya Dryomov break;
6309dc1dad8eSIlya Dryomov case Opt_compression_hint_compressible:
6310dc1dad8eSIlya Dryomov opt->alloc_hint_flags |=
6311dc1dad8eSIlya Dryomov CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6312dc1dad8eSIlya Dryomov opt->alloc_hint_flags &=
6313dc1dad8eSIlya Dryomov ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6314dc1dad8eSIlya Dryomov break;
6315dc1dad8eSIlya Dryomov case Opt_compression_hint_incompressible:
6316dc1dad8eSIlya Dryomov opt->alloc_hint_flags |=
6317dc1dad8eSIlya Dryomov CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6318dc1dad8eSIlya Dryomov opt->alloc_hint_flags &=
6319dc1dad8eSIlya Dryomov ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6320dc1dad8eSIlya Dryomov break;
6321dc1dad8eSIlya Dryomov default:
6322dc1dad8eSIlya Dryomov BUG();
6323dc1dad8eSIlya Dryomov }
6324dc1dad8eSIlya Dryomov break;
632582995cc6SDavid Howells case Opt_read_only:
632682995cc6SDavid Howells opt->read_only = true;
632782995cc6SDavid Howells break;
632882995cc6SDavid Howells case Opt_read_write:
632982995cc6SDavid Howells opt->read_only = false;
633082995cc6SDavid Howells break;
633182995cc6SDavid Howells case Opt_lock_on_read:
633282995cc6SDavid Howells opt->lock_on_read = true;
633382995cc6SDavid Howells break;
633482995cc6SDavid Howells case Opt_exclusive:
633582995cc6SDavid Howells opt->exclusive = true;
633682995cc6SDavid Howells break;
633782995cc6SDavid Howells case Opt_notrim:
633882995cc6SDavid Howells opt->trim = false;
633982995cc6SDavid Howells break;
634082995cc6SDavid Howells default:
634182995cc6SDavid Howells BUG();
634282995cc6SDavid Howells }
634382995cc6SDavid Howells
634482995cc6SDavid Howells return 0;
634582995cc6SDavid Howells
634682995cc6SDavid Howells out_of_range:
63472c3f3dc3SAl Viro return inval_plog(&log, "%s out of range", param->key);
634882995cc6SDavid Howells }
634982995cc6SDavid Howells
635082995cc6SDavid Howells /*
635182995cc6SDavid Howells * This duplicates most of generic_parse_monolithic(), untying it from
635282995cc6SDavid Howells * fs_context and skipping standard superblock and security options.
635382995cc6SDavid Howells */
rbd_parse_options(char * options,struct rbd_parse_opts_ctx * pctx)635482995cc6SDavid Howells static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
635582995cc6SDavid Howells {
635682995cc6SDavid Howells char *key;
635782995cc6SDavid Howells int ret = 0;
635882995cc6SDavid Howells
635982995cc6SDavid Howells dout("%s '%s'\n", __func__, options);
636082995cc6SDavid Howells while ((key = strsep(&options, ",")) != NULL) {
636182995cc6SDavid Howells if (*key) {
636282995cc6SDavid Howells struct fs_parameter param = {
636382995cc6SDavid Howells .key = key,
63640f89589aSAl Viro .type = fs_value_is_flag,
636582995cc6SDavid Howells };
636682995cc6SDavid Howells char *value = strchr(key, '=');
636782995cc6SDavid Howells size_t v_len = 0;
636882995cc6SDavid Howells
636982995cc6SDavid Howells if (value) {
637082995cc6SDavid Howells if (value == key)
637182995cc6SDavid Howells continue;
637282995cc6SDavid Howells *value++ = 0;
637382995cc6SDavid Howells v_len = strlen(value);
637482995cc6SDavid Howells param.string = kmemdup_nul(value, v_len,
637582995cc6SDavid Howells GFP_KERNEL);
637682995cc6SDavid Howells if (!param.string)
637782995cc6SDavid Howells return -ENOMEM;
63780f89589aSAl Viro param.type = fs_value_is_string;
637982995cc6SDavid Howells }
638082995cc6SDavid Howells param.size = v_len;
638182995cc6SDavid Howells
638282995cc6SDavid Howells ret = rbd_parse_param(¶m, pctx);
638382995cc6SDavid Howells kfree(param.string);
638482995cc6SDavid Howells if (ret)
638582995cc6SDavid Howells break;
638682995cc6SDavid Howells }
638782995cc6SDavid Howells }
638882995cc6SDavid Howells
638982995cc6SDavid Howells return ret;
639082995cc6SDavid Howells }
639182995cc6SDavid Howells
6392ea3352f4SAlex Elder /*
6393859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image
6394859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6395859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer.
6396859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise.
6397d22f76e7SAlex Elder *
6398859c31dfSAlex Elder * The information extracted from these options is recorded in
6399859c31dfSAlex Elder * the other parameters which return dynamically-allocated
6400859c31dfSAlex Elder * structures:
6401859c31dfSAlex Elder * ceph_opts
6402859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options
6403859c31dfSAlex Elder * structure. Caller must release the returned pointer using
6404859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed.
6405859c31dfSAlex Elder * rbd_opts
6406859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by
6407859c31dfSAlex Elder * this function; caller must release with kfree().
6408859c31dfSAlex Elder * spec
6409859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully
6410859c31dfSAlex Elder * initialized by this function based on parsed options.
6411859c31dfSAlex Elder * Caller must release with rbd_spec_put().
6412859c31dfSAlex Elder *
6413859c31dfSAlex Elder * The options passed take this form:
6414859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6415859c31dfSAlex Elder * where:
6416859c31dfSAlex Elder * <mon_addrs>
6417859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses.
6418859c31dfSAlex Elder * A monitor address is an ip address, optionally followed
6419859c31dfSAlex Elder * by a port number (separated by a colon).
6420859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...]
6421859c31dfSAlex Elder * <options>
6422859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options.
6423859c31dfSAlex Elder * <pool_name>
6424859c31dfSAlex Elder * The name of the rados pool containing the rbd image.
6425859c31dfSAlex Elder * <image_name>
6426859c31dfSAlex Elder * The name of the image in that pool to map.
6427859c31dfSAlex Elder * <snap_id>
6428859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will
6429859c31dfSAlex Elder * present data from the image at the time that snapshot was
6430859c31dfSAlex Elder * created. The image head is used if no snapshot id is
6431859c31dfSAlex Elder * provided. Snapshot mappings are always read-only.
6432a725f65eSAlex Elder */
rbd_add_parse_args(const char * buf,struct ceph_options ** ceph_opts,struct rbd_options ** opts,struct rbd_spec ** rbd_spec)6433859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf,
6434dc79b113SAlex Elder struct ceph_options **ceph_opts,
6435859c31dfSAlex Elder struct rbd_options **opts,
6436859c31dfSAlex Elder struct rbd_spec **rbd_spec)
6437a725f65eSAlex Elder {
6438e28fff26SAlex Elder size_t len;
6439859c31dfSAlex Elder char *options;
64400ddebc0cSAlex Elder const char *mon_addrs;
6441ecb4dc22SAlex Elder char *snap_name;
64420ddebc0cSAlex Elder size_t mon_addrs_size;
644382995cc6SDavid Howells struct rbd_parse_opts_ctx pctx = { 0 };
6444dc79b113SAlex Elder int ret;
6445e28fff26SAlex Elder
6446e28fff26SAlex Elder /* The first four tokens are required */
6447e28fff26SAlex Elder
64487ef3214aSAlex Elder len = next_token(&buf);
64494fb5d671SAlex Elder if (!len) {
64504fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided");
64514fb5d671SAlex Elder return -EINVAL;
64524fb5d671SAlex Elder }
64530ddebc0cSAlex Elder mon_addrs = buf;
645482995cc6SDavid Howells mon_addrs_size = len;
64557ef3214aSAlex Elder buf += len;
6456a725f65eSAlex Elder
6457dc79b113SAlex Elder ret = -EINVAL;
6458f28e565aSAlex Elder options = dup_token(&buf, NULL);
6459f28e565aSAlex Elder if (!options)
6460dc79b113SAlex Elder return -ENOMEM;
64614fb5d671SAlex Elder if (!*options) {
64624fb5d671SAlex Elder rbd_warn(NULL, "no options provided");
64634fb5d671SAlex Elder goto out_err;
64644fb5d671SAlex Elder }
6465a725f65eSAlex Elder
6466c300156bSIlya Dryomov pctx.spec = rbd_spec_alloc();
6467c300156bSIlya Dryomov if (!pctx.spec)
6468f28e565aSAlex Elder goto out_mem;
6469859c31dfSAlex Elder
6470c300156bSIlya Dryomov pctx.spec->pool_name = dup_token(&buf, NULL);
6471c300156bSIlya Dryomov if (!pctx.spec->pool_name)
6472859c31dfSAlex Elder goto out_mem;
6473c300156bSIlya Dryomov if (!*pctx.spec->pool_name) {
64744fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided");
64754fb5d671SAlex Elder goto out_err;
64764fb5d671SAlex Elder }
6477e28fff26SAlex Elder
6478c300156bSIlya Dryomov pctx.spec->image_name = dup_token(&buf, NULL);
6479c300156bSIlya Dryomov if (!pctx.spec->image_name)
6480f28e565aSAlex Elder goto out_mem;
6481c300156bSIlya Dryomov if (!*pctx.spec->image_name) {
64824fb5d671SAlex Elder rbd_warn(NULL, "no image name provided");
64834fb5d671SAlex Elder goto out_err;
64844fb5d671SAlex Elder }
6485e28fff26SAlex Elder
6486f28e565aSAlex Elder /*
6487f28e565aSAlex Elder * Snapshot name is optional; default is to use "-"
6488f28e565aSAlex Elder * (indicating the head/no snapshot).
6489f28e565aSAlex Elder */
64903feeb894SAlex Elder len = next_token(&buf);
6491820a5f3eSAlex Elder if (!len) {
64923feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
64933feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6494f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) {
6495dc79b113SAlex Elder ret = -ENAMETOOLONG;
6496f28e565aSAlex Elder goto out_err;
6497849b4260SAlex Elder }
6498ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6499ecb4dc22SAlex Elder if (!snap_name)
6500f28e565aSAlex Elder goto out_mem;
6501ecb4dc22SAlex Elder *(snap_name + len) = '\0';
6502c300156bSIlya Dryomov pctx.spec->snap_name = snap_name;
6503e5c35534SAlex Elder
650482995cc6SDavid Howells pctx.copts = ceph_alloc_options();
650582995cc6SDavid Howells if (!pctx.copts)
650682995cc6SDavid Howells goto out_mem;
650782995cc6SDavid Howells
65080ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */
6509e28fff26SAlex Elder
6510c300156bSIlya Dryomov pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6511c300156bSIlya Dryomov if (!pctx.opts)
65124e9afebaSAlex Elder goto out_mem;
65134e9afebaSAlex Elder
6514c300156bSIlya Dryomov pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6515c300156bSIlya Dryomov pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
65160c93e1b7SIlya Dryomov pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6517c300156bSIlya Dryomov pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6518c300156bSIlya Dryomov pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6519c300156bSIlya Dryomov pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6520c300156bSIlya Dryomov pctx.opts->trim = RBD_TRIM_DEFAULT;
6521d22f76e7SAlex Elder
65222d7c86a8SVenky Shankar ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL,
65232d7c86a8SVenky Shankar ',');
652482995cc6SDavid Howells if (ret)
6525dc79b113SAlex Elder goto out_err;
6526859c31dfSAlex Elder
652782995cc6SDavid Howells ret = rbd_parse_options(options, &pctx);
652882995cc6SDavid Howells if (ret)
652982995cc6SDavid Howells goto out_err;
653082995cc6SDavid Howells
653182995cc6SDavid Howells *ceph_opts = pctx.copts;
6532c300156bSIlya Dryomov *opts = pctx.opts;
6533c300156bSIlya Dryomov *rbd_spec = pctx.spec;
653482995cc6SDavid Howells kfree(options);
6535dc79b113SAlex Elder return 0;
653682995cc6SDavid Howells
6537f28e565aSAlex Elder out_mem:
6538dc79b113SAlex Elder ret = -ENOMEM;
6539d22f76e7SAlex Elder out_err:
6540c300156bSIlya Dryomov kfree(pctx.opts);
654182995cc6SDavid Howells ceph_destroy_options(pctx.copts);
6542c300156bSIlya Dryomov rbd_spec_put(pctx.spec);
6543f28e565aSAlex Elder kfree(options);
6544dc79b113SAlex Elder return ret;
6545a725f65eSAlex Elder }
6546a725f65eSAlex Elder
rbd_dev_image_unlock(struct rbd_device * rbd_dev)6547e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6548e010dd0aSIlya Dryomov {
6549e010dd0aSIlya Dryomov down_write(&rbd_dev->lock_rwsem);
6550e010dd0aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev))
6551e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev);
6552e010dd0aSIlya Dryomov up_write(&rbd_dev->lock_rwsem);
6553e010dd0aSIlya Dryomov }
6554e010dd0aSIlya Dryomov
6555637cd060SIlya Dryomov /*
6556637cd060SIlya Dryomov * If the wait is interrupted, an error is returned even if the lock
6557637cd060SIlya Dryomov * was successfully acquired. rbd_dev_image_unlock() will release it
6558637cd060SIlya Dryomov * if needed.
6559637cd060SIlya Dryomov */
rbd_add_acquire_lock(struct rbd_device * rbd_dev)6560e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6561e010dd0aSIlya Dryomov {
6562637cd060SIlya Dryomov long ret;
65632f18d466SIlya Dryomov
6564e010dd0aSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6565637cd060SIlya Dryomov if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6566637cd060SIlya Dryomov return 0;
6567637cd060SIlya Dryomov
6568e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6569e010dd0aSIlya Dryomov return -EINVAL;
6570e010dd0aSIlya Dryomov }
6571e010dd0aSIlya Dryomov
65723fe69921SIlya Dryomov if (rbd_is_ro(rbd_dev))
6573637cd060SIlya Dryomov return 0;
6574637cd060SIlya Dryomov
6575637cd060SIlya Dryomov rbd_assert(!rbd_is_lock_owner(rbd_dev));
6576637cd060SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6577637cd060SIlya Dryomov ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6578637cd060SIlya Dryomov ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
657925e6be21SDongsheng Yang if (ret > 0) {
6580637cd060SIlya Dryomov ret = rbd_dev->acquire_err;
658125e6be21SDongsheng Yang } else {
658225e6be21SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->lock_dwork);
658325e6be21SDongsheng Yang if (!ret)
6584637cd060SIlya Dryomov ret = -ETIMEDOUT;
6585637cd060SIlya Dryomov
65869d01e07fSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %ld", ret);
6587e010dd0aSIlya Dryomov }
65889d01e07fSIlya Dryomov if (ret)
65899d01e07fSIlya Dryomov return ret;
6590e010dd0aSIlya Dryomov
6591637cd060SIlya Dryomov /*
6592637cd060SIlya Dryomov * The lock may have been released by now, unless automatic lock
6593637cd060SIlya Dryomov * transitions are disabled.
6594637cd060SIlya Dryomov */
6595637cd060SIlya Dryomov rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6596e010dd0aSIlya Dryomov return 0;
6597e010dd0aSIlya Dryomov }
6598e010dd0aSIlya Dryomov
659930ba1f02SIlya Dryomov /*
6600589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the
6601589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is
6602589d30e0SAlex Elder * what's used to specify the names of objects related to the image.
6603589d30e0SAlex Elder *
6604589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its
6605589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image
6606589d30e0SAlex Elder * with the supplied name.
6607589d30e0SAlex Elder *
6608589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if
6609589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any
6610589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's
6611589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL).
6612589d30e0SAlex Elder */
rbd_dev_image_id(struct rbd_device * rbd_dev)6613589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6614589d30e0SAlex Elder {
6615589d30e0SAlex Elder int ret;
6616589d30e0SAlex Elder size_t size;
6617ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid);
6618589d30e0SAlex Elder void *response;
6619c0fba368SAlex Elder char *image_id;
66202f82ee54SAlex Elder
6621589d30e0SAlex Elder /*
66222c0d0a10SAlex Elder * When probing a parent image, the image id is already
66232c0d0a10SAlex Elder * known (and the image name likely is not). There's no
6624c0fba368SAlex Elder * need to fetch the image id again in this case. We
6625c0fba368SAlex Elder * do still need to set the image format though.
66262c0d0a10SAlex Elder */
6627c0fba368SAlex Elder if (rbd_dev->spec->image_id) {
6628c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6629c0fba368SAlex Elder
66302c0d0a10SAlex Elder return 0;
6631c0fba368SAlex Elder }
66322c0d0a10SAlex Elder
66332c0d0a10SAlex Elder /*
6634589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if
6635589d30e0SAlex Elder * so, get the image's persistent id from it.
6636589d30e0SAlex Elder */
6637ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6638ecd4a68aSIlya Dryomov rbd_dev->spec->image_name);
6639ecd4a68aSIlya Dryomov if (ret)
6640ecd4a68aSIlya Dryomov return ret;
6641ecd4a68aSIlya Dryomov
6642ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name);
6643589d30e0SAlex Elder
6644589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */
6645589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6646589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO);
6647589d30e0SAlex Elder if (!response) {
6648589d30e0SAlex Elder ret = -ENOMEM;
6649589d30e0SAlex Elder goto out;
6650589d30e0SAlex Elder }
6651589d30e0SAlex Elder
6652c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */
6653c0fba368SAlex Elder
6654ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6655ecd4a68aSIlya Dryomov "get_id", NULL, 0,
66565435d206SDongsheng Yang response, size);
665736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6658c0fba368SAlex Elder if (ret == -ENOENT) {
6659c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL);
6660c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM;
6661c0fba368SAlex Elder if (!ret)
6662c0fba368SAlex Elder rbd_dev->image_format = 1;
66637dd440c9SIlya Dryomov } else if (ret >= 0) {
6664c0fba368SAlex Elder void *p = response;
6665589d30e0SAlex Elder
6666c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret,
6667979ed480SAlex Elder NULL, GFP_NOIO);
6668461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id);
6669c0fba368SAlex Elder if (!ret)
6670c0fba368SAlex Elder rbd_dev->image_format = 2;
6671c0fba368SAlex Elder }
6672c0fba368SAlex Elder
6673c0fba368SAlex Elder if (!ret) {
6674c0fba368SAlex Elder rbd_dev->spec->image_id = image_id;
6675c0fba368SAlex Elder dout("image_id is %s\n", image_id);
6676589d30e0SAlex Elder }
6677589d30e0SAlex Elder out:
6678589d30e0SAlex Elder kfree(response);
6679ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid);
6680589d30e0SAlex Elder return ret;
6681589d30e0SAlex Elder }
6682589d30e0SAlex Elder
66833abef3b3SAlex Elder /*
66843abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info
66853abef3b3SAlex Elder * call.
66863abef3b3SAlex Elder */
rbd_dev_unprobe(struct rbd_device * rbd_dev)66876fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
66886fd48b3bSAlex Elder {
6689a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev);
669022e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev);
6691da5ef6beSIlya Dryomov rbd_dev_mapping_clear(rbd_dev);
66926fd48b3bSAlex Elder
66936fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */
66946fd48b3bSAlex Elder
6695510a7330SIlya Dryomov rbd_image_header_cleanup(&rbd_dev->header);
66966fd48b3bSAlex Elder }
66976fd48b3bSAlex Elder
rbd_dev_v2_header_onetime(struct rbd_device * rbd_dev,struct rbd_image_header * header)6698510a7330SIlya Dryomov static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
6699510a7330SIlya Dryomov struct rbd_image_header *header)
6700a30b71b9SAlex Elder {
6701a30b71b9SAlex Elder int ret;
6702a30b71b9SAlex Elder
6703510a7330SIlya Dryomov ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix);
670457385b51SAlex Elder if (ret)
6705510a7330SIlya Dryomov return ret;
6706b1b5402aSAlex Elder
67072df3fac7SAlex Elder /*
67082df3fac7SAlex Elder * Get the and check features for the image. Currently the
67092df3fac7SAlex Elder * features are assumed to never change.
67102df3fac7SAlex Elder */
6711510a7330SIlya Dryomov ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
6712510a7330SIlya Dryomov rbd_is_ro(rbd_dev), &header->features);
671357385b51SAlex Elder if (ret)
6714510a7330SIlya Dryomov return ret;
671535d489f9SAlex Elder
6716cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */
6717cc070d59SAlex Elder
6718510a7330SIlya Dryomov if (header->features & RBD_FEATURE_STRIPINGV2) {
6719510a7330SIlya Dryomov ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit,
6720510a7330SIlya Dryomov &header->stripe_count);
67217e97332eSIlya Dryomov if (ret)
6722510a7330SIlya Dryomov return ret;
67237e97332eSIlya Dryomov }
67247e97332eSIlya Dryomov
6725510a7330SIlya Dryomov if (header->features & RBD_FEATURE_DATA_POOL) {
6726510a7330SIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id);
6727510a7330SIlya Dryomov if (ret)
67289d475de5SAlex Elder return ret;
6729a30b71b9SAlex Elder }
6730a30b71b9SAlex Elder
6731510a7330SIlya Dryomov return 0;
6732510a7330SIlya Dryomov }
6733510a7330SIlya Dryomov
67346d69bb53SIlya Dryomov /*
67356d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
67366d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the
67376d69bb53SIlya Dryomov * length of the already discovered part of the parent chain.
67386d69bb53SIlya Dryomov */
rbd_dev_probe_parent(struct rbd_device * rbd_dev,int depth)67396d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
674083a06263SAlex Elder {
67412f82ee54SAlex Elder struct rbd_device *parent = NULL;
6742124afba2SAlex Elder int ret;
6743124afba2SAlex Elder
6744124afba2SAlex Elder if (!rbd_dev->parent_spec)
6745124afba2SAlex Elder return 0;
6746124afba2SAlex Elder
67476d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
67486d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth);
67496d69bb53SIlya Dryomov ret = -EINVAL;
67506d69bb53SIlya Dryomov goto out_err;
67516d69bb53SIlya Dryomov }
67526d69bb53SIlya Dryomov
6753f7c4d9b1SIlya Dryomov parent = __rbd_dev_create(rbd_dev->parent_spec);
67541f2c6651SIlya Dryomov if (!parent) {
6755124afba2SAlex Elder ret = -ENOMEM;
6756124afba2SAlex Elder goto out_err;
67571f2c6651SIlya Dryomov }
67581f2c6651SIlya Dryomov
67591f2c6651SIlya Dryomov /*
67601f2c6651SIlya Dryomov * Images related by parent/child relationships always share
67611f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts.
67621f2c6651SIlya Dryomov */
6763f7c4d9b1SIlya Dryomov parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client);
6764f7c4d9b1SIlya Dryomov parent->spec = rbd_spec_get(rbd_dev->parent_spec);
6765124afba2SAlex Elder
676639258aa2SIlya Dryomov __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
676739258aa2SIlya Dryomov
67686d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth);
6769124afba2SAlex Elder if (ret < 0)
6770124afba2SAlex Elder goto out_err;
67711f2c6651SIlya Dryomov
6772124afba2SAlex Elder rbd_dev->parent = parent;
6773a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1);
6774124afba2SAlex Elder return 0;
6775124afba2SAlex Elder
67761f2c6651SIlya Dryomov out_err:
67771f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev);
67781f2c6651SIlya Dryomov rbd_dev_destroy(parent);
6779124afba2SAlex Elder return ret;
6780124afba2SAlex Elder }
6781124afba2SAlex Elder
rbd_dev_device_release(struct rbd_device * rbd_dev)67825769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev)
67835769ed0cSIlya Dryomov {
67845769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
67855769ed0cSIlya Dryomov rbd_free_disk(rbd_dev);
67865769ed0cSIlya Dryomov if (!single_major)
67875769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name);
67885769ed0cSIlya Dryomov }
67895769ed0cSIlya Dryomov
6790811c6688SIlya Dryomov /*
6791811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked
6792811c6688SIlya Dryomov * upon return.
6793811c6688SIlya Dryomov */
rbd_dev_device_setup(struct rbd_device * rbd_dev)6794200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6795124afba2SAlex Elder {
679683a06263SAlex Elder int ret;
679783a06263SAlex Elder
67989b60e70bSIlya Dryomov /* Record our major and minor device numbers. */
679983a06263SAlex Elder
68009b60e70bSIlya Dryomov if (!single_major) {
680183a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name);
680283a06263SAlex Elder if (ret < 0)
68031643dfa4SIlya Dryomov goto err_out_unlock;
68049b60e70bSIlya Dryomov
680583a06263SAlex Elder rbd_dev->major = ret;
6806dd82fff1SIlya Dryomov rbd_dev->minor = 0;
68079b60e70bSIlya Dryomov } else {
68089b60e70bSIlya Dryomov rbd_dev->major = rbd_major;
68099b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
68109b60e70bSIlya Dryomov }
681183a06263SAlex Elder
681283a06263SAlex Elder /* Set up the blkdev mapping. */
681383a06263SAlex Elder
681483a06263SAlex Elder ret = rbd_init_disk(rbd_dev);
681583a06263SAlex Elder if (ret)
681683a06263SAlex Elder goto err_out_blkdev;
681783a06263SAlex Elder
6818f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
681939258aa2SIlya Dryomov set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
6820f35a4deeSAlex Elder
68215769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6822f35a4deeSAlex Elder if (ret)
6823da5ef6beSIlya Dryomov goto err_out_disk;
682483a06263SAlex Elder
6825129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6826811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem);
68275769ed0cSIlya Dryomov return 0;
68282f82ee54SAlex Elder
682983a06263SAlex Elder err_out_disk:
683083a06263SAlex Elder rbd_free_disk(rbd_dev);
683183a06263SAlex Elder err_out_blkdev:
68329b60e70bSIlya Dryomov if (!single_major)
683383a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name);
6834811c6688SIlya Dryomov err_out_unlock:
6835811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem);
683683a06263SAlex Elder return ret;
683783a06263SAlex Elder }
683883a06263SAlex Elder
rbd_dev_header_name(struct rbd_device * rbd_dev)6839332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6840332bb12dSAlex Elder {
6841332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec;
6842c41d13a3SIlya Dryomov int ret;
6843332bb12dSAlex Elder
6844332bb12dSAlex Elder /* Record the header object name for this rbd image. */
6845332bb12dSAlex Elder
6846332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6847332bb12dSAlex Elder if (rbd_dev->image_format == 1)
6848c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6849332bb12dSAlex Elder spec->image_name, RBD_SUFFIX);
6850332bb12dSAlex Elder else
6851c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6852332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id);
6853c41d13a3SIlya Dryomov
6854c41d13a3SIlya Dryomov return ret;
6855332bb12dSAlex Elder }
6856332bb12dSAlex Elder
rbd_print_dne(struct rbd_device * rbd_dev,bool is_snap)6857b9ef2b88SIlya Dryomov static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6858b9ef2b88SIlya Dryomov {
6859b9ef2b88SIlya Dryomov if (!is_snap) {
6860b9ef2b88SIlya Dryomov pr_info("image %s/%s%s%s does not exist\n",
6861b9ef2b88SIlya Dryomov rbd_dev->spec->pool_name,
6862b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ?: "",
6863b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "",
6864b9ef2b88SIlya Dryomov rbd_dev->spec->image_name);
6865b9ef2b88SIlya Dryomov } else {
6866b9ef2b88SIlya Dryomov pr_info("snap %s/%s%s%s@%s does not exist\n",
6867b9ef2b88SIlya Dryomov rbd_dev->spec->pool_name,
6868b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ?: "",
6869b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "",
6870b9ef2b88SIlya Dryomov rbd_dev->spec->image_name,
6871b9ef2b88SIlya Dryomov rbd_dev->spec->snap_name);
6872b9ef2b88SIlya Dryomov }
6873b9ef2b88SIlya Dryomov }
6874b9ef2b88SIlya Dryomov
rbd_dev_image_release(struct rbd_device * rbd_dev)6875200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6876200a6a8bSAlex Elder {
6877b8776051SIlya Dryomov if (!rbd_is_ro(rbd_dev))
6878fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev);
6879952c48b0SIlya Dryomov
6880952c48b0SIlya Dryomov rbd_dev_unprobe(rbd_dev);
68816fd48b3bSAlex Elder rbd_dev->image_format = 0;
68826fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id);
68836fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL;
6884200a6a8bSAlex Elder }
6885200a6a8bSAlex Elder
6886a30b71b9SAlex Elder /*
6887a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd
68881f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a
68891f3ef788SAlex Elder * parent), initiate a watch on its header object before using that
68901f3ef788SAlex Elder * object to get detailed information about the rbd image.
68910e4e1de5SIlya Dryomov *
68920e4e1de5SIlya Dryomov * On success, returns with header_rwsem held for write if called
68930e4e1de5SIlya Dryomov * with @depth == 0.
6894a30b71b9SAlex Elder */
rbd_dev_image_probe(struct rbd_device * rbd_dev,int depth)68956d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6896a30b71b9SAlex Elder {
6897b9ef2b88SIlya Dryomov bool need_watch = !rbd_is_ro(rbd_dev);
6898a30b71b9SAlex Elder int ret;
6899a30b71b9SAlex Elder
6900a30b71b9SAlex Elder /*
69013abef3b3SAlex Elder * Get the id from the image id object. Unless there's an
69023abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with
69033abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format
69043abef3b3SAlex Elder * will be set to either 1 or 2.
6905a30b71b9SAlex Elder */
6906a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev);
6907a30b71b9SAlex Elder if (ret)
6908c0fba368SAlex Elder return ret;
6909c0fba368SAlex Elder
6910332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev);
6911332bb12dSAlex Elder if (ret)
6912332bb12dSAlex Elder goto err_out_format;
6913332bb12dSAlex Elder
6914b9ef2b88SIlya Dryomov if (need_watch) {
691599d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev);
69161fe48023SIlya Dryomov if (ret) {
69171fe48023SIlya Dryomov if (ret == -ENOENT)
6918b9ef2b88SIlya Dryomov rbd_print_dne(rbd_dev, false);
6919c41d13a3SIlya Dryomov goto err_out_format;
69201f3ef788SAlex Elder }
69211fe48023SIlya Dryomov }
6922b644de2bSAlex Elder
69230e4e1de5SIlya Dryomov if (!depth)
69240e4e1de5SIlya Dryomov down_write(&rbd_dev->header_rwsem);
69250e4e1de5SIlya Dryomov
6926510a7330SIlya Dryomov ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true);
6927b9ef2b88SIlya Dryomov if (ret) {
6928b9ef2b88SIlya Dryomov if (ret == -ENOENT && !need_watch)
6929b9ef2b88SIlya Dryomov rbd_print_dne(rbd_dev, false);
6930952c48b0SIlya Dryomov goto err_out_probe;
6931b9ef2b88SIlya Dryomov }
6932a30b71b9SAlex Elder
6933510a7330SIlya Dryomov rbd_init_layout(rbd_dev);
6934510a7330SIlya Dryomov
693504077599SIlya Dryomov /*
693604077599SIlya Dryomov * If this image is the one being mapped, we have pool name and
693704077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id.
693804077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image
693904077599SIlya Dryomov * and snap ids - need to fill in names for those ids.
694004077599SIlya Dryomov */
69416d69bb53SIlya Dryomov if (!depth)
694204077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev);
694304077599SIlya Dryomov else
694404077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev);
69451fe48023SIlya Dryomov if (ret) {
69461fe48023SIlya Dryomov if (ret == -ENOENT)
6947b9ef2b88SIlya Dryomov rbd_print_dne(rbd_dev, true);
694833dca39fSAlex Elder goto err_out_probe;
69491fe48023SIlya Dryomov }
69509bb81c9bSAlex Elder
6951da5ef6beSIlya Dryomov ret = rbd_dev_mapping_set(rbd_dev);
6952da5ef6beSIlya Dryomov if (ret)
6953da5ef6beSIlya Dryomov goto err_out_probe;
6954da5ef6beSIlya Dryomov
6955f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev) &&
695622e8bd51SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
695722e8bd51SIlya Dryomov ret = rbd_object_map_load(rbd_dev);
695822e8bd51SIlya Dryomov if (ret)
695922e8bd51SIlya Dryomov goto err_out_probe;
696022e8bd51SIlya Dryomov }
696122e8bd51SIlya Dryomov
6962e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6963c1031177SIlya Dryomov ret = rbd_dev_setup_parent(rbd_dev);
6964e8f59b59SIlya Dryomov if (ret)
6965e8f59b59SIlya Dryomov goto err_out_probe;
6966e8f59b59SIlya Dryomov }
6967e8f59b59SIlya Dryomov
69686d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth);
696930d60ba2SAlex Elder if (ret)
697030d60ba2SAlex Elder goto err_out_probe;
697183a06263SAlex Elder
697230d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n",
6973c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name);
697430d60ba2SAlex Elder return 0;
6975e8f59b59SIlya Dryomov
69766fd48b3bSAlex Elder err_out_probe:
69770e4e1de5SIlya Dryomov if (!depth)
69780e4e1de5SIlya Dryomov up_write(&rbd_dev->header_rwsem);
6979b9ef2b88SIlya Dryomov if (need_watch)
698099d16943SIlya Dryomov rbd_unregister_watch(rbd_dev);
6981952c48b0SIlya Dryomov rbd_dev_unprobe(rbd_dev);
6982332bb12dSAlex Elder err_out_format:
6983332bb12dSAlex Elder rbd_dev->image_format = 0;
69845655c4d9SAlex Elder kfree(rbd_dev->spec->image_id);
69855655c4d9SAlex Elder rbd_dev->spec->image_id = NULL;
69865655c4d9SAlex Elder return ret;
698783a06263SAlex Elder }
698883a06263SAlex Elder
rbd_dev_update_header(struct rbd_device * rbd_dev,struct rbd_image_header * header)6989510a7330SIlya Dryomov static void rbd_dev_update_header(struct rbd_device *rbd_dev,
6990510a7330SIlya Dryomov struct rbd_image_header *header)
6991510a7330SIlya Dryomov {
6992510a7330SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6993510a7330SIlya Dryomov rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
6994510a7330SIlya Dryomov
69950b207d02SIlya Dryomov if (rbd_dev->header.image_size != header->image_size) {
6996510a7330SIlya Dryomov rbd_dev->header.image_size = header->image_size;
6997510a7330SIlya Dryomov
69980b207d02SIlya Dryomov if (!rbd_is_snap(rbd_dev)) {
69990b207d02SIlya Dryomov rbd_dev->mapping.size = header->image_size;
70000b207d02SIlya Dryomov rbd_dev_update_size(rbd_dev);
70010b207d02SIlya Dryomov }
70020b207d02SIlya Dryomov }
70030b207d02SIlya Dryomov
7004510a7330SIlya Dryomov ceph_put_snap_context(rbd_dev->header.snapc);
7005510a7330SIlya Dryomov rbd_dev->header.snapc = header->snapc;
7006510a7330SIlya Dryomov header->snapc = NULL;
7007510a7330SIlya Dryomov
7008510a7330SIlya Dryomov if (rbd_dev->image_format == 1) {
7009510a7330SIlya Dryomov kfree(rbd_dev->header.snap_names);
7010510a7330SIlya Dryomov rbd_dev->header.snap_names = header->snap_names;
7011510a7330SIlya Dryomov header->snap_names = NULL;
7012510a7330SIlya Dryomov
7013510a7330SIlya Dryomov kfree(rbd_dev->header.snap_sizes);
7014510a7330SIlya Dryomov rbd_dev->header.snap_sizes = header->snap_sizes;
7015510a7330SIlya Dryomov header->snap_sizes = NULL;
7016510a7330SIlya Dryomov }
7017510a7330SIlya Dryomov }
7018510a7330SIlya Dryomov
rbd_dev_update_parent(struct rbd_device * rbd_dev,struct parent_image_info * pii)7019c1031177SIlya Dryomov static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
7020c1031177SIlya Dryomov struct parent_image_info *pii)
7021c1031177SIlya Dryomov {
7022c1031177SIlya Dryomov if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
7023c1031177SIlya Dryomov /*
7024c1031177SIlya Dryomov * Either the parent never existed, or we have
7025c1031177SIlya Dryomov * record of it but the image got flattened so it no
7026c1031177SIlya Dryomov * longer has a parent. When the parent of a
7027c1031177SIlya Dryomov * layered image disappears we immediately set the
7028c1031177SIlya Dryomov * overlap to 0. The effect of this is that all new
7029c1031177SIlya Dryomov * requests will be treated as if the image had no
7030c1031177SIlya Dryomov * parent.
7031c1031177SIlya Dryomov *
7032c1031177SIlya Dryomov * If !pii.has_overlap, the parent image spec is not
7033c1031177SIlya Dryomov * applicable. It's there to avoid duplication in each
7034c1031177SIlya Dryomov * snapshot record.
7035c1031177SIlya Dryomov */
7036c1031177SIlya Dryomov if (rbd_dev->parent_overlap) {
7037c1031177SIlya Dryomov rbd_dev->parent_overlap = 0;
7038c1031177SIlya Dryomov rbd_dev_parent_put(rbd_dev);
7039c1031177SIlya Dryomov pr_info("%s: clone has been flattened\n",
7040c1031177SIlya Dryomov rbd_dev->disk->disk_name);
7041c1031177SIlya Dryomov }
7042c1031177SIlya Dryomov } else {
7043c1031177SIlya Dryomov rbd_assert(rbd_dev->parent_spec);
7044c1031177SIlya Dryomov
7045c1031177SIlya Dryomov /*
7046c1031177SIlya Dryomov * Update the parent overlap. If it became zero, issue
7047c1031177SIlya Dryomov * a warning as we will proceed as if there is no parent.
7048c1031177SIlya Dryomov */
7049c1031177SIlya Dryomov if (!pii->overlap && rbd_dev->parent_overlap)
7050c1031177SIlya Dryomov rbd_warn(rbd_dev,
7051c1031177SIlya Dryomov "clone has become standalone (overlap 0)");
7052c1031177SIlya Dryomov rbd_dev->parent_overlap = pii->overlap;
7053c1031177SIlya Dryomov }
7054c1031177SIlya Dryomov }
7055c1031177SIlya Dryomov
rbd_dev_refresh(struct rbd_device * rbd_dev)70560b035401SIlya Dryomov static int rbd_dev_refresh(struct rbd_device *rbd_dev)
70570b035401SIlya Dryomov {
7058510a7330SIlya Dryomov struct rbd_image_header header = { 0 };
7059c1031177SIlya Dryomov struct parent_image_info pii = { 0 };
70600b035401SIlya Dryomov int ret;
70610b035401SIlya Dryomov
70620b207d02SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev);
70630b035401SIlya Dryomov
7064510a7330SIlya Dryomov ret = rbd_dev_header_info(rbd_dev, &header, false);
70650b035401SIlya Dryomov if (ret)
70660b035401SIlya Dryomov goto out;
70670b035401SIlya Dryomov
70680b035401SIlya Dryomov /*
70690b035401SIlya Dryomov * If there is a parent, see if it has disappeared due to the
70700b035401SIlya Dryomov * mapped image getting flattened.
70710b035401SIlya Dryomov */
70720b035401SIlya Dryomov if (rbd_dev->parent) {
7073c1031177SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
70740b035401SIlya Dryomov if (ret)
70750b035401SIlya Dryomov goto out;
70760b035401SIlya Dryomov }
70770b035401SIlya Dryomov
70780b207d02SIlya Dryomov down_write(&rbd_dev->header_rwsem);
7079510a7330SIlya Dryomov rbd_dev_update_header(rbd_dev, &header);
7080c1031177SIlya Dryomov if (rbd_dev->parent)
7081c1031177SIlya Dryomov rbd_dev_update_parent(rbd_dev, &pii);
70820b207d02SIlya Dryomov up_write(&rbd_dev->header_rwsem);
70830b035401SIlya Dryomov
70840b035401SIlya Dryomov out:
7085c1031177SIlya Dryomov rbd_parent_info_cleanup(&pii);
7086510a7330SIlya Dryomov rbd_image_header_cleanup(&header);
70870b035401SIlya Dryomov return ret;
70880b035401SIlya Dryomov }
70890b035401SIlya Dryomov
do_rbd_add(const char * buf,size_t count)709075cff725SGreg Kroah-Hartman static ssize_t do_rbd_add(const char *buf, size_t count)
7091602adf40SYehuda Sadeh {
7092cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL;
7093dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL;
70944e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL;
7095859c31dfSAlex Elder struct rbd_spec *spec = NULL;
70969d3997fdSAlex Elder struct rbd_client *rbdc;
7097b51c83c2SIlya Dryomov int rc;
7098602adf40SYehuda Sadeh
7099f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN))
7100f44d04e6SIlya Dryomov return -EPERM;
7101f44d04e6SIlya Dryomov
7102602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE))
7103602adf40SYehuda Sadeh return -ENODEV;
7104602adf40SYehuda Sadeh
7105a725f65eSAlex Elder /* parse add command */
7106859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7107dc79b113SAlex Elder if (rc < 0)
7108dd5ac32dSIlya Dryomov goto out;
7109a725f65eSAlex Elder
71109d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts);
71119d3997fdSAlex Elder if (IS_ERR(rbdc)) {
71129d3997fdSAlex Elder rc = PTR_ERR(rbdc);
71130ddebc0cSAlex Elder goto err_out_args;
71149d3997fdSAlex Elder }
7115602adf40SYehuda Sadeh
7116602adf40SYehuda Sadeh /* pick the pool */
7117dd435855SIlya Dryomov rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
71181fe48023SIlya Dryomov if (rc < 0) {
71191fe48023SIlya Dryomov if (rc == -ENOENT)
71201fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name);
7121602adf40SYehuda Sadeh goto err_out_client;
71221fe48023SIlya Dryomov }
7123859c31dfSAlex Elder spec->pool_id = (u64)rc;
7124859c31dfSAlex Elder
7125d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7126b51c83c2SIlya Dryomov if (!rbd_dev) {
7127b51c83c2SIlya Dryomov rc = -ENOMEM;
7128bd4ba655SAlex Elder goto err_out_client;
7129b51c83c2SIlya Dryomov }
7130c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */
7131c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */
7132d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */
7133602adf40SYehuda Sadeh
713439258aa2SIlya Dryomov /* if we are mapping a snapshot it will be a read-only mapping */
713539258aa2SIlya Dryomov if (rbd_dev->opts->read_only ||
713639258aa2SIlya Dryomov strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
713739258aa2SIlya Dryomov __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
713839258aa2SIlya Dryomov
71390d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
71400d6d1e9cSMike Christie if (!rbd_dev->config_info) {
71410d6d1e9cSMike Christie rc = -ENOMEM;
71420d6d1e9cSMike Christie goto err_out_rbd_dev;
71430d6d1e9cSMike Christie }
71440d6d1e9cSMike Christie
71456d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0);
71460e4e1de5SIlya Dryomov if (rc < 0)
7147c53d5893SAlex Elder goto err_out_rbd_dev;
714805fd6f6fSAlex Elder
71490c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
71500c93e1b7SIlya Dryomov rbd_warn(rbd_dev, "alloc_size adjusted to %u",
71510c93e1b7SIlya Dryomov rbd_dev->layout.object_size);
71520c93e1b7SIlya Dryomov rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
71530c93e1b7SIlya Dryomov }
71540c93e1b7SIlya Dryomov
7155b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev);
7156fd22aef8SIlya Dryomov if (rc)
71578b679ec5SIlya Dryomov goto err_out_image_probe;
71583abef3b3SAlex Elder
7159e010dd0aSIlya Dryomov rc = rbd_add_acquire_lock(rbd_dev);
7160e010dd0aSIlya Dryomov if (rc)
7161637cd060SIlya Dryomov goto err_out_image_lock;
7162b536f69aSAlex Elder
71635769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */
71645769ed0cSIlya Dryomov
71655769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev);
71665769ed0cSIlya Dryomov if (rc)
7167e010dd0aSIlya Dryomov goto err_out_image_lock;
71685769ed0cSIlya Dryomov
716927c97abcSLuis Chamberlain rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
717027c97abcSLuis Chamberlain if (rc)
717127c97abcSLuis Chamberlain goto err_out_cleanup_disk;
71725769ed0cSIlya Dryomov
71735769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock);
71745769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list);
71755769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock);
71765769ed0cSIlya Dryomov
71775769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
71785769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
71795769ed0cSIlya Dryomov rbd_dev->header.features);
7180dd5ac32dSIlya Dryomov rc = count;
7181dd5ac32dSIlya Dryomov out:
7182dd5ac32dSIlya Dryomov module_put(THIS_MODULE);
7183dd5ac32dSIlya Dryomov return rc;
7184b536f69aSAlex Elder
718527c97abcSLuis Chamberlain err_out_cleanup_disk:
718627c97abcSLuis Chamberlain rbd_free_disk(rbd_dev);
7187e010dd0aSIlya Dryomov err_out_image_lock:
7188e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev);
71895769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev);
71908b679ec5SIlya Dryomov err_out_image_probe:
71918b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev);
7192c53d5893SAlex Elder err_out_rbd_dev:
7193c53d5893SAlex Elder rbd_dev_destroy(rbd_dev);
7194bd4ba655SAlex Elder err_out_client:
71959d3997fdSAlex Elder rbd_put_client(rbdc);
71960ddebc0cSAlex Elder err_out_args:
7197859c31dfSAlex Elder rbd_spec_put(spec);
7198d147543dSIlya Dryomov kfree(rbd_opts);
7199dd5ac32dSIlya Dryomov goto out;
7200602adf40SYehuda Sadeh }
7201602adf40SYehuda Sadeh
add_store(const struct bus_type * bus,const char * buf,size_t count)720275cff725SGreg Kroah-Hartman static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count)
72039b60e70bSIlya Dryomov {
72049b60e70bSIlya Dryomov if (single_major)
72059b60e70bSIlya Dryomov return -EINVAL;
72069b60e70bSIlya Dryomov
720775cff725SGreg Kroah-Hartman return do_rbd_add(buf, count);
72089b60e70bSIlya Dryomov }
72099b60e70bSIlya Dryomov
add_single_major_store(const struct bus_type * bus,const char * buf,size_t count)721075cff725SGreg Kroah-Hartman static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf,
72119b60e70bSIlya Dryomov size_t count)
72129b60e70bSIlya Dryomov {
721375cff725SGreg Kroah-Hartman return do_rbd_add(buf, count);
72149b60e70bSIlya Dryomov }
72159b60e70bSIlya Dryomov
rbd_dev_remove_parent(struct rbd_device * rbd_dev)721605a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
721705a46afdSAlex Elder {
7218ad945fc1SAlex Elder while (rbd_dev->parent) {
721905a46afdSAlex Elder struct rbd_device *first = rbd_dev;
722005a46afdSAlex Elder struct rbd_device *second = first->parent;
722105a46afdSAlex Elder struct rbd_device *third;
722205a46afdSAlex Elder
722305a46afdSAlex Elder /*
722405a46afdSAlex Elder * Follow to the parent with no grandparent and
722505a46afdSAlex Elder * remove it.
722605a46afdSAlex Elder */
722705a46afdSAlex Elder while (second && (third = second->parent)) {
722805a46afdSAlex Elder first = second;
722905a46afdSAlex Elder second = third;
723005a46afdSAlex Elder }
7231ad945fc1SAlex Elder rbd_assert(second);
72328ad42cd0SAlex Elder rbd_dev_image_release(second);
72338b679ec5SIlya Dryomov rbd_dev_destroy(second);
7234ad945fc1SAlex Elder first->parent = NULL;
7235ad945fc1SAlex Elder first->parent_overlap = 0;
7236ad945fc1SAlex Elder
7237ad945fc1SAlex Elder rbd_assert(first->parent_spec);
723805a46afdSAlex Elder rbd_spec_put(first->parent_spec);
723905a46afdSAlex Elder first->parent_spec = NULL;
724005a46afdSAlex Elder }
724105a46afdSAlex Elder }
724205a46afdSAlex Elder
do_rbd_remove(const char * buf,size_t count)724375cff725SGreg Kroah-Hartman static ssize_t do_rbd_remove(const char *buf, size_t count)
7244602adf40SYehuda Sadeh {
7245602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL;
7246751cc0e3SAlex Elder int dev_id;
72470276dca6SMike Christie char opt_buf[6];
72480276dca6SMike Christie bool force = false;
72490d8189e1SAlex Elder int ret;
7250602adf40SYehuda Sadeh
7251f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN))
7252f44d04e6SIlya Dryomov return -EPERM;
7253f44d04e6SIlya Dryomov
72540276dca6SMike Christie dev_id = -1;
72550276dca6SMike Christie opt_buf[0] = '\0';
72560276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf);
72570276dca6SMike Christie if (dev_id < 0) {
72580276dca6SMike Christie pr_err("dev_id out of range\n");
7259602adf40SYehuda Sadeh return -EINVAL;
72600276dca6SMike Christie }
72610276dca6SMike Christie if (opt_buf[0] != '\0') {
72620276dca6SMike Christie if (!strcmp(opt_buf, "force")) {
72630276dca6SMike Christie force = true;
72640276dca6SMike Christie } else {
72650276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf);
72660276dca6SMike Christie return -EINVAL;
72670276dca6SMike Christie }
72680276dca6SMike Christie }
7269602adf40SYehuda Sadeh
7270602adf40SYehuda Sadeh ret = -ENOENT;
7271751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock);
7272cd59cdefSJinjie Ruan list_for_each_entry(rbd_dev, &rbd_dev_list, node) {
7273751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) {
7274751cc0e3SAlex Elder ret = 0;
7275751cc0e3SAlex Elder break;
7276602adf40SYehuda Sadeh }
7277751cc0e3SAlex Elder }
7278751cc0e3SAlex Elder if (!ret) {
7279a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock);
72800276dca6SMike Christie if (rbd_dev->open_count && !force)
728142382b70SAlex Elder ret = -EBUSY;
728285f5a4d6SIlya Dryomov else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
728385f5a4d6SIlya Dryomov &rbd_dev->flags))
728485f5a4d6SIlya Dryomov ret = -EINPROGRESS;
7285a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock);
7286751cc0e3SAlex Elder }
7287751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock);
728885f5a4d6SIlya Dryomov if (ret)
72891ba0f1e7SAlex Elder return ret;
7290751cc0e3SAlex Elder
72910276dca6SMike Christie if (force) {
72920276dca6SMike Christie /*
72930276dca6SMike Christie * Prevent new IO from being queued and wait for existing
72940276dca6SMike Christie * IO to complete/fail.
72950276dca6SMike Christie */
72960276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue);
72977a5428dcSChristoph Hellwig blk_mark_disk_dead(rbd_dev->disk);
72980276dca6SMike Christie }
72990276dca6SMike Christie
73005769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk);
73015769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock);
73025769ed0cSIlya Dryomov list_del_init(&rbd_dev->node);
73035769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock);
73045769ed0cSIlya Dryomov device_del(&rbd_dev->dev);
7305fca27065SIlya Dryomov
7306e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev);
7307dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev);
73088ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev);
73098b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev);
73101ba0f1e7SAlex Elder return count;
7311602adf40SYehuda Sadeh }
7312602adf40SYehuda Sadeh
remove_store(const struct bus_type * bus,const char * buf,size_t count)731375cff725SGreg Kroah-Hartman static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count)
73149b60e70bSIlya Dryomov {
73159b60e70bSIlya Dryomov if (single_major)
73169b60e70bSIlya Dryomov return -EINVAL;
73179b60e70bSIlya Dryomov
731875cff725SGreg Kroah-Hartman return do_rbd_remove(buf, count);
73199b60e70bSIlya Dryomov }
73209b60e70bSIlya Dryomov
remove_single_major_store(const struct bus_type * bus,const char * buf,size_t count)732175cff725SGreg Kroah-Hartman static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf,
73229b60e70bSIlya Dryomov size_t count)
73239b60e70bSIlya Dryomov {
732475cff725SGreg Kroah-Hartman return do_rbd_remove(buf, count);
73259b60e70bSIlya Dryomov }
73269b60e70bSIlya Dryomov
7327602adf40SYehuda Sadeh /*
7328602adf40SYehuda Sadeh * create control files in sysfs
7329dfc5606dSYehuda Sadeh * /sys/bus/rbd/...
7330602adf40SYehuda Sadeh */
rbd_sysfs_init(void)73317d8dc534SChengguang Xu static int __init rbd_sysfs_init(void)
7332602adf40SYehuda Sadeh {
7333dfc5606dSYehuda Sadeh int ret;
7334602adf40SYehuda Sadeh
7335fed4c143SAlex Elder ret = device_register(&rbd_root_dev);
73367f21735fSYang Yingliang if (ret < 0) {
73377f21735fSYang Yingliang put_device(&rbd_root_dev);
7338dfc5606dSYehuda Sadeh return ret;
73397f21735fSYang Yingliang }
7340602adf40SYehuda Sadeh
7341fed4c143SAlex Elder ret = bus_register(&rbd_bus_type);
7342fed4c143SAlex Elder if (ret < 0)
7343fed4c143SAlex Elder device_unregister(&rbd_root_dev);
7344602adf40SYehuda Sadeh
7345602adf40SYehuda Sadeh return ret;
7346602adf40SYehuda Sadeh }
7347602adf40SYehuda Sadeh
rbd_sysfs_cleanup(void)73487d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void)
7349602adf40SYehuda Sadeh {
7350dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type);
7351fed4c143SAlex Elder device_unregister(&rbd_root_dev);
7352602adf40SYehuda Sadeh }
7353602adf40SYehuda Sadeh
rbd_slab_init(void)73547d8dc534SChengguang Xu static int __init rbd_slab_init(void)
73551c2a9dfeSAlex Elder {
73561c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache);
735703d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7358868311b1SAlex Elder if (!rbd_img_request_cache)
7359868311b1SAlex Elder return -ENOMEM;
7360868311b1SAlex Elder
7361868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache);
736203d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
736378c2a44aSAlex Elder if (!rbd_obj_request_cache)
736478c2a44aSAlex Elder goto out_err;
736578c2a44aSAlex Elder
73661c2a9dfeSAlex Elder return 0;
73671c2a9dfeSAlex Elder
73686c696d85SIlya Dryomov out_err:
7369868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache);
7370868311b1SAlex Elder rbd_img_request_cache = NULL;
73711c2a9dfeSAlex Elder return -ENOMEM;
73721c2a9dfeSAlex Elder }
73731c2a9dfeSAlex Elder
rbd_slab_exit(void)73741c2a9dfeSAlex Elder static void rbd_slab_exit(void)
73751c2a9dfeSAlex Elder {
7376868311b1SAlex Elder rbd_assert(rbd_obj_request_cache);
7377868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache);
7378868311b1SAlex Elder rbd_obj_request_cache = NULL;
7379868311b1SAlex Elder
73801c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache);
73811c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache);
73821c2a9dfeSAlex Elder rbd_img_request_cache = NULL;
73831c2a9dfeSAlex Elder }
73841c2a9dfeSAlex Elder
rbd_init(void)7385cc344fa1SAlex Elder static int __init rbd_init(void)
7386602adf40SYehuda Sadeh {
7387602adf40SYehuda Sadeh int rc;
7388602adf40SYehuda Sadeh
73891e32d34cSAlex Elder if (!libceph_compatible(NULL)) {
73901e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)");
73911e32d34cSAlex Elder return -EINVAL;
73921e32d34cSAlex Elder }
7393e1b4d96dSIlya Dryomov
73941c2a9dfeSAlex Elder rc = rbd_slab_init();
7395602adf40SYehuda Sadeh if (rc)
7396602adf40SYehuda Sadeh return rc;
7397e1b4d96dSIlya Dryomov
7398f5ee37bdSIlya Dryomov /*
7399f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of
7400f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default.
7401f5ee37bdSIlya Dryomov */
7402f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7403f5ee37bdSIlya Dryomov if (!rbd_wq) {
7404f5ee37bdSIlya Dryomov rc = -ENOMEM;
7405f5ee37bdSIlya Dryomov goto err_out_slab;
7406f5ee37bdSIlya Dryomov }
7407f5ee37bdSIlya Dryomov
74089b60e70bSIlya Dryomov if (single_major) {
74099b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME);
74109b60e70bSIlya Dryomov if (rbd_major < 0) {
74119b60e70bSIlya Dryomov rc = rbd_major;
7412f5ee37bdSIlya Dryomov goto err_out_wq;
74139b60e70bSIlya Dryomov }
74149b60e70bSIlya Dryomov }
74159b60e70bSIlya Dryomov
74161c2a9dfeSAlex Elder rc = rbd_sysfs_init();
74171c2a9dfeSAlex Elder if (rc)
74189b60e70bSIlya Dryomov goto err_out_blkdev;
74191c2a9dfeSAlex Elder
74209b60e70bSIlya Dryomov if (single_major)
74219b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major);
74229b60e70bSIlya Dryomov else
7423e1b4d96dSIlya Dryomov pr_info("loaded\n");
74249b60e70bSIlya Dryomov
7425e1b4d96dSIlya Dryomov return 0;
7426e1b4d96dSIlya Dryomov
74279b60e70bSIlya Dryomov err_out_blkdev:
74289b60e70bSIlya Dryomov if (single_major)
74299b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME);
7430f5ee37bdSIlya Dryomov err_out_wq:
7431f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq);
7432e1b4d96dSIlya Dryomov err_out_slab:
7433e1b4d96dSIlya Dryomov rbd_slab_exit();
74341c2a9dfeSAlex Elder return rc;
7435602adf40SYehuda Sadeh }
7436602adf40SYehuda Sadeh
rbd_exit(void)7437cc344fa1SAlex Elder static void __exit rbd_exit(void)
7438602adf40SYehuda Sadeh {
7439ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida);
7440602adf40SYehuda Sadeh rbd_sysfs_cleanup();
74419b60e70bSIlya Dryomov if (single_major)
74429b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME);
7443f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq);
74441c2a9dfeSAlex Elder rbd_slab_exit();
7445602adf40SYehuda Sadeh }
7446602adf40SYehuda Sadeh
7447602adf40SYehuda Sadeh module_init(rbd_init);
7448602adf40SYehuda Sadeh module_exit(rbd_exit);
7449602adf40SYehuda Sadeh
7450d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7451602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7452602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7453602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */
7454602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7455602adf40SYehuda Sadeh
745690da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7457602adf40SYehuda Sadeh MODULE_LICENSE("GPL");
7458