1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 3543df3d35SIlya Dryomov #include <linux/ceph/striper.h> 36602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3782995cc6SDavid Howells #include <linux/fs_parser.h> 3830d1cff8SAlex Elder #include <linux/bsearch.h> 39602adf40SYehuda Sadeh 40602adf40SYehuda Sadeh #include <linux/kernel.h> 41602adf40SYehuda Sadeh #include <linux/device.h> 42602adf40SYehuda Sadeh #include <linux/module.h> 437ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 44602adf40SYehuda Sadeh #include <linux/fs.h> 45602adf40SYehuda Sadeh #include <linux/blkdev.h> 461c2a9dfeSAlex Elder #include <linux/slab.h> 47f8a22fc2SIlya Dryomov #include <linux/idr.h> 48bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 49602adf40SYehuda Sadeh 50602adf40SYehuda Sadeh #include "rbd_types.h" 51602adf40SYehuda Sadeh 52aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 53aafb230eSAlex Elder 54593a9e7bSAlex Elder /* 55a2acd00eSAlex Elder * Increment the given counter and return its updated value. 56a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 57a2acd00eSAlex Elder * If the counter is already at its maximum value returns 58a2acd00eSAlex Elder * -EINVAL without updating it. 59a2acd00eSAlex Elder */ 60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 61a2acd00eSAlex Elder { 62a2acd00eSAlex Elder unsigned int counter; 63a2acd00eSAlex Elder 64bfc18e38SMark Rutland counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); 65a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 66a2acd00eSAlex Elder return (int)counter; 67a2acd00eSAlex Elder 68a2acd00eSAlex Elder atomic_dec(v); 69a2acd00eSAlex Elder 70a2acd00eSAlex Elder return -EINVAL; 71a2acd00eSAlex Elder } 72a2acd00eSAlex Elder 73a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 75a2acd00eSAlex Elder { 76a2acd00eSAlex Elder int counter; 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder counter = atomic_dec_return(v); 79a2acd00eSAlex Elder if (counter >= 0) 80a2acd00eSAlex Elder return counter; 81a2acd00eSAlex Elder 82a2acd00eSAlex Elder atomic_inc(v); 83a2acd00eSAlex Elder 84a2acd00eSAlex Elder return -EINVAL; 85a2acd00eSAlex Elder } 86a2acd00eSAlex Elder 87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 88602adf40SYehuda Sadeh 897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 91602adf40SYehuda Sadeh 926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 936d69bb53SIlya Dryomov 94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 96d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 97d4b125e9SAlex Elder 9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 99602adf40SYehuda Sadeh 100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 101602adf40SYehuda Sadeh 1029682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1039682fc6dSAlex Elder 1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1079e15b77dSAlex Elder 1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 109589d30e0SAlex Elder 110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11199d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 11299d16943SIlya Dryomov 113d889140cSAlex Elder /* Feature bits */ 114d889140cSAlex Elder 1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0) 1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 11822e8bd51SIlya Dryomov #define RBD_FEATURE_OBJECT_MAP (1ULL<<3) 11922e8bd51SIlya Dryomov #define RBD_FEATURE_FAST_DIFF (1ULL<<4) 120b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) 1218767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7) 122e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS (1ULL<<8) 1238767b293SIlya Dryomov 124ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 125ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1267e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 12722e8bd51SIlya Dryomov RBD_FEATURE_OBJECT_MAP | \ 12822e8bd51SIlya Dryomov RBD_FEATURE_FAST_DIFF | \ 129b9f6d447SIlya Dryomov RBD_FEATURE_DEEP_FLATTEN | \ 130e573427aSIlya Dryomov RBD_FEATURE_DATA_POOL | \ 131e573427aSIlya Dryomov RBD_FEATURE_OPERATIONS) 132d889140cSAlex Elder 133d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 134d889140cSAlex Elder 135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 136d889140cSAlex Elder 13781a89793SAlex Elder /* 13881a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13981a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 14081a89793SAlex Elder */ 141602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 142602adf40SYehuda Sadeh 143602adf40SYehuda Sadeh /* 144602adf40SYehuda Sadeh * block device image metadata (in-memory version) 145602adf40SYehuda Sadeh */ 146602adf40SYehuda Sadeh struct rbd_image_header { 147f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 148849b4260SAlex Elder char *object_prefix; 149602adf40SYehuda Sadeh __u8 obj_order; 150f35a4deeSAlex Elder u64 stripe_unit; 151f35a4deeSAlex Elder u64 stripe_count; 1527e97332eSIlya Dryomov s64 data_pool_id; 153f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 154602adf40SYehuda Sadeh 155f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 156f84344f3SAlex Elder u64 image_size; 157f84344f3SAlex Elder struct ceph_snap_context *snapc; 158f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 159f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 16059c2be1eSYehuda Sadeh }; 16159c2be1eSYehuda Sadeh 1620d7dbfceSAlex Elder /* 1630d7dbfceSAlex Elder * An rbd image specification. 1640d7dbfceSAlex Elder * 1650d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 166c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 167c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 168c66c6e0cSAlex Elder * 169c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 170c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 171c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 172c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 173c66c6e0cSAlex Elder * 174c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 175c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 176c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 177c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 178c66c6e0cSAlex Elder * is shared between the parent and child). 179c66c6e0cSAlex Elder * 180c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 181c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 182c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 183c66c6e0cSAlex Elder * 184c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 185c66c6e0cSAlex Elder * could be a null pointer). 1860d7dbfceSAlex Elder */ 1870d7dbfceSAlex Elder struct rbd_spec { 1880d7dbfceSAlex Elder u64 pool_id; 189ecb4dc22SAlex Elder const char *pool_name; 190b26c047bSIlya Dryomov const char *pool_ns; /* NULL if default, never "" */ 1910d7dbfceSAlex Elder 192ecb4dc22SAlex Elder const char *image_id; 193ecb4dc22SAlex Elder const char *image_name; 1940d7dbfceSAlex Elder 1950d7dbfceSAlex Elder u64 snap_id; 196ecb4dc22SAlex Elder const char *snap_name; 1970d7dbfceSAlex Elder 1980d7dbfceSAlex Elder struct kref kref; 1990d7dbfceSAlex Elder }; 2000d7dbfceSAlex Elder 201602adf40SYehuda Sadeh /* 202f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 203602adf40SYehuda Sadeh */ 204602adf40SYehuda Sadeh struct rbd_client { 205602adf40SYehuda Sadeh struct ceph_client *client; 206602adf40SYehuda Sadeh struct kref kref; 207602adf40SYehuda Sadeh struct list_head node; 208602adf40SYehuda Sadeh }; 209602adf40SYehuda Sadeh 2100192ce2eSIlya Dryomov struct pending_result { 2110192ce2eSIlya Dryomov int result; /* first nonzero result */ 2120192ce2eSIlya Dryomov int num_pending; 2130192ce2eSIlya Dryomov }; 2140192ce2eSIlya Dryomov 215bf0d5f50SAlex Elder struct rbd_img_request; 216bf0d5f50SAlex Elder 2179969ebc5SAlex Elder enum obj_request_type { 218a1fbb5e7SIlya Dryomov OBJ_REQUEST_NODATA = 1, 2195359a17dSIlya Dryomov OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 2207e07efb1SIlya Dryomov OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 221afb97888SIlya Dryomov OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ 2229969ebc5SAlex Elder }; 223bf0d5f50SAlex Elder 2246d2940c8SGuangliang Zhao enum obj_operation_type { 225a1fbb5e7SIlya Dryomov OBJ_OP_READ = 1, 2266d2940c8SGuangliang Zhao OBJ_OP_WRITE, 22790e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2286484cbe9SIlya Dryomov OBJ_OP_ZEROOUT, 2296d2940c8SGuangliang Zhao }; 2306d2940c8SGuangliang Zhao 2310ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_DELETION (1U << 0) 2320ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1) 233793333a3SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2) 23422e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_MAY_EXIST (1U << 3) 23522e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4) 2360ad5d953SIlya Dryomov 237a9b67e69SIlya Dryomov enum rbd_obj_read_state { 23885b5e6d1SIlya Dryomov RBD_OBJ_READ_START = 1, 23985b5e6d1SIlya Dryomov RBD_OBJ_READ_OBJECT, 240a9b67e69SIlya Dryomov RBD_OBJ_READ_PARENT, 241a9b67e69SIlya Dryomov }; 242a9b67e69SIlya Dryomov 2433da691bfSIlya Dryomov /* 2443da691bfSIlya Dryomov * Writes go through the following state machine to deal with 2453da691bfSIlya Dryomov * layering: 2463da691bfSIlya Dryomov * 24789a59c1cSIlya Dryomov * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . . 24889a59c1cSIlya Dryomov * . | . 24989a59c1cSIlya Dryomov * . v . 25089a59c1cSIlya Dryomov * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . . 25189a59c1cSIlya Dryomov * . | . . 25289a59c1cSIlya Dryomov * . v v (deep-copyup . 25389a59c1cSIlya Dryomov * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) . 25489a59c1cSIlya Dryomov * flattened) v | . . 25589a59c1cSIlya Dryomov * . v . . 25689a59c1cSIlya Dryomov * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup . 25789a59c1cSIlya Dryomov * | not needed) v 25889a59c1cSIlya Dryomov * v . 25989a59c1cSIlya Dryomov * done . . . . . . . . . . . . . . . . . . 2603da691bfSIlya Dryomov * ^ 2613da691bfSIlya Dryomov * | 2623da691bfSIlya Dryomov * RBD_OBJ_WRITE_FLAT 2633da691bfSIlya Dryomov * 2643da691bfSIlya Dryomov * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether 26589a59c1cSIlya Dryomov * assert_exists guard is needed or not (in some cases it's not needed 26689a59c1cSIlya Dryomov * even if there is a parent). 2673da691bfSIlya Dryomov */ 2683da691bfSIlya Dryomov enum rbd_obj_write_state { 26985b5e6d1SIlya Dryomov RBD_OBJ_WRITE_START = 1, 27022e8bd51SIlya Dryomov RBD_OBJ_WRITE_PRE_OBJECT_MAP, 27185b5e6d1SIlya Dryomov RBD_OBJ_WRITE_OBJECT, 272793333a3SIlya Dryomov __RBD_OBJ_WRITE_COPYUP, 273793333a3SIlya Dryomov RBD_OBJ_WRITE_COPYUP, 27422e8bd51SIlya Dryomov RBD_OBJ_WRITE_POST_OBJECT_MAP, 275793333a3SIlya Dryomov }; 276793333a3SIlya Dryomov 277793333a3SIlya Dryomov enum rbd_obj_copyup_state { 278793333a3SIlya Dryomov RBD_OBJ_COPYUP_START = 1, 279793333a3SIlya Dryomov RBD_OBJ_COPYUP_READ_PARENT, 28022e8bd51SIlya Dryomov __RBD_OBJ_COPYUP_OBJECT_MAPS, 28122e8bd51SIlya Dryomov RBD_OBJ_COPYUP_OBJECT_MAPS, 282793333a3SIlya Dryomov __RBD_OBJ_COPYUP_WRITE_OBJECT, 283793333a3SIlya Dryomov RBD_OBJ_COPYUP_WRITE_OBJECT, 284926f9b3fSAlex Elder }; 285926f9b3fSAlex Elder 286bf0d5f50SAlex Elder struct rbd_obj_request { 28743df3d35SIlya Dryomov struct ceph_object_extent ex; 2880ad5d953SIlya Dryomov unsigned int flags; /* RBD_OBJ_FLAG_* */ 289c5b5ef6cSAlex Elder union { 290a9b67e69SIlya Dryomov enum rbd_obj_read_state read_state; /* for reads */ 2913da691bfSIlya Dryomov enum rbd_obj_write_state write_state; /* for writes */ 2923da691bfSIlya Dryomov }; 293bf0d5f50SAlex Elder 294bf0d5f50SAlex Elder struct rbd_img_request *img_request; 29586bd7998SIlya Dryomov struct ceph_file_extent *img_extents; 29686bd7998SIlya Dryomov u32 num_img_extents; 297bf0d5f50SAlex Elder 298788e2df3SAlex Elder union { 2995359a17dSIlya Dryomov struct ceph_bio_iter bio_pos; 300788e2df3SAlex Elder struct { 3017e07efb1SIlya Dryomov struct ceph_bvec_iter bvec_pos; 3027e07efb1SIlya Dryomov u32 bvec_count; 303afb97888SIlya Dryomov u32 bvec_idx; 304788e2df3SAlex Elder }; 305788e2df3SAlex Elder }; 306793333a3SIlya Dryomov 307793333a3SIlya Dryomov enum rbd_obj_copyup_state copyup_state; 3087e07efb1SIlya Dryomov struct bio_vec *copyup_bvecs; 3097e07efb1SIlya Dryomov u32 copyup_bvec_count; 310bf0d5f50SAlex Elder 311bcbab1dbSIlya Dryomov struct list_head osd_reqs; /* w/ r_private_item */ 312bf0d5f50SAlex Elder 31385b5e6d1SIlya Dryomov struct mutex state_mutex; 314793333a3SIlya Dryomov struct pending_result pending; 315bf0d5f50SAlex Elder struct kref kref; 316bf0d5f50SAlex Elder }; 317bf0d5f50SAlex Elder 3180c425248SAlex Elder enum img_req_flags { 3199849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 320d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 3210c425248SAlex Elder }; 3220c425248SAlex Elder 3230192ce2eSIlya Dryomov enum rbd_img_state { 3240192ce2eSIlya Dryomov RBD_IMG_START = 1, 325637cd060SIlya Dryomov RBD_IMG_EXCLUSIVE_LOCK, 3260192ce2eSIlya Dryomov __RBD_IMG_OBJECT_REQUESTS, 3270192ce2eSIlya Dryomov RBD_IMG_OBJECT_REQUESTS, 3280192ce2eSIlya Dryomov }; 3290192ce2eSIlya Dryomov 330bf0d5f50SAlex Elder struct rbd_img_request { 331bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 3329bb0248dSIlya Dryomov enum obj_operation_type op_type; 333ecc633caSIlya Dryomov enum obj_request_type data_type; 3340c425248SAlex Elder unsigned long flags; 3350192ce2eSIlya Dryomov enum rbd_img_state state; 336bf0d5f50SAlex Elder union { 337bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3389849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3399849e986SAlex Elder }; 3409849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 341bf0d5f50SAlex Elder 342e1fddc8fSIlya Dryomov struct list_head lock_item; 34343df3d35SIlya Dryomov struct list_head object_extents; /* obj_req.ex structs */ 344bf0d5f50SAlex Elder 3450192ce2eSIlya Dryomov struct mutex state_mutex; 3460192ce2eSIlya Dryomov struct pending_result pending; 3470192ce2eSIlya Dryomov struct work_struct work; 3480192ce2eSIlya Dryomov int work_result; 349bf0d5f50SAlex Elder }; 350bf0d5f50SAlex Elder 351bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 35243df3d35SIlya Dryomov list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) 353bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 35443df3d35SIlya Dryomov list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) 355bf0d5f50SAlex Elder 35699d16943SIlya Dryomov enum rbd_watch_state { 35799d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 35899d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 35999d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 36099d16943SIlya Dryomov }; 36199d16943SIlya Dryomov 362ed95b21aSIlya Dryomov enum rbd_lock_state { 363ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 364ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 365ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 366ed95b21aSIlya Dryomov }; 367ed95b21aSIlya Dryomov 368ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 369ed95b21aSIlya Dryomov struct rbd_client_id { 370ed95b21aSIlya Dryomov u64 gid; 371ed95b21aSIlya Dryomov u64 handle; 372ed95b21aSIlya Dryomov }; 373ed95b21aSIlya Dryomov 374f84344f3SAlex Elder struct rbd_mapping { 37599c1f08fSAlex Elder u64 size; 376f84344f3SAlex Elder }; 377f84344f3SAlex Elder 378602adf40SYehuda Sadeh /* 379602adf40SYehuda Sadeh * a single device 380602adf40SYehuda Sadeh */ 381602adf40SYehuda Sadeh struct rbd_device { 382de71a297SAlex Elder int dev_id; /* blkdev unique id */ 383602adf40SYehuda Sadeh 384602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 385dd82fff1SIlya Dryomov int minor; 386602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 387602adf40SYehuda Sadeh 388a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 389602adf40SYehuda Sadeh struct rbd_client *rbd_client; 390602adf40SYehuda Sadeh 391602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 392602adf40SYehuda Sadeh 393b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 394602adf40SYehuda Sadeh 395602adf40SYehuda Sadeh struct rbd_image_header header; 396b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3970d7dbfceSAlex Elder struct rbd_spec *spec; 398d147543dSIlya Dryomov struct rbd_options *opts; 3990d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 400602adf40SYehuda Sadeh 401c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 402922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 403971f839aSAlex Elder 4041643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 4050903e875SAlex Elder 40699d16943SIlya Dryomov struct mutex watch_mutex; 40799d16943SIlya Dryomov enum rbd_watch_state watch_state; 408922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 40999d16943SIlya Dryomov u64 watch_cookie; 41099d16943SIlya Dryomov struct delayed_work watch_dwork; 41159c2be1eSYehuda Sadeh 412ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 413ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 414cbbfb0ffSIlya Dryomov char lock_cookie[32]; 415ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 416ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 417ed95b21aSIlya Dryomov struct work_struct released_lock_work; 418ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 419ed95b21aSIlya Dryomov struct work_struct unlock_work; 420e1fddc8fSIlya Dryomov spinlock_t lock_lists_lock; 421637cd060SIlya Dryomov struct list_head acquiring_list; 422e1fddc8fSIlya Dryomov struct list_head running_list; 423637cd060SIlya Dryomov struct completion acquire_wait; 424637cd060SIlya Dryomov int acquire_err; 425e1fddc8fSIlya Dryomov struct completion releasing_wait; 426ed95b21aSIlya Dryomov 42722e8bd51SIlya Dryomov spinlock_t object_map_lock; 42822e8bd51SIlya Dryomov u8 *object_map; 42922e8bd51SIlya Dryomov u64 object_map_size; /* in objects */ 43022e8bd51SIlya Dryomov u64 object_map_flags; 431602adf40SYehuda Sadeh 4321643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 433602adf40SYehuda Sadeh 43486b00e0dSAlex Elder struct rbd_spec *parent_spec; 43586b00e0dSAlex Elder u64 parent_overlap; 436a2acd00eSAlex Elder atomic_t parent_ref; 4372f82ee54SAlex Elder struct rbd_device *parent; 43886b00e0dSAlex Elder 4397ad18afaSChristoph Hellwig /* Block layer tags. */ 4407ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4417ad18afaSChristoph Hellwig 442c666601aSJosh Durgin /* protects updating the header */ 443c666601aSJosh Durgin struct rw_semaphore header_rwsem; 444f84344f3SAlex Elder 445f84344f3SAlex Elder struct rbd_mapping mapping; 446602adf40SYehuda Sadeh 447602adf40SYehuda Sadeh struct list_head node; 448dfc5606dSYehuda Sadeh 449dfc5606dSYehuda Sadeh /* sysfs related */ 450dfc5606dSYehuda Sadeh struct device dev; 451b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 452dfc5606dSYehuda Sadeh }; 453dfc5606dSYehuda Sadeh 454b82d167bSAlex Elder /* 45587c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 45687c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 45787c0fdedSIlya Dryomov * by rbd_dev->lock 458b82d167bSAlex Elder */ 4596d292906SAlex Elder enum rbd_dev_flags { 460686238b7SIlya Dryomov RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */ 461b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 46239258aa2SIlya Dryomov RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */ 4636d292906SAlex Elder }; 4646d292906SAlex Elder 465cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 466e124a82fSAlex Elder 467602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 468e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 469e124a82fSAlex Elder 470602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 471432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 472602adf40SYehuda Sadeh 47378c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 47478c2a44aSAlex Elder 4751c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 476868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4771c2a9dfeSAlex Elder 4789b60e70bSIlya Dryomov static int rbd_major; 479f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 480f8a22fc2SIlya Dryomov 481f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 482f5ee37bdSIlya Dryomov 48389a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = { 48489a59c1cSIlya Dryomov .nref = REFCOUNT_INIT(1), 48589a59c1cSIlya Dryomov }; 48689a59c1cSIlya Dryomov 4879b60e70bSIlya Dryomov /* 4883cfa3b16SIlya Dryomov * single-major requires >= 0.75 version of userspace rbd utility. 4899b60e70bSIlya Dryomov */ 4903cfa3b16SIlya Dryomov static bool single_major = true; 4915657a819SJoe Perches module_param(single_major, bool, 0444); 4923cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 4939b60e70bSIlya Dryomov 4947e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count); 4957e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, 496f0f8cef5SAlex Elder size_t count); 4977e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, 498f0f8cef5SAlex Elder size_t count); 4997e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, 5009b60e70bSIlya Dryomov size_t count); 5016d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 502f0f8cef5SAlex Elder 5039b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 5049b60e70bSIlya Dryomov { 5057e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 5069b60e70bSIlya Dryomov } 5079b60e70bSIlya Dryomov 5089b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 5099b60e70bSIlya Dryomov { 5107e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 5119b60e70bSIlya Dryomov } 5129b60e70bSIlya Dryomov 51339258aa2SIlya Dryomov static bool rbd_is_ro(struct rbd_device *rbd_dev) 51439258aa2SIlya Dryomov { 51539258aa2SIlya Dryomov return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags); 51639258aa2SIlya Dryomov } 51739258aa2SIlya Dryomov 518f3c0e459SIlya Dryomov static bool rbd_is_snap(struct rbd_device *rbd_dev) 519f3c0e459SIlya Dryomov { 520f3c0e459SIlya Dryomov return rbd_dev->spec->snap_id != CEPH_NOSNAP; 521f3c0e459SIlya Dryomov } 522f3c0e459SIlya Dryomov 523ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 524ed95b21aSIlya Dryomov { 525637cd060SIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem); 526637cd060SIlya Dryomov 527ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 528ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 529ed95b21aSIlya Dryomov } 530ed95b21aSIlya Dryomov 531ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 532ed95b21aSIlya Dryomov { 533ed95b21aSIlya Dryomov bool is_lock_owner; 534ed95b21aSIlya Dryomov 535ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 536ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 537ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 538ed95b21aSIlya Dryomov return is_lock_owner; 539ed95b21aSIlya Dryomov } 540ed95b21aSIlya Dryomov 5417e9586baSGreg Kroah-Hartman static ssize_t supported_features_show(struct bus_type *bus, char *buf) 5428767b293SIlya Dryomov { 5438767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); 5448767b293SIlya Dryomov } 5458767b293SIlya Dryomov 5467e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add); 5477e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove); 5487e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major); 5497e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major); 5507e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features); 551b15a21ddSGreg Kroah-Hartman 552b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 553b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 554b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5559b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5569b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 5578767b293SIlya Dryomov &bus_attr_supported_features.attr, 558b15a21ddSGreg Kroah-Hartman NULL, 559f0f8cef5SAlex Elder }; 56092c76dc0SIlya Dryomov 56192c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 56292c76dc0SIlya Dryomov struct attribute *attr, int index) 56392c76dc0SIlya Dryomov { 5649b60e70bSIlya Dryomov if (!single_major && 5659b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5669b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5679b60e70bSIlya Dryomov return 0; 5689b60e70bSIlya Dryomov 56992c76dc0SIlya Dryomov return attr->mode; 57092c76dc0SIlya Dryomov } 57192c76dc0SIlya Dryomov 57292c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 57392c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 57492c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 57592c76dc0SIlya Dryomov }; 57692c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 577f0f8cef5SAlex Elder 578f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 579f0f8cef5SAlex Elder .name = "rbd", 580b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 581f0f8cef5SAlex Elder }; 582f0f8cef5SAlex Elder 583f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 584f0f8cef5SAlex Elder { 585f0f8cef5SAlex Elder } 586f0f8cef5SAlex Elder 587f0f8cef5SAlex Elder static struct device rbd_root_dev = { 588f0f8cef5SAlex Elder .init_name = "rbd", 589f0f8cef5SAlex Elder .release = rbd_root_dev_release, 590f0f8cef5SAlex Elder }; 591f0f8cef5SAlex Elder 59206ecc6cbSAlex Elder static __printf(2, 3) 59306ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 59406ecc6cbSAlex Elder { 59506ecc6cbSAlex Elder struct va_format vaf; 59606ecc6cbSAlex Elder va_list args; 59706ecc6cbSAlex Elder 59806ecc6cbSAlex Elder va_start(args, fmt); 59906ecc6cbSAlex Elder vaf.fmt = fmt; 60006ecc6cbSAlex Elder vaf.va = &args; 60106ecc6cbSAlex Elder 60206ecc6cbSAlex Elder if (!rbd_dev) 60306ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 60406ecc6cbSAlex Elder else if (rbd_dev->disk) 60506ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 60606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 60706ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 60806ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 60906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 61006ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 61106ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 61206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 61306ecc6cbSAlex Elder else /* punt */ 61406ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 61506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 61606ecc6cbSAlex Elder va_end(args); 61706ecc6cbSAlex Elder } 61806ecc6cbSAlex Elder 619aafb230eSAlex Elder #ifdef RBD_DEBUG 620aafb230eSAlex Elder #define rbd_assert(expr) \ 621aafb230eSAlex Elder if (unlikely(!(expr))) { \ 622aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 623aafb230eSAlex Elder "at line %d:\n\n" \ 624aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 625aafb230eSAlex Elder __func__, __LINE__, #expr); \ 626aafb230eSAlex Elder BUG(); \ 627aafb230eSAlex Elder } 628aafb230eSAlex Elder #else /* !RBD_DEBUG */ 629aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 630aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 631dfc5606dSYehuda Sadeh 63205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 6338b3e1a56SAlex Elder 634cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 6352df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 636a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 637e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 63854cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 63954cac61fSAlex Elder u64 snap_id); 6402ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 6412ad3d716SAlex Elder u8 *order, u64 *snap_size); 64222e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev); 64359c2be1eSYehuda Sadeh 64454ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result); 6450192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result); 6460192ce2eSIlya Dryomov 6470192ce2eSIlya Dryomov /* 6480192ce2eSIlya Dryomov * Return true if nothing else is pending. 6490192ce2eSIlya Dryomov */ 6500192ce2eSIlya Dryomov static bool pending_result_dec(struct pending_result *pending, int *result) 6510192ce2eSIlya Dryomov { 6520192ce2eSIlya Dryomov rbd_assert(pending->num_pending > 0); 6530192ce2eSIlya Dryomov 6540192ce2eSIlya Dryomov if (*result && !pending->result) 6550192ce2eSIlya Dryomov pending->result = *result; 6560192ce2eSIlya Dryomov if (--pending->num_pending) 6570192ce2eSIlya Dryomov return false; 6580192ce2eSIlya Dryomov 6590192ce2eSIlya Dryomov *result = pending->result; 6600192ce2eSIlya Dryomov return true; 6610192ce2eSIlya Dryomov } 662602adf40SYehuda Sadeh 663602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 664602adf40SYehuda Sadeh { 665f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 666b82d167bSAlex Elder bool removing = false; 667602adf40SYehuda Sadeh 668a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 669b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 670b82d167bSAlex Elder removing = true; 671b82d167bSAlex Elder else 672b82d167bSAlex Elder rbd_dev->open_count++; 673a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 674b82d167bSAlex Elder if (removing) 675b82d167bSAlex Elder return -ENOENT; 676b82d167bSAlex Elder 677c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 678340c7a2bSAlex Elder 679602adf40SYehuda Sadeh return 0; 680602adf40SYehuda Sadeh } 681602adf40SYehuda Sadeh 682db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 683dfc5606dSYehuda Sadeh { 684dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 685b82d167bSAlex Elder unsigned long open_count_before; 686b82d167bSAlex Elder 687a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 688b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 689a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 690b82d167bSAlex Elder rbd_assert(open_count_before > 0); 691dfc5606dSYehuda Sadeh 692c3e946ceSAlex Elder put_device(&rbd_dev->dev); 693dfc5606dSYehuda Sadeh } 694dfc5606dSYehuda Sadeh 695602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 696602adf40SYehuda Sadeh .owner = THIS_MODULE, 697602adf40SYehuda Sadeh .open = rbd_open, 698dfc5606dSYehuda Sadeh .release = rbd_release, 699602adf40SYehuda Sadeh }; 700602adf40SYehuda Sadeh 701602adf40SYehuda Sadeh /* 7027262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 703cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 704602adf40SYehuda Sadeh */ 705f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 706602adf40SYehuda Sadeh { 707602adf40SYehuda Sadeh struct rbd_client *rbdc; 708602adf40SYehuda Sadeh int ret = -ENOMEM; 709602adf40SYehuda Sadeh 71037206ee5SAlex Elder dout("%s:\n", __func__); 711602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 712602adf40SYehuda Sadeh if (!rbdc) 713602adf40SYehuda Sadeh goto out_opt; 714602adf40SYehuda Sadeh 715602adf40SYehuda Sadeh kref_init(&rbdc->kref); 716602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 717602adf40SYehuda Sadeh 71874da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc); 719602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 72008f75463SAlex Elder goto out_rbdc; 72143ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 722602adf40SYehuda Sadeh 723602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 724602adf40SYehuda Sadeh if (ret < 0) 72508f75463SAlex Elder goto out_client; 726602adf40SYehuda Sadeh 727432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 728602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 729432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 730602adf40SYehuda Sadeh 73137206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 732bc534d86SAlex Elder 733602adf40SYehuda Sadeh return rbdc; 73408f75463SAlex Elder out_client: 735602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 73608f75463SAlex Elder out_rbdc: 737602adf40SYehuda Sadeh kfree(rbdc); 738602adf40SYehuda Sadeh out_opt: 73943ae4701SAlex Elder if (ceph_opts) 74043ae4701SAlex Elder ceph_destroy_options(ceph_opts); 74137206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 74237206ee5SAlex Elder 74328f259b7SVasiliy Kulikov return ERR_PTR(ret); 744602adf40SYehuda Sadeh } 745602adf40SYehuda Sadeh 7462f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7472f82ee54SAlex Elder { 7482f82ee54SAlex Elder kref_get(&rbdc->kref); 7492f82ee54SAlex Elder 7502f82ee54SAlex Elder return rbdc; 7512f82ee54SAlex Elder } 7522f82ee54SAlex Elder 753602adf40SYehuda Sadeh /* 7541f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7551f7ba331SAlex Elder * found, bump its reference count. 756602adf40SYehuda Sadeh */ 7571f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 758602adf40SYehuda Sadeh { 759*3302ffd4SJakob Koschel struct rbd_client *rbdc = NULL, *iter; 760602adf40SYehuda Sadeh 76143ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 762602adf40SYehuda Sadeh return NULL; 763602adf40SYehuda Sadeh 7641f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 765*3302ffd4SJakob Koschel list_for_each_entry(iter, &rbd_client_list, node) { 766*3302ffd4SJakob Koschel if (!ceph_compare_options(ceph_opts, iter->client)) { 767*3302ffd4SJakob Koschel __rbd_get_client(iter); 7682f82ee54SAlex Elder 769*3302ffd4SJakob Koschel rbdc = iter; 7701f7ba331SAlex Elder break; 7711f7ba331SAlex Elder } 7721f7ba331SAlex Elder } 7731f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7741f7ba331SAlex Elder 775*3302ffd4SJakob Koschel return rbdc; 776602adf40SYehuda Sadeh } 777602adf40SYehuda Sadeh 778602adf40SYehuda Sadeh /* 779210c104cSIlya Dryomov * (Per device) rbd map options 78059c2be1eSYehuda Sadeh */ 78159c2be1eSYehuda Sadeh enum { 782b5584180SIlya Dryomov Opt_queue_depth, 7830c93e1b7SIlya Dryomov Opt_alloc_size, 78434f55d0bSDongsheng Yang Opt_lock_timeout, 78559c2be1eSYehuda Sadeh /* int args above */ 786b26c047bSIlya Dryomov Opt_pool_ns, 787dc1dad8eSIlya Dryomov Opt_compression_hint, 78859c2be1eSYehuda Sadeh /* string args above */ 789cc0538b6SAlex Elder Opt_read_only, 790cc0538b6SAlex Elder Opt_read_write, 79180de1912SIlya Dryomov Opt_lock_on_read, 792e010dd0aSIlya Dryomov Opt_exclusive, 793d9360540SIlya Dryomov Opt_notrim, 79459c2be1eSYehuda Sadeh }; 79559c2be1eSYehuda Sadeh 796dc1dad8eSIlya Dryomov enum { 797dc1dad8eSIlya Dryomov Opt_compression_hint_none, 798dc1dad8eSIlya Dryomov Opt_compression_hint_compressible, 799dc1dad8eSIlya Dryomov Opt_compression_hint_incompressible, 800dc1dad8eSIlya Dryomov }; 801dc1dad8eSIlya Dryomov 802dc1dad8eSIlya Dryomov static const struct constant_table rbd_param_compression_hint[] = { 803dc1dad8eSIlya Dryomov {"none", Opt_compression_hint_none}, 804dc1dad8eSIlya Dryomov {"compressible", Opt_compression_hint_compressible}, 805dc1dad8eSIlya Dryomov {"incompressible", Opt_compression_hint_incompressible}, 806dc1dad8eSIlya Dryomov {} 807dc1dad8eSIlya Dryomov }; 808dc1dad8eSIlya Dryomov 809d7167b14SAl Viro static const struct fs_parameter_spec rbd_parameters[] = { 81082995cc6SDavid Howells fsparam_u32 ("alloc_size", Opt_alloc_size), 811dc1dad8eSIlya Dryomov fsparam_enum ("compression_hint", Opt_compression_hint, 812dc1dad8eSIlya Dryomov rbd_param_compression_hint), 81382995cc6SDavid Howells fsparam_flag ("exclusive", Opt_exclusive), 81482995cc6SDavid Howells fsparam_flag ("lock_on_read", Opt_lock_on_read), 81582995cc6SDavid Howells fsparam_u32 ("lock_timeout", Opt_lock_timeout), 81682995cc6SDavid Howells fsparam_flag ("notrim", Opt_notrim), 81782995cc6SDavid Howells fsparam_string ("_pool_ns", Opt_pool_ns), 81882995cc6SDavid Howells fsparam_u32 ("queue_depth", Opt_queue_depth), 81982995cc6SDavid Howells fsparam_flag ("read_only", Opt_read_only), 82082995cc6SDavid Howells fsparam_flag ("read_write", Opt_read_write), 82182995cc6SDavid Howells fsparam_flag ("ro", Opt_read_only), 82282995cc6SDavid Howells fsparam_flag ("rw", Opt_read_write), 82382995cc6SDavid Howells {} 82482995cc6SDavid Howells }; 82582995cc6SDavid Howells 82698571b5aSAlex Elder struct rbd_options { 827b5584180SIlya Dryomov int queue_depth; 8280c93e1b7SIlya Dryomov int alloc_size; 82934f55d0bSDongsheng Yang unsigned long lock_timeout; 83098571b5aSAlex Elder bool read_only; 83180de1912SIlya Dryomov bool lock_on_read; 832e010dd0aSIlya Dryomov bool exclusive; 833d9360540SIlya Dryomov bool trim; 834dc1dad8eSIlya Dryomov 835dc1dad8eSIlya Dryomov u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */ 83698571b5aSAlex Elder }; 83798571b5aSAlex Elder 838d2a27964SJohn Garry #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_DEFAULT_RQ 8390c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) 84034f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ 84198571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 84280de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 843e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT false 844d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT true 84598571b5aSAlex Elder 84682995cc6SDavid Howells struct rbd_parse_opts_ctx { 847c300156bSIlya Dryomov struct rbd_spec *spec; 84882995cc6SDavid Howells struct ceph_options *copts; 849c300156bSIlya Dryomov struct rbd_options *opts; 850c300156bSIlya Dryomov }; 851c300156bSIlya Dryomov 8526d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8536d2940c8SGuangliang Zhao { 8546d2940c8SGuangliang Zhao switch (op_type) { 8556d2940c8SGuangliang Zhao case OBJ_OP_READ: 8566d2940c8SGuangliang Zhao return "read"; 8576d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8586d2940c8SGuangliang Zhao return "write"; 85990e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 86090e98c52SGuangliang Zhao return "discard"; 8616484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 8626484cbe9SIlya Dryomov return "zeroout"; 8636d2940c8SGuangliang Zhao default: 8646d2940c8SGuangliang Zhao return "???"; 8656d2940c8SGuangliang Zhao } 8666d2940c8SGuangliang Zhao } 8676d2940c8SGuangliang Zhao 86859c2be1eSYehuda Sadeh /* 869602adf40SYehuda Sadeh * Destroy ceph client 870d23a4b3fSAlex Elder * 871432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 872602adf40SYehuda Sadeh */ 873602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 874602adf40SYehuda Sadeh { 875602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 876602adf40SYehuda Sadeh 87737206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 878cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 879602adf40SYehuda Sadeh list_del(&rbdc->node); 880cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 881602adf40SYehuda Sadeh 882602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 883602adf40SYehuda Sadeh kfree(rbdc); 884602adf40SYehuda Sadeh } 885602adf40SYehuda Sadeh 886602adf40SYehuda Sadeh /* 887602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 888602adf40SYehuda Sadeh * it. 889602adf40SYehuda Sadeh */ 8909d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 891602adf40SYehuda Sadeh { 892c53d5893SAlex Elder if (rbdc) 8939d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 894602adf40SYehuda Sadeh } 895602adf40SYehuda Sadeh 8965feb0d8dSIlya Dryomov /* 8975feb0d8dSIlya Dryomov * Get a ceph client with specific addr and configuration, if one does 8985feb0d8dSIlya Dryomov * not exist create it. Either way, ceph_opts is consumed by this 8995feb0d8dSIlya Dryomov * function. 9005feb0d8dSIlya Dryomov */ 9015feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 9025feb0d8dSIlya Dryomov { 9035feb0d8dSIlya Dryomov struct rbd_client *rbdc; 904dd435855SIlya Dryomov int ret; 9055feb0d8dSIlya Dryomov 906a32e4143SIlya Dryomov mutex_lock(&client_mutex); 9075feb0d8dSIlya Dryomov rbdc = rbd_client_find(ceph_opts); 908dd435855SIlya Dryomov if (rbdc) { 9095feb0d8dSIlya Dryomov ceph_destroy_options(ceph_opts); 910dd435855SIlya Dryomov 911dd435855SIlya Dryomov /* 912dd435855SIlya Dryomov * Using an existing client. Make sure ->pg_pools is up to 913dd435855SIlya Dryomov * date before we look up the pool id in do_rbd_add(). 914dd435855SIlya Dryomov */ 9159d4a227fSIlya Dryomov ret = ceph_wait_for_latest_osdmap(rbdc->client, 9169d4a227fSIlya Dryomov rbdc->client->options->mount_timeout); 917dd435855SIlya Dryomov if (ret) { 918dd435855SIlya Dryomov rbd_warn(NULL, "failed to get latest osdmap: %d", ret); 919dd435855SIlya Dryomov rbd_put_client(rbdc); 920dd435855SIlya Dryomov rbdc = ERR_PTR(ret); 921dd435855SIlya Dryomov } 922dd435855SIlya Dryomov } else { 9235feb0d8dSIlya Dryomov rbdc = rbd_client_create(ceph_opts); 924dd435855SIlya Dryomov } 9255feb0d8dSIlya Dryomov mutex_unlock(&client_mutex); 9265feb0d8dSIlya Dryomov 9275feb0d8dSIlya Dryomov return rbdc; 9285feb0d8dSIlya Dryomov } 9295feb0d8dSIlya Dryomov 930a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 931a30b71b9SAlex Elder { 932a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 933a30b71b9SAlex Elder } 934a30b71b9SAlex Elder 9358e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9368e94af8eSAlex Elder { 937103a150fSAlex Elder size_t size; 938103a150fSAlex Elder u32 snap_count; 939103a150fSAlex Elder 940103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 941103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 942103a150fSAlex Elder return false; 943103a150fSAlex Elder 944db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 945db2388b6SAlex Elder 946db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 947db2388b6SAlex Elder return false; 948db2388b6SAlex Elder 949db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 950db2388b6SAlex Elder 951db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 952db2388b6SAlex Elder return false; 953db2388b6SAlex Elder 954103a150fSAlex Elder /* 955103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 956103a150fSAlex Elder * that limits the number of snapshots. 957103a150fSAlex Elder */ 958103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 959103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 960103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 961103a150fSAlex Elder return false; 962103a150fSAlex Elder 963103a150fSAlex Elder /* 964103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 965103a150fSAlex Elder * header must also be representable in a size_t. 966103a150fSAlex Elder */ 967103a150fSAlex Elder size -= snap_count * sizeof (__le64); 968103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 969103a150fSAlex Elder return false; 970103a150fSAlex Elder 971103a150fSAlex Elder return true; 9728e94af8eSAlex Elder } 9738e94af8eSAlex Elder 974602adf40SYehuda Sadeh /* 9755bc3fb17SIlya Dryomov * returns the size of an object in the image 9765bc3fb17SIlya Dryomov */ 9775bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 9785bc3fb17SIlya Dryomov { 9795bc3fb17SIlya Dryomov return 1U << header->obj_order; 9805bc3fb17SIlya Dryomov } 9815bc3fb17SIlya Dryomov 982263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 983263423f8SIlya Dryomov { 984263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 985263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 986263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 987263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 988263423f8SIlya Dryomov } 989263423f8SIlya Dryomov 990263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 991263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 992263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 9937e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 9947e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 995263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 996263423f8SIlya Dryomov } 997263423f8SIlya Dryomov 9985bc3fb17SIlya Dryomov /* 999bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1000bb23e37aSAlex Elder * on-disk header. 1001602adf40SYehuda Sadeh */ 1002662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10034156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1004602adf40SYehuda Sadeh { 1005662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1006bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1007bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1008bb23e37aSAlex Elder char *object_prefix = NULL; 1009bb23e37aSAlex Elder char *snap_names = NULL; 1010bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1011ccece235SAlex Elder u32 snap_count; 1012bb23e37aSAlex Elder int ret = -ENOMEM; 1013621901d6SAlex Elder u32 i; 1014602adf40SYehuda Sadeh 1015bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1016103a150fSAlex Elder 1017bb23e37aSAlex Elder if (first_time) { 1018848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1019848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1020848d796cSIlya Dryomov GFP_KERNEL); 1021bb23e37aSAlex Elder if (!object_prefix) 1022602adf40SYehuda Sadeh return -ENOMEM; 1023bb23e37aSAlex Elder } 102400f1f36fSAlex Elder 1025bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1026d2bb24e5SAlex Elder 1027602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1028bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1029bb23e37aSAlex Elder if (!snapc) 1030bb23e37aSAlex Elder goto out_err; 1031bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1032602adf40SYehuda Sadeh if (snap_count) { 1033bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1034f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1035f785cc1dSAlex Elder 1036bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1037621901d6SAlex Elder 1038f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1039bb23e37aSAlex Elder goto out_2big; 1040bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1041bb23e37aSAlex Elder if (!snap_names) 1042602adf40SYehuda Sadeh goto out_err; 1043bb23e37aSAlex Elder 1044bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 104588a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 104688a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 104788a25a5fSMarkus Elfring GFP_KERNEL); 1048bb23e37aSAlex Elder if (!snap_sizes) 1049bb23e37aSAlex Elder goto out_err; 1050bb23e37aSAlex Elder 1051f785cc1dSAlex Elder /* 1052bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1053bb23e37aSAlex Elder * and size. 1054bb23e37aSAlex Elder * 105599a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1056bb23e37aSAlex Elder * ondisk buffer we're working with has 1057f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1058f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1059f785cc1dSAlex Elder */ 1060bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1061bb23e37aSAlex Elder snaps = ondisk->snaps; 1062bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1063bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1064bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1065bb23e37aSAlex Elder } 1066602adf40SYehuda Sadeh } 1067849b4260SAlex Elder 1068bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1069bb23e37aSAlex Elder 1070bb23e37aSAlex Elder if (first_time) { 1071bb23e37aSAlex Elder header->object_prefix = object_prefix; 1072602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1073263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1074662518b1SAlex Elder } else { 1075662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1076662518b1SAlex Elder kfree(header->snap_names); 1077662518b1SAlex Elder kfree(header->snap_sizes); 1078bb23e37aSAlex Elder } 10796a52325fSAlex Elder 1080bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1081621901d6SAlex Elder 1082f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1083bb23e37aSAlex Elder header->snapc = snapc; 1084bb23e37aSAlex Elder header->snap_names = snap_names; 1085bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1086468521c1SAlex Elder 1087602adf40SYehuda Sadeh return 0; 1088bb23e37aSAlex Elder out_2big: 1089bb23e37aSAlex Elder ret = -EIO; 10906a52325fSAlex Elder out_err: 1091bb23e37aSAlex Elder kfree(snap_sizes); 1092bb23e37aSAlex Elder kfree(snap_names); 1093bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1094bb23e37aSAlex Elder kfree(object_prefix); 1095ccece235SAlex Elder 1096bb23e37aSAlex Elder return ret; 1097602adf40SYehuda Sadeh } 1098602adf40SYehuda Sadeh 10999682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11009682fc6dSAlex Elder { 11019682fc6dSAlex Elder const char *snap_name; 11029682fc6dSAlex Elder 11039682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11049682fc6dSAlex Elder 11059682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11069682fc6dSAlex Elder 11079682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11089682fc6dSAlex Elder while (which--) 11099682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11109682fc6dSAlex Elder 11119682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11129682fc6dSAlex Elder } 11139682fc6dSAlex Elder 111430d1cff8SAlex Elder /* 111530d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 111630d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 111730d1cff8SAlex Elder */ 111830d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 111930d1cff8SAlex Elder { 112030d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 112130d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 112230d1cff8SAlex Elder 112330d1cff8SAlex Elder if (snap_id1 < snap_id2) 112430d1cff8SAlex Elder return 1; 112530d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 112630d1cff8SAlex Elder } 112730d1cff8SAlex Elder 112830d1cff8SAlex Elder /* 112930d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 113030d1cff8SAlex Elder * present. 113130d1cff8SAlex Elder * 113230d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 113330d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 113430d1cff8SAlex Elder * 113530d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 113630d1cff8SAlex Elder * reverse order, highest snapshot id first. 113730d1cff8SAlex Elder */ 11389682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11399682fc6dSAlex Elder { 11409682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 114130d1cff8SAlex Elder u64 *found; 11429682fc6dSAlex Elder 114330d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 114430d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11459682fc6dSAlex Elder 114630d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11479682fc6dSAlex Elder } 11489682fc6dSAlex Elder 11492ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11502ad3d716SAlex Elder u64 snap_id) 115154cac61fSAlex Elder { 115254cac61fSAlex Elder u32 which; 1153da6a6b63SJosh Durgin const char *snap_name; 115454cac61fSAlex Elder 115554cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 115654cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1157da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 115854cac61fSAlex Elder 1159da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1160da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 116154cac61fSAlex Elder } 116254cac61fSAlex Elder 11639e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11649e15b77dSAlex Elder { 11659e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11669e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11679e15b77dSAlex Elder 116854cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 116954cac61fSAlex Elder if (rbd_dev->image_format == 1) 117054cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 11719e15b77dSAlex Elder 117254cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 11739e15b77dSAlex Elder } 11749e15b77dSAlex Elder 11752ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 11762ad3d716SAlex Elder u64 *snap_size) 1177602adf40SYehuda Sadeh { 11782ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 11792ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 11802ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 11812ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 11822ad3d716SAlex Elder u32 which; 118300f1f36fSAlex Elder 11842ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 11852ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 11862ad3d716SAlex Elder return -ENOENT; 118700f1f36fSAlex Elder 11882ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 11892ad3d716SAlex Elder } else { 11902ad3d716SAlex Elder u64 size = 0; 11912ad3d716SAlex Elder int ret; 11922ad3d716SAlex Elder 11932ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 11942ad3d716SAlex Elder if (ret) 11952ad3d716SAlex Elder return ret; 11962ad3d716SAlex Elder 11972ad3d716SAlex Elder *snap_size = size; 11982ad3d716SAlex Elder } 11992ad3d716SAlex Elder return 0; 12002ad3d716SAlex Elder } 12012ad3d716SAlex Elder 1202d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1203602adf40SYehuda Sadeh { 12048f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12052ad3d716SAlex Elder u64 size = 0; 12062ad3d716SAlex Elder int ret; 12078b0241f8SAlex Elder 12082ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12092ad3d716SAlex Elder if (ret) 12102ad3d716SAlex Elder return ret; 12112ad3d716SAlex Elder 12122ad3d716SAlex Elder rbd_dev->mapping.size = size; 12138b0241f8SAlex Elder return 0; 1214602adf40SYehuda Sadeh } 1215602adf40SYehuda Sadeh 1216d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1217d1cf5788SAlex Elder { 1218d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1219200a6a8bSAlex Elder } 1220200a6a8bSAlex Elder 12215359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) 1222b9434c5bSAlex Elder { 12235359a17dSIlya Dryomov struct ceph_bio_iter it = *bio_pos; 1224b9434c5bSAlex Elder 12255359a17dSIlya Dryomov ceph_bio_iter_advance(&it, off); 12265359a17dSIlya Dryomov ceph_bio_iter_advance_step(&it, bytes, ({ 1227732022b8SChristoph Hellwig memzero_bvec(&bv); 12285359a17dSIlya Dryomov })); 1229b9434c5bSAlex Elder } 1230b9434c5bSAlex Elder 12317e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) 1232602adf40SYehuda Sadeh { 12337e07efb1SIlya Dryomov struct ceph_bvec_iter it = *bvec_pos; 1234602adf40SYehuda Sadeh 12357e07efb1SIlya Dryomov ceph_bvec_iter_advance(&it, off); 12367e07efb1SIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 1237732022b8SChristoph Hellwig memzero_bvec(&bv); 12387e07efb1SIlya Dryomov })); 1239602adf40SYehuda Sadeh } 1240602adf40SYehuda Sadeh 1241f7760dadSAlex Elder /* 12423da691bfSIlya Dryomov * Zero a range in @obj_req data buffer defined by a bio (list) or 1243afb97888SIlya Dryomov * (private) bio_vec array. 1244f7760dadSAlex Elder * 12453da691bfSIlya Dryomov * @off is relative to the start of the data buffer. 1246f7760dadSAlex Elder */ 12473da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 12483da691bfSIlya Dryomov u32 bytes) 1249f7760dadSAlex Elder { 125054ab3b24SIlya Dryomov dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes); 125154ab3b24SIlya Dryomov 1252ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 12533da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 12543da691bfSIlya Dryomov zero_bios(&obj_req->bio_pos, off, bytes); 12553da691bfSIlya Dryomov break; 12563da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1257afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 12583da691bfSIlya Dryomov zero_bvecs(&obj_req->bvec_pos, off, bytes); 12593da691bfSIlya Dryomov break; 12603da691bfSIlya Dryomov default: 126116809372SArnd Bergmann BUG(); 1262f5400b7aSAlex Elder } 1263bf0d5f50SAlex Elder } 1264bf0d5f50SAlex Elder 1265bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1266bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1267bf0d5f50SAlex Elder { 1268bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 126937206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 12702c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1271bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1272bf0d5f50SAlex Elder } 1273bf0d5f50SAlex Elder 1274bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1275bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1276bf0d5f50SAlex Elder { 127725dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 127825dcf954SAlex Elder 1279b155e86cSAlex Elder /* Image request now owns object's original reference */ 1280bf0d5f50SAlex Elder obj_request->img_request = img_request; 128115961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1282bf0d5f50SAlex Elder } 1283bf0d5f50SAlex Elder 1284bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1285bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1286bf0d5f50SAlex Elder { 128715961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 128843df3d35SIlya Dryomov list_del(&obj_request->ex.oe_item); 1289bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1290bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1291bf0d5f50SAlex Elder } 1292bf0d5f50SAlex Elder 1293a086a1b8SIlya Dryomov static void rbd_osd_submit(struct ceph_osd_request *osd_req) 1294bf0d5f50SAlex Elder { 1295a086a1b8SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 1296980917fcSIlya Dryomov 1297a086a1b8SIlya Dryomov dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", 1298a086a1b8SIlya Dryomov __func__, osd_req, obj_req, obj_req->ex.oe_objno, 1299a086a1b8SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len); 1300980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1301bf0d5f50SAlex Elder } 1302bf0d5f50SAlex Elder 13030c425248SAlex Elder /* 13040c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 13050c425248SAlex Elder * is conditionally set to 1 at image request initialization time 13060c425248SAlex Elder * and currently never change thereafter. 13070c425248SAlex Elder */ 1308d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1309d0b2e944SAlex Elder { 1310d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1311d0b2e944SAlex Elder } 1312d0b2e944SAlex Elder 1313d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1314d0b2e944SAlex Elder { 1315d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1316d0b2e944SAlex Elder } 1317d0b2e944SAlex Elder 13183da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) 13193b434a2aSJosh Durgin { 13203da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 13213da691bfSIlya Dryomov 132243df3d35SIlya Dryomov return !obj_req->ex.oe_off && 132343df3d35SIlya Dryomov obj_req->ex.oe_len == rbd_dev->layout.object_size; 13243b434a2aSJosh Durgin } 13253b434a2aSJosh Durgin 13263da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) 13276e2a4505SAlex Elder { 13283da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1329b9434c5bSAlex Elder 133043df3d35SIlya Dryomov return obj_req->ex.oe_off + obj_req->ex.oe_len == 13313da691bfSIlya Dryomov rbd_dev->layout.object_size; 13326e2a4505SAlex Elder } 13336e2a4505SAlex Elder 133413488d53SIlya Dryomov /* 133513488d53SIlya Dryomov * Must be called after rbd_obj_calc_img_extents(). 133613488d53SIlya Dryomov */ 133713488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) 133813488d53SIlya Dryomov { 133913488d53SIlya Dryomov if (!obj_req->num_img_extents || 13409b17eb2cSIlya Dryomov (rbd_obj_is_entire(obj_req) && 13419b17eb2cSIlya Dryomov !obj_req->img_request->snapc->num_snaps)) 134213488d53SIlya Dryomov return false; 134313488d53SIlya Dryomov 134413488d53SIlya Dryomov return true; 134513488d53SIlya Dryomov } 134613488d53SIlya Dryomov 134786bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) 1348bf0d5f50SAlex Elder { 134986bd7998SIlya Dryomov return ceph_file_extents_bytes(obj_req->img_extents, 135086bd7998SIlya Dryomov obj_req->num_img_extents); 1351bf0d5f50SAlex Elder } 1352bf0d5f50SAlex Elder 13533da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req) 13540dcc685eSIlya Dryomov { 13559bb0248dSIlya Dryomov switch (img_req->op_type) { 13563da691bfSIlya Dryomov case OBJ_OP_READ: 13573da691bfSIlya Dryomov return false; 13583da691bfSIlya Dryomov case OBJ_OP_WRITE: 13593da691bfSIlya Dryomov case OBJ_OP_DISCARD: 13606484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 13613da691bfSIlya Dryomov return true; 13623da691bfSIlya Dryomov default: 1363c6244b3bSArnd Bergmann BUG(); 13640dcc685eSIlya Dryomov } 13650dcc685eSIlya Dryomov } 13660dcc685eSIlya Dryomov 136785e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1368bf0d5f50SAlex Elder { 13693da691bfSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 137054ab3b24SIlya Dryomov int result; 1371bf0d5f50SAlex Elder 13723da691bfSIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 13733da691bfSIlya Dryomov osd_req->r_result, obj_req); 1374bf0d5f50SAlex Elder 1375c47f9371SAlex Elder /* 13763da691bfSIlya Dryomov * Writes aren't allowed to return a data payload. In some 13773da691bfSIlya Dryomov * guarded write cases (e.g. stat + zero on an empty object) 13783da691bfSIlya Dryomov * a stat response makes it through, but we don't care. 1379c47f9371SAlex Elder */ 138054ab3b24SIlya Dryomov if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request)) 138154ab3b24SIlya Dryomov result = 0; 138254ab3b24SIlya Dryomov else 138354ab3b24SIlya Dryomov result = osd_req->r_result; 13840ccd5926SIlya Dryomov 138554ab3b24SIlya Dryomov rbd_obj_handle_request(obj_req, result); 1386bf0d5f50SAlex Elder } 1387bf0d5f50SAlex Elder 1388bcbab1dbSIlya Dryomov static void rbd_osd_format_read(struct ceph_osd_request *osd_req) 1389430c28c3SAlex Elder { 1390bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_request = osd_req->r_priv; 139122d2cfdfSIlya Dryomov struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 139222d2cfdfSIlya Dryomov struct ceph_options *opt = rbd_dev->rbd_client->client->options; 1393430c28c3SAlex Elder 139422d2cfdfSIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica; 13957c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 13969d4df01fSAlex Elder } 13979d4df01fSAlex Elder 1398bcbab1dbSIlya Dryomov static void rbd_osd_format_write(struct ceph_osd_request *osd_req) 13999d4df01fSAlex Elder { 1400bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_request = osd_req->r_priv; 14019d4df01fSAlex Elder 1402a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1403fac02ddfSArnd Bergmann ktime_get_real_ts64(&osd_req->r_mtime); 140443df3d35SIlya Dryomov osd_req->r_data_offset = obj_request->ex.oe_off; 1405430c28c3SAlex Elder } 1406430c28c3SAlex Elder 1407bc81207eSIlya Dryomov static struct ceph_osd_request * 1408bcbab1dbSIlya Dryomov __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, 1409bcbab1dbSIlya Dryomov struct ceph_snap_context *snapc, int num_ops) 1410bc81207eSIlya Dryomov { 1411e28eded5SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1412bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1413bc81207eSIlya Dryomov struct ceph_osd_request *req; 1414a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1415a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1416bcbab1dbSIlya Dryomov int ret; 1417bc81207eSIlya Dryomov 1418e28eded5SIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1419bc81207eSIlya Dryomov if (!req) 1420bcbab1dbSIlya Dryomov return ERR_PTR(-ENOMEM); 1421bc81207eSIlya Dryomov 1422bcbab1dbSIlya Dryomov list_add_tail(&req->r_private_item, &obj_req->osd_reqs); 1423bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1424a162b308SIlya Dryomov req->r_priv = obj_req; 1425bc81207eSIlya Dryomov 1426b26c047bSIlya Dryomov /* 1427b26c047bSIlya Dryomov * Data objects may be stored in a separate pool, but always in 1428b26c047bSIlya Dryomov * the same namespace in that pool as the header in its pool. 1429b26c047bSIlya Dryomov */ 1430b26c047bSIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 1431bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1432b26c047bSIlya Dryomov 1433bcbab1dbSIlya Dryomov ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1434bcbab1dbSIlya Dryomov rbd_dev->header.object_prefix, 1435bcbab1dbSIlya Dryomov obj_req->ex.oe_objno); 1436bcbab1dbSIlya Dryomov if (ret) 1437bcbab1dbSIlya Dryomov return ERR_PTR(ret); 1438bc81207eSIlya Dryomov 1439bc81207eSIlya Dryomov return req; 1440bc81207eSIlya Dryomov } 1441bc81207eSIlya Dryomov 1442e28eded5SIlya Dryomov static struct ceph_osd_request * 1443bcbab1dbSIlya Dryomov rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops) 1444e28eded5SIlya Dryomov { 1445bcbab1dbSIlya Dryomov return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc, 1446e28eded5SIlya Dryomov num_ops); 1447e28eded5SIlya Dryomov } 1448e28eded5SIlya Dryomov 1449ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void) 1450bf0d5f50SAlex Elder { 1451bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1452bf0d5f50SAlex Elder 14535a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 14546c696d85SIlya Dryomov if (!obj_request) 1455f907ad55SAlex Elder return NULL; 1456f907ad55SAlex Elder 145743df3d35SIlya Dryomov ceph_object_extent_init(&obj_request->ex); 1458bcbab1dbSIlya Dryomov INIT_LIST_HEAD(&obj_request->osd_reqs); 145985b5e6d1SIlya Dryomov mutex_init(&obj_request->state_mutex); 1460bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1461bf0d5f50SAlex Elder 146267e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1463bf0d5f50SAlex Elder return obj_request; 1464bf0d5f50SAlex Elder } 1465bf0d5f50SAlex Elder 1466bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1467bf0d5f50SAlex Elder { 1468bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1469bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req; 14707e07efb1SIlya Dryomov u32 i; 1471bf0d5f50SAlex Elder 1472bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1473bf0d5f50SAlex Elder 147437206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 147537206ee5SAlex Elder 1476bcbab1dbSIlya Dryomov while (!list_empty(&obj_request->osd_reqs)) { 1477bcbab1dbSIlya Dryomov osd_req = list_first_entry(&obj_request->osd_reqs, 1478bcbab1dbSIlya Dryomov struct ceph_osd_request, r_private_item); 1479bcbab1dbSIlya Dryomov list_del_init(&osd_req->r_private_item); 1480bcbab1dbSIlya Dryomov ceph_osdc_put_request(osd_req); 1481bcbab1dbSIlya Dryomov } 1482bf0d5f50SAlex Elder 1483ecc633caSIlya Dryomov switch (obj_request->img_request->data_type) { 14849969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1485bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 14867e07efb1SIlya Dryomov case OBJ_REQUEST_BVECS: 14875359a17dSIlya Dryomov break; /* Nothing to do */ 1488afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 1489afb97888SIlya Dryomov kfree(obj_request->bvec_pos.bvecs); 1490bf0d5f50SAlex Elder break; 14917e07efb1SIlya Dryomov default: 149216809372SArnd Bergmann BUG(); 1493bf0d5f50SAlex Elder } 1494bf0d5f50SAlex Elder 149586bd7998SIlya Dryomov kfree(obj_request->img_extents); 14967e07efb1SIlya Dryomov if (obj_request->copyup_bvecs) { 14977e07efb1SIlya Dryomov for (i = 0; i < obj_request->copyup_bvec_count; i++) { 14987e07efb1SIlya Dryomov if (obj_request->copyup_bvecs[i].bv_page) 14997e07efb1SIlya Dryomov __free_page(obj_request->copyup_bvecs[i].bv_page); 15007e07efb1SIlya Dryomov } 15017e07efb1SIlya Dryomov kfree(obj_request->copyup_bvecs); 1502bf0d5f50SAlex Elder } 1503bf0d5f50SAlex Elder 1504868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1505bf0d5f50SAlex Elder } 1506bf0d5f50SAlex Elder 1507fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1508fb65d228SAlex Elder 1509fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1510fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1511fb65d228SAlex Elder { 1512fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1513fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1514fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1515fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1516fb65d228SAlex Elder } 1517fb65d228SAlex Elder 1518bf0d5f50SAlex Elder /* 1519a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1520a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1521a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1522a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1523a2acd00eSAlex Elder */ 1524a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1525a2acd00eSAlex Elder { 1526a2acd00eSAlex Elder int counter; 1527a2acd00eSAlex Elder 1528a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1529a2acd00eSAlex Elder return; 1530a2acd00eSAlex Elder 1531a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1532a2acd00eSAlex Elder if (counter > 0) 1533a2acd00eSAlex Elder return; 1534a2acd00eSAlex Elder 1535a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1536a2acd00eSAlex Elder 1537a2acd00eSAlex Elder if (!counter) 1538a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1539a2acd00eSAlex Elder else 15409584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 1541a2acd00eSAlex Elder } 1542a2acd00eSAlex Elder 1543a2acd00eSAlex Elder /* 1544a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1545a2acd00eSAlex Elder * parent. 1546a2acd00eSAlex Elder * 1547a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1548a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1549a2acd00eSAlex Elder * false otherwise. 1550a2acd00eSAlex Elder */ 1551a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1552a2acd00eSAlex Elder { 1553ae43e9d0SIlya Dryomov int counter = 0; 1554a2acd00eSAlex Elder 1555a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1556a2acd00eSAlex Elder return false; 1557a2acd00eSAlex Elder 1558ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 1559a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1560a2acd00eSAlex Elder 1561a2acd00eSAlex Elder if (counter < 0) 15629584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 1563a2acd00eSAlex Elder 1564ae43e9d0SIlya Dryomov return counter > 0; 1565a2acd00eSAlex Elder } 1566a2acd00eSAlex Elder 156759e542c8SIlya Dryomov static void rbd_img_request_init(struct rbd_img_request *img_request, 1568cc344fa1SAlex Elder struct rbd_device *rbd_dev, 1569a52cc685SIlya Dryomov enum obj_operation_type op_type) 1570bf0d5f50SAlex Elder { 157159e542c8SIlya Dryomov memset(img_request, 0, sizeof(*img_request)); 1572bf0d5f50SAlex Elder 1573bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 15749bb0248dSIlya Dryomov img_request->op_type = op_type; 1575a0c5895bSIlya Dryomov 1576e1fddc8fSIlya Dryomov INIT_LIST_HEAD(&img_request->lock_item); 157743df3d35SIlya Dryomov INIT_LIST_HEAD(&img_request->object_extents); 15780192ce2eSIlya Dryomov mutex_init(&img_request->state_mutex); 1579bf0d5f50SAlex Elder } 1580bf0d5f50SAlex Elder 1581a52cc685SIlya Dryomov static void rbd_img_capture_header(struct rbd_img_request *img_req) 1582a52cc685SIlya Dryomov { 1583a52cc685SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 1584a52cc685SIlya Dryomov 1585a52cc685SIlya Dryomov lockdep_assert_held(&rbd_dev->header_rwsem); 1586a52cc685SIlya Dryomov 1587a52cc685SIlya Dryomov if (rbd_img_is_write(img_req)) 1588a52cc685SIlya Dryomov img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc); 1589a52cc685SIlya Dryomov else 1590a52cc685SIlya Dryomov img_req->snap_id = rbd_dev->spec->snap_id; 1591a52cc685SIlya Dryomov 1592a52cc685SIlya Dryomov if (rbd_dev_parent_get(rbd_dev)) 1593a52cc685SIlya Dryomov img_request_layered_set(img_req); 1594a52cc685SIlya Dryomov } 1595a52cc685SIlya Dryomov 1596679a97d2SHannes Reinecke static void rbd_img_request_destroy(struct rbd_img_request *img_request) 1597bf0d5f50SAlex Elder { 1598bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1599bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1600bf0d5f50SAlex Elder 160137206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 160237206ee5SAlex Elder 1603e1fddc8fSIlya Dryomov WARN_ON(!list_empty(&img_request->lock_item)); 1604bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1605bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 1606bf0d5f50SAlex Elder 160778b42a87SIlya Dryomov if (img_request_layered_test(img_request)) 1608a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 1609a2acd00eSAlex Elder 16109bb0248dSIlya Dryomov if (rbd_img_is_write(img_request)) 1611812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 1612bf0d5f50SAlex Elder 161359e542c8SIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_request->flags)) 16141c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 1615bf0d5f50SAlex Elder } 1616bf0d5f50SAlex Elder 161722e8bd51SIlya Dryomov #define BITS_PER_OBJ 2 161822e8bd51SIlya Dryomov #define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ) 161922e8bd51SIlya Dryomov #define OBJ_MASK ((1 << BITS_PER_OBJ) - 1) 162022e8bd51SIlya Dryomov 162122e8bd51SIlya Dryomov static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno, 162222e8bd51SIlya Dryomov u64 *index, u8 *shift) 162322e8bd51SIlya Dryomov { 162422e8bd51SIlya Dryomov u32 off; 162522e8bd51SIlya Dryomov 162622e8bd51SIlya Dryomov rbd_assert(objno < rbd_dev->object_map_size); 162722e8bd51SIlya Dryomov *index = div_u64_rem(objno, OBJS_PER_BYTE, &off); 162822e8bd51SIlya Dryomov *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ; 162922e8bd51SIlya Dryomov } 163022e8bd51SIlya Dryomov 163122e8bd51SIlya Dryomov static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) 163222e8bd51SIlya Dryomov { 163322e8bd51SIlya Dryomov u64 index; 163422e8bd51SIlya Dryomov u8 shift; 163522e8bd51SIlya Dryomov 163622e8bd51SIlya Dryomov lockdep_assert_held(&rbd_dev->object_map_lock); 163722e8bd51SIlya Dryomov __rbd_object_map_index(rbd_dev, objno, &index, &shift); 163822e8bd51SIlya Dryomov return (rbd_dev->object_map[index] >> shift) & OBJ_MASK; 163922e8bd51SIlya Dryomov } 164022e8bd51SIlya Dryomov 164122e8bd51SIlya Dryomov static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val) 164222e8bd51SIlya Dryomov { 164322e8bd51SIlya Dryomov u64 index; 164422e8bd51SIlya Dryomov u8 shift; 164522e8bd51SIlya Dryomov u8 *p; 164622e8bd51SIlya Dryomov 164722e8bd51SIlya Dryomov lockdep_assert_held(&rbd_dev->object_map_lock); 164822e8bd51SIlya Dryomov rbd_assert(!(val & ~OBJ_MASK)); 164922e8bd51SIlya Dryomov 165022e8bd51SIlya Dryomov __rbd_object_map_index(rbd_dev, objno, &index, &shift); 165122e8bd51SIlya Dryomov p = &rbd_dev->object_map[index]; 165222e8bd51SIlya Dryomov *p = (*p & ~(OBJ_MASK << shift)) | (val << shift); 165322e8bd51SIlya Dryomov } 165422e8bd51SIlya Dryomov 165522e8bd51SIlya Dryomov static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) 165622e8bd51SIlya Dryomov { 165722e8bd51SIlya Dryomov u8 state; 165822e8bd51SIlya Dryomov 165922e8bd51SIlya Dryomov spin_lock(&rbd_dev->object_map_lock); 166022e8bd51SIlya Dryomov state = __rbd_object_map_get(rbd_dev, objno); 166122e8bd51SIlya Dryomov spin_unlock(&rbd_dev->object_map_lock); 166222e8bd51SIlya Dryomov return state; 166322e8bd51SIlya Dryomov } 166422e8bd51SIlya Dryomov 166522e8bd51SIlya Dryomov static bool use_object_map(struct rbd_device *rbd_dev) 166622e8bd51SIlya Dryomov { 16673fe69921SIlya Dryomov /* 16683fe69921SIlya Dryomov * An image mapped read-only can't use the object map -- it isn't 16693fe69921SIlya Dryomov * loaded because the header lock isn't acquired. Someone else can 16703fe69921SIlya Dryomov * write to the image and update the object map behind our back. 16713fe69921SIlya Dryomov * 16723fe69921SIlya Dryomov * A snapshot can't be written to, so using the object map is always 16733fe69921SIlya Dryomov * safe. 16743fe69921SIlya Dryomov */ 16753fe69921SIlya Dryomov if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev)) 16763fe69921SIlya Dryomov return false; 16773fe69921SIlya Dryomov 167822e8bd51SIlya Dryomov return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) && 167922e8bd51SIlya Dryomov !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)); 168022e8bd51SIlya Dryomov } 168122e8bd51SIlya Dryomov 168222e8bd51SIlya Dryomov static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno) 168322e8bd51SIlya Dryomov { 168422e8bd51SIlya Dryomov u8 state; 168522e8bd51SIlya Dryomov 168622e8bd51SIlya Dryomov /* fall back to default logic if object map is disabled or invalid */ 168722e8bd51SIlya Dryomov if (!use_object_map(rbd_dev)) 168822e8bd51SIlya Dryomov return true; 168922e8bd51SIlya Dryomov 169022e8bd51SIlya Dryomov state = rbd_object_map_get(rbd_dev, objno); 169122e8bd51SIlya Dryomov return state != OBJECT_NONEXISTENT; 169222e8bd51SIlya Dryomov } 169322e8bd51SIlya Dryomov 169422e8bd51SIlya Dryomov static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id, 169522e8bd51SIlya Dryomov struct ceph_object_id *oid) 169622e8bd51SIlya Dryomov { 169722e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) 169822e8bd51SIlya Dryomov ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX, 169922e8bd51SIlya Dryomov rbd_dev->spec->image_id); 170022e8bd51SIlya Dryomov else 170122e8bd51SIlya Dryomov ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX, 170222e8bd51SIlya Dryomov rbd_dev->spec->image_id, snap_id); 170322e8bd51SIlya Dryomov } 170422e8bd51SIlya Dryomov 170522e8bd51SIlya Dryomov static int rbd_object_map_lock(struct rbd_device *rbd_dev) 170622e8bd51SIlya Dryomov { 170722e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 170822e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 170922e8bd51SIlya Dryomov u8 lock_type; 171022e8bd51SIlya Dryomov char *lock_tag; 171122e8bd51SIlya Dryomov struct ceph_locker *lockers; 171222e8bd51SIlya Dryomov u32 num_lockers; 171322e8bd51SIlya Dryomov bool broke_lock = false; 171422e8bd51SIlya Dryomov int ret; 171522e8bd51SIlya Dryomov 171622e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); 171722e8bd51SIlya Dryomov 171822e8bd51SIlya Dryomov again: 171922e8bd51SIlya Dryomov ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, 172022e8bd51SIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0); 172122e8bd51SIlya Dryomov if (ret != -EBUSY || broke_lock) { 172222e8bd51SIlya Dryomov if (ret == -EEXIST) 172322e8bd51SIlya Dryomov ret = 0; /* already locked by myself */ 172422e8bd51SIlya Dryomov if (ret) 172522e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to lock object map: %d", ret); 172622e8bd51SIlya Dryomov return ret; 172722e8bd51SIlya Dryomov } 172822e8bd51SIlya Dryomov 172922e8bd51SIlya Dryomov ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc, 173022e8bd51SIlya Dryomov RBD_LOCK_NAME, &lock_type, &lock_tag, 173122e8bd51SIlya Dryomov &lockers, &num_lockers); 173222e8bd51SIlya Dryomov if (ret) { 173322e8bd51SIlya Dryomov if (ret == -ENOENT) 173422e8bd51SIlya Dryomov goto again; 173522e8bd51SIlya Dryomov 173622e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret); 173722e8bd51SIlya Dryomov return ret; 173822e8bd51SIlya Dryomov } 173922e8bd51SIlya Dryomov 174022e8bd51SIlya Dryomov kfree(lock_tag); 174122e8bd51SIlya Dryomov if (num_lockers == 0) 174222e8bd51SIlya Dryomov goto again; 174322e8bd51SIlya Dryomov 174422e8bd51SIlya Dryomov rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu", 174522e8bd51SIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 174622e8bd51SIlya Dryomov 174722e8bd51SIlya Dryomov ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc, 174822e8bd51SIlya Dryomov RBD_LOCK_NAME, lockers[0].id.cookie, 174922e8bd51SIlya Dryomov &lockers[0].id.name); 175022e8bd51SIlya Dryomov ceph_free_lockers(lockers, num_lockers); 175122e8bd51SIlya Dryomov if (ret) { 175222e8bd51SIlya Dryomov if (ret == -ENOENT) 175322e8bd51SIlya Dryomov goto again; 175422e8bd51SIlya Dryomov 175522e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to break object map lock: %d", ret); 175622e8bd51SIlya Dryomov return ret; 175722e8bd51SIlya Dryomov } 175822e8bd51SIlya Dryomov 175922e8bd51SIlya Dryomov broke_lock = true; 176022e8bd51SIlya Dryomov goto again; 176122e8bd51SIlya Dryomov } 176222e8bd51SIlya Dryomov 176322e8bd51SIlya Dryomov static void rbd_object_map_unlock(struct rbd_device *rbd_dev) 176422e8bd51SIlya Dryomov { 176522e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 176622e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 176722e8bd51SIlya Dryomov int ret; 176822e8bd51SIlya Dryomov 176922e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); 177022e8bd51SIlya Dryomov 177122e8bd51SIlya Dryomov ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, 177222e8bd51SIlya Dryomov ""); 177322e8bd51SIlya Dryomov if (ret && ret != -ENOENT) 177422e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock object map: %d", ret); 177522e8bd51SIlya Dryomov } 177622e8bd51SIlya Dryomov 177722e8bd51SIlya Dryomov static int decode_object_map_header(void **p, void *end, u64 *object_map_size) 177822e8bd51SIlya Dryomov { 177922e8bd51SIlya Dryomov u8 struct_v; 178022e8bd51SIlya Dryomov u32 struct_len; 178122e8bd51SIlya Dryomov u32 header_len; 178222e8bd51SIlya Dryomov void *header_end; 178322e8bd51SIlya Dryomov int ret; 178422e8bd51SIlya Dryomov 178522e8bd51SIlya Dryomov ceph_decode_32_safe(p, end, header_len, e_inval); 178622e8bd51SIlya Dryomov header_end = *p + header_len; 178722e8bd51SIlya Dryomov 178822e8bd51SIlya Dryomov ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v, 178922e8bd51SIlya Dryomov &struct_len); 179022e8bd51SIlya Dryomov if (ret) 179122e8bd51SIlya Dryomov return ret; 179222e8bd51SIlya Dryomov 179322e8bd51SIlya Dryomov ceph_decode_64_safe(p, end, *object_map_size, e_inval); 179422e8bd51SIlya Dryomov 179522e8bd51SIlya Dryomov *p = header_end; 179622e8bd51SIlya Dryomov return 0; 179722e8bd51SIlya Dryomov 179822e8bd51SIlya Dryomov e_inval: 179922e8bd51SIlya Dryomov return -EINVAL; 180022e8bd51SIlya Dryomov } 180122e8bd51SIlya Dryomov 180222e8bd51SIlya Dryomov static int __rbd_object_map_load(struct rbd_device *rbd_dev) 180322e8bd51SIlya Dryomov { 180422e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 180522e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 180622e8bd51SIlya Dryomov struct page **pages; 180722e8bd51SIlya Dryomov void *p, *end; 180822e8bd51SIlya Dryomov size_t reply_len; 180922e8bd51SIlya Dryomov u64 num_objects; 181022e8bd51SIlya Dryomov u64 object_map_bytes; 181122e8bd51SIlya Dryomov u64 object_map_size; 181222e8bd51SIlya Dryomov int num_pages; 181322e8bd51SIlya Dryomov int ret; 181422e8bd51SIlya Dryomov 181522e8bd51SIlya Dryomov rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size); 181622e8bd51SIlya Dryomov 181722e8bd51SIlya Dryomov num_objects = ceph_get_num_objects(&rbd_dev->layout, 181822e8bd51SIlya Dryomov rbd_dev->mapping.size); 181922e8bd51SIlya Dryomov object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ, 182022e8bd51SIlya Dryomov BITS_PER_BYTE); 182122e8bd51SIlya Dryomov num_pages = calc_pages_for(0, object_map_bytes) + 1; 182222e8bd51SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 182322e8bd51SIlya Dryomov if (IS_ERR(pages)) 182422e8bd51SIlya Dryomov return PTR_ERR(pages); 182522e8bd51SIlya Dryomov 182622e8bd51SIlya Dryomov reply_len = num_pages * PAGE_SIZE; 182722e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid); 182822e8bd51SIlya Dryomov ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc, 182922e8bd51SIlya Dryomov "rbd", "object_map_load", CEPH_OSD_FLAG_READ, 183022e8bd51SIlya Dryomov NULL, 0, pages, &reply_len); 183122e8bd51SIlya Dryomov if (ret) 183222e8bd51SIlya Dryomov goto out; 183322e8bd51SIlya Dryomov 183422e8bd51SIlya Dryomov p = page_address(pages[0]); 183522e8bd51SIlya Dryomov end = p + min(reply_len, (size_t)PAGE_SIZE); 183622e8bd51SIlya Dryomov ret = decode_object_map_header(&p, end, &object_map_size); 183722e8bd51SIlya Dryomov if (ret) 183822e8bd51SIlya Dryomov goto out; 183922e8bd51SIlya Dryomov 184022e8bd51SIlya Dryomov if (object_map_size != num_objects) { 184122e8bd51SIlya Dryomov rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu", 184222e8bd51SIlya Dryomov object_map_size, num_objects); 184322e8bd51SIlya Dryomov ret = -EINVAL; 184422e8bd51SIlya Dryomov goto out; 184522e8bd51SIlya Dryomov } 184622e8bd51SIlya Dryomov 184722e8bd51SIlya Dryomov if (offset_in_page(p) + object_map_bytes > reply_len) { 184822e8bd51SIlya Dryomov ret = -EINVAL; 184922e8bd51SIlya Dryomov goto out; 185022e8bd51SIlya Dryomov } 185122e8bd51SIlya Dryomov 185222e8bd51SIlya Dryomov rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL); 185322e8bd51SIlya Dryomov if (!rbd_dev->object_map) { 185422e8bd51SIlya Dryomov ret = -ENOMEM; 185522e8bd51SIlya Dryomov goto out; 185622e8bd51SIlya Dryomov } 185722e8bd51SIlya Dryomov 185822e8bd51SIlya Dryomov rbd_dev->object_map_size = object_map_size; 185922e8bd51SIlya Dryomov ceph_copy_from_page_vector(pages, rbd_dev->object_map, 186022e8bd51SIlya Dryomov offset_in_page(p), object_map_bytes); 186122e8bd51SIlya Dryomov 186222e8bd51SIlya Dryomov out: 186322e8bd51SIlya Dryomov ceph_release_page_vector(pages, num_pages); 186422e8bd51SIlya Dryomov return ret; 186522e8bd51SIlya Dryomov } 186622e8bd51SIlya Dryomov 186722e8bd51SIlya Dryomov static void rbd_object_map_free(struct rbd_device *rbd_dev) 186822e8bd51SIlya Dryomov { 186922e8bd51SIlya Dryomov kvfree(rbd_dev->object_map); 187022e8bd51SIlya Dryomov rbd_dev->object_map = NULL; 187122e8bd51SIlya Dryomov rbd_dev->object_map_size = 0; 187222e8bd51SIlya Dryomov } 187322e8bd51SIlya Dryomov 187422e8bd51SIlya Dryomov static int rbd_object_map_load(struct rbd_device *rbd_dev) 187522e8bd51SIlya Dryomov { 187622e8bd51SIlya Dryomov int ret; 187722e8bd51SIlya Dryomov 187822e8bd51SIlya Dryomov ret = __rbd_object_map_load(rbd_dev); 187922e8bd51SIlya Dryomov if (ret) 188022e8bd51SIlya Dryomov return ret; 188122e8bd51SIlya Dryomov 188222e8bd51SIlya Dryomov ret = rbd_dev_v2_get_flags(rbd_dev); 188322e8bd51SIlya Dryomov if (ret) { 188422e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev); 188522e8bd51SIlya Dryomov return ret; 188622e8bd51SIlya Dryomov } 188722e8bd51SIlya Dryomov 188822e8bd51SIlya Dryomov if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID) 188922e8bd51SIlya Dryomov rbd_warn(rbd_dev, "object map is invalid"); 189022e8bd51SIlya Dryomov 189122e8bd51SIlya Dryomov return 0; 189222e8bd51SIlya Dryomov } 189322e8bd51SIlya Dryomov 189422e8bd51SIlya Dryomov static int rbd_object_map_open(struct rbd_device *rbd_dev) 189522e8bd51SIlya Dryomov { 189622e8bd51SIlya Dryomov int ret; 189722e8bd51SIlya Dryomov 189822e8bd51SIlya Dryomov ret = rbd_object_map_lock(rbd_dev); 189922e8bd51SIlya Dryomov if (ret) 190022e8bd51SIlya Dryomov return ret; 190122e8bd51SIlya Dryomov 190222e8bd51SIlya Dryomov ret = rbd_object_map_load(rbd_dev); 190322e8bd51SIlya Dryomov if (ret) { 190422e8bd51SIlya Dryomov rbd_object_map_unlock(rbd_dev); 190522e8bd51SIlya Dryomov return ret; 190622e8bd51SIlya Dryomov } 190722e8bd51SIlya Dryomov 190822e8bd51SIlya Dryomov return 0; 190922e8bd51SIlya Dryomov } 191022e8bd51SIlya Dryomov 191122e8bd51SIlya Dryomov static void rbd_object_map_close(struct rbd_device *rbd_dev) 191222e8bd51SIlya Dryomov { 191322e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev); 191422e8bd51SIlya Dryomov rbd_object_map_unlock(rbd_dev); 191522e8bd51SIlya Dryomov } 191622e8bd51SIlya Dryomov 191722e8bd51SIlya Dryomov /* 191822e8bd51SIlya Dryomov * This function needs snap_id (or more precisely just something to 191922e8bd51SIlya Dryomov * distinguish between HEAD and snapshot object maps), new_state and 192022e8bd51SIlya Dryomov * current_state that were passed to rbd_object_map_update(). 192122e8bd51SIlya Dryomov * 192222e8bd51SIlya Dryomov * To avoid allocating and stashing a context we piggyback on the OSD 192322e8bd51SIlya Dryomov * request. A HEAD update has two ops (assert_locked). For new_state 192422e8bd51SIlya Dryomov * and current_state we decode our own object_map_update op, encoded in 192522e8bd51SIlya Dryomov * rbd_cls_object_map_update(). 192622e8bd51SIlya Dryomov */ 192722e8bd51SIlya Dryomov static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, 192822e8bd51SIlya Dryomov struct ceph_osd_request *osd_req) 192922e8bd51SIlya Dryomov { 193022e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 193122e8bd51SIlya Dryomov struct ceph_osd_data *osd_data; 193222e8bd51SIlya Dryomov u64 objno; 19333f649ab7SKees Cook u8 state, new_state, current_state; 193422e8bd51SIlya Dryomov bool has_current_state; 193522e8bd51SIlya Dryomov void *p; 193622e8bd51SIlya Dryomov 193722e8bd51SIlya Dryomov if (osd_req->r_result) 193822e8bd51SIlya Dryomov return osd_req->r_result; 193922e8bd51SIlya Dryomov 194022e8bd51SIlya Dryomov /* 194122e8bd51SIlya Dryomov * Nothing to do for a snapshot object map. 194222e8bd51SIlya Dryomov */ 194322e8bd51SIlya Dryomov if (osd_req->r_num_ops == 1) 194422e8bd51SIlya Dryomov return 0; 194522e8bd51SIlya Dryomov 194622e8bd51SIlya Dryomov /* 194722e8bd51SIlya Dryomov * Update in-memory HEAD object map. 194822e8bd51SIlya Dryomov */ 194922e8bd51SIlya Dryomov rbd_assert(osd_req->r_num_ops == 2); 195022e8bd51SIlya Dryomov osd_data = osd_req_op_data(osd_req, 1, cls, request_data); 195122e8bd51SIlya Dryomov rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES); 195222e8bd51SIlya Dryomov 195322e8bd51SIlya Dryomov p = page_address(osd_data->pages[0]); 195422e8bd51SIlya Dryomov objno = ceph_decode_64(&p); 195522e8bd51SIlya Dryomov rbd_assert(objno == obj_req->ex.oe_objno); 195622e8bd51SIlya Dryomov rbd_assert(ceph_decode_64(&p) == objno + 1); 195722e8bd51SIlya Dryomov new_state = ceph_decode_8(&p); 195822e8bd51SIlya Dryomov has_current_state = ceph_decode_8(&p); 195922e8bd51SIlya Dryomov if (has_current_state) 196022e8bd51SIlya Dryomov current_state = ceph_decode_8(&p); 196122e8bd51SIlya Dryomov 196222e8bd51SIlya Dryomov spin_lock(&rbd_dev->object_map_lock); 196322e8bd51SIlya Dryomov state = __rbd_object_map_get(rbd_dev, objno); 196422e8bd51SIlya Dryomov if (!has_current_state || current_state == state || 196522e8bd51SIlya Dryomov (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) 196622e8bd51SIlya Dryomov __rbd_object_map_set(rbd_dev, objno, new_state); 196722e8bd51SIlya Dryomov spin_unlock(&rbd_dev->object_map_lock); 196822e8bd51SIlya Dryomov 196922e8bd51SIlya Dryomov return 0; 197022e8bd51SIlya Dryomov } 197122e8bd51SIlya Dryomov 197222e8bd51SIlya Dryomov static void rbd_object_map_callback(struct ceph_osd_request *osd_req) 197322e8bd51SIlya Dryomov { 197422e8bd51SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 197522e8bd51SIlya Dryomov int result; 197622e8bd51SIlya Dryomov 197722e8bd51SIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 197822e8bd51SIlya Dryomov osd_req->r_result, obj_req); 197922e8bd51SIlya Dryomov 198022e8bd51SIlya Dryomov result = rbd_object_map_update_finish(obj_req, osd_req); 198122e8bd51SIlya Dryomov rbd_obj_handle_request(obj_req, result); 198222e8bd51SIlya Dryomov } 198322e8bd51SIlya Dryomov 198422e8bd51SIlya Dryomov static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state) 198522e8bd51SIlya Dryomov { 198622e8bd51SIlya Dryomov u8 state = rbd_object_map_get(rbd_dev, objno); 198722e8bd51SIlya Dryomov 198822e8bd51SIlya Dryomov if (state == new_state || 198922e8bd51SIlya Dryomov (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) || 199022e8bd51SIlya Dryomov (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) 199122e8bd51SIlya Dryomov return false; 199222e8bd51SIlya Dryomov 199322e8bd51SIlya Dryomov return true; 199422e8bd51SIlya Dryomov } 199522e8bd51SIlya Dryomov 199622e8bd51SIlya Dryomov static int rbd_cls_object_map_update(struct ceph_osd_request *req, 199722e8bd51SIlya Dryomov int which, u64 objno, u8 new_state, 199822e8bd51SIlya Dryomov const u8 *current_state) 199922e8bd51SIlya Dryomov { 200022e8bd51SIlya Dryomov struct page **pages; 200122e8bd51SIlya Dryomov void *p, *start; 200222e8bd51SIlya Dryomov int ret; 200322e8bd51SIlya Dryomov 200422e8bd51SIlya Dryomov ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update"); 200522e8bd51SIlya Dryomov if (ret) 200622e8bd51SIlya Dryomov return ret; 200722e8bd51SIlya Dryomov 200822e8bd51SIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 200922e8bd51SIlya Dryomov if (IS_ERR(pages)) 201022e8bd51SIlya Dryomov return PTR_ERR(pages); 201122e8bd51SIlya Dryomov 201222e8bd51SIlya Dryomov p = start = page_address(pages[0]); 201322e8bd51SIlya Dryomov ceph_encode_64(&p, objno); 201422e8bd51SIlya Dryomov ceph_encode_64(&p, objno + 1); 201522e8bd51SIlya Dryomov ceph_encode_8(&p, new_state); 201622e8bd51SIlya Dryomov if (current_state) { 201722e8bd51SIlya Dryomov ceph_encode_8(&p, 1); 201822e8bd51SIlya Dryomov ceph_encode_8(&p, *current_state); 201922e8bd51SIlya Dryomov } else { 202022e8bd51SIlya Dryomov ceph_encode_8(&p, 0); 202122e8bd51SIlya Dryomov } 202222e8bd51SIlya Dryomov 202322e8bd51SIlya Dryomov osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0, 202422e8bd51SIlya Dryomov false, true); 202522e8bd51SIlya Dryomov return 0; 202622e8bd51SIlya Dryomov } 202722e8bd51SIlya Dryomov 202822e8bd51SIlya Dryomov /* 202922e8bd51SIlya Dryomov * Return: 203022e8bd51SIlya Dryomov * 0 - object map update sent 203122e8bd51SIlya Dryomov * 1 - object map update isn't needed 203222e8bd51SIlya Dryomov * <0 - error 203322e8bd51SIlya Dryomov */ 203422e8bd51SIlya Dryomov static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, 203522e8bd51SIlya Dryomov u8 new_state, const u8 *current_state) 203622e8bd51SIlya Dryomov { 203722e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 203822e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 203922e8bd51SIlya Dryomov struct ceph_osd_request *req; 204022e8bd51SIlya Dryomov int num_ops = 1; 204122e8bd51SIlya Dryomov int which = 0; 204222e8bd51SIlya Dryomov int ret; 204322e8bd51SIlya Dryomov 204422e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) { 204522e8bd51SIlya Dryomov if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state)) 204622e8bd51SIlya Dryomov return 1; 204722e8bd51SIlya Dryomov 204822e8bd51SIlya Dryomov num_ops++; /* assert_locked */ 204922e8bd51SIlya Dryomov } 205022e8bd51SIlya Dryomov 205122e8bd51SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO); 205222e8bd51SIlya Dryomov if (!req) 205322e8bd51SIlya Dryomov return -ENOMEM; 205422e8bd51SIlya Dryomov 205522e8bd51SIlya Dryomov list_add_tail(&req->r_private_item, &obj_req->osd_reqs); 205622e8bd51SIlya Dryomov req->r_callback = rbd_object_map_callback; 205722e8bd51SIlya Dryomov req->r_priv = obj_req; 205822e8bd51SIlya Dryomov 205922e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid); 206022e8bd51SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 206122e8bd51SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_WRITE; 206222e8bd51SIlya Dryomov ktime_get_real_ts64(&req->r_mtime); 206322e8bd51SIlya Dryomov 206422e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) { 206522e8bd51SIlya Dryomov /* 206622e8bd51SIlya Dryomov * Protect against possible race conditions during lock 206722e8bd51SIlya Dryomov * ownership transitions. 206822e8bd51SIlya Dryomov */ 206922e8bd51SIlya Dryomov ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME, 207022e8bd51SIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, "", ""); 207122e8bd51SIlya Dryomov if (ret) 207222e8bd51SIlya Dryomov return ret; 207322e8bd51SIlya Dryomov } 207422e8bd51SIlya Dryomov 207522e8bd51SIlya Dryomov ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno, 207622e8bd51SIlya Dryomov new_state, current_state); 207722e8bd51SIlya Dryomov if (ret) 207822e8bd51SIlya Dryomov return ret; 207922e8bd51SIlya Dryomov 208022e8bd51SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_NOIO); 208122e8bd51SIlya Dryomov if (ret) 208222e8bd51SIlya Dryomov return ret; 208322e8bd51SIlya Dryomov 208422e8bd51SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 208522e8bd51SIlya Dryomov return 0; 208622e8bd51SIlya Dryomov } 208722e8bd51SIlya Dryomov 208886bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents, 208986bd7998SIlya Dryomov u32 *num_img_extents, u64 overlap) 2090e93f3152SAlex Elder { 209186bd7998SIlya Dryomov u32 cnt = *num_img_extents; 2092e93f3152SAlex Elder 209386bd7998SIlya Dryomov /* drop extents completely beyond the overlap */ 209486bd7998SIlya Dryomov while (cnt && img_extents[cnt - 1].fe_off >= overlap) 209586bd7998SIlya Dryomov cnt--; 2096e93f3152SAlex Elder 209786bd7998SIlya Dryomov if (cnt) { 209886bd7998SIlya Dryomov struct ceph_file_extent *ex = &img_extents[cnt - 1]; 2099e93f3152SAlex Elder 210086bd7998SIlya Dryomov /* trim final overlapping extent */ 210186bd7998SIlya Dryomov if (ex->fe_off + ex->fe_len > overlap) 210286bd7998SIlya Dryomov ex->fe_len = overlap - ex->fe_off; 2103e93f3152SAlex Elder } 2104e93f3152SAlex Elder 210586bd7998SIlya Dryomov *num_img_extents = cnt; 210686bd7998SIlya Dryomov } 210786bd7998SIlya Dryomov 210886bd7998SIlya Dryomov /* 210986bd7998SIlya Dryomov * Determine the byte range(s) covered by either just the object extent 211086bd7998SIlya Dryomov * or the entire object in the parent image. 211186bd7998SIlya Dryomov */ 211286bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, 211386bd7998SIlya Dryomov bool entire) 2114e93f3152SAlex Elder { 211586bd7998SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2116c5b5ef6cSAlex Elder int ret; 2117c5b5ef6cSAlex Elder 211886bd7998SIlya Dryomov if (!rbd_dev->parent_overlap) 211986bd7998SIlya Dryomov return 0; 212086bd7998SIlya Dryomov 212186bd7998SIlya Dryomov ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, 212286bd7998SIlya Dryomov entire ? 0 : obj_req->ex.oe_off, 212386bd7998SIlya Dryomov entire ? rbd_dev->layout.object_size : 212486bd7998SIlya Dryomov obj_req->ex.oe_len, 212586bd7998SIlya Dryomov &obj_req->img_extents, 212686bd7998SIlya Dryomov &obj_req->num_img_extents); 212786bd7998SIlya Dryomov if (ret) 212886bd7998SIlya Dryomov return ret; 212986bd7998SIlya Dryomov 213086bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 213186bd7998SIlya Dryomov rbd_dev->parent_overlap); 213286bd7998SIlya Dryomov return 0; 213386bd7998SIlya Dryomov } 213486bd7998SIlya Dryomov 2135bcbab1dbSIlya Dryomov static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which) 21363da691bfSIlya Dryomov { 2137bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 2138bcbab1dbSIlya Dryomov 2139ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 21403da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 2141bcbab1dbSIlya Dryomov osd_req_op_extent_osd_data_bio(osd_req, which, 21423da691bfSIlya Dryomov &obj_req->bio_pos, 214343df3d35SIlya Dryomov obj_req->ex.oe_len); 21443da691bfSIlya Dryomov break; 21453da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2146afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 21473da691bfSIlya Dryomov rbd_assert(obj_req->bvec_pos.iter.bi_size == 214843df3d35SIlya Dryomov obj_req->ex.oe_len); 2149afb97888SIlya Dryomov rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 2150bcbab1dbSIlya Dryomov osd_req_op_extent_osd_data_bvec_pos(osd_req, which, 21513da691bfSIlya Dryomov &obj_req->bvec_pos); 21523da691bfSIlya Dryomov break; 21533da691bfSIlya Dryomov default: 215416809372SArnd Bergmann BUG(); 21553da691bfSIlya Dryomov } 21563da691bfSIlya Dryomov } 21573da691bfSIlya Dryomov 2158bcbab1dbSIlya Dryomov static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which) 21593da691bfSIlya Dryomov { 21603da691bfSIlya Dryomov struct page **pages; 21613da691bfSIlya Dryomov 2162c5b5ef6cSAlex Elder /* 2163c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2164c5b5ef6cSAlex Elder * le64 length; 2165c5b5ef6cSAlex Elder * struct { 2166c5b5ef6cSAlex Elder * le32 tv_sec; 2167c5b5ef6cSAlex Elder * le32 tv_nsec; 2168c5b5ef6cSAlex Elder * } mtime; 2169c5b5ef6cSAlex Elder */ 21703da691bfSIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 21713da691bfSIlya Dryomov if (IS_ERR(pages)) 21723da691bfSIlya Dryomov return PTR_ERR(pages); 21733da691bfSIlya Dryomov 2174bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0); 2175bcbab1dbSIlya Dryomov osd_req_op_raw_data_in_pages(osd_req, which, pages, 21763da691bfSIlya Dryomov 8 + sizeof(struct ceph_timespec), 21773da691bfSIlya Dryomov 0, false, true); 21783da691bfSIlya Dryomov return 0; 2179710214e3SIlya Dryomov } 2180c5b5ef6cSAlex Elder 2181b5ae8cbcSIlya Dryomov static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which, 2182b5ae8cbcSIlya Dryomov u32 bytes) 218313488d53SIlya Dryomov { 2184b5ae8cbcSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 2185b5ae8cbcSIlya Dryomov int ret; 2186b5ae8cbcSIlya Dryomov 2187b5ae8cbcSIlya Dryomov ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup"); 2188b5ae8cbcSIlya Dryomov if (ret) 2189b5ae8cbcSIlya Dryomov return ret; 2190b5ae8cbcSIlya Dryomov 2191b5ae8cbcSIlya Dryomov osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs, 2192b5ae8cbcSIlya Dryomov obj_req->copyup_bvec_count, bytes); 2193b5ae8cbcSIlya Dryomov return 0; 219413488d53SIlya Dryomov } 219513488d53SIlya Dryomov 2196ea9b743cSIlya Dryomov static int rbd_obj_init_read(struct rbd_obj_request *obj_req) 21973da691bfSIlya Dryomov { 2198ea9b743cSIlya Dryomov obj_req->read_state = RBD_OBJ_READ_START; 2199ea9b743cSIlya Dryomov return 0; 2200ea9b743cSIlya Dryomov } 2201ea9b743cSIlya Dryomov 2202bcbab1dbSIlya Dryomov static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, 2203bcbab1dbSIlya Dryomov int which) 22043da691bfSIlya Dryomov { 2205bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 22063da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 22073da691bfSIlya Dryomov u16 opcode; 2208c5b5ef6cSAlex Elder 22098b5bec5cSIlya Dryomov if (!use_object_map(rbd_dev) || 22108b5bec5cSIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { 2211bcbab1dbSIlya Dryomov osd_req_op_alloc_hint_init(osd_req, which++, 22123da691bfSIlya Dryomov rbd_dev->layout.object_size, 2213d3798accSIlya Dryomov rbd_dev->layout.object_size, 2214dc1dad8eSIlya Dryomov rbd_dev->opts->alloc_hint_flags); 22158b5bec5cSIlya Dryomov } 2216c5b5ef6cSAlex Elder 22173da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) 22183da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 22193da691bfSIlya Dryomov else 22203da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITE; 2221c5b5ef6cSAlex Elder 2222bcbab1dbSIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode, 222343df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2224bcbab1dbSIlya Dryomov rbd_osd_setup_data(osd_req, which); 22253da691bfSIlya Dryomov } 22263da691bfSIlya Dryomov 2227ea9b743cSIlya Dryomov static int rbd_obj_init_write(struct rbd_obj_request *obj_req) 22283da691bfSIlya Dryomov { 22293da691bfSIlya Dryomov int ret; 22303da691bfSIlya Dryomov 223186bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 223286bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 223386bd7998SIlya Dryomov if (ret) 223486bd7998SIlya Dryomov return ret; 223586bd7998SIlya Dryomov 22360ad5d953SIlya Dryomov if (rbd_obj_copyup_enabled(obj_req)) 22370ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; 22383da691bfSIlya Dryomov 223985b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START; 22403da691bfSIlya Dryomov return 0; 224170d045f6SIlya Dryomov } 224270d045f6SIlya Dryomov 22436484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) 22446484cbe9SIlya Dryomov { 22456484cbe9SIlya Dryomov return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : 22466484cbe9SIlya Dryomov CEPH_OSD_OP_ZERO; 22476484cbe9SIlya Dryomov } 22486484cbe9SIlya Dryomov 224927bbd911SIlya Dryomov static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req, 225027bbd911SIlya Dryomov int which) 225127bbd911SIlya Dryomov { 225227bbd911SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 225327bbd911SIlya Dryomov 225427bbd911SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { 225527bbd911SIlya Dryomov rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); 225627bbd911SIlya Dryomov osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0); 225727bbd911SIlya Dryomov } else { 225827bbd911SIlya Dryomov osd_req_op_extent_init(osd_req, which, 225927bbd911SIlya Dryomov truncate_or_zero_opcode(obj_req), 226027bbd911SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 226127bbd911SIlya Dryomov 0, 0); 226227bbd911SIlya Dryomov } 226327bbd911SIlya Dryomov } 226427bbd911SIlya Dryomov 2265ea9b743cSIlya Dryomov static int rbd_obj_init_discard(struct rbd_obj_request *obj_req) 22666484cbe9SIlya Dryomov { 22670c93e1b7SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 226827bbd911SIlya Dryomov u64 off, next_off; 22696484cbe9SIlya Dryomov int ret; 22706484cbe9SIlya Dryomov 22710c93e1b7SIlya Dryomov /* 22720c93e1b7SIlya Dryomov * Align the range to alloc_size boundary and punt on discards 22730c93e1b7SIlya Dryomov * that are too small to free up any space. 22740c93e1b7SIlya Dryomov * 22750c93e1b7SIlya Dryomov * alloc_size == object_size && is_tail() is a special case for 22760c93e1b7SIlya Dryomov * filestore with filestore_punch_hole = false, needed to allow 22770c93e1b7SIlya Dryomov * truncate (in addition to delete). 22780c93e1b7SIlya Dryomov */ 22790c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || 22800c93e1b7SIlya Dryomov !rbd_obj_is_tail(obj_req)) { 228127bbd911SIlya Dryomov off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size); 228227bbd911SIlya Dryomov next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len, 228327bbd911SIlya Dryomov rbd_dev->opts->alloc_size); 22840c93e1b7SIlya Dryomov if (off >= next_off) 22850c93e1b7SIlya Dryomov return 1; 228627bbd911SIlya Dryomov 228727bbd911SIlya Dryomov dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, 228827bbd911SIlya Dryomov obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, 228927bbd911SIlya Dryomov off, next_off - off); 229027bbd911SIlya Dryomov obj_req->ex.oe_off = off; 229127bbd911SIlya Dryomov obj_req->ex.oe_len = next_off - off; 22920c93e1b7SIlya Dryomov } 22930c93e1b7SIlya Dryomov 22946484cbe9SIlya Dryomov /* reverse map the entire object onto the parent */ 22956484cbe9SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 22966484cbe9SIlya Dryomov if (ret) 22976484cbe9SIlya Dryomov return ret; 22986484cbe9SIlya Dryomov 229922e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; 23000ad5d953SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) 23010ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_DELETION; 23026484cbe9SIlya Dryomov 230385b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START; 23046484cbe9SIlya Dryomov return 0; 23056484cbe9SIlya Dryomov } 23066484cbe9SIlya Dryomov 2307bcbab1dbSIlya Dryomov static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req, 2308bcbab1dbSIlya Dryomov int which) 230913488d53SIlya Dryomov { 2310bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 23113da691bfSIlya Dryomov u16 opcode; 2312058aa991SIlya Dryomov 23133da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) { 231486bd7998SIlya Dryomov if (obj_req->num_img_extents) { 23150ad5d953SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) 2316bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which++, 23172bb1e56eSIlya Dryomov CEPH_OSD_OP_CREATE, 0); 23183da691bfSIlya Dryomov opcode = CEPH_OSD_OP_TRUNCATE; 23193da691bfSIlya Dryomov } else { 23200ad5d953SIlya Dryomov rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); 2321bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which++, 23223da691bfSIlya Dryomov CEPH_OSD_OP_DELETE, 0); 23233da691bfSIlya Dryomov opcode = 0; 23243da691bfSIlya Dryomov } 23253da691bfSIlya Dryomov } else { 23266484cbe9SIlya Dryomov opcode = truncate_or_zero_opcode(obj_req); 23273da691bfSIlya Dryomov } 23283da691bfSIlya Dryomov 23293da691bfSIlya Dryomov if (opcode) 2330bcbab1dbSIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode, 233143df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 23323da691bfSIlya Dryomov 0, 0); 23333da691bfSIlya Dryomov } 23343da691bfSIlya Dryomov 2335ea9b743cSIlya Dryomov static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req) 23363da691bfSIlya Dryomov { 23373da691bfSIlya Dryomov int ret; 23383da691bfSIlya Dryomov 233986bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 234086bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 234186bd7998SIlya Dryomov if (ret) 234286bd7998SIlya Dryomov return ret; 234386bd7998SIlya Dryomov 23440ad5d953SIlya Dryomov if (rbd_obj_copyup_enabled(obj_req)) 23450ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; 23460ad5d953SIlya Dryomov if (!obj_req->num_img_extents) { 234722e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; 23480ad5d953SIlya Dryomov if (rbd_obj_is_entire(obj_req)) 23490ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_DELETION; 23503da691bfSIlya Dryomov } 23513da691bfSIlya Dryomov 235285b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START; 2353980917fcSIlya Dryomov return 0; 2354b454e36dSAlex Elder } 2355b454e36dSAlex Elder 2356a086a1b8SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req) 2357a086a1b8SIlya Dryomov { 23588b5bec5cSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 23598b5bec5cSIlya Dryomov 23608b5bec5cSIlya Dryomov switch (img_req->op_type) { 2361a086a1b8SIlya Dryomov case OBJ_OP_WRITE: 23628b5bec5cSIlya Dryomov if (!use_object_map(img_req->rbd_dev) || 23638b5bec5cSIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) 2364a086a1b8SIlya Dryomov return 2; /* setallochint + write/writefull */ 23658b5bec5cSIlya Dryomov 23668b5bec5cSIlya Dryomov return 1; /* write/writefull */ 2367a086a1b8SIlya Dryomov case OBJ_OP_DISCARD: 2368a086a1b8SIlya Dryomov return 1; /* delete/truncate/zero */ 2369a086a1b8SIlya Dryomov case OBJ_OP_ZEROOUT: 2370a086a1b8SIlya Dryomov if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && 2371a086a1b8SIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) 2372a086a1b8SIlya Dryomov return 2; /* create + truncate */ 2373a086a1b8SIlya Dryomov 2374a086a1b8SIlya Dryomov return 1; /* delete/truncate/zero */ 2375a086a1b8SIlya Dryomov default: 2376a086a1b8SIlya Dryomov BUG(); 2377a086a1b8SIlya Dryomov } 2378a086a1b8SIlya Dryomov } 2379a086a1b8SIlya Dryomov 2380a086a1b8SIlya Dryomov static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, 2381a086a1b8SIlya Dryomov int which) 2382a086a1b8SIlya Dryomov { 2383a086a1b8SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 2384a086a1b8SIlya Dryomov 2385a086a1b8SIlya Dryomov switch (obj_req->img_request->op_type) { 2386a086a1b8SIlya Dryomov case OBJ_OP_WRITE: 2387a086a1b8SIlya Dryomov __rbd_osd_setup_write_ops(osd_req, which); 2388a086a1b8SIlya Dryomov break; 2389a086a1b8SIlya Dryomov case OBJ_OP_DISCARD: 2390a086a1b8SIlya Dryomov __rbd_osd_setup_discard_ops(osd_req, which); 2391a086a1b8SIlya Dryomov break; 2392a086a1b8SIlya Dryomov case OBJ_OP_ZEROOUT: 2393a086a1b8SIlya Dryomov __rbd_osd_setup_zeroout_ops(osd_req, which); 2394a086a1b8SIlya Dryomov break; 2395a086a1b8SIlya Dryomov default: 2396a086a1b8SIlya Dryomov BUG(); 2397a086a1b8SIlya Dryomov } 2398a086a1b8SIlya Dryomov } 2399a086a1b8SIlya Dryomov 2400b454e36dSAlex Elder /* 2401a086a1b8SIlya Dryomov * Prune the list of object requests (adjust offset and/or length, drop 2402a086a1b8SIlya Dryomov * redundant requests). Prepare object request state machines and image 2403a086a1b8SIlya Dryomov * request state machine for execution. 2404b454e36dSAlex Elder */ 24053da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req) 24063da691bfSIlya Dryomov { 24070c93e1b7SIlya Dryomov struct rbd_obj_request *obj_req, *next_obj_req; 24083da691bfSIlya Dryomov int ret; 24093d7efd18SAlex Elder 24100c93e1b7SIlya Dryomov for_each_obj_request_safe(img_req, obj_req, next_obj_req) { 24119bb0248dSIlya Dryomov switch (img_req->op_type) { 24123da691bfSIlya Dryomov case OBJ_OP_READ: 2413ea9b743cSIlya Dryomov ret = rbd_obj_init_read(obj_req); 24143da691bfSIlya Dryomov break; 24153da691bfSIlya Dryomov case OBJ_OP_WRITE: 2416ea9b743cSIlya Dryomov ret = rbd_obj_init_write(obj_req); 24173da691bfSIlya Dryomov break; 24183da691bfSIlya Dryomov case OBJ_OP_DISCARD: 2419ea9b743cSIlya Dryomov ret = rbd_obj_init_discard(obj_req); 24203da691bfSIlya Dryomov break; 24216484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 2422ea9b743cSIlya Dryomov ret = rbd_obj_init_zeroout(obj_req); 24236484cbe9SIlya Dryomov break; 24243da691bfSIlya Dryomov default: 242516809372SArnd Bergmann BUG(); 24263da691bfSIlya Dryomov } 24270c93e1b7SIlya Dryomov if (ret < 0) 24283da691bfSIlya Dryomov return ret; 24290c93e1b7SIlya Dryomov if (ret > 0) { 24300c93e1b7SIlya Dryomov rbd_img_obj_request_del(img_req, obj_req); 24310c93e1b7SIlya Dryomov continue; 24320c93e1b7SIlya Dryomov } 2433b454e36dSAlex Elder } 2434b454e36dSAlex Elder 24350192ce2eSIlya Dryomov img_req->state = RBD_IMG_START; 24363da691bfSIlya Dryomov return 0; 24373da691bfSIlya Dryomov } 24383da691bfSIlya Dryomov 24395a237819SIlya Dryomov union rbd_img_fill_iter { 24405a237819SIlya Dryomov struct ceph_bio_iter bio_iter; 24415a237819SIlya Dryomov struct ceph_bvec_iter bvec_iter; 24425a237819SIlya Dryomov }; 24435a237819SIlya Dryomov 24445a237819SIlya Dryomov struct rbd_img_fill_ctx { 24455a237819SIlya Dryomov enum obj_request_type pos_type; 24465a237819SIlya Dryomov union rbd_img_fill_iter *pos; 24475a237819SIlya Dryomov union rbd_img_fill_iter iter; 24485a237819SIlya Dryomov ceph_object_extent_fn_t set_pos_fn; 2449afb97888SIlya Dryomov ceph_object_extent_fn_t count_fn; 2450afb97888SIlya Dryomov ceph_object_extent_fn_t copy_fn; 24515a237819SIlya Dryomov }; 24525a237819SIlya Dryomov 24535a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg) 24545a237819SIlya Dryomov { 24555a237819SIlya Dryomov struct rbd_img_request *img_req = arg; 24565a237819SIlya Dryomov struct rbd_obj_request *obj_req; 24575a237819SIlya Dryomov 24585a237819SIlya Dryomov obj_req = rbd_obj_request_create(); 24595a237819SIlya Dryomov if (!obj_req) 24605a237819SIlya Dryomov return NULL; 24615a237819SIlya Dryomov 24625a237819SIlya Dryomov rbd_img_obj_request_add(img_req, obj_req); 24635a237819SIlya Dryomov return &obj_req->ex; 24645a237819SIlya Dryomov } 24655a237819SIlya Dryomov 24665a237819SIlya Dryomov /* 2467afb97888SIlya Dryomov * While su != os && sc == 1 is technically not fancy (it's the same 2468afb97888SIlya Dryomov * layout as su == os && sc == 1), we can't use the nocopy path for it 2469afb97888SIlya Dryomov * because ->set_pos_fn() should be called only once per object. 2470afb97888SIlya Dryomov * ceph_file_to_extents() invokes action_fn once per stripe unit, so 2471afb97888SIlya Dryomov * treat su != os && sc == 1 as fancy. 24725a237819SIlya Dryomov */ 2473afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l) 2474afb97888SIlya Dryomov { 2475afb97888SIlya Dryomov return l->stripe_unit != l->object_size; 2476afb97888SIlya Dryomov } 2477afb97888SIlya Dryomov 2478afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, 24795a237819SIlya Dryomov struct ceph_file_extent *img_extents, 24805a237819SIlya Dryomov u32 num_img_extents, 24815a237819SIlya Dryomov struct rbd_img_fill_ctx *fctx) 24825a237819SIlya Dryomov { 24835a237819SIlya Dryomov u32 i; 24845a237819SIlya Dryomov int ret; 24855a237819SIlya Dryomov 24865a237819SIlya Dryomov img_req->data_type = fctx->pos_type; 24875a237819SIlya Dryomov 24885a237819SIlya Dryomov /* 24895a237819SIlya Dryomov * Create object requests and set each object request's starting 24905a237819SIlya Dryomov * position in the provided bio (list) or bio_vec array. 24915a237819SIlya Dryomov */ 24925a237819SIlya Dryomov fctx->iter = *fctx->pos; 24935a237819SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 24945a237819SIlya Dryomov ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 24955a237819SIlya Dryomov img_extents[i].fe_off, 24965a237819SIlya Dryomov img_extents[i].fe_len, 24975a237819SIlya Dryomov &img_req->object_extents, 24985a237819SIlya Dryomov alloc_object_extent, img_req, 24995a237819SIlya Dryomov fctx->set_pos_fn, &fctx->iter); 25005a237819SIlya Dryomov if (ret) 25015a237819SIlya Dryomov return ret; 25025a237819SIlya Dryomov } 25035a237819SIlya Dryomov 25045a237819SIlya Dryomov return __rbd_img_fill_request(img_req); 25055a237819SIlya Dryomov } 25065a237819SIlya Dryomov 2507afb97888SIlya Dryomov /* 2508afb97888SIlya Dryomov * Map a list of image extents to a list of object extents, create the 2509afb97888SIlya Dryomov * corresponding object requests (normally each to a different object, 2510afb97888SIlya Dryomov * but not always) and add them to @img_req. For each object request, 2511afb97888SIlya Dryomov * set up its data descriptor to point to the corresponding chunk(s) of 2512afb97888SIlya Dryomov * @fctx->pos data buffer. 2513afb97888SIlya Dryomov * 2514afb97888SIlya Dryomov * Because ceph_file_to_extents() will merge adjacent object extents 2515afb97888SIlya Dryomov * together, each object request's data descriptor may point to multiple 2516afb97888SIlya Dryomov * different chunks of @fctx->pos data buffer. 2517afb97888SIlya Dryomov * 2518afb97888SIlya Dryomov * @fctx->pos data buffer is assumed to be large enough. 2519afb97888SIlya Dryomov */ 2520afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req, 2521afb97888SIlya Dryomov struct ceph_file_extent *img_extents, 2522afb97888SIlya Dryomov u32 num_img_extents, 2523afb97888SIlya Dryomov struct rbd_img_fill_ctx *fctx) 2524afb97888SIlya Dryomov { 2525afb97888SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 2526afb97888SIlya Dryomov struct rbd_obj_request *obj_req; 2527afb97888SIlya Dryomov u32 i; 2528afb97888SIlya Dryomov int ret; 2529afb97888SIlya Dryomov 2530afb97888SIlya Dryomov if (fctx->pos_type == OBJ_REQUEST_NODATA || 2531afb97888SIlya Dryomov !rbd_layout_is_fancy(&rbd_dev->layout)) 2532afb97888SIlya Dryomov return rbd_img_fill_request_nocopy(img_req, img_extents, 2533afb97888SIlya Dryomov num_img_extents, fctx); 2534afb97888SIlya Dryomov 2535afb97888SIlya Dryomov img_req->data_type = OBJ_REQUEST_OWN_BVECS; 2536afb97888SIlya Dryomov 2537afb97888SIlya Dryomov /* 2538afb97888SIlya Dryomov * Create object requests and determine ->bvec_count for each object 2539afb97888SIlya Dryomov * request. Note that ->bvec_count sum over all object requests may 2540afb97888SIlya Dryomov * be greater than the number of bio_vecs in the provided bio (list) 2541afb97888SIlya Dryomov * or bio_vec array because when mapped, those bio_vecs can straddle 2542afb97888SIlya Dryomov * stripe unit boundaries. 2543afb97888SIlya Dryomov */ 2544afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2545afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2546afb97888SIlya Dryomov ret = ceph_file_to_extents(&rbd_dev->layout, 2547afb97888SIlya Dryomov img_extents[i].fe_off, 2548afb97888SIlya Dryomov img_extents[i].fe_len, 2549afb97888SIlya Dryomov &img_req->object_extents, 2550afb97888SIlya Dryomov alloc_object_extent, img_req, 2551afb97888SIlya Dryomov fctx->count_fn, &fctx->iter); 2552afb97888SIlya Dryomov if (ret) 2553afb97888SIlya Dryomov return ret; 2554afb97888SIlya Dryomov } 2555afb97888SIlya Dryomov 2556afb97888SIlya Dryomov for_each_obj_request(img_req, obj_req) { 2557afb97888SIlya Dryomov obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, 2558afb97888SIlya Dryomov sizeof(*obj_req->bvec_pos.bvecs), 2559afb97888SIlya Dryomov GFP_NOIO); 2560afb97888SIlya Dryomov if (!obj_req->bvec_pos.bvecs) 2561afb97888SIlya Dryomov return -ENOMEM; 2562afb97888SIlya Dryomov } 2563afb97888SIlya Dryomov 2564afb97888SIlya Dryomov /* 2565afb97888SIlya Dryomov * Fill in each object request's private bio_vec array, splitting and 2566afb97888SIlya Dryomov * rearranging the provided bio_vecs in stripe unit chunks as needed. 2567afb97888SIlya Dryomov */ 2568afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2569afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2570afb97888SIlya Dryomov ret = ceph_iterate_extents(&rbd_dev->layout, 2571afb97888SIlya Dryomov img_extents[i].fe_off, 2572afb97888SIlya Dryomov img_extents[i].fe_len, 2573afb97888SIlya Dryomov &img_req->object_extents, 2574afb97888SIlya Dryomov fctx->copy_fn, &fctx->iter); 2575afb97888SIlya Dryomov if (ret) 2576afb97888SIlya Dryomov return ret; 2577afb97888SIlya Dryomov } 2578afb97888SIlya Dryomov 2579afb97888SIlya Dryomov return __rbd_img_fill_request(img_req); 2580afb97888SIlya Dryomov } 2581afb97888SIlya Dryomov 25825a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req, 25835a237819SIlya Dryomov u64 off, u64 len) 25845a237819SIlya Dryomov { 25855a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 2586a55e601bSArnd Bergmann union rbd_img_fill_iter dummy = {}; 25875a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 25885a237819SIlya Dryomov .pos_type = OBJ_REQUEST_NODATA, 25895a237819SIlya Dryomov .pos = &dummy, 25905a237819SIlya Dryomov }; 25915a237819SIlya Dryomov 25925a237819SIlya Dryomov return rbd_img_fill_request(img_req, &ex, 1, &fctx); 25935a237819SIlya Dryomov } 25945a237819SIlya Dryomov 25955a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 25965a237819SIlya Dryomov { 25975a237819SIlya Dryomov struct rbd_obj_request *obj_req = 25985a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 25995a237819SIlya Dryomov struct ceph_bio_iter *it = arg; 26005a237819SIlya Dryomov 26015a237819SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 26025a237819SIlya Dryomov obj_req->bio_pos = *it; 26035a237819SIlya Dryomov ceph_bio_iter_advance(it, bytes); 26045a237819SIlya Dryomov } 26055a237819SIlya Dryomov 2606afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2607afb97888SIlya Dryomov { 2608afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2609afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2610afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2611afb97888SIlya Dryomov 2612afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2613afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2614afb97888SIlya Dryomov obj_req->bvec_count++; 2615afb97888SIlya Dryomov })); 2616afb97888SIlya Dryomov 2617afb97888SIlya Dryomov } 2618afb97888SIlya Dryomov 2619afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2620afb97888SIlya Dryomov { 2621afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2622afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2623afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2624afb97888SIlya Dryomov 2625afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2626afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2627afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2628afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2629afb97888SIlya Dryomov })); 2630afb97888SIlya Dryomov } 2631afb97888SIlya Dryomov 26325a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 26335a237819SIlya Dryomov struct ceph_file_extent *img_extents, 26345a237819SIlya Dryomov u32 num_img_extents, 26355a237819SIlya Dryomov struct ceph_bio_iter *bio_pos) 26365a237819SIlya Dryomov { 26375a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 26385a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BIO, 26395a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bio_pos, 26405a237819SIlya Dryomov .set_pos_fn = set_bio_pos, 2641afb97888SIlya Dryomov .count_fn = count_bio_bvecs, 2642afb97888SIlya Dryomov .copy_fn = copy_bio_bvecs, 26435a237819SIlya Dryomov }; 26445a237819SIlya Dryomov 26455a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 26465a237819SIlya Dryomov &fctx); 26475a237819SIlya Dryomov } 26485a237819SIlya Dryomov 26495a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, 26505a237819SIlya Dryomov u64 off, u64 len, struct bio *bio) 26515a237819SIlya Dryomov { 26525a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 26535a237819SIlya Dryomov struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; 26545a237819SIlya Dryomov 26555a237819SIlya Dryomov return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); 26565a237819SIlya Dryomov } 26575a237819SIlya Dryomov 26585a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 26595a237819SIlya Dryomov { 26605a237819SIlya Dryomov struct rbd_obj_request *obj_req = 26615a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 26625a237819SIlya Dryomov struct ceph_bvec_iter *it = arg; 26635a237819SIlya Dryomov 26645a237819SIlya Dryomov obj_req->bvec_pos = *it; 26655a237819SIlya Dryomov ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); 26665a237819SIlya Dryomov ceph_bvec_iter_advance(it, bytes); 26675a237819SIlya Dryomov } 26685a237819SIlya Dryomov 2669afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2670afb97888SIlya Dryomov { 2671afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2672afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2673afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2674afb97888SIlya Dryomov 2675afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2676afb97888SIlya Dryomov obj_req->bvec_count++; 2677afb97888SIlya Dryomov })); 2678afb97888SIlya Dryomov } 2679afb97888SIlya Dryomov 2680afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2681afb97888SIlya Dryomov { 2682afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2683afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2684afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2685afb97888SIlya Dryomov 2686afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2687afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2688afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2689afb97888SIlya Dryomov })); 2690afb97888SIlya Dryomov } 2691afb97888SIlya Dryomov 26925a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 26935a237819SIlya Dryomov struct ceph_file_extent *img_extents, 26945a237819SIlya Dryomov u32 num_img_extents, 26955a237819SIlya Dryomov struct ceph_bvec_iter *bvec_pos) 26965a237819SIlya Dryomov { 26975a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 26985a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BVECS, 26995a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bvec_pos, 27005a237819SIlya Dryomov .set_pos_fn = set_bvec_pos, 2701afb97888SIlya Dryomov .count_fn = count_bvecs, 2702afb97888SIlya Dryomov .copy_fn = copy_bvecs, 27035a237819SIlya Dryomov }; 27045a237819SIlya Dryomov 27055a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 27065a237819SIlya Dryomov &fctx); 27075a237819SIlya Dryomov } 27085a237819SIlya Dryomov 27095a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 27105a237819SIlya Dryomov struct ceph_file_extent *img_extents, 27115a237819SIlya Dryomov u32 num_img_extents, 27125a237819SIlya Dryomov struct bio_vec *bvecs) 27135a237819SIlya Dryomov { 27145a237819SIlya Dryomov struct ceph_bvec_iter it = { 27155a237819SIlya Dryomov .bvecs = bvecs, 27165a237819SIlya Dryomov .iter = { .bi_size = ceph_file_extents_bytes(img_extents, 27175a237819SIlya Dryomov num_img_extents) }, 27185a237819SIlya Dryomov }; 27195a237819SIlya Dryomov 27205a237819SIlya Dryomov return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, 27215a237819SIlya Dryomov &it); 27225a237819SIlya Dryomov } 27235a237819SIlya Dryomov 27240192ce2eSIlya Dryomov static void rbd_img_handle_request_work(struct work_struct *work) 2725bf0d5f50SAlex Elder { 27260192ce2eSIlya Dryomov struct rbd_img_request *img_req = 27270192ce2eSIlya Dryomov container_of(work, struct rbd_img_request, work); 2728bf0d5f50SAlex Elder 27290192ce2eSIlya Dryomov rbd_img_handle_request(img_req, img_req->work_result); 27300192ce2eSIlya Dryomov } 2731bf0d5f50SAlex Elder 27320192ce2eSIlya Dryomov static void rbd_img_schedule(struct rbd_img_request *img_req, int result) 27330192ce2eSIlya Dryomov { 27340192ce2eSIlya Dryomov INIT_WORK(&img_req->work, rbd_img_handle_request_work); 27350192ce2eSIlya Dryomov img_req->work_result = result; 27360192ce2eSIlya Dryomov queue_work(rbd_wq, &img_req->work); 2737bf0d5f50SAlex Elder } 2738bf0d5f50SAlex Elder 273922e8bd51SIlya Dryomov static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req) 274022e8bd51SIlya Dryomov { 274122e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 274222e8bd51SIlya Dryomov 274322e8bd51SIlya Dryomov if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) { 274422e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; 274522e8bd51SIlya Dryomov return true; 274622e8bd51SIlya Dryomov } 274722e8bd51SIlya Dryomov 274822e8bd51SIlya Dryomov dout("%s %p objno %llu assuming dne\n", __func__, obj_req, 274922e8bd51SIlya Dryomov obj_req->ex.oe_objno); 275022e8bd51SIlya Dryomov return false; 275122e8bd51SIlya Dryomov } 275222e8bd51SIlya Dryomov 275385b5e6d1SIlya Dryomov static int rbd_obj_read_object(struct rbd_obj_request *obj_req) 275485b5e6d1SIlya Dryomov { 2755a086a1b8SIlya Dryomov struct ceph_osd_request *osd_req; 2756a086a1b8SIlya Dryomov int ret; 2757a086a1b8SIlya Dryomov 2758a086a1b8SIlya Dryomov osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1); 2759a086a1b8SIlya Dryomov if (IS_ERR(osd_req)) 2760a086a1b8SIlya Dryomov return PTR_ERR(osd_req); 2761a086a1b8SIlya Dryomov 2762a086a1b8SIlya Dryomov osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, 2763a086a1b8SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2764a086a1b8SIlya Dryomov rbd_osd_setup_data(osd_req, 0); 2765a086a1b8SIlya Dryomov rbd_osd_format_read(osd_req); 2766a086a1b8SIlya Dryomov 2767a086a1b8SIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 2768a086a1b8SIlya Dryomov if (ret) 2769a086a1b8SIlya Dryomov return ret; 2770a086a1b8SIlya Dryomov 2771a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 277285b5e6d1SIlya Dryomov return 0; 2773bf0d5f50SAlex Elder } 2774bf0d5f50SAlex Elder 277586bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 27763da691bfSIlya Dryomov { 27773da691bfSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 2778a52cc685SIlya Dryomov struct rbd_device *parent = img_req->rbd_dev->parent; 27793da691bfSIlya Dryomov struct rbd_img_request *child_img_req; 27803da691bfSIlya Dryomov int ret; 27813da691bfSIlya Dryomov 278259e542c8SIlya Dryomov child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); 27833da691bfSIlya Dryomov if (!child_img_req) 27843da691bfSIlya Dryomov return -ENOMEM; 27853da691bfSIlya Dryomov 278659e542c8SIlya Dryomov rbd_img_request_init(child_img_req, parent, OBJ_OP_READ); 2787e93aca0aSIlya Dryomov __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2788e93aca0aSIlya Dryomov child_img_req->obj_request = obj_req; 2789e93aca0aSIlya Dryomov 2790a52cc685SIlya Dryomov down_read(&parent->header_rwsem); 2791a52cc685SIlya Dryomov rbd_img_capture_header(child_img_req); 2792a52cc685SIlya Dryomov up_read(&parent->header_rwsem); 2793a52cc685SIlya Dryomov 279421ed05a8SIlya Dryomov dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, 279521ed05a8SIlya Dryomov obj_req); 279621ed05a8SIlya Dryomov 27973da691bfSIlya Dryomov if (!rbd_img_is_write(img_req)) { 2798ecc633caSIlya Dryomov switch (img_req->data_type) { 27993da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 28005a237819SIlya Dryomov ret = __rbd_img_fill_from_bio(child_img_req, 28015a237819SIlya Dryomov obj_req->img_extents, 28025a237819SIlya Dryomov obj_req->num_img_extents, 28033da691bfSIlya Dryomov &obj_req->bio_pos); 28043da691bfSIlya Dryomov break; 28053da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2806afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 28075a237819SIlya Dryomov ret = __rbd_img_fill_from_bvecs(child_img_req, 28085a237819SIlya Dryomov obj_req->img_extents, 28095a237819SIlya Dryomov obj_req->num_img_extents, 28103da691bfSIlya Dryomov &obj_req->bvec_pos); 28113da691bfSIlya Dryomov break; 28123da691bfSIlya Dryomov default: 2813d342a15bSArnd Bergmann BUG(); 28143da691bfSIlya Dryomov } 28153da691bfSIlya Dryomov } else { 28165a237819SIlya Dryomov ret = rbd_img_fill_from_bvecs(child_img_req, 28175a237819SIlya Dryomov obj_req->img_extents, 28185a237819SIlya Dryomov obj_req->num_img_extents, 28195a237819SIlya Dryomov obj_req->copyup_bvecs); 28203da691bfSIlya Dryomov } 28213da691bfSIlya Dryomov if (ret) { 2822679a97d2SHannes Reinecke rbd_img_request_destroy(child_img_req); 2823663ae2ccSIlya Dryomov return ret; 2824bf0d5f50SAlex Elder } 2825bf0d5f50SAlex Elder 28260192ce2eSIlya Dryomov /* avoid parent chain recursion */ 28270192ce2eSIlya Dryomov rbd_img_schedule(child_img_req, 0); 28283da691bfSIlya Dryomov return 0; 28293da691bfSIlya Dryomov } 28303da691bfSIlya Dryomov 283185b5e6d1SIlya Dryomov static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result) 28328b3e1a56SAlex Elder { 28333da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 28343da691bfSIlya Dryomov int ret; 28358b3e1a56SAlex Elder 283622e8bd51SIlya Dryomov again: 2837a9b67e69SIlya Dryomov switch (obj_req->read_state) { 283885b5e6d1SIlya Dryomov case RBD_OBJ_READ_START: 283985b5e6d1SIlya Dryomov rbd_assert(!*result); 284085b5e6d1SIlya Dryomov 284122e8bd51SIlya Dryomov if (!rbd_obj_may_exist(obj_req)) { 284222e8bd51SIlya Dryomov *result = -ENOENT; 284322e8bd51SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_OBJECT; 284422e8bd51SIlya Dryomov goto again; 284522e8bd51SIlya Dryomov } 284622e8bd51SIlya Dryomov 284785b5e6d1SIlya Dryomov ret = rbd_obj_read_object(obj_req); 284885b5e6d1SIlya Dryomov if (ret) { 284985b5e6d1SIlya Dryomov *result = ret; 285085b5e6d1SIlya Dryomov return true; 285185b5e6d1SIlya Dryomov } 285285b5e6d1SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_OBJECT; 285385b5e6d1SIlya Dryomov return false; 2854a9b67e69SIlya Dryomov case RBD_OBJ_READ_OBJECT: 2855a9b67e69SIlya Dryomov if (*result == -ENOENT && rbd_dev->parent_overlap) { 285686bd7998SIlya Dryomov /* reverse map this object extent onto the parent */ 285786bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, false); 285886bd7998SIlya Dryomov if (ret) { 285954ab3b24SIlya Dryomov *result = ret; 286086bd7998SIlya Dryomov return true; 286186bd7998SIlya Dryomov } 286286bd7998SIlya Dryomov if (obj_req->num_img_extents) { 286386bd7998SIlya Dryomov ret = rbd_obj_read_from_parent(obj_req); 28643da691bfSIlya Dryomov if (ret) { 286554ab3b24SIlya Dryomov *result = ret; 28663da691bfSIlya Dryomov return true; 28673da691bfSIlya Dryomov } 2868a9b67e69SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_PARENT; 28693da691bfSIlya Dryomov return false; 28703da691bfSIlya Dryomov } 287186bd7998SIlya Dryomov } 287202c74fbaSAlex Elder 287302c74fbaSAlex Elder /* 28743da691bfSIlya Dryomov * -ENOENT means a hole in the image -- zero-fill the entire 28753da691bfSIlya Dryomov * length of the request. A short read also implies zero-fill 287654ab3b24SIlya Dryomov * to the end of the request. 287702c74fbaSAlex Elder */ 287854ab3b24SIlya Dryomov if (*result == -ENOENT) { 287954ab3b24SIlya Dryomov rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len); 288054ab3b24SIlya Dryomov *result = 0; 288154ab3b24SIlya Dryomov } else if (*result >= 0) { 288254ab3b24SIlya Dryomov if (*result < obj_req->ex.oe_len) 288354ab3b24SIlya Dryomov rbd_obj_zero_range(obj_req, *result, 288454ab3b24SIlya Dryomov obj_req->ex.oe_len - *result); 288554ab3b24SIlya Dryomov else 288654ab3b24SIlya Dryomov rbd_assert(*result == obj_req->ex.oe_len); 288754ab3b24SIlya Dryomov *result = 0; 28883da691bfSIlya Dryomov } 28893da691bfSIlya Dryomov return true; 2890a9b67e69SIlya Dryomov case RBD_OBJ_READ_PARENT: 2891d435c9a7SIlya Dryomov /* 2892d435c9a7SIlya Dryomov * The parent image is read only up to the overlap -- zero-fill 2893d435c9a7SIlya Dryomov * from the overlap to the end of the request. 2894d435c9a7SIlya Dryomov */ 2895d435c9a7SIlya Dryomov if (!*result) { 2896d435c9a7SIlya Dryomov u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req); 2897d435c9a7SIlya Dryomov 2898d435c9a7SIlya Dryomov if (obj_overlap < obj_req->ex.oe_len) 2899d435c9a7SIlya Dryomov rbd_obj_zero_range(obj_req, obj_overlap, 2900d435c9a7SIlya Dryomov obj_req->ex.oe_len - obj_overlap); 2901d435c9a7SIlya Dryomov } 2902a9b67e69SIlya Dryomov return true; 2903a9b67e69SIlya Dryomov default: 2904a9b67e69SIlya Dryomov BUG(); 2905a9b67e69SIlya Dryomov } 29063da691bfSIlya Dryomov } 29073da691bfSIlya Dryomov 290822e8bd51SIlya Dryomov static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req) 290922e8bd51SIlya Dryomov { 291022e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 291122e8bd51SIlya Dryomov 291222e8bd51SIlya Dryomov if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) 291322e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; 291422e8bd51SIlya Dryomov 291522e8bd51SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) && 291622e8bd51SIlya Dryomov (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) { 291722e8bd51SIlya Dryomov dout("%s %p noop for nonexistent\n", __func__, obj_req); 29183da691bfSIlya Dryomov return true; 29193da691bfSIlya Dryomov } 29203da691bfSIlya Dryomov 292122e8bd51SIlya Dryomov return false; 292222e8bd51SIlya Dryomov } 292322e8bd51SIlya Dryomov 292422e8bd51SIlya Dryomov /* 292522e8bd51SIlya Dryomov * Return: 292622e8bd51SIlya Dryomov * 0 - object map update sent 292722e8bd51SIlya Dryomov * 1 - object map update isn't needed 292822e8bd51SIlya Dryomov * <0 - error 292922e8bd51SIlya Dryomov */ 293022e8bd51SIlya Dryomov static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req) 293122e8bd51SIlya Dryomov { 293222e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 293322e8bd51SIlya Dryomov u8 new_state; 293422e8bd51SIlya Dryomov 293522e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 293622e8bd51SIlya Dryomov return 1; 293722e8bd51SIlya Dryomov 293822e8bd51SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_DELETION) 293922e8bd51SIlya Dryomov new_state = OBJECT_PENDING; 294022e8bd51SIlya Dryomov else 294122e8bd51SIlya Dryomov new_state = OBJECT_EXISTS; 294222e8bd51SIlya Dryomov 294322e8bd51SIlya Dryomov return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL); 294422e8bd51SIlya Dryomov } 294522e8bd51SIlya Dryomov 294685b5e6d1SIlya Dryomov static int rbd_obj_write_object(struct rbd_obj_request *obj_req) 294785b5e6d1SIlya Dryomov { 2948a086a1b8SIlya Dryomov struct ceph_osd_request *osd_req; 2949a086a1b8SIlya Dryomov int num_ops = count_write_ops(obj_req); 2950a086a1b8SIlya Dryomov int which = 0; 2951a086a1b8SIlya Dryomov int ret; 2952a086a1b8SIlya Dryomov 2953a086a1b8SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) 2954a086a1b8SIlya Dryomov num_ops++; /* stat */ 2955a086a1b8SIlya Dryomov 2956a086a1b8SIlya Dryomov osd_req = rbd_obj_add_osd_request(obj_req, num_ops); 2957a086a1b8SIlya Dryomov if (IS_ERR(osd_req)) 2958a086a1b8SIlya Dryomov return PTR_ERR(osd_req); 2959a086a1b8SIlya Dryomov 2960a086a1b8SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { 2961a086a1b8SIlya Dryomov ret = rbd_osd_setup_stat(osd_req, which++); 2962a086a1b8SIlya Dryomov if (ret) 2963a086a1b8SIlya Dryomov return ret; 2964a086a1b8SIlya Dryomov } 2965a086a1b8SIlya Dryomov 2966a086a1b8SIlya Dryomov rbd_osd_setup_write_ops(osd_req, which); 2967a086a1b8SIlya Dryomov rbd_osd_format_write(osd_req); 2968a086a1b8SIlya Dryomov 2969a086a1b8SIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 2970a086a1b8SIlya Dryomov if (ret) 2971a086a1b8SIlya Dryomov return ret; 2972a086a1b8SIlya Dryomov 2973a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 297485b5e6d1SIlya Dryomov return 0; 297585b5e6d1SIlya Dryomov } 297685b5e6d1SIlya Dryomov 29773da691bfSIlya Dryomov /* 29783da691bfSIlya Dryomov * copyup_bvecs pages are never highmem pages 29793da691bfSIlya Dryomov */ 29803da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) 29813da691bfSIlya Dryomov { 29823da691bfSIlya Dryomov struct ceph_bvec_iter it = { 29833da691bfSIlya Dryomov .bvecs = bvecs, 29843da691bfSIlya Dryomov .iter = { .bi_size = bytes }, 29853da691bfSIlya Dryomov }; 29863da691bfSIlya Dryomov 29873da691bfSIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 2988cf58b537SChristoph Hellwig if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len)) 29893da691bfSIlya Dryomov return false; 29903da691bfSIlya Dryomov })); 29913da691bfSIlya Dryomov return true; 29923da691bfSIlya Dryomov } 29933da691bfSIlya Dryomov 29943a482501SIlya Dryomov #define MODS_ONLY U32_MAX 29953a482501SIlya Dryomov 2996793333a3SIlya Dryomov static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req, 299789a59c1cSIlya Dryomov u32 bytes) 29983da691bfSIlya Dryomov { 2999bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req; 3000fe943d50SChengguang Xu int ret; 30013da691bfSIlya Dryomov 30023da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 300389a59c1cSIlya Dryomov rbd_assert(bytes > 0 && bytes != MODS_ONLY); 30043da691bfSIlya Dryomov 3005bcbab1dbSIlya Dryomov osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1); 3006bcbab1dbSIlya Dryomov if (IS_ERR(osd_req)) 3007bcbab1dbSIlya Dryomov return PTR_ERR(osd_req); 30083da691bfSIlya Dryomov 3009b5ae8cbcSIlya Dryomov ret = rbd_osd_setup_copyup(osd_req, 0, bytes); 3010fe943d50SChengguang Xu if (ret) 3011fe943d50SChengguang Xu return ret; 3012fe943d50SChengguang Xu 3013bcbab1dbSIlya Dryomov rbd_osd_format_write(osd_req); 30143da691bfSIlya Dryomov 3015bcbab1dbSIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 301689a59c1cSIlya Dryomov if (ret) 301789a59c1cSIlya Dryomov return ret; 301889a59c1cSIlya Dryomov 3019a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 302089a59c1cSIlya Dryomov return 0; 302189a59c1cSIlya Dryomov } 302289a59c1cSIlya Dryomov 3023793333a3SIlya Dryomov static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req, 3024793333a3SIlya Dryomov u32 bytes) 30253da691bfSIlya Dryomov { 3026bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req; 3027a086a1b8SIlya Dryomov int num_ops = count_write_ops(obj_req); 3028a086a1b8SIlya Dryomov int which = 0; 30293da691bfSIlya Dryomov int ret; 30303da691bfSIlya Dryomov 30313da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 30323da691bfSIlya Dryomov 3033a086a1b8SIlya Dryomov if (bytes != MODS_ONLY) 3034a086a1b8SIlya Dryomov num_ops++; /* copyup */ 303513488d53SIlya Dryomov 3036a086a1b8SIlya Dryomov osd_req = rbd_obj_add_osd_request(obj_req, num_ops); 3037bcbab1dbSIlya Dryomov if (IS_ERR(osd_req)) 3038bcbab1dbSIlya Dryomov return PTR_ERR(osd_req); 30393da691bfSIlya Dryomov 30403a482501SIlya Dryomov if (bytes != MODS_ONLY) { 3041b5ae8cbcSIlya Dryomov ret = rbd_osd_setup_copyup(osd_req, which++, bytes); 30423da691bfSIlya Dryomov if (ret) 30433da691bfSIlya Dryomov return ret; 30443a482501SIlya Dryomov } 30453da691bfSIlya Dryomov 3046a086a1b8SIlya Dryomov rbd_osd_setup_write_ops(osd_req, which); 3047a086a1b8SIlya Dryomov rbd_osd_format_write(osd_req); 30483da691bfSIlya Dryomov 3049bcbab1dbSIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 30503da691bfSIlya Dryomov if (ret) 30513da691bfSIlya Dryomov return ret; 30523da691bfSIlya Dryomov 3053a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 30543da691bfSIlya Dryomov return 0; 30553da691bfSIlya Dryomov } 30563da691bfSIlya Dryomov 30577e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 30587e07efb1SIlya Dryomov { 30597e07efb1SIlya Dryomov u32 i; 30607e07efb1SIlya Dryomov 30617e07efb1SIlya Dryomov rbd_assert(!obj_req->copyup_bvecs); 30627e07efb1SIlya Dryomov obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); 30637e07efb1SIlya Dryomov obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, 30647e07efb1SIlya Dryomov sizeof(*obj_req->copyup_bvecs), 30657e07efb1SIlya Dryomov GFP_NOIO); 30667e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs) 30677e07efb1SIlya Dryomov return -ENOMEM; 30687e07efb1SIlya Dryomov 30697e07efb1SIlya Dryomov for (i = 0; i < obj_req->copyup_bvec_count; i++) { 30707e07efb1SIlya Dryomov unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); 30717e07efb1SIlya Dryomov 30727e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); 30737e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs[i].bv_page) 30747e07efb1SIlya Dryomov return -ENOMEM; 30757e07efb1SIlya Dryomov 30767e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_offset = 0; 30777e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_len = len; 30787e07efb1SIlya Dryomov obj_overlap -= len; 30797e07efb1SIlya Dryomov } 30807e07efb1SIlya Dryomov 30817e07efb1SIlya Dryomov rbd_assert(!obj_overlap); 30827e07efb1SIlya Dryomov return 0; 30837e07efb1SIlya Dryomov } 30847e07efb1SIlya Dryomov 30850ad5d953SIlya Dryomov /* 30860ad5d953SIlya Dryomov * The target object doesn't exist. Read the data for the entire 30870ad5d953SIlya Dryomov * target object up to the overlap point (if any) from the parent, 30880ad5d953SIlya Dryomov * so we can use it for a copyup. 30890ad5d953SIlya Dryomov */ 3090793333a3SIlya Dryomov static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req) 30913da691bfSIlya Dryomov { 30923da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 30933da691bfSIlya Dryomov int ret; 30943da691bfSIlya Dryomov 309586bd7998SIlya Dryomov rbd_assert(obj_req->num_img_extents); 309686bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 309786bd7998SIlya Dryomov rbd_dev->parent_overlap); 309886bd7998SIlya Dryomov if (!obj_req->num_img_extents) { 30993da691bfSIlya Dryomov /* 31003da691bfSIlya Dryomov * The overlap has become 0 (most likely because the 31013a482501SIlya Dryomov * image has been flattened). Re-submit the original write 31023a482501SIlya Dryomov * request -- pass MODS_ONLY since the copyup isn't needed 31033a482501SIlya Dryomov * anymore. 31043da691bfSIlya Dryomov */ 3105793333a3SIlya Dryomov return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY); 31063da691bfSIlya Dryomov } 31073da691bfSIlya Dryomov 310886bd7998SIlya Dryomov ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 31093da691bfSIlya Dryomov if (ret) 31103da691bfSIlya Dryomov return ret; 31113da691bfSIlya Dryomov 311286bd7998SIlya Dryomov return rbd_obj_read_from_parent(obj_req); 31133da691bfSIlya Dryomov } 31143da691bfSIlya Dryomov 311522e8bd51SIlya Dryomov static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req) 31163da691bfSIlya Dryomov { 311722e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 311822e8bd51SIlya Dryomov struct ceph_snap_context *snapc = obj_req->img_request->snapc; 311922e8bd51SIlya Dryomov u8 new_state; 312022e8bd51SIlya Dryomov u32 i; 31213da691bfSIlya Dryomov int ret; 31223da691bfSIlya Dryomov 312322e8bd51SIlya Dryomov rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); 31243da691bfSIlya Dryomov 312522e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 312622e8bd51SIlya Dryomov return; 312789a59c1cSIlya Dryomov 312822e8bd51SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) 312922e8bd51SIlya Dryomov return; 31303da691bfSIlya Dryomov 313122e8bd51SIlya Dryomov for (i = 0; i < snapc->num_snaps; i++) { 313222e8bd51SIlya Dryomov if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) && 313322e8bd51SIlya Dryomov i + 1 < snapc->num_snaps) 313422e8bd51SIlya Dryomov new_state = OBJECT_EXISTS_CLEAN; 313522e8bd51SIlya Dryomov else 313622e8bd51SIlya Dryomov new_state = OBJECT_EXISTS; 31373da691bfSIlya Dryomov 313822e8bd51SIlya Dryomov ret = rbd_object_map_update(obj_req, snapc->snaps[i], 313922e8bd51SIlya Dryomov new_state, NULL); 314022e8bd51SIlya Dryomov if (ret < 0) { 314122e8bd51SIlya Dryomov obj_req->pending.result = ret; 314202c74fbaSAlex Elder return; 314302c74fbaSAlex Elder } 314402c74fbaSAlex Elder 314522e8bd51SIlya Dryomov rbd_assert(!ret); 314622e8bd51SIlya Dryomov obj_req->pending.num_pending++; 3147a9e8ba2cSAlex Elder } 31488b3e1a56SAlex Elder } 31498b3e1a56SAlex Elder 3150793333a3SIlya Dryomov static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req) 31518b3e1a56SAlex Elder { 3152793333a3SIlya Dryomov u32 bytes = rbd_obj_img_extents_bytes(obj_req); 3153793333a3SIlya Dryomov int ret; 31548b3e1a56SAlex Elder 3155793333a3SIlya Dryomov rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); 31568b3e1a56SAlex Elder 3157793333a3SIlya Dryomov /* 3158793333a3SIlya Dryomov * Only send non-zero copyup data to save some I/O and network 3159793333a3SIlya Dryomov * bandwidth -- zero copyup data is equivalent to the object not 3160793333a3SIlya Dryomov * existing. 3161793333a3SIlya Dryomov */ 3162793333a3SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) 3163793333a3SIlya Dryomov bytes = 0; 3164793333a3SIlya Dryomov 3165793333a3SIlya Dryomov if (obj_req->img_request->snapc->num_snaps && bytes > 0) { 3166793333a3SIlya Dryomov /* 3167793333a3SIlya Dryomov * Send a copyup request with an empty snapshot context to 3168793333a3SIlya Dryomov * deep-copyup the object through all existing snapshots. 3169793333a3SIlya Dryomov * A second request with the current snapshot context will be 3170793333a3SIlya Dryomov * sent for the actual modification. 3171793333a3SIlya Dryomov */ 3172793333a3SIlya Dryomov ret = rbd_obj_copyup_empty_snapc(obj_req, bytes); 3173793333a3SIlya Dryomov if (ret) { 3174793333a3SIlya Dryomov obj_req->pending.result = ret; 3175793333a3SIlya Dryomov return; 31767114edacSIlya Dryomov } 31778b3e1a56SAlex Elder 3178793333a3SIlya Dryomov obj_req->pending.num_pending++; 3179793333a3SIlya Dryomov bytes = MODS_ONLY; 31803da691bfSIlya Dryomov } 31818b3e1a56SAlex Elder 3182793333a3SIlya Dryomov ret = rbd_obj_copyup_current_snapc(obj_req, bytes); 3183793333a3SIlya Dryomov if (ret) { 3184793333a3SIlya Dryomov obj_req->pending.result = ret; 3185793333a3SIlya Dryomov return; 3186793333a3SIlya Dryomov } 3187793333a3SIlya Dryomov 3188793333a3SIlya Dryomov obj_req->pending.num_pending++; 3189793333a3SIlya Dryomov } 3190793333a3SIlya Dryomov 3191793333a3SIlya Dryomov static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result) 31923da691bfSIlya Dryomov { 319322e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3194793333a3SIlya Dryomov int ret; 31957114edacSIlya Dryomov 31967114edacSIlya Dryomov again: 3197793333a3SIlya Dryomov switch (obj_req->copyup_state) { 3198793333a3SIlya Dryomov case RBD_OBJ_COPYUP_START: 3199793333a3SIlya Dryomov rbd_assert(!*result); 32003da691bfSIlya Dryomov 3201793333a3SIlya Dryomov ret = rbd_obj_copyup_read_parent(obj_req); 3202793333a3SIlya Dryomov if (ret) { 3203793333a3SIlya Dryomov *result = ret; 3204793333a3SIlya Dryomov return true; 3205793333a3SIlya Dryomov } 3206793333a3SIlya Dryomov if (obj_req->num_img_extents) 3207793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT; 3208793333a3SIlya Dryomov else 3209793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; 3210793333a3SIlya Dryomov return false; 3211793333a3SIlya Dryomov case RBD_OBJ_COPYUP_READ_PARENT: 3212793333a3SIlya Dryomov if (*result) 3213793333a3SIlya Dryomov return true; 3214793333a3SIlya Dryomov 3215793333a3SIlya Dryomov if (is_zero_bvecs(obj_req->copyup_bvecs, 3216793333a3SIlya Dryomov rbd_obj_img_extents_bytes(obj_req))) { 3217793333a3SIlya Dryomov dout("%s %p detected zeros\n", __func__, obj_req); 3218793333a3SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS; 32197114edacSIlya Dryomov } 32207114edacSIlya Dryomov 322122e8bd51SIlya Dryomov rbd_obj_copyup_object_maps(obj_req); 322222e8bd51SIlya Dryomov if (!obj_req->pending.num_pending) { 322322e8bd51SIlya Dryomov *result = obj_req->pending.result; 322422e8bd51SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS; 32257114edacSIlya Dryomov goto again; 32267114edacSIlya Dryomov } 322722e8bd51SIlya Dryomov obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS; 322822e8bd51SIlya Dryomov return false; 322922e8bd51SIlya Dryomov case __RBD_OBJ_COPYUP_OBJECT_MAPS: 323022e8bd51SIlya Dryomov if (!pending_result_dec(&obj_req->pending, result)) 323122e8bd51SIlya Dryomov return false; 3232df561f66SGustavo A. R. Silva fallthrough; 323322e8bd51SIlya Dryomov case RBD_OBJ_COPYUP_OBJECT_MAPS: 323422e8bd51SIlya Dryomov if (*result) { 323522e8bd51SIlya Dryomov rbd_warn(rbd_dev, "snap object map update failed: %d", 323622e8bd51SIlya Dryomov *result); 323722e8bd51SIlya Dryomov return true; 323822e8bd51SIlya Dryomov } 323922e8bd51SIlya Dryomov 3240793333a3SIlya Dryomov rbd_obj_copyup_write_object(obj_req); 3241793333a3SIlya Dryomov if (!obj_req->pending.num_pending) { 3242793333a3SIlya Dryomov *result = obj_req->pending.result; 3243793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; 3244793333a3SIlya Dryomov goto again; 3245793333a3SIlya Dryomov } 3246793333a3SIlya Dryomov obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT; 3247793333a3SIlya Dryomov return false; 3248793333a3SIlya Dryomov case __RBD_OBJ_COPYUP_WRITE_OBJECT: 3249793333a3SIlya Dryomov if (!pending_result_dec(&obj_req->pending, result)) 3250793333a3SIlya Dryomov return false; 3251df561f66SGustavo A. R. Silva fallthrough; 3252793333a3SIlya Dryomov case RBD_OBJ_COPYUP_WRITE_OBJECT: 3253793333a3SIlya Dryomov return true; 3254793333a3SIlya Dryomov default: 3255793333a3SIlya Dryomov BUG(); 3256793333a3SIlya Dryomov } 3257793333a3SIlya Dryomov } 3258793333a3SIlya Dryomov 325922e8bd51SIlya Dryomov /* 326022e8bd51SIlya Dryomov * Return: 326122e8bd51SIlya Dryomov * 0 - object map update sent 326222e8bd51SIlya Dryomov * 1 - object map update isn't needed 326322e8bd51SIlya Dryomov * <0 - error 326422e8bd51SIlya Dryomov */ 326522e8bd51SIlya Dryomov static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req) 326622e8bd51SIlya Dryomov { 326722e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 326822e8bd51SIlya Dryomov u8 current_state = OBJECT_PENDING; 326922e8bd51SIlya Dryomov 327022e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 327122e8bd51SIlya Dryomov return 1; 327222e8bd51SIlya Dryomov 327322e8bd51SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION)) 327422e8bd51SIlya Dryomov return 1; 327522e8bd51SIlya Dryomov 327622e8bd51SIlya Dryomov return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT, 327722e8bd51SIlya Dryomov ¤t_state); 327822e8bd51SIlya Dryomov } 327922e8bd51SIlya Dryomov 328085b5e6d1SIlya Dryomov static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) 3281b8d70035SAlex Elder { 3282793333a3SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3283b8d70035SAlex Elder int ret; 3284b8d70035SAlex Elder 3285793333a3SIlya Dryomov again: 3286cf81b60eSAlex Elder switch (obj_req->write_state) { 328785b5e6d1SIlya Dryomov case RBD_OBJ_WRITE_START: 328885b5e6d1SIlya Dryomov rbd_assert(!*result); 328985b5e6d1SIlya Dryomov 329022e8bd51SIlya Dryomov if (rbd_obj_write_is_noop(obj_req)) 329122e8bd51SIlya Dryomov return true; 329222e8bd51SIlya Dryomov 329322e8bd51SIlya Dryomov ret = rbd_obj_write_pre_object_map(obj_req); 329422e8bd51SIlya Dryomov if (ret < 0) { 329522e8bd51SIlya Dryomov *result = ret; 329622e8bd51SIlya Dryomov return true; 329722e8bd51SIlya Dryomov } 329822e8bd51SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP; 329922e8bd51SIlya Dryomov if (ret > 0) 330022e8bd51SIlya Dryomov goto again; 330122e8bd51SIlya Dryomov return false; 330222e8bd51SIlya Dryomov case RBD_OBJ_WRITE_PRE_OBJECT_MAP: 330322e8bd51SIlya Dryomov if (*result) { 330422e8bd51SIlya Dryomov rbd_warn(rbd_dev, "pre object map update failed: %d", 330522e8bd51SIlya Dryomov *result); 330622e8bd51SIlya Dryomov return true; 330722e8bd51SIlya Dryomov } 330885b5e6d1SIlya Dryomov ret = rbd_obj_write_object(obj_req); 330985b5e6d1SIlya Dryomov if (ret) { 331085b5e6d1SIlya Dryomov *result = ret; 331185b5e6d1SIlya Dryomov return true; 331285b5e6d1SIlya Dryomov } 331385b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_OBJECT; 331485b5e6d1SIlya Dryomov return false; 33150ad5d953SIlya Dryomov case RBD_OBJ_WRITE_OBJECT: 331654ab3b24SIlya Dryomov if (*result == -ENOENT) { 33170ad5d953SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { 3318793333a3SIlya Dryomov *result = 0; 3319793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_START; 3320793333a3SIlya Dryomov obj_req->write_state = __RBD_OBJ_WRITE_COPYUP; 3321793333a3SIlya Dryomov goto again; 3322b8d70035SAlex Elder } 33230ad5d953SIlya Dryomov /* 33240ad5d953SIlya Dryomov * On a non-existent object: 33250ad5d953SIlya Dryomov * delete - -ENOENT, truncate/zero - 0 33260ad5d953SIlya Dryomov */ 33270ad5d953SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_DELETION) 33280ad5d953SIlya Dryomov *result = 0; 33290ad5d953SIlya Dryomov } 3330793333a3SIlya Dryomov if (*result) 3331793333a3SIlya Dryomov return true; 3332793333a3SIlya Dryomov 3333793333a3SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP; 3334793333a3SIlya Dryomov goto again; 3335793333a3SIlya Dryomov case __RBD_OBJ_WRITE_COPYUP: 3336793333a3SIlya Dryomov if (!rbd_obj_advance_copyup(obj_req, result)) 3337793333a3SIlya Dryomov return false; 3338df561f66SGustavo A. R. Silva fallthrough; 3339793333a3SIlya Dryomov case RBD_OBJ_WRITE_COPYUP: 334022e8bd51SIlya Dryomov if (*result) { 3341793333a3SIlya Dryomov rbd_warn(rbd_dev, "copyup failed: %d", *result); 3342b8d70035SAlex Elder return true; 334322e8bd51SIlya Dryomov } 334422e8bd51SIlya Dryomov ret = rbd_obj_write_post_object_map(obj_req); 334522e8bd51SIlya Dryomov if (ret < 0) { 334622e8bd51SIlya Dryomov *result = ret; 334722e8bd51SIlya Dryomov return true; 334822e8bd51SIlya Dryomov } 334922e8bd51SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP; 335022e8bd51SIlya Dryomov if (ret > 0) 335122e8bd51SIlya Dryomov goto again; 335222e8bd51SIlya Dryomov return false; 335322e8bd51SIlya Dryomov case RBD_OBJ_WRITE_POST_OBJECT_MAP: 335422e8bd51SIlya Dryomov if (*result) 335522e8bd51SIlya Dryomov rbd_warn(rbd_dev, "post object map update failed: %d", 335622e8bd51SIlya Dryomov *result); 335722e8bd51SIlya Dryomov return true; 3358b8d70035SAlex Elder default: 3359b8d70035SAlex Elder BUG(); 3360b8d70035SAlex Elder } 3361b8d70035SAlex Elder } 3362b8d70035SAlex Elder 3363b8d70035SAlex Elder /* 33640ad5d953SIlya Dryomov * Return true if @obj_req is completed. 3365b8d70035SAlex Elder */ 336654ab3b24SIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req, 336754ab3b24SIlya Dryomov int *result) 3368b8d70035SAlex Elder { 33690ad5d953SIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 33700192ce2eSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 33710ad5d953SIlya Dryomov bool done; 33720ad5d953SIlya Dryomov 337385b5e6d1SIlya Dryomov mutex_lock(&obj_req->state_mutex); 33740ad5d953SIlya Dryomov if (!rbd_img_is_write(img_req)) 337585b5e6d1SIlya Dryomov done = rbd_obj_advance_read(obj_req, result); 33760ad5d953SIlya Dryomov else 337785b5e6d1SIlya Dryomov done = rbd_obj_advance_write(obj_req, result); 337885b5e6d1SIlya Dryomov mutex_unlock(&obj_req->state_mutex); 33790ad5d953SIlya Dryomov 33800192ce2eSIlya Dryomov if (done && *result) { 33810192ce2eSIlya Dryomov rbd_assert(*result < 0); 33820192ce2eSIlya Dryomov rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d", 33830192ce2eSIlya Dryomov obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 33840192ce2eSIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, *result); 33850192ce2eSIlya Dryomov } 33860ad5d953SIlya Dryomov return done; 33879969ebc5SAlex Elder } 33889969ebc5SAlex Elder 33890192ce2eSIlya Dryomov /* 33900192ce2eSIlya Dryomov * This is open-coded in rbd_img_handle_request() to avoid parent chain 33910192ce2eSIlya Dryomov * recursion. 33920192ce2eSIlya Dryomov */ 339354ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result) 33949969ebc5SAlex Elder { 33950192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) 33960192ce2eSIlya Dryomov rbd_img_handle_request(obj_req->img_request, result); 33977114edacSIlya Dryomov } 33987114edacSIlya Dryomov 3399e1fddc8fSIlya Dryomov static bool need_exclusive_lock(struct rbd_img_request *img_req) 3400e1fddc8fSIlya Dryomov { 3401e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3402e1fddc8fSIlya Dryomov 3403e1fddc8fSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) 3404e1fddc8fSIlya Dryomov return false; 3405e1fddc8fSIlya Dryomov 34063fe69921SIlya Dryomov if (rbd_is_ro(rbd_dev)) 3407e1fddc8fSIlya Dryomov return false; 3408e1fddc8fSIlya Dryomov 3409e1fddc8fSIlya Dryomov rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 341022e8bd51SIlya Dryomov if (rbd_dev->opts->lock_on_read || 341122e8bd51SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 3412e1fddc8fSIlya Dryomov return true; 3413e1fddc8fSIlya Dryomov 3414e1fddc8fSIlya Dryomov return rbd_img_is_write(img_req); 3415e1fddc8fSIlya Dryomov } 3416e1fddc8fSIlya Dryomov 3417637cd060SIlya Dryomov static bool rbd_lock_add_request(struct rbd_img_request *img_req) 3418e1fddc8fSIlya Dryomov { 3419e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3420637cd060SIlya Dryomov bool locked; 3421e1fddc8fSIlya Dryomov 3422e1fddc8fSIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem); 3423637cd060SIlya Dryomov locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED; 3424e1fddc8fSIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock); 3425e1fddc8fSIlya Dryomov rbd_assert(list_empty(&img_req->lock_item)); 3426637cd060SIlya Dryomov if (!locked) 3427637cd060SIlya Dryomov list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list); 3428637cd060SIlya Dryomov else 3429e1fddc8fSIlya Dryomov list_add_tail(&img_req->lock_item, &rbd_dev->running_list); 3430e1fddc8fSIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock); 3431637cd060SIlya Dryomov return locked; 3432e1fddc8fSIlya Dryomov } 3433e1fddc8fSIlya Dryomov 3434e1fddc8fSIlya Dryomov static void rbd_lock_del_request(struct rbd_img_request *img_req) 3435e1fddc8fSIlya Dryomov { 3436e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3437e1fddc8fSIlya Dryomov bool need_wakeup; 3438e1fddc8fSIlya Dryomov 3439e1fddc8fSIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem); 3440e1fddc8fSIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock); 3441e1fddc8fSIlya Dryomov rbd_assert(!list_empty(&img_req->lock_item)); 3442e1fddc8fSIlya Dryomov list_del_init(&img_req->lock_item); 3443e1fddc8fSIlya Dryomov need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && 3444e1fddc8fSIlya Dryomov list_empty(&rbd_dev->running_list)); 3445e1fddc8fSIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock); 3446e1fddc8fSIlya Dryomov if (need_wakeup) 3447e1fddc8fSIlya Dryomov complete(&rbd_dev->releasing_wait); 3448e1fddc8fSIlya Dryomov } 3449e1fddc8fSIlya Dryomov 3450637cd060SIlya Dryomov static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) 3451637cd060SIlya Dryomov { 3452637cd060SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3453637cd060SIlya Dryomov 3454637cd060SIlya Dryomov if (!need_exclusive_lock(img_req)) 3455637cd060SIlya Dryomov return 1; 3456637cd060SIlya Dryomov 3457637cd060SIlya Dryomov if (rbd_lock_add_request(img_req)) 3458637cd060SIlya Dryomov return 1; 3459637cd060SIlya Dryomov 3460637cd060SIlya Dryomov if (rbd_dev->opts->exclusive) { 3461637cd060SIlya Dryomov WARN_ON(1); /* lock got released? */ 3462637cd060SIlya Dryomov return -EROFS; 3463637cd060SIlya Dryomov } 3464637cd060SIlya Dryomov 3465637cd060SIlya Dryomov /* 3466637cd060SIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3467637cd060SIlya Dryomov * and cancel_delayed_work() in wake_lock_waiters(). 3468637cd060SIlya Dryomov */ 3469637cd060SIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3470637cd060SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3471637cd060SIlya Dryomov return 0; 3472637cd060SIlya Dryomov } 3473637cd060SIlya Dryomov 34740192ce2eSIlya Dryomov static void rbd_img_object_requests(struct rbd_img_request *img_req) 34750192ce2eSIlya Dryomov { 34760192ce2eSIlya Dryomov struct rbd_obj_request *obj_req; 34770192ce2eSIlya Dryomov 34780192ce2eSIlya Dryomov rbd_assert(!img_req->pending.result && !img_req->pending.num_pending); 34790192ce2eSIlya Dryomov 34800192ce2eSIlya Dryomov for_each_obj_request(img_req, obj_req) { 34810192ce2eSIlya Dryomov int result = 0; 34820192ce2eSIlya Dryomov 34830192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) { 34840192ce2eSIlya Dryomov if (result) { 34850192ce2eSIlya Dryomov img_req->pending.result = result; 34860192ce2eSIlya Dryomov return; 34870192ce2eSIlya Dryomov } 34880192ce2eSIlya Dryomov } else { 34890192ce2eSIlya Dryomov img_req->pending.num_pending++; 34900192ce2eSIlya Dryomov } 34910192ce2eSIlya Dryomov } 34920192ce2eSIlya Dryomov } 34930192ce2eSIlya Dryomov 34940192ce2eSIlya Dryomov static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) 34950192ce2eSIlya Dryomov { 3496637cd060SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3497637cd060SIlya Dryomov int ret; 3498637cd060SIlya Dryomov 34990192ce2eSIlya Dryomov again: 35000192ce2eSIlya Dryomov switch (img_req->state) { 35010192ce2eSIlya Dryomov case RBD_IMG_START: 35020192ce2eSIlya Dryomov rbd_assert(!*result); 35030192ce2eSIlya Dryomov 3504637cd060SIlya Dryomov ret = rbd_img_exclusive_lock(img_req); 3505637cd060SIlya Dryomov if (ret < 0) { 3506637cd060SIlya Dryomov *result = ret; 3507637cd060SIlya Dryomov return true; 3508637cd060SIlya Dryomov } 3509637cd060SIlya Dryomov img_req->state = RBD_IMG_EXCLUSIVE_LOCK; 3510637cd060SIlya Dryomov if (ret > 0) 3511637cd060SIlya Dryomov goto again; 3512637cd060SIlya Dryomov return false; 3513637cd060SIlya Dryomov case RBD_IMG_EXCLUSIVE_LOCK: 3514637cd060SIlya Dryomov if (*result) 3515637cd060SIlya Dryomov return true; 3516637cd060SIlya Dryomov 3517637cd060SIlya Dryomov rbd_assert(!need_exclusive_lock(img_req) || 3518637cd060SIlya Dryomov __rbd_is_lock_owner(rbd_dev)); 3519637cd060SIlya Dryomov 35200192ce2eSIlya Dryomov rbd_img_object_requests(img_req); 35210192ce2eSIlya Dryomov if (!img_req->pending.num_pending) { 35220192ce2eSIlya Dryomov *result = img_req->pending.result; 35230192ce2eSIlya Dryomov img_req->state = RBD_IMG_OBJECT_REQUESTS; 35247114edacSIlya Dryomov goto again; 35257114edacSIlya Dryomov } 35260192ce2eSIlya Dryomov img_req->state = __RBD_IMG_OBJECT_REQUESTS; 35270192ce2eSIlya Dryomov return false; 35280192ce2eSIlya Dryomov case __RBD_IMG_OBJECT_REQUESTS: 35290192ce2eSIlya Dryomov if (!pending_result_dec(&img_req->pending, result)) 35300192ce2eSIlya Dryomov return false; 3531df561f66SGustavo A. R. Silva fallthrough; 35320192ce2eSIlya Dryomov case RBD_IMG_OBJECT_REQUESTS: 35330192ce2eSIlya Dryomov return true; 35340192ce2eSIlya Dryomov default: 35350192ce2eSIlya Dryomov BUG(); 35360192ce2eSIlya Dryomov } 35370192ce2eSIlya Dryomov } 35380192ce2eSIlya Dryomov 35390192ce2eSIlya Dryomov /* 35400192ce2eSIlya Dryomov * Return true if @img_req is completed. 35410192ce2eSIlya Dryomov */ 35420192ce2eSIlya Dryomov static bool __rbd_img_handle_request(struct rbd_img_request *img_req, 35430192ce2eSIlya Dryomov int *result) 35440192ce2eSIlya Dryomov { 35450192ce2eSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 35460192ce2eSIlya Dryomov bool done; 35470192ce2eSIlya Dryomov 3548e1fddc8fSIlya Dryomov if (need_exclusive_lock(img_req)) { 3549e1fddc8fSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3550e1fddc8fSIlya Dryomov mutex_lock(&img_req->state_mutex); 3551e1fddc8fSIlya Dryomov done = rbd_img_advance(img_req, result); 3552e1fddc8fSIlya Dryomov if (done) 3553e1fddc8fSIlya Dryomov rbd_lock_del_request(img_req); 3554e1fddc8fSIlya Dryomov mutex_unlock(&img_req->state_mutex); 3555e1fddc8fSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3556e1fddc8fSIlya Dryomov } else { 35570192ce2eSIlya Dryomov mutex_lock(&img_req->state_mutex); 35580192ce2eSIlya Dryomov done = rbd_img_advance(img_req, result); 35590192ce2eSIlya Dryomov mutex_unlock(&img_req->state_mutex); 3560e1fddc8fSIlya Dryomov } 35610192ce2eSIlya Dryomov 35620192ce2eSIlya Dryomov if (done && *result) { 35630192ce2eSIlya Dryomov rbd_assert(*result < 0); 35640192ce2eSIlya Dryomov rbd_warn(rbd_dev, "%s%s result %d", 35650192ce2eSIlya Dryomov test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "", 35660192ce2eSIlya Dryomov obj_op_name(img_req->op_type), *result); 35670192ce2eSIlya Dryomov } 35680192ce2eSIlya Dryomov return done; 35690192ce2eSIlya Dryomov } 35700192ce2eSIlya Dryomov 35710192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result) 35720192ce2eSIlya Dryomov { 35730192ce2eSIlya Dryomov again: 35740192ce2eSIlya Dryomov if (!__rbd_img_handle_request(img_req, &result)) 35750192ce2eSIlya Dryomov return; 35760192ce2eSIlya Dryomov 35770192ce2eSIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 35780192ce2eSIlya Dryomov struct rbd_obj_request *obj_req = img_req->obj_request; 35790192ce2eSIlya Dryomov 3580679a97d2SHannes Reinecke rbd_img_request_destroy(img_req); 35810192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) { 35820192ce2eSIlya Dryomov img_req = obj_req->img_request; 35830192ce2eSIlya Dryomov goto again; 35840192ce2eSIlya Dryomov } 35850192ce2eSIlya Dryomov } else { 358659e542c8SIlya Dryomov struct request *rq = blk_mq_rq_from_pdu(img_req); 35870192ce2eSIlya Dryomov 3588679a97d2SHannes Reinecke rbd_img_request_destroy(img_req); 35890192ce2eSIlya Dryomov blk_mq_end_request(rq, errno_to_blk_status(result)); 35900192ce2eSIlya Dryomov } 35919969ebc5SAlex Elder } 35929969ebc5SAlex Elder 3593ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3594ed95b21aSIlya Dryomov 3595ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3596ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3597ed95b21aSIlya Dryomov { 3598ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3599ed95b21aSIlya Dryomov } 3600ed95b21aSIlya Dryomov 3601ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3602ed95b21aSIlya Dryomov { 3603ed95b21aSIlya Dryomov struct rbd_client_id cid; 3604ed95b21aSIlya Dryomov 3605ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3606ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3607ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3608ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3609ed95b21aSIlya Dryomov return cid; 3610ed95b21aSIlya Dryomov } 3611ed95b21aSIlya Dryomov 3612ed95b21aSIlya Dryomov /* 3613ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3614ed95b21aSIlya Dryomov */ 3615ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3616ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3617ed95b21aSIlya Dryomov { 3618ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3619ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3620ed95b21aSIlya Dryomov cid->gid, cid->handle); 3621ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3622ed95b21aSIlya Dryomov } 3623ed95b21aSIlya Dryomov 3624ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3625ed95b21aSIlya Dryomov { 3626ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3627ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3628ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3629ed95b21aSIlya Dryomov } 3630ed95b21aSIlya Dryomov 3631edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) 3632edd8ca80SFlorian Margaine { 3633edd8ca80SFlorian Margaine struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3634edd8ca80SFlorian Margaine 3635a2b1da09SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3636edd8ca80SFlorian Margaine strcpy(rbd_dev->lock_cookie, cookie); 3637edd8ca80SFlorian Margaine rbd_set_owner_cid(rbd_dev, &cid); 3638edd8ca80SFlorian Margaine queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3639edd8ca80SFlorian Margaine } 3640edd8ca80SFlorian Margaine 3641ed95b21aSIlya Dryomov /* 3642ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3643ed95b21aSIlya Dryomov */ 3644ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3645ed95b21aSIlya Dryomov { 3646ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3647ed95b21aSIlya Dryomov char cookie[32]; 3648ed95b21aSIlya Dryomov int ret; 3649ed95b21aSIlya Dryomov 3650cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) || 3651cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0'); 3652ed95b21aSIlya Dryomov 3653ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3654ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3655ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3656ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3657ed95b21aSIlya Dryomov if (ret) 3658ed95b21aSIlya Dryomov return ret; 3659ed95b21aSIlya Dryomov 3660edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 3661ed95b21aSIlya Dryomov return 0; 3662ed95b21aSIlya Dryomov } 3663ed95b21aSIlya Dryomov 3664ed95b21aSIlya Dryomov /* 3665ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3666ed95b21aSIlya Dryomov */ 3667bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev) 3668ed95b21aSIlya Dryomov { 3669ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3670ed95b21aSIlya Dryomov int ret; 3671ed95b21aSIlya Dryomov 3672cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) || 3673cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0'); 3674ed95b21aSIlya Dryomov 3675ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3676cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie); 3677bbead745SIlya Dryomov if (ret && ret != -ENOENT) 3678637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock header: %d", ret); 3679ed95b21aSIlya Dryomov 3680bbead745SIlya Dryomov /* treat errors as the image is unlocked */ 3681bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3682cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0'; 3683ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3684ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3685ed95b21aSIlya Dryomov } 3686ed95b21aSIlya Dryomov 3687ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3688ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3689ed95b21aSIlya Dryomov struct page ***preply_pages, 3690ed95b21aSIlya Dryomov size_t *preply_len) 3691ed95b21aSIlya Dryomov { 3692ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3693ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 369408a79102SKyle Spiers char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; 369508a79102SKyle Spiers int buf_size = sizeof(buf); 3696ed95b21aSIlya Dryomov void *p = buf; 3697ed95b21aSIlya Dryomov 3698ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3699ed95b21aSIlya Dryomov 3700ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3701ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3702ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3703ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3704ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3705ed95b21aSIlya Dryomov 3706ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3707ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3708ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3709ed95b21aSIlya Dryomov } 3710ed95b21aSIlya Dryomov 3711ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3712ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3713ed95b21aSIlya Dryomov { 37148ae0299aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL); 3715ed95b21aSIlya Dryomov } 3716ed95b21aSIlya Dryomov 3717ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3718ed95b21aSIlya Dryomov { 3719ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3720ed95b21aSIlya Dryomov acquired_lock_work); 3721ed95b21aSIlya Dryomov 3722ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3723ed95b21aSIlya Dryomov } 3724ed95b21aSIlya Dryomov 3725ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3726ed95b21aSIlya Dryomov { 3727ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3728ed95b21aSIlya Dryomov released_lock_work); 3729ed95b21aSIlya Dryomov 3730ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3731ed95b21aSIlya Dryomov } 3732ed95b21aSIlya Dryomov 3733ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3734ed95b21aSIlya Dryomov { 3735ed95b21aSIlya Dryomov struct page **reply_pages; 3736ed95b21aSIlya Dryomov size_t reply_len; 3737ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3738ed95b21aSIlya Dryomov int ret; 3739ed95b21aSIlya Dryomov 3740ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3741ed95b21aSIlya Dryomov 3742ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3743ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3744ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3745ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3746ed95b21aSIlya Dryomov goto out; 3747ed95b21aSIlya Dryomov } 3748ed95b21aSIlya Dryomov 3749ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3750ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3751ed95b21aSIlya Dryomov void *const end = p + reply_len; 3752ed95b21aSIlya Dryomov u32 n; 3753ed95b21aSIlya Dryomov 3754ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3755ed95b21aSIlya Dryomov while (n--) { 3756ed95b21aSIlya Dryomov u8 struct_v; 3757ed95b21aSIlya Dryomov u32 len; 3758ed95b21aSIlya Dryomov 3759ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3760ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3761ed95b21aSIlya Dryomov 3762ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3763ed95b21aSIlya Dryomov if (!len) 3764ed95b21aSIlya Dryomov continue; 3765ed95b21aSIlya Dryomov 3766ed95b21aSIlya Dryomov if (lock_owner_responded) { 3767ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3768ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3769ed95b21aSIlya Dryomov ret = -EIO; 3770ed95b21aSIlya Dryomov goto out; 3771ed95b21aSIlya Dryomov } 3772ed95b21aSIlya Dryomov 3773ed95b21aSIlya Dryomov lock_owner_responded = true; 3774ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3775ed95b21aSIlya Dryomov &struct_v, &len); 3776ed95b21aSIlya Dryomov if (ret) { 3777ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3778ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3779ed95b21aSIlya Dryomov ret); 3780ed95b21aSIlya Dryomov goto e_inval; 3781ed95b21aSIlya Dryomov } 3782ed95b21aSIlya Dryomov 3783ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3784ed95b21aSIlya Dryomov } 3785ed95b21aSIlya Dryomov } 3786ed95b21aSIlya Dryomov 3787ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3788ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3789ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3790ed95b21aSIlya Dryomov } 3791ed95b21aSIlya Dryomov 3792ed95b21aSIlya Dryomov out: 3793ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3794ed95b21aSIlya Dryomov return ret; 3795ed95b21aSIlya Dryomov 3796ed95b21aSIlya Dryomov e_inval: 3797ed95b21aSIlya Dryomov ret = -EINVAL; 3798ed95b21aSIlya Dryomov goto out; 3799ed95b21aSIlya Dryomov } 3800ed95b21aSIlya Dryomov 3801637cd060SIlya Dryomov /* 3802637cd060SIlya Dryomov * Either image request state machine(s) or rbd_add_acquire_lock() 3803637cd060SIlya Dryomov * (i.e. "rbd map"). 3804637cd060SIlya Dryomov */ 3805637cd060SIlya Dryomov static void wake_lock_waiters(struct rbd_device *rbd_dev, int result) 3806ed95b21aSIlya Dryomov { 3807637cd060SIlya Dryomov struct rbd_img_request *img_req; 3808637cd060SIlya Dryomov 3809637cd060SIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3810d9b9c893SLinus Torvalds lockdep_assert_held_write(&rbd_dev->lock_rwsem); 3811ed95b21aSIlya Dryomov 3812ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3813637cd060SIlya Dryomov if (!completion_done(&rbd_dev->acquire_wait)) { 3814637cd060SIlya Dryomov rbd_assert(list_empty(&rbd_dev->acquiring_list) && 3815637cd060SIlya Dryomov list_empty(&rbd_dev->running_list)); 3816637cd060SIlya Dryomov rbd_dev->acquire_err = result; 3817637cd060SIlya Dryomov complete_all(&rbd_dev->acquire_wait); 3818637cd060SIlya Dryomov return; 3819637cd060SIlya Dryomov } 3820637cd060SIlya Dryomov 3821637cd060SIlya Dryomov list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { 3822637cd060SIlya Dryomov mutex_lock(&img_req->state_mutex); 3823637cd060SIlya Dryomov rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); 3824637cd060SIlya Dryomov rbd_img_schedule(img_req, result); 3825637cd060SIlya Dryomov mutex_unlock(&img_req->state_mutex); 3826637cd060SIlya Dryomov } 3827637cd060SIlya Dryomov 3828637cd060SIlya Dryomov list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); 3829ed95b21aSIlya Dryomov } 3830ed95b21aSIlya Dryomov 3831ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3832ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3833ed95b21aSIlya Dryomov { 3834ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3835ed95b21aSIlya Dryomov u8 lock_type; 3836ed95b21aSIlya Dryomov char *lock_tag; 3837ed95b21aSIlya Dryomov int ret; 3838ed95b21aSIlya Dryomov 3839ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3840ed95b21aSIlya Dryomov 3841ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 3842ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3843ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 3844ed95b21aSIlya Dryomov if (ret) 3845ed95b21aSIlya Dryomov return ret; 3846ed95b21aSIlya Dryomov 3847ed95b21aSIlya Dryomov if (*num_lockers == 0) { 3848ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 3849ed95b21aSIlya Dryomov goto out; 3850ed95b21aSIlya Dryomov } 3851ed95b21aSIlya Dryomov 3852ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 3853ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 3854ed95b21aSIlya Dryomov lock_tag); 3855ed95b21aSIlya Dryomov ret = -EBUSY; 3856ed95b21aSIlya Dryomov goto out; 3857ed95b21aSIlya Dryomov } 3858ed95b21aSIlya Dryomov 3859ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 3860ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 3861ed95b21aSIlya Dryomov ret = -EBUSY; 3862ed95b21aSIlya Dryomov goto out; 3863ed95b21aSIlya Dryomov } 3864ed95b21aSIlya Dryomov 3865ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3866ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3867ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3868ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3869ed95b21aSIlya Dryomov ret = -EBUSY; 3870ed95b21aSIlya Dryomov goto out; 3871ed95b21aSIlya Dryomov } 3872ed95b21aSIlya Dryomov 3873ed95b21aSIlya Dryomov out: 3874ed95b21aSIlya Dryomov kfree(lock_tag); 3875ed95b21aSIlya Dryomov return ret; 3876ed95b21aSIlya Dryomov } 3877ed95b21aSIlya Dryomov 3878ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3879ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3880ed95b21aSIlya Dryomov { 3881ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3882ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3883ed95b21aSIlya Dryomov u32 num_watchers; 3884ed95b21aSIlya Dryomov u64 cookie; 3885ed95b21aSIlya Dryomov int i; 3886ed95b21aSIlya Dryomov int ret; 3887ed95b21aSIlya Dryomov 3888ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3889ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3890ed95b21aSIlya Dryomov &num_watchers); 3891ed95b21aSIlya Dryomov if (ret) 3892ed95b21aSIlya Dryomov return ret; 3893ed95b21aSIlya Dryomov 3894ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3895ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3896313771e8SIlya Dryomov /* 3897313771e8SIlya Dryomov * Ignore addr->type while comparing. This mimics 3898313771e8SIlya Dryomov * entity_addr_t::get_legacy_str() + strcmp(). 3899313771e8SIlya Dryomov */ 3900313771e8SIlya Dryomov if (ceph_addr_equal_no_type(&watchers[i].addr, 3901313771e8SIlya Dryomov &locker->info.addr) && 3902ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3903ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3904ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3905ed95b21aSIlya Dryomov .handle = cookie, 3906ed95b21aSIlya Dryomov }; 3907ed95b21aSIlya Dryomov 3908ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3909ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3910ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3911ed95b21aSIlya Dryomov ret = 1; 3912ed95b21aSIlya Dryomov goto out; 3913ed95b21aSIlya Dryomov } 3914ed95b21aSIlya Dryomov } 3915ed95b21aSIlya Dryomov 3916ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3917ed95b21aSIlya Dryomov ret = 0; 3918ed95b21aSIlya Dryomov out: 3919ed95b21aSIlya Dryomov kfree(watchers); 3920ed95b21aSIlya Dryomov return ret; 3921ed95b21aSIlya Dryomov } 3922ed95b21aSIlya Dryomov 3923ed95b21aSIlya Dryomov /* 3924ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3925ed95b21aSIlya Dryomov */ 3926ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3927ed95b21aSIlya Dryomov { 3928ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3929ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3930ed95b21aSIlya Dryomov u32 num_lockers; 3931ed95b21aSIlya Dryomov int ret; 3932ed95b21aSIlya Dryomov 3933ed95b21aSIlya Dryomov for (;;) { 3934ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3935ed95b21aSIlya Dryomov if (ret != -EBUSY) 3936ed95b21aSIlya Dryomov return ret; 3937ed95b21aSIlya Dryomov 3938ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3939ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3940ed95b21aSIlya Dryomov if (ret) 3941ed95b21aSIlya Dryomov return ret; 3942ed95b21aSIlya Dryomov 3943ed95b21aSIlya Dryomov if (num_lockers == 0) 3944ed95b21aSIlya Dryomov goto again; 3945ed95b21aSIlya Dryomov 3946ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3947637cd060SIlya Dryomov if (ret) 3948637cd060SIlya Dryomov goto out; /* request lock or error */ 3949ed95b21aSIlya Dryomov 395022e8bd51SIlya Dryomov rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", 3951ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3952ed95b21aSIlya Dryomov 39530b98acd6SIlya Dryomov ret = ceph_monc_blocklist_add(&client->monc, 3954ed95b21aSIlya Dryomov &lockers[0].info.addr); 3955ed95b21aSIlya Dryomov if (ret) { 39560b98acd6SIlya Dryomov rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d", 3957ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3958ed95b21aSIlya Dryomov goto out; 3959ed95b21aSIlya Dryomov } 3960ed95b21aSIlya Dryomov 3961ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3962ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3963ed95b21aSIlya Dryomov lockers[0].id.cookie, 3964ed95b21aSIlya Dryomov &lockers[0].id.name); 3965ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3966ed95b21aSIlya Dryomov goto out; 3967ed95b21aSIlya Dryomov 3968ed95b21aSIlya Dryomov again: 3969ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3970ed95b21aSIlya Dryomov } 3971ed95b21aSIlya Dryomov 3972ed95b21aSIlya Dryomov out: 3973ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3974ed95b21aSIlya Dryomov return ret; 3975ed95b21aSIlya Dryomov } 3976ed95b21aSIlya Dryomov 397722e8bd51SIlya Dryomov static int rbd_post_acquire_action(struct rbd_device *rbd_dev) 3978ed95b21aSIlya Dryomov { 397922e8bd51SIlya Dryomov int ret; 398022e8bd51SIlya Dryomov 398122e8bd51SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) { 398222e8bd51SIlya Dryomov ret = rbd_object_map_open(rbd_dev); 398322e8bd51SIlya Dryomov if (ret) 398422e8bd51SIlya Dryomov return ret; 398522e8bd51SIlya Dryomov } 398622e8bd51SIlya Dryomov 398722e8bd51SIlya Dryomov return 0; 398822e8bd51SIlya Dryomov } 398922e8bd51SIlya Dryomov 3990ed95b21aSIlya Dryomov /* 3991637cd060SIlya Dryomov * Return: 3992637cd060SIlya Dryomov * 0 - lock acquired 3993637cd060SIlya Dryomov * 1 - caller should call rbd_request_lock() 3994637cd060SIlya Dryomov * <0 - error 3995ed95b21aSIlya Dryomov */ 3996637cd060SIlya Dryomov static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) 3997ed95b21aSIlya Dryomov { 3998637cd060SIlya Dryomov int ret; 3999ed95b21aSIlya Dryomov 4000ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4001ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 4002ed95b21aSIlya Dryomov rbd_dev->lock_state); 4003ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 4004ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4005637cd060SIlya Dryomov return 0; 4006ed95b21aSIlya Dryomov } 4007ed95b21aSIlya Dryomov 4008ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4009ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4010ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 4011ed95b21aSIlya Dryomov rbd_dev->lock_state); 4012637cd060SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 4013637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4014637cd060SIlya Dryomov return 0; 4015ed95b21aSIlya Dryomov } 4016ed95b21aSIlya Dryomov 4017637cd060SIlya Dryomov ret = rbd_try_lock(rbd_dev); 4018637cd060SIlya Dryomov if (ret < 0) { 4019637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to lock header: %d", ret); 40200b98acd6SIlya Dryomov if (ret == -EBLOCKLISTED) 4021637cd060SIlya Dryomov goto out; 4022637cd060SIlya Dryomov 4023637cd060SIlya Dryomov ret = 1; /* request lock anyway */ 4024637cd060SIlya Dryomov } 4025637cd060SIlya Dryomov if (ret > 0) { 4026ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4027637cd060SIlya Dryomov return ret; 4028637cd060SIlya Dryomov } 4029637cd060SIlya Dryomov 4030637cd060SIlya Dryomov rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED); 4031637cd060SIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list)); 4032637cd060SIlya Dryomov 403322e8bd51SIlya Dryomov ret = rbd_post_acquire_action(rbd_dev); 403422e8bd51SIlya Dryomov if (ret) { 403522e8bd51SIlya Dryomov rbd_warn(rbd_dev, "post-acquire action failed: %d", ret); 403622e8bd51SIlya Dryomov /* 403722e8bd51SIlya Dryomov * Can't stay in RBD_LOCK_STATE_LOCKED because 403822e8bd51SIlya Dryomov * rbd_lock_add_request() would let the request through, 403922e8bd51SIlya Dryomov * assuming that e.g. object map is locked and loaded. 404022e8bd51SIlya Dryomov */ 404122e8bd51SIlya Dryomov rbd_unlock(rbd_dev); 404222e8bd51SIlya Dryomov } 404322e8bd51SIlya Dryomov 4044637cd060SIlya Dryomov out: 4045637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret); 4046637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4047637cd060SIlya Dryomov return ret; 4048ed95b21aSIlya Dryomov } 4049ed95b21aSIlya Dryomov 4050ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 4051ed95b21aSIlya Dryomov { 4052ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 4053ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 4054637cd060SIlya Dryomov int ret; 4055ed95b21aSIlya Dryomov 4056ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4057ed95b21aSIlya Dryomov again: 4058637cd060SIlya Dryomov ret = rbd_try_acquire_lock(rbd_dev); 4059637cd060SIlya Dryomov if (ret <= 0) { 4060637cd060SIlya Dryomov dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret); 4061ed95b21aSIlya Dryomov return; 4062ed95b21aSIlya Dryomov } 4063ed95b21aSIlya Dryomov 4064ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 4065ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 4066ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 4067e010dd0aSIlya Dryomov } else if (ret == -EROFS) { 4068e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "peer will not release lock"); 4069637cd060SIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4070637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret); 4071637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4072ed95b21aSIlya Dryomov } else if (ret < 0) { 4073ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 4074ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 4075ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 4076ed95b21aSIlya Dryomov } else { 4077ed95b21aSIlya Dryomov /* 4078ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 4079ed95b21aSIlya Dryomov * release the lock 4080ed95b21aSIlya Dryomov */ 40816b0a8774SColin Ian King dout("%s rbd_dev %p requeuing lock_dwork\n", __func__, 4082ed95b21aSIlya Dryomov rbd_dev); 4083ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 4084ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 4085ed95b21aSIlya Dryomov } 4086ed95b21aSIlya Dryomov } 4087ed95b21aSIlya Dryomov 4088a2b1da09SIlya Dryomov static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) 4089ed95b21aSIlya Dryomov { 4090a2b1da09SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4091d9b9c893SLinus Torvalds lockdep_assert_held_write(&rbd_dev->lock_rwsem); 4092a2b1da09SIlya Dryomov 4093ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 4094ed95b21aSIlya Dryomov return false; 4095ed95b21aSIlya Dryomov 4096ed95b21aSIlya Dryomov /* 4097ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 4098ed95b21aSIlya Dryomov */ 4099e1fddc8fSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 4100e1fddc8fSIlya Dryomov rbd_assert(!completion_done(&rbd_dev->releasing_wait)); 4101ed9eb710SIlya Dryomov if (list_empty(&rbd_dev->running_list)) 4102ed9eb710SIlya Dryomov return true; 4103ed9eb710SIlya Dryomov 4104ed9eb710SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4105e1fddc8fSIlya Dryomov wait_for_completion(&rbd_dev->releasing_wait); 4106ed95b21aSIlya Dryomov 4107ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4108ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 4109ed95b21aSIlya Dryomov return false; 4110ed95b21aSIlya Dryomov 4111e1fddc8fSIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list)); 4112a2b1da09SIlya Dryomov return true; 4113a2b1da09SIlya Dryomov } 4114a2b1da09SIlya Dryomov 411522e8bd51SIlya Dryomov static void rbd_pre_release_action(struct rbd_device *rbd_dev) 411622e8bd51SIlya Dryomov { 411722e8bd51SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) 411822e8bd51SIlya Dryomov rbd_object_map_close(rbd_dev); 411922e8bd51SIlya Dryomov } 412022e8bd51SIlya Dryomov 4121e1fddc8fSIlya Dryomov static void __rbd_release_lock(struct rbd_device *rbd_dev) 4122e1fddc8fSIlya Dryomov { 4123e1fddc8fSIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list)); 4124e1fddc8fSIlya Dryomov 412522e8bd51SIlya Dryomov rbd_pre_release_action(rbd_dev); 4126bbead745SIlya Dryomov rbd_unlock(rbd_dev); 4127e1fddc8fSIlya Dryomov } 4128e1fddc8fSIlya Dryomov 4129a2b1da09SIlya Dryomov /* 4130a2b1da09SIlya Dryomov * lock_rwsem must be held for write 4131a2b1da09SIlya Dryomov */ 4132a2b1da09SIlya Dryomov static void rbd_release_lock(struct rbd_device *rbd_dev) 4133a2b1da09SIlya Dryomov { 4134a2b1da09SIlya Dryomov if (!rbd_quiesce_lock(rbd_dev)) 4135a2b1da09SIlya Dryomov return; 4136a2b1da09SIlya Dryomov 4137e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev); 4138a2b1da09SIlya Dryomov 4139ed95b21aSIlya Dryomov /* 4140ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 4141637cd060SIlya Dryomov * almost immediately if we got new IO while draining the running 4142637cd060SIlya Dryomov * list otherwise. We need to ack our own notifications, so this 4143637cd060SIlya Dryomov * lock_dwork will be requeued from rbd_handle_released_lock() by 4144637cd060SIlya Dryomov * way of maybe_kick_acquire(). 4145ed95b21aSIlya Dryomov */ 4146ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 4147ed95b21aSIlya Dryomov } 4148ed95b21aSIlya Dryomov 4149ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 4150ed95b21aSIlya Dryomov { 4151ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 4152ed95b21aSIlya Dryomov unlock_work); 4153ed95b21aSIlya Dryomov 4154ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4155ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 4156ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4157ed95b21aSIlya Dryomov } 4158ed95b21aSIlya Dryomov 4159637cd060SIlya Dryomov static void maybe_kick_acquire(struct rbd_device *rbd_dev) 4160637cd060SIlya Dryomov { 4161637cd060SIlya Dryomov bool have_requests; 4162637cd060SIlya Dryomov 4163637cd060SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4164637cd060SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 4165637cd060SIlya Dryomov return; 4166637cd060SIlya Dryomov 4167637cd060SIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock); 4168637cd060SIlya Dryomov have_requests = !list_empty(&rbd_dev->acquiring_list); 4169637cd060SIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock); 4170637cd060SIlya Dryomov if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) { 4171637cd060SIlya Dryomov dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev); 4172637cd060SIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 4173637cd060SIlya Dryomov } 4174637cd060SIlya Dryomov } 4175637cd060SIlya Dryomov 4176ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 4177ed95b21aSIlya Dryomov void **p) 4178ed95b21aSIlya Dryomov { 4179ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 4180ed95b21aSIlya Dryomov 4181ed95b21aSIlya Dryomov if (struct_v >= 2) { 4182ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 4183ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 4184ed95b21aSIlya Dryomov } 4185ed95b21aSIlya Dryomov 4186ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 4187ed95b21aSIlya Dryomov cid.handle); 4188ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 4189ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4190ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 41918798d070SIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n", 41928798d070SIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle); 41938798d070SIlya Dryomov } else { 4194ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 41958798d070SIlya Dryomov } 4196ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 4197ed95b21aSIlya Dryomov } else { 4198ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4199ed95b21aSIlya Dryomov } 4200ed95b21aSIlya Dryomov 4201637cd060SIlya Dryomov maybe_kick_acquire(rbd_dev); 4202ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4203ed95b21aSIlya Dryomov } 4204ed95b21aSIlya Dryomov 4205ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 4206ed95b21aSIlya Dryomov void **p) 4207ed95b21aSIlya Dryomov { 4208ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 4209ed95b21aSIlya Dryomov 4210ed95b21aSIlya Dryomov if (struct_v >= 2) { 4211ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 4212ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 4213ed95b21aSIlya Dryomov } 4214ed95b21aSIlya Dryomov 4215ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 4216ed95b21aSIlya Dryomov cid.handle); 4217ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 4218ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4219ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 42208798d070SIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n", 4221ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 4222ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 42238798d070SIlya Dryomov } else { 4224ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 42258798d070SIlya Dryomov } 4226ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 4227ed95b21aSIlya Dryomov } else { 4228ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4229ed95b21aSIlya Dryomov } 4230ed95b21aSIlya Dryomov 4231637cd060SIlya Dryomov maybe_kick_acquire(rbd_dev); 4232ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4233ed95b21aSIlya Dryomov } 4234ed95b21aSIlya Dryomov 42353b77faa0SIlya Dryomov /* 42363b77faa0SIlya Dryomov * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no 42373b77faa0SIlya Dryomov * ResponseMessage is needed. 42383b77faa0SIlya Dryomov */ 42393b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 4240ed95b21aSIlya Dryomov void **p) 4241ed95b21aSIlya Dryomov { 4242ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 4243ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 42443b77faa0SIlya Dryomov int result = 1; 4245ed95b21aSIlya Dryomov 4246ed95b21aSIlya Dryomov if (struct_v >= 2) { 4247ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 4248ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 4249ed95b21aSIlya Dryomov } 4250ed95b21aSIlya Dryomov 4251ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 4252ed95b21aSIlya Dryomov cid.handle); 4253ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 42543b77faa0SIlya Dryomov return result; 4255ed95b21aSIlya Dryomov 4256ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 42573b77faa0SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 42583b77faa0SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED && 42593b77faa0SIlya Dryomov rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) 42603b77faa0SIlya Dryomov goto out_unlock; 42613b77faa0SIlya Dryomov 42623b77faa0SIlya Dryomov /* 42633b77faa0SIlya Dryomov * encode ResponseMessage(0) so the peer can detect 42643b77faa0SIlya Dryomov * a missing owner 42653b77faa0SIlya Dryomov */ 42663b77faa0SIlya Dryomov result = 0; 42673b77faa0SIlya Dryomov 4268ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 4269e010dd0aSIlya Dryomov if (!rbd_dev->opts->exclusive) { 4270e010dd0aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", 4271e010dd0aSIlya Dryomov __func__, rbd_dev); 4272e010dd0aSIlya Dryomov queue_work(rbd_dev->task_wq, 4273e010dd0aSIlya Dryomov &rbd_dev->unlock_work); 4274e010dd0aSIlya Dryomov } else { 4275e010dd0aSIlya Dryomov /* refuse to release the lock */ 4276e010dd0aSIlya Dryomov result = -EROFS; 4277ed95b21aSIlya Dryomov } 4278ed95b21aSIlya Dryomov } 4279ed95b21aSIlya Dryomov } 42803b77faa0SIlya Dryomov 42813b77faa0SIlya Dryomov out_unlock: 4282ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 42833b77faa0SIlya Dryomov return result; 4284ed95b21aSIlya Dryomov } 4285ed95b21aSIlya Dryomov 4286ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 4287ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 4288ed95b21aSIlya Dryomov { 4289ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 429008a79102SKyle Spiers char buf[4 + CEPH_ENCODING_START_BLK_LEN]; 429108a79102SKyle Spiers int buf_size = sizeof(buf); 4292ed95b21aSIlya Dryomov int ret; 4293ed95b21aSIlya Dryomov 4294ed95b21aSIlya Dryomov if (result) { 4295ed95b21aSIlya Dryomov void *p = buf; 4296ed95b21aSIlya Dryomov 4297ed95b21aSIlya Dryomov /* encode ResponseMessage */ 4298ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 4299ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 4300ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 4301ed95b21aSIlya Dryomov } else { 4302ed95b21aSIlya Dryomov buf_size = 0; 4303ed95b21aSIlya Dryomov } 4304ed95b21aSIlya Dryomov 4305ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 4306ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 4307ed95b21aSIlya Dryomov buf, buf_size); 4308ed95b21aSIlya Dryomov if (ret) 4309ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 4310ed95b21aSIlya Dryomov } 4311ed95b21aSIlya Dryomov 4312ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 4313ed95b21aSIlya Dryomov u64 cookie) 4314ed95b21aSIlya Dryomov { 4315ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4316ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 4317ed95b21aSIlya Dryomov } 4318ed95b21aSIlya Dryomov 4319ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 4320ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 4321ed95b21aSIlya Dryomov { 4322ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 4323ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 4324ed95b21aSIlya Dryomov } 4325922dab61SIlya Dryomov 4326922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 4327922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 4328bf0d5f50SAlex Elder { 4329922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 4330ed95b21aSIlya Dryomov void *p = data; 4331ed95b21aSIlya Dryomov void *const end = p + data_len; 4332d4c2269bSIlya Dryomov u8 struct_v = 0; 4333ed95b21aSIlya Dryomov u32 len; 4334ed95b21aSIlya Dryomov u32 notify_op; 4335bf0d5f50SAlex Elder int ret; 4336bf0d5f50SAlex Elder 4337ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 4338ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 4339ed95b21aSIlya Dryomov if (data_len) { 4340ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 4341ed95b21aSIlya Dryomov &struct_v, &len); 4342ed95b21aSIlya Dryomov if (ret) { 4343ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 4344ed95b21aSIlya Dryomov ret); 4345ed95b21aSIlya Dryomov return; 4346ed95b21aSIlya Dryomov } 434752bb1f9bSIlya Dryomov 4348ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 4349ed95b21aSIlya Dryomov } else { 4350ed95b21aSIlya Dryomov /* legacy notification for header updates */ 4351ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 4352ed95b21aSIlya Dryomov len = 0; 4353ed95b21aSIlya Dryomov } 4354ed95b21aSIlya Dryomov 4355ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 4356ed95b21aSIlya Dryomov switch (notify_op) { 4357ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 4358ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 4359ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4360ed95b21aSIlya Dryomov break; 4361ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 4362ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 4363ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4364ed95b21aSIlya Dryomov break; 4365ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 43663b77faa0SIlya Dryomov ret = rbd_handle_request_lock(rbd_dev, struct_v, &p); 43673b77faa0SIlya Dryomov if (ret <= 0) 4368ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 43693b77faa0SIlya Dryomov cookie, ret); 4370ed95b21aSIlya Dryomov else 4371ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4372ed95b21aSIlya Dryomov break; 4373ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 4374e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 4375e627db08SAlex Elder if (ret) 43769584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 4377bf0d5f50SAlex Elder 4378ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4379ed95b21aSIlya Dryomov break; 4380ed95b21aSIlya Dryomov default: 4381ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 4382ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 4383ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 4384ed95b21aSIlya Dryomov else 4385ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4386ed95b21aSIlya Dryomov break; 43879969ebc5SAlex Elder } 43889969ebc5SAlex Elder } 43899969ebc5SAlex Elder 439099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 43919969ebc5SAlex Elder 4392922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 4393bb040aa0SIlya Dryomov { 4394922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 4395bb040aa0SIlya Dryomov 4396922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 4397bb040aa0SIlya Dryomov 4398ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4399ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 4400ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4401bb040aa0SIlya Dryomov 440299d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 440399d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 440499d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 440599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 4406bb040aa0SIlya Dryomov 440799d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 4408bb040aa0SIlya Dryomov } 440999d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 4410bb040aa0SIlya Dryomov } 4411bb040aa0SIlya Dryomov 4412bb040aa0SIlya Dryomov /* 441399d16943SIlya Dryomov * watch_mutex must be locked 44149969ebc5SAlex Elder */ 441599d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 44169969ebc5SAlex Elder { 44179969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4418922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 44199969ebc5SAlex Elder 4420922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 442199d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 44229969ebc5SAlex Elder 4423922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 4424922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 4425922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 4426922dab61SIlya Dryomov if (IS_ERR(handle)) 4427922dab61SIlya Dryomov return PTR_ERR(handle); 44289969ebc5SAlex Elder 4429922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 44308eb87565SAlex Elder return 0; 44319969ebc5SAlex Elder } 44329969ebc5SAlex Elder 443399d16943SIlya Dryomov /* 443499d16943SIlya Dryomov * watch_mutex must be locked 443599d16943SIlya Dryomov */ 443699d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 4437fca27065SIlya Dryomov { 4438922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4439922dab61SIlya Dryomov int ret; 4440b30a01f2SIlya Dryomov 444199d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 444299d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4443b30a01f2SIlya Dryomov 4444922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 4445922dab61SIlya Dryomov if (ret) 4446922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 4447b30a01f2SIlya Dryomov 4448922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 4449c525f036SIlya Dryomov } 4450c525f036SIlya Dryomov 445199d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 4452c525f036SIlya Dryomov { 445399d16943SIlya Dryomov int ret; 4454811c6688SIlya Dryomov 445599d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 445699d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 445799d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 445899d16943SIlya Dryomov if (ret) 445999d16943SIlya Dryomov goto out; 446099d16943SIlya Dryomov 446199d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 446299d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 446399d16943SIlya Dryomov 446499d16943SIlya Dryomov out: 446599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 446699d16943SIlya Dryomov return ret; 446799d16943SIlya Dryomov } 446899d16943SIlya Dryomov 446999d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 447099d16943SIlya Dryomov { 447199d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 447299d16943SIlya Dryomov 4473ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 4474ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 4475ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 4476ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 447799d16943SIlya Dryomov } 447899d16943SIlya Dryomov 44790e4e1de5SIlya Dryomov /* 44800e4e1de5SIlya Dryomov * header_rwsem must not be held to avoid a deadlock with 44810e4e1de5SIlya Dryomov * rbd_dev_refresh() when flushing notifies. 44820e4e1de5SIlya Dryomov */ 448399d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 448499d16943SIlya Dryomov { 448599d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 448699d16943SIlya Dryomov 448799d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 448899d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 448999d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 449099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 449199d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 449299d16943SIlya Dryomov 449323edca86SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->watch_dwork); 4494811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 4495fca27065SIlya Dryomov } 4496fca27065SIlya Dryomov 449714bb211dSIlya Dryomov /* 449814bb211dSIlya Dryomov * lock_rwsem must be held for write 449914bb211dSIlya Dryomov */ 450014bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev) 450114bb211dSIlya Dryomov { 450214bb211dSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 450314bb211dSIlya Dryomov char cookie[32]; 450414bb211dSIlya Dryomov int ret; 450514bb211dSIlya Dryomov 4506a2b1da09SIlya Dryomov if (!rbd_quiesce_lock(rbd_dev)) 4507a2b1da09SIlya Dryomov return; 450814bb211dSIlya Dryomov 450914bb211dSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 451014bb211dSIlya Dryomov ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, 451114bb211dSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 451214bb211dSIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie, 451314bb211dSIlya Dryomov RBD_LOCK_TAG, cookie); 451414bb211dSIlya Dryomov if (ret) { 451514bb211dSIlya Dryomov if (ret != -EOPNOTSUPP) 451614bb211dSIlya Dryomov rbd_warn(rbd_dev, "failed to update lock cookie: %d", 451714bb211dSIlya Dryomov ret); 451814bb211dSIlya Dryomov 451914bb211dSIlya Dryomov /* 452014bb211dSIlya Dryomov * Lock cookie cannot be updated on older OSDs, so do 452114bb211dSIlya Dryomov * a manual release and queue an acquire. 452214bb211dSIlya Dryomov */ 4523e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev); 4524a2b1da09SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 452514bb211dSIlya Dryomov } else { 4526edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 4527637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, 0); 452814bb211dSIlya Dryomov } 452914bb211dSIlya Dryomov } 453014bb211dSIlya Dryomov 453199d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 453299d16943SIlya Dryomov { 453399d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 453499d16943SIlya Dryomov struct rbd_device, watch_dwork); 453599d16943SIlya Dryomov int ret; 453699d16943SIlya Dryomov 453799d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 453899d16943SIlya Dryomov 453999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 454087c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 454187c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 454214bb211dSIlya Dryomov return; 454387c0fdedSIlya Dryomov } 454499d16943SIlya Dryomov 454599d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 454699d16943SIlya Dryomov if (ret) { 454799d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 45480b98acd6SIlya Dryomov if (ret != -EBLOCKLISTED && ret != -ENOENT) { 454999d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 455099d16943SIlya Dryomov &rbd_dev->watch_dwork, 455199d16943SIlya Dryomov RBD_RETRY_DELAY); 455287c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 455314bb211dSIlya Dryomov return; 455499d16943SIlya Dryomov } 455599d16943SIlya Dryomov 4556637cd060SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 4557637cd060SIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4558637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret); 4559637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4560637cd060SIlya Dryomov return; 4561637cd060SIlya Dryomov } 4562637cd060SIlya Dryomov 456399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 456499d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 456599d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 456699d16943SIlya Dryomov 456714bb211dSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 456814bb211dSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 456914bb211dSIlya Dryomov rbd_reacquire_lock(rbd_dev); 457014bb211dSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 457114bb211dSIlya Dryomov 457299d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 457399d16943SIlya Dryomov if (ret) 4574f6870cc9SColin Ian King rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); 457599d16943SIlya Dryomov } 457699d16943SIlya Dryomov 457736be9a76SAlex Elder /* 4578f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 4579f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 458036be9a76SAlex Elder */ 458136be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 4582ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 4583ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 458436be9a76SAlex Elder const char *method_name, 45854157976bSAlex Elder const void *outbound, 458636be9a76SAlex Elder size_t outbound_size, 45874157976bSAlex Elder void *inbound, 4588e2a58ee5SAlex Elder size_t inbound_size) 458936be9a76SAlex Elder { 4590ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4591ecd4a68aSIlya Dryomov struct page *req_page = NULL; 4592ecd4a68aSIlya Dryomov struct page *reply_page; 459336be9a76SAlex Elder int ret; 459436be9a76SAlex Elder 459536be9a76SAlex Elder /* 45966010a451SAlex Elder * Method calls are ultimately read operations. The result 45976010a451SAlex Elder * should placed into the inbound buffer provided. They 45986010a451SAlex Elder * also supply outbound data--parameters for the object 45996010a451SAlex Elder * method. Currently if this is present it will be a 46006010a451SAlex Elder * snapshot id. 460136be9a76SAlex Elder */ 4602ecd4a68aSIlya Dryomov if (outbound) { 4603ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 4604ecd4a68aSIlya Dryomov return -E2BIG; 460536be9a76SAlex Elder 4606ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 4607ecd4a68aSIlya Dryomov if (!req_page) 4608ecd4a68aSIlya Dryomov return -ENOMEM; 460936be9a76SAlex Elder 4610ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 461104017e29SAlex Elder } 4612430c28c3SAlex Elder 4613ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 4614ecd4a68aSIlya Dryomov if (!reply_page) { 4615ecd4a68aSIlya Dryomov if (req_page) 4616ecd4a68aSIlya Dryomov __free_page(req_page); 4617ecd4a68aSIlya Dryomov return -ENOMEM; 4618ecd4a68aSIlya Dryomov } 461936be9a76SAlex Elder 4620ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 4621ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 462268ada915SIlya Dryomov &reply_page, &inbound_size); 4623ecd4a68aSIlya Dryomov if (!ret) { 4624ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 4625ecd4a68aSIlya Dryomov ret = inbound_size; 4626ecd4a68aSIlya Dryomov } 462757385b51SAlex Elder 4628ecd4a68aSIlya Dryomov if (req_page) 4629ecd4a68aSIlya Dryomov __free_page(req_page); 4630ecd4a68aSIlya Dryomov __free_page(reply_page); 463136be9a76SAlex Elder return ret; 463236be9a76SAlex Elder } 463336be9a76SAlex Elder 46347ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 4635bc1ecc65SIlya Dryomov { 463659e542c8SIlya Dryomov struct rbd_img_request *img_request = 463759e542c8SIlya Dryomov container_of(work, struct rbd_img_request, work); 463859e542c8SIlya Dryomov struct rbd_device *rbd_dev = img_request->rbd_dev; 463959e542c8SIlya Dryomov enum obj_operation_type op_type = img_request->op_type; 464059e542c8SIlya Dryomov struct request *rq = blk_mq_rq_from_pdu(img_request); 4641bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 4642bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 46434e752f0aSJosh Durgin u64 mapping_size; 4644bc1ecc65SIlya Dryomov int result; 4645bc1ecc65SIlya Dryomov 4646bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 4647bc1ecc65SIlya Dryomov if (!length) { 4648bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 4649bc1ecc65SIlya Dryomov result = 0; 465059e542c8SIlya Dryomov goto err_img_request; 4651bc1ecc65SIlya Dryomov } 4652bc1ecc65SIlya Dryomov 46537ad18afaSChristoph Hellwig blk_mq_start_request(rq); 46547ad18afaSChristoph Hellwig 46554e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 46564e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 4657a52cc685SIlya Dryomov rbd_img_capture_header(img_request); 46584e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 46594e752f0aSJosh Durgin 46604e752f0aSJosh Durgin if (offset + length > mapping_size) { 4661bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 46624e752f0aSJosh Durgin length, mapping_size); 4663bc1ecc65SIlya Dryomov result = -EIO; 4664a52cc685SIlya Dryomov goto err_img_request; 4665bc1ecc65SIlya Dryomov } 4666bc1ecc65SIlya Dryomov 466721ed05a8SIlya Dryomov dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, 466821ed05a8SIlya Dryomov img_request, obj_op_name(op_type), offset, length); 466921ed05a8SIlya Dryomov 46706484cbe9SIlya Dryomov if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 46715a237819SIlya Dryomov result = rbd_img_fill_nodata(img_request, offset, length); 467290e98c52SGuangliang Zhao else 46735a237819SIlya Dryomov result = rbd_img_fill_from_bio(img_request, offset, length, 467490e98c52SGuangliang Zhao rq->bio); 46750192ce2eSIlya Dryomov if (result) 4676bc1ecc65SIlya Dryomov goto err_img_request; 4677bc1ecc65SIlya Dryomov 4678e1fddc8fSIlya Dryomov rbd_img_handle_request(img_request, 0); 4679bc1ecc65SIlya Dryomov return; 4680bc1ecc65SIlya Dryomov 4681bc1ecc65SIlya Dryomov err_img_request: 4682679a97d2SHannes Reinecke rbd_img_request_destroy(img_request); 4683bc1ecc65SIlya Dryomov if (result) 4684bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 46856d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 46862a842acaSChristoph Hellwig blk_mq_end_request(rq, errno_to_blk_status(result)); 4687bc1ecc65SIlya Dryomov } 4688bc1ecc65SIlya Dryomov 4689fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 46907ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4691bc1ecc65SIlya Dryomov { 469259e542c8SIlya Dryomov struct rbd_device *rbd_dev = hctx->queue->queuedata; 469359e542c8SIlya Dryomov struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq); 469459e542c8SIlya Dryomov enum obj_operation_type op_type; 4695bc1ecc65SIlya Dryomov 469659e542c8SIlya Dryomov switch (req_op(bd->rq)) { 469759e542c8SIlya Dryomov case REQ_OP_DISCARD: 469859e542c8SIlya Dryomov op_type = OBJ_OP_DISCARD; 469959e542c8SIlya Dryomov break; 470059e542c8SIlya Dryomov case REQ_OP_WRITE_ZEROES: 470159e542c8SIlya Dryomov op_type = OBJ_OP_ZEROOUT; 470259e542c8SIlya Dryomov break; 470359e542c8SIlya Dryomov case REQ_OP_WRITE: 470459e542c8SIlya Dryomov op_type = OBJ_OP_WRITE; 470559e542c8SIlya Dryomov break; 470659e542c8SIlya Dryomov case REQ_OP_READ: 470759e542c8SIlya Dryomov op_type = OBJ_OP_READ; 470859e542c8SIlya Dryomov break; 470959e542c8SIlya Dryomov default: 471059e542c8SIlya Dryomov rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq)); 471159e542c8SIlya Dryomov return BLK_STS_IOERR; 471259e542c8SIlya Dryomov } 471359e542c8SIlya Dryomov 471459e542c8SIlya Dryomov rbd_img_request_init(img_req, rbd_dev, op_type); 471559e542c8SIlya Dryomov 471659e542c8SIlya Dryomov if (rbd_img_is_write(img_req)) { 471759e542c8SIlya Dryomov if (rbd_is_ro(rbd_dev)) { 471859e542c8SIlya Dryomov rbd_warn(rbd_dev, "%s on read-only mapping", 471959e542c8SIlya Dryomov obj_op_name(img_req->op_type)); 472059e542c8SIlya Dryomov return BLK_STS_IOERR; 472159e542c8SIlya Dryomov } 472259e542c8SIlya Dryomov rbd_assert(!rbd_is_snap(rbd_dev)); 472359e542c8SIlya Dryomov } 472459e542c8SIlya Dryomov 472559e542c8SIlya Dryomov INIT_WORK(&img_req->work, rbd_queue_workfn); 472659e542c8SIlya Dryomov queue_work(rbd_wq, &img_req->work); 4727fc17b653SChristoph Hellwig return BLK_STS_OK; 4728bf0d5f50SAlex Elder } 4729bf0d5f50SAlex Elder 4730602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4731602adf40SYehuda Sadeh { 4732195b1956SChristoph Hellwig blk_cleanup_disk(rbd_dev->disk); 47337ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 47345769ed0cSIlya Dryomov rbd_dev->disk = NULL; 4735602adf40SYehuda Sadeh } 4736602adf40SYehuda Sadeh 4737788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4738fe5478e0SIlya Dryomov struct ceph_object_id *oid, 4739fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 4740fe5478e0SIlya Dryomov void *buf, int buf_len) 4741788e2df3SAlex Elder 4742788e2df3SAlex Elder { 4743fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4744fe5478e0SIlya Dryomov struct ceph_osd_request *req; 4745fe5478e0SIlya Dryomov struct page **pages; 4746fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 4747788e2df3SAlex Elder int ret; 4748788e2df3SAlex Elder 4749fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 4750fe5478e0SIlya Dryomov if (!req) 4751fe5478e0SIlya Dryomov return -ENOMEM; 4752788e2df3SAlex Elder 4753fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 4754fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 4755fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 4756788e2df3SAlex Elder 4757fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 4758fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 4759fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 4760fe5478e0SIlya Dryomov goto out_req; 4761fe5478e0SIlya Dryomov } 47621ceae7efSAlex Elder 4763fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 4764fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 4765fe5478e0SIlya Dryomov true); 4766788e2df3SAlex Elder 476726f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 476826f887e0SIlya Dryomov if (ret) 476926f887e0SIlya Dryomov goto out_req; 477026f887e0SIlya Dryomov 4771fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 4772fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 4773fe5478e0SIlya Dryomov if (ret >= 0) 4774fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 4775fe5478e0SIlya Dryomov 4776fe5478e0SIlya Dryomov out_req: 4777fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 4778788e2df3SAlex Elder return ret; 4779788e2df3SAlex Elder } 4780788e2df3SAlex Elder 4781602adf40SYehuda Sadeh /* 4782662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4783662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4784662518b1SAlex Elder * information about the image. 47854156d998SAlex Elder */ 478699a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 47874156d998SAlex Elder { 47884156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 47894156d998SAlex Elder u32 snap_count = 0; 47904156d998SAlex Elder u64 names_size = 0; 47914156d998SAlex Elder u32 want_count; 47924156d998SAlex Elder int ret; 47934156d998SAlex Elder 47944156d998SAlex Elder /* 47954156d998SAlex Elder * The complete header will include an array of its 64-bit 47964156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 47974156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 47984156d998SAlex Elder * the number of snapshots could change by the time we read 47994156d998SAlex Elder * it in, in which case we re-read it. 48004156d998SAlex Elder */ 48014156d998SAlex Elder do { 48024156d998SAlex Elder size_t size; 48034156d998SAlex Elder 48044156d998SAlex Elder kfree(ondisk); 48054156d998SAlex Elder 48064156d998SAlex Elder size = sizeof (*ondisk); 48074156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 48084156d998SAlex Elder size += names_size; 48094156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 48104156d998SAlex Elder if (!ondisk) 4811662518b1SAlex Elder return -ENOMEM; 48124156d998SAlex Elder 4813fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 4814fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 48154156d998SAlex Elder if (ret < 0) 4816662518b1SAlex Elder goto out; 4817c0cd10dbSAlex Elder if ((size_t)ret < size) { 48184156d998SAlex Elder ret = -ENXIO; 481906ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 482006ecc6cbSAlex Elder size, ret); 4821662518b1SAlex Elder goto out; 48224156d998SAlex Elder } 48234156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 48244156d998SAlex Elder ret = -ENXIO; 482506ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 4826662518b1SAlex Elder goto out; 48274156d998SAlex Elder } 48284156d998SAlex Elder 48294156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 48304156d998SAlex Elder want_count = snap_count; 48314156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 48324156d998SAlex Elder } while (snap_count != want_count); 48334156d998SAlex Elder 4834662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 4835662518b1SAlex Elder out: 48364156d998SAlex Elder kfree(ondisk); 48374156d998SAlex Elder 4838dfc5606dSYehuda Sadeh return ret; 4839602adf40SYehuda Sadeh } 4840602adf40SYehuda Sadeh 48419875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 48429875201eSJosh Durgin { 48439875201eSJosh Durgin sector_t size; 48449875201eSJosh Durgin 48459875201eSJosh Durgin /* 4846811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4847811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4848811c6688SIlya Dryomov * is just useless work since the device can't be opened. 48499875201eSJosh Durgin */ 4850811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4851811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 48529875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 48539875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 4854e864e49aSChristoph Hellwig set_capacity_and_notify(rbd_dev->disk, size); 48559875201eSJosh Durgin } 48569875201eSJosh Durgin } 48579875201eSJosh Durgin 4858cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 48591fe5e993SAlex Elder { 4860e627db08SAlex Elder u64 mapping_size; 48611fe5e993SAlex Elder int ret; 48621fe5e993SAlex Elder 4863cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 48643b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4865a720ae09SIlya Dryomov 4866a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 486752bb1f9bSIlya Dryomov if (ret) 486873e39e4dSIlya Dryomov goto out; 486915228edeSAlex Elder 4870e8f59b59SIlya Dryomov /* 4871e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4872e8f59b59SIlya Dryomov * mapped image getting flattened. 4873e8f59b59SIlya Dryomov */ 4874e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4875e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4876e8f59b59SIlya Dryomov if (ret) 487773e39e4dSIlya Dryomov goto out; 4878e8f59b59SIlya Dryomov } 4879e8f59b59SIlya Dryomov 4880686238b7SIlya Dryomov rbd_assert(!rbd_is_snap(rbd_dev)); 48815ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 48825ff1108cSIlya Dryomov 488373e39e4dSIlya Dryomov out: 4884cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 488573e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 48869875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 48871fe5e993SAlex Elder 488873e39e4dSIlya Dryomov return ret; 48891fe5e993SAlex Elder } 48901fe5e993SAlex Elder 4891f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = { 48927ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 48937ad18afaSChristoph Hellwig }; 48947ad18afaSChristoph Hellwig 4895602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4896602adf40SYehuda Sadeh { 4897602adf40SYehuda Sadeh struct gendisk *disk; 4898602adf40SYehuda Sadeh struct request_queue *q; 4899420efbdfSIlya Dryomov unsigned int objset_bytes = 4900420efbdfSIlya Dryomov rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; 49017ad18afaSChristoph Hellwig int err; 4902602adf40SYehuda Sadeh 49037ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 49047ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4905b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 49067ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 490756d18f62SMing Lei rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 4908f9b6b98dSHannes Reinecke rbd_dev->tag_set.nr_hw_queues = num_present_cpus(); 490959e542c8SIlya Dryomov rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request); 49107ad18afaSChristoph Hellwig 49117ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 49127ad18afaSChristoph Hellwig if (err) 4913195b1956SChristoph Hellwig return err; 4914029bcbd8SJosh Durgin 4915195b1956SChristoph Hellwig disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); 4916195b1956SChristoph Hellwig if (IS_ERR(disk)) { 4917195b1956SChristoph Hellwig err = PTR_ERR(disk); 49187ad18afaSChristoph Hellwig goto out_tag_set; 49197ad18afaSChristoph Hellwig } 4920195b1956SChristoph Hellwig q = disk->queue; 4921195b1956SChristoph Hellwig 4922195b1956SChristoph Hellwig snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4923195b1956SChristoph Hellwig rbd_dev->dev_id); 4924195b1956SChristoph Hellwig disk->major = rbd_dev->major; 4925195b1956SChristoph Hellwig disk->first_minor = rbd_dev->minor; 49261ebe2e5fSChristoph Hellwig if (single_major) 4927195b1956SChristoph Hellwig disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT); 49281ebe2e5fSChristoph Hellwig else 4929195b1956SChristoph Hellwig disk->minors = RBD_MINORS_PER_MAJOR; 4930195b1956SChristoph Hellwig disk->fops = &rbd_bd_ops; 49310077a500SIlya Dryomov disk->private_data = rbd_dev; 49327ad18afaSChristoph Hellwig 49338b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 4934d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4935593a9e7bSAlex Elder 4936420efbdfSIlya Dryomov blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); 49370d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 493821acdf45SIlya Dryomov blk_queue_max_segments(q, USHRT_MAX); 493924f1df60SIlya Dryomov blk_queue_max_segment_size(q, UINT_MAX); 494016d80c54SIlya Dryomov blk_queue_io_min(q, rbd_dev->opts->alloc_size); 494116d80c54SIlya Dryomov blk_queue_io_opt(q, rbd_dev->opts->alloc_size); 4942029bcbd8SJosh Durgin 4943d9360540SIlya Dryomov if (rbd_dev->opts->trim) { 49448b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 494516d80c54SIlya Dryomov q->limits.discard_granularity = rbd_dev->opts->alloc_size; 4946420efbdfSIlya Dryomov blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); 4947420efbdfSIlya Dryomov blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); 4948d9360540SIlya Dryomov } 494990e98c52SGuangliang Zhao 4950bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 49511cb039f3SChristoph Hellwig blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); 4952bae818eeSRonny Hegewald 4953602adf40SYehuda Sadeh rbd_dev->disk = disk; 4954602adf40SYehuda Sadeh 4955602adf40SYehuda Sadeh return 0; 49567ad18afaSChristoph Hellwig out_tag_set: 49577ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 49587ad18afaSChristoph Hellwig return err; 4959602adf40SYehuda Sadeh } 4960602adf40SYehuda Sadeh 4961dfc5606dSYehuda Sadeh /* 4962dfc5606dSYehuda Sadeh sysfs 4963dfc5606dSYehuda Sadeh */ 4964602adf40SYehuda Sadeh 4965593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4966593a9e7bSAlex Elder { 4967593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4968593a9e7bSAlex Elder } 4969593a9e7bSAlex Elder 4970dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4971dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4972602adf40SYehuda Sadeh { 4973593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4974dfc5606dSYehuda Sadeh 4975fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4976fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4977602adf40SYehuda Sadeh } 4978602adf40SYehuda Sadeh 497934b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 498034b13184SAlex Elder struct device_attribute *attr, char *buf) 498134b13184SAlex Elder { 498234b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 498334b13184SAlex Elder 4984fa58bcadSIlya Dryomov return sprintf(buf, "0x%016llx\n", rbd_dev->header.features); 498534b13184SAlex Elder } 498634b13184SAlex Elder 4987dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4988dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4989602adf40SYehuda Sadeh { 4990593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4991dfc5606dSYehuda Sadeh 4992fc71d833SAlex Elder if (rbd_dev->major) 4993dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4994fc71d833SAlex Elder 4995fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4996dd82fff1SIlya Dryomov } 4997fc71d833SAlex Elder 4998dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4999dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 5000dd82fff1SIlya Dryomov { 5001dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5002dd82fff1SIlya Dryomov 5003dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 5004dfc5606dSYehuda Sadeh } 5005dfc5606dSYehuda Sadeh 5006005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 5007005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 5008005a07bfSIlya Dryomov { 5009005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5010005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 5011005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 5012005a07bfSIlya Dryomov 5013005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 5014005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 5015005a07bfSIlya Dryomov } 5016005a07bfSIlya Dryomov 5017dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 5018dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5019dfc5606dSYehuda Sadeh { 5020593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5021dfc5606dSYehuda Sadeh 50221dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 5023033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 5024dfc5606dSYehuda Sadeh } 5025dfc5606dSYehuda Sadeh 5026267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 5027267fb90bSMike Christie struct device_attribute *attr, char *buf) 5028267fb90bSMike Christie { 5029267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5030267fb90bSMike Christie 5031267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 5032267fb90bSMike Christie } 5033267fb90bSMike Christie 50340d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 50350d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 50360d6d1e9cSMike Christie { 50370d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 50380d6d1e9cSMike Christie 5039f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN)) 5040f44d04e6SIlya Dryomov return -EPERM; 5041f44d04e6SIlya Dryomov 50420d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 5043dfc5606dSYehuda Sadeh } 5044dfc5606dSYehuda Sadeh 5045dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 5046dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5047dfc5606dSYehuda Sadeh { 5048593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5049dfc5606dSYehuda Sadeh 50500d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 5051dfc5606dSYehuda Sadeh } 5052dfc5606dSYehuda Sadeh 50539bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 50549bb2f334SAlex Elder struct device_attribute *attr, char *buf) 50559bb2f334SAlex Elder { 50569bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 50579bb2f334SAlex Elder 50580d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 50590d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 50609bb2f334SAlex Elder } 50619bb2f334SAlex Elder 5062b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev, 5063b26c047bSIlya Dryomov struct device_attribute *attr, char *buf) 5064b26c047bSIlya Dryomov { 5065b26c047bSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5066b26c047bSIlya Dryomov 5067b26c047bSIlya Dryomov return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: ""); 5068b26c047bSIlya Dryomov } 5069b26c047bSIlya Dryomov 5070dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 5071dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5072dfc5606dSYehuda Sadeh { 5073593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5074dfc5606dSYehuda Sadeh 5075a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 50760d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 5077a92ffdf8SAlex Elder 5078a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 5079dfc5606dSYehuda Sadeh } 5080dfc5606dSYehuda Sadeh 5081589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 5082589d30e0SAlex Elder struct device_attribute *attr, char *buf) 5083589d30e0SAlex Elder { 5084589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5085589d30e0SAlex Elder 50860d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 5087589d30e0SAlex Elder } 5088589d30e0SAlex Elder 508934b13184SAlex Elder /* 509034b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 509134b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 509234b13184SAlex Elder */ 5093dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 5094dfc5606dSYehuda Sadeh struct device_attribute *attr, 5095dfc5606dSYehuda Sadeh char *buf) 5096dfc5606dSYehuda Sadeh { 5097593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5098dfc5606dSYehuda Sadeh 50990d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 5100dfc5606dSYehuda Sadeh } 5101dfc5606dSYehuda Sadeh 510292a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 510392a58671SMike Christie struct device_attribute *attr, char *buf) 510492a58671SMike Christie { 510592a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 510692a58671SMike Christie 510792a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 510892a58671SMike Christie } 510992a58671SMike Christie 511086b00e0dSAlex Elder /* 5111ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 5112ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 5113ff96128fSIlya Dryomov * image)". 511486b00e0dSAlex Elder */ 511586b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 511686b00e0dSAlex Elder struct device_attribute *attr, 511786b00e0dSAlex Elder char *buf) 511886b00e0dSAlex Elder { 511986b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5120ff96128fSIlya Dryomov ssize_t count = 0; 512186b00e0dSAlex Elder 5122ff96128fSIlya Dryomov if (!rbd_dev->parent) 512386b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 512486b00e0dSAlex Elder 5125ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 5126ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 512786b00e0dSAlex Elder 5128ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 5129ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 5130e92c0eafSIlya Dryomov "pool_ns %s\n" 5131ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 5132ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 5133ff96128fSIlya Dryomov "overlap %llu\n", 5134ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 5135ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 5136e92c0eafSIlya Dryomov spec->pool_ns ?: "", 5137ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 5138ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 5139ff96128fSIlya Dryomov rbd_dev->parent_overlap); 5140ff96128fSIlya Dryomov } 514186b00e0dSAlex Elder 514286b00e0dSAlex Elder return count; 514386b00e0dSAlex Elder } 514486b00e0dSAlex Elder 5145dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 5146dfc5606dSYehuda Sadeh struct device_attribute *attr, 5147dfc5606dSYehuda Sadeh const char *buf, 5148dfc5606dSYehuda Sadeh size_t size) 5149dfc5606dSYehuda Sadeh { 5150593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5151b813623aSAlex Elder int ret; 5152602adf40SYehuda Sadeh 5153f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN)) 5154f44d04e6SIlya Dryomov return -EPERM; 5155f44d04e6SIlya Dryomov 5156cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 5157e627db08SAlex Elder if (ret) 515852bb1f9bSIlya Dryomov return ret; 5159b813623aSAlex Elder 516052bb1f9bSIlya Dryomov return size; 5161dfc5606dSYehuda Sadeh } 5162602adf40SYehuda Sadeh 51635657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL); 51645657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL); 51655657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL); 51665657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL); 51675657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL); 51685657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL); 51695657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); 51705657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); 51715657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); 51725657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); 5173b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL); 51745657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); 51755657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); 51765657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); 51775657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL); 51785657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL); 51795657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL); 5180dfc5606dSYehuda Sadeh 5181dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 5182dfc5606dSYehuda Sadeh &dev_attr_size.attr, 518334b13184SAlex Elder &dev_attr_features.attr, 5184dfc5606dSYehuda Sadeh &dev_attr_major.attr, 5185dd82fff1SIlya Dryomov &dev_attr_minor.attr, 5186005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 5187dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 5188267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 51890d6d1e9cSMike Christie &dev_attr_config_info.attr, 5190dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 51919bb2f334SAlex Elder &dev_attr_pool_id.attr, 5192b26c047bSIlya Dryomov &dev_attr_pool_ns.attr, 5193dfc5606dSYehuda Sadeh &dev_attr_name.attr, 5194589d30e0SAlex Elder &dev_attr_image_id.attr, 5195dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 519692a58671SMike Christie &dev_attr_snap_id.attr, 519786b00e0dSAlex Elder &dev_attr_parent.attr, 5198dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 5199dfc5606dSYehuda Sadeh NULL 5200dfc5606dSYehuda Sadeh }; 5201dfc5606dSYehuda Sadeh 5202dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 5203dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 5204dfc5606dSYehuda Sadeh }; 5205dfc5606dSYehuda Sadeh 5206dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 5207dfc5606dSYehuda Sadeh &rbd_attr_group, 5208dfc5606dSYehuda Sadeh NULL 5209dfc5606dSYehuda Sadeh }; 5210dfc5606dSYehuda Sadeh 52116cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 5212dfc5606dSYehuda Sadeh 5213b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = { 5214dfc5606dSYehuda Sadeh .name = "rbd", 5215dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 52166cac4695SIlya Dryomov .release = rbd_dev_release, 5217dfc5606dSYehuda Sadeh }; 5218dfc5606dSYehuda Sadeh 52198b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 52208b8fb99cSAlex Elder { 52218b8fb99cSAlex Elder kref_get(&spec->kref); 52228b8fb99cSAlex Elder 52238b8fb99cSAlex Elder return spec; 52248b8fb99cSAlex Elder } 52258b8fb99cSAlex Elder 52268b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 52278b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 52288b8fb99cSAlex Elder { 52298b8fb99cSAlex Elder if (spec) 52308b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 52318b8fb99cSAlex Elder } 52328b8fb99cSAlex Elder 52338b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 52348b8fb99cSAlex Elder { 52358b8fb99cSAlex Elder struct rbd_spec *spec; 52368b8fb99cSAlex Elder 52378b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 52388b8fb99cSAlex Elder if (!spec) 52398b8fb99cSAlex Elder return NULL; 524004077599SIlya Dryomov 524104077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 524204077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 52438b8fb99cSAlex Elder kref_init(&spec->kref); 52448b8fb99cSAlex Elder 52458b8fb99cSAlex Elder return spec; 52468b8fb99cSAlex Elder } 52478b8fb99cSAlex Elder 52488b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 52498b8fb99cSAlex Elder { 52508b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 52518b8fb99cSAlex Elder 52528b8fb99cSAlex Elder kfree(spec->pool_name); 5253b26c047bSIlya Dryomov kfree(spec->pool_ns); 52548b8fb99cSAlex Elder kfree(spec->image_id); 52558b8fb99cSAlex Elder kfree(spec->image_name); 52568b8fb99cSAlex Elder kfree(spec->snap_name); 52578b8fb99cSAlex Elder kfree(spec); 52588b8fb99cSAlex Elder } 52598b8fb99cSAlex Elder 52601643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 5261dd5ac32dSIlya Dryomov { 526299d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 5263ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 5264dd5ac32dSIlya Dryomov 5265c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 52666b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 52670d6d1e9cSMike Christie kfree(rbd_dev->config_info); 5268c41d13a3SIlya Dryomov 5269dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 5270dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 5271dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 5272dd5ac32dSIlya Dryomov kfree(rbd_dev); 52731643dfa4SIlya Dryomov } 52741643dfa4SIlya Dryomov 52751643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 52761643dfa4SIlya Dryomov { 52771643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 52781643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 52791643dfa4SIlya Dryomov 52801643dfa4SIlya Dryomov if (need_put) { 52811643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 52821643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 52831643dfa4SIlya Dryomov } 52841643dfa4SIlya Dryomov 52851643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 5286dd5ac32dSIlya Dryomov 5287dd5ac32dSIlya Dryomov /* 5288dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 5289dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 5290dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 5291dd5ac32dSIlya Dryomov */ 5292dd5ac32dSIlya Dryomov if (need_put) 5293dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 5294dd5ac32dSIlya Dryomov } 5295dd5ac32dSIlya Dryomov 52961643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 52971643dfa4SIlya Dryomov struct rbd_spec *spec) 5298c53d5893SAlex Elder { 5299c53d5893SAlex Elder struct rbd_device *rbd_dev; 5300c53d5893SAlex Elder 5301c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 5302c53d5893SAlex Elder if (!rbd_dev) 5303c53d5893SAlex Elder return NULL; 5304c53d5893SAlex Elder 5305c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 5306c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 5307c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 5308c53d5893SAlex Elder 53097e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 5310c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 5311431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 5312b26c047bSIlya Dryomov if (spec->pool_ns) { 5313b26c047bSIlya Dryomov WARN_ON(!*spec->pool_ns); 5314b26c047bSIlya Dryomov rbd_dev->header_oloc.pool_ns = 5315b26c047bSIlya Dryomov ceph_find_or_create_string(spec->pool_ns, 5316b26c047bSIlya Dryomov strlen(spec->pool_ns)); 5317b26c047bSIlya Dryomov } 5318c41d13a3SIlya Dryomov 531999d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 532099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 532199d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 532299d16943SIlya Dryomov 5323ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 5324ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 5325ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 5326ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 5327ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 5328ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 5329e1fddc8fSIlya Dryomov spin_lock_init(&rbd_dev->lock_lists_lock); 5330637cd060SIlya Dryomov INIT_LIST_HEAD(&rbd_dev->acquiring_list); 5331e1fddc8fSIlya Dryomov INIT_LIST_HEAD(&rbd_dev->running_list); 5332637cd060SIlya Dryomov init_completion(&rbd_dev->acquire_wait); 5333e1fddc8fSIlya Dryomov init_completion(&rbd_dev->releasing_wait); 5334ed95b21aSIlya Dryomov 533522e8bd51SIlya Dryomov spin_lock_init(&rbd_dev->object_map_lock); 5336c53d5893SAlex Elder 5337dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 5338dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 5339dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 5340dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 5341dd5ac32dSIlya Dryomov 5342c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 5343d147543dSIlya Dryomov rbd_dev->spec = spec; 53440903e875SAlex Elder 53451643dfa4SIlya Dryomov return rbd_dev; 53461643dfa4SIlya Dryomov } 53471643dfa4SIlya Dryomov 5348dd5ac32dSIlya Dryomov /* 53491643dfa4SIlya Dryomov * Create a mapping rbd_dev. 5350dd5ac32dSIlya Dryomov */ 53511643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 53521643dfa4SIlya Dryomov struct rbd_spec *spec, 53531643dfa4SIlya Dryomov struct rbd_options *opts) 53541643dfa4SIlya Dryomov { 53551643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 53561643dfa4SIlya Dryomov 53571643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 53581643dfa4SIlya Dryomov if (!rbd_dev) 53591643dfa4SIlya Dryomov return NULL; 53601643dfa4SIlya Dryomov 53611643dfa4SIlya Dryomov rbd_dev->opts = opts; 53621643dfa4SIlya Dryomov 53631643dfa4SIlya Dryomov /* get an id and fill in device name */ 53641643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 53651643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 53661643dfa4SIlya Dryomov GFP_KERNEL); 53671643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 53681643dfa4SIlya Dryomov goto fail_rbd_dev; 53691643dfa4SIlya Dryomov 53701643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 53711643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 53721643dfa4SIlya Dryomov rbd_dev->name); 53731643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 53741643dfa4SIlya Dryomov goto fail_dev_id; 53751643dfa4SIlya Dryomov 53761643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 5377dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 5378dd5ac32dSIlya Dryomov 53791643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 5380c53d5893SAlex Elder return rbd_dev; 53811643dfa4SIlya Dryomov 53821643dfa4SIlya Dryomov fail_dev_id: 53831643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 53841643dfa4SIlya Dryomov fail_rbd_dev: 53851643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 53861643dfa4SIlya Dryomov return NULL; 5387c53d5893SAlex Elder } 5388c53d5893SAlex Elder 5389c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 5390c53d5893SAlex Elder { 5391dd5ac32dSIlya Dryomov if (rbd_dev) 5392dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 5393c53d5893SAlex Elder } 5394c53d5893SAlex Elder 5395dfc5606dSYehuda Sadeh /* 53969d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 53979d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 53989d475de5SAlex Elder * image. 53999d475de5SAlex Elder */ 54009d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 54019d475de5SAlex Elder u8 *order, u64 *snap_size) 54029d475de5SAlex Elder { 54039d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 54049d475de5SAlex Elder int ret; 54059d475de5SAlex Elder struct { 54069d475de5SAlex Elder u8 order; 54079d475de5SAlex Elder __le64 size; 54089d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 54099d475de5SAlex Elder 5410ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5411ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 54124157976bSAlex Elder &snapid, sizeof(snapid), 5413e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 541436be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 54159d475de5SAlex Elder if (ret < 0) 54169d475de5SAlex Elder return ret; 541757385b51SAlex Elder if (ret < sizeof (size_buf)) 541857385b51SAlex Elder return -ERANGE; 54199d475de5SAlex Elder 5420c3545579SJosh Durgin if (order) { 54219d475de5SAlex Elder *order = size_buf.order; 5422c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 5423c3545579SJosh Durgin } 54249d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 54259d475de5SAlex Elder 5426c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 5427c3545579SJosh Durgin (unsigned long long)snap_id, 54289d475de5SAlex Elder (unsigned long long)*snap_size); 54299d475de5SAlex Elder 54309d475de5SAlex Elder return 0; 54319d475de5SAlex Elder } 54329d475de5SAlex Elder 54339d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 54349d475de5SAlex Elder { 54359d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 54369d475de5SAlex Elder &rbd_dev->header.obj_order, 54379d475de5SAlex Elder &rbd_dev->header.image_size); 54389d475de5SAlex Elder } 54399d475de5SAlex Elder 54401e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 54411e130199SAlex Elder { 54425435d206SDongsheng Yang size_t size; 54431e130199SAlex Elder void *reply_buf; 54441e130199SAlex Elder int ret; 54451e130199SAlex Elder void *p; 54461e130199SAlex Elder 54475435d206SDongsheng Yang /* Response will be an encoded string, which includes a length */ 54485435d206SDongsheng Yang size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; 54495435d206SDongsheng Yang reply_buf = kzalloc(size, GFP_KERNEL); 54501e130199SAlex Elder if (!reply_buf) 54511e130199SAlex Elder return -ENOMEM; 54521e130199SAlex Elder 5453ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5454ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 54555435d206SDongsheng Yang NULL, 0, reply_buf, size); 545636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 54571e130199SAlex Elder if (ret < 0) 54581e130199SAlex Elder goto out; 54591e130199SAlex Elder 54601e130199SAlex Elder p = reply_buf; 54611e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 546257385b51SAlex Elder p + ret, NULL, GFP_NOIO); 546357385b51SAlex Elder ret = 0; 54641e130199SAlex Elder 54651e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 54661e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 54671e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 54681e130199SAlex Elder } else { 54691e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 54701e130199SAlex Elder } 54711e130199SAlex Elder out: 54721e130199SAlex Elder kfree(reply_buf); 54731e130199SAlex Elder 54741e130199SAlex Elder return ret; 54751e130199SAlex Elder } 54761e130199SAlex Elder 5477b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5478196e2d6dSIlya Dryomov bool read_only, u64 *snap_features) 5479b1b5402aSAlex Elder { 5480196e2d6dSIlya Dryomov struct { 5481196e2d6dSIlya Dryomov __le64 snap_id; 5482196e2d6dSIlya Dryomov u8 read_only; 5483196e2d6dSIlya Dryomov } features_in; 5484b1b5402aSAlex Elder struct { 5485b1b5402aSAlex Elder __le64 features; 5486b1b5402aSAlex Elder __le64 incompat; 54874157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 5488d3767f0fSIlya Dryomov u64 unsup; 5489b1b5402aSAlex Elder int ret; 5490b1b5402aSAlex Elder 5491196e2d6dSIlya Dryomov features_in.snap_id = cpu_to_le64(snap_id); 5492196e2d6dSIlya Dryomov features_in.read_only = read_only; 5493196e2d6dSIlya Dryomov 5494ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5495ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 5496196e2d6dSIlya Dryomov &features_in, sizeof(features_in), 5497e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 549836be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5499b1b5402aSAlex Elder if (ret < 0) 5500b1b5402aSAlex Elder return ret; 550157385b51SAlex Elder if (ret < sizeof (features_buf)) 550257385b51SAlex Elder return -ERANGE; 5503d889140cSAlex Elder 5504d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 5505d3767f0fSIlya Dryomov if (unsup) { 5506d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 5507d3767f0fSIlya Dryomov unsup); 5508b8f5c6edSAlex Elder return -ENXIO; 5509d3767f0fSIlya Dryomov } 5510d889140cSAlex Elder 5511b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 5512b1b5402aSAlex Elder 5513b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 5514b1b5402aSAlex Elder (unsigned long long)snap_id, 5515b1b5402aSAlex Elder (unsigned long long)*snap_features, 5516b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 5517b1b5402aSAlex Elder 5518b1b5402aSAlex Elder return 0; 5519b1b5402aSAlex Elder } 5520b1b5402aSAlex Elder 5521b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 5522b1b5402aSAlex Elder { 5523b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 5524196e2d6dSIlya Dryomov rbd_is_ro(rbd_dev), 5525b1b5402aSAlex Elder &rbd_dev->header.features); 5526b1b5402aSAlex Elder } 5527b1b5402aSAlex Elder 552822e8bd51SIlya Dryomov /* 552922e8bd51SIlya Dryomov * These are generic image flags, but since they are used only for 553022e8bd51SIlya Dryomov * object map, store them in rbd_dev->object_map_flags. 553122e8bd51SIlya Dryomov * 553222e8bd51SIlya Dryomov * For the same reason, this function is called only on object map 553322e8bd51SIlya Dryomov * (re)load and not on header refresh. 553422e8bd51SIlya Dryomov */ 553522e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev) 553622e8bd51SIlya Dryomov { 553722e8bd51SIlya Dryomov __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id); 553822e8bd51SIlya Dryomov __le64 flags; 553922e8bd51SIlya Dryomov int ret; 554022e8bd51SIlya Dryomov 554122e8bd51SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 554222e8bd51SIlya Dryomov &rbd_dev->header_oloc, "get_flags", 554322e8bd51SIlya Dryomov &snapid, sizeof(snapid), 554422e8bd51SIlya Dryomov &flags, sizeof(flags)); 554522e8bd51SIlya Dryomov if (ret < 0) 554622e8bd51SIlya Dryomov return ret; 554722e8bd51SIlya Dryomov if (ret < sizeof(flags)) 554822e8bd51SIlya Dryomov return -EBADMSG; 554922e8bd51SIlya Dryomov 555022e8bd51SIlya Dryomov rbd_dev->object_map_flags = le64_to_cpu(flags); 555122e8bd51SIlya Dryomov return 0; 555222e8bd51SIlya Dryomov } 555322e8bd51SIlya Dryomov 5554eb3b2d6bSIlya Dryomov struct parent_image_info { 5555eb3b2d6bSIlya Dryomov u64 pool_id; 5556e92c0eafSIlya Dryomov const char *pool_ns; 5557eb3b2d6bSIlya Dryomov const char *image_id; 5558eb3b2d6bSIlya Dryomov u64 snap_id; 5559eb3b2d6bSIlya Dryomov 5560e92c0eafSIlya Dryomov bool has_overlap; 5561eb3b2d6bSIlya Dryomov u64 overlap; 5562eb3b2d6bSIlya Dryomov }; 5563eb3b2d6bSIlya Dryomov 5564eb3b2d6bSIlya Dryomov /* 5565eb3b2d6bSIlya Dryomov * The caller is responsible for @pii. 5566eb3b2d6bSIlya Dryomov */ 5567e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end, 5568e92c0eafSIlya Dryomov struct parent_image_info *pii) 5569e92c0eafSIlya Dryomov { 5570e92c0eafSIlya Dryomov u8 struct_v; 5571e92c0eafSIlya Dryomov u32 struct_len; 5572e92c0eafSIlya Dryomov int ret; 5573e92c0eafSIlya Dryomov 5574e92c0eafSIlya Dryomov ret = ceph_start_decoding(p, end, 1, "ParentImageSpec", 5575e92c0eafSIlya Dryomov &struct_v, &struct_len); 5576e92c0eafSIlya Dryomov if (ret) 5577e92c0eafSIlya Dryomov return ret; 5578e92c0eafSIlya Dryomov 5579e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->pool_id, e_inval); 5580e92c0eafSIlya Dryomov pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 5581e92c0eafSIlya Dryomov if (IS_ERR(pii->pool_ns)) { 5582e92c0eafSIlya Dryomov ret = PTR_ERR(pii->pool_ns); 5583e92c0eafSIlya Dryomov pii->pool_ns = NULL; 5584e92c0eafSIlya Dryomov return ret; 5585e92c0eafSIlya Dryomov } 5586e92c0eafSIlya Dryomov pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 5587e92c0eafSIlya Dryomov if (IS_ERR(pii->image_id)) { 5588e92c0eafSIlya Dryomov ret = PTR_ERR(pii->image_id); 5589e92c0eafSIlya Dryomov pii->image_id = NULL; 5590e92c0eafSIlya Dryomov return ret; 5591e92c0eafSIlya Dryomov } 5592e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->snap_id, e_inval); 5593e92c0eafSIlya Dryomov return 0; 5594e92c0eafSIlya Dryomov 5595e92c0eafSIlya Dryomov e_inval: 5596e92c0eafSIlya Dryomov return -EINVAL; 5597e92c0eafSIlya Dryomov } 5598e92c0eafSIlya Dryomov 5599e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev, 5600e92c0eafSIlya Dryomov struct page *req_page, 5601e92c0eafSIlya Dryomov struct page *reply_page, 5602e92c0eafSIlya Dryomov struct parent_image_info *pii) 5603e92c0eafSIlya Dryomov { 5604e92c0eafSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 5605e92c0eafSIlya Dryomov size_t reply_len = PAGE_SIZE; 5606e92c0eafSIlya Dryomov void *p, *end; 5607e92c0eafSIlya Dryomov int ret; 5608e92c0eafSIlya Dryomov 5609e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5610e92c0eafSIlya Dryomov "rbd", "parent_get", CEPH_OSD_FLAG_READ, 561168ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len); 5612e92c0eafSIlya Dryomov if (ret) 5613e92c0eafSIlya Dryomov return ret == -EOPNOTSUPP ? 1 : ret; 5614e92c0eafSIlya Dryomov 5615e92c0eafSIlya Dryomov p = page_address(reply_page); 5616e92c0eafSIlya Dryomov end = p + reply_len; 5617e92c0eafSIlya Dryomov ret = decode_parent_image_spec(&p, end, pii); 5618e92c0eafSIlya Dryomov if (ret) 5619e92c0eafSIlya Dryomov return ret; 5620e92c0eafSIlya Dryomov 5621e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5622e92c0eafSIlya Dryomov "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, 562368ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len); 5624e92c0eafSIlya Dryomov if (ret) 5625e92c0eafSIlya Dryomov return ret; 5626e92c0eafSIlya Dryomov 5627e92c0eafSIlya Dryomov p = page_address(reply_page); 5628e92c0eafSIlya Dryomov end = p + reply_len; 5629e92c0eafSIlya Dryomov ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval); 5630e92c0eafSIlya Dryomov if (pii->has_overlap) 5631e92c0eafSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 5632e92c0eafSIlya Dryomov 5633e92c0eafSIlya Dryomov return 0; 5634e92c0eafSIlya Dryomov 5635e92c0eafSIlya Dryomov e_inval: 5636e92c0eafSIlya Dryomov return -EINVAL; 5637e92c0eafSIlya Dryomov } 5638e92c0eafSIlya Dryomov 5639e92c0eafSIlya Dryomov /* 5640e92c0eafSIlya Dryomov * The caller is responsible for @pii. 5641e92c0eafSIlya Dryomov */ 5642eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev, 5643eb3b2d6bSIlya Dryomov struct page *req_page, 5644eb3b2d6bSIlya Dryomov struct page *reply_page, 5645eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 5646eb3b2d6bSIlya Dryomov { 5647eb3b2d6bSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 5648eb3b2d6bSIlya Dryomov size_t reply_len = PAGE_SIZE; 5649eb3b2d6bSIlya Dryomov void *p, *end; 5650eb3b2d6bSIlya Dryomov int ret; 5651eb3b2d6bSIlya Dryomov 5652eb3b2d6bSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5653eb3b2d6bSIlya Dryomov "rbd", "get_parent", CEPH_OSD_FLAG_READ, 565468ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len); 5655eb3b2d6bSIlya Dryomov if (ret) 5656eb3b2d6bSIlya Dryomov return ret; 5657eb3b2d6bSIlya Dryomov 5658eb3b2d6bSIlya Dryomov p = page_address(reply_page); 5659eb3b2d6bSIlya Dryomov end = p + reply_len; 5660eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->pool_id, e_inval); 5661eb3b2d6bSIlya Dryomov pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5662eb3b2d6bSIlya Dryomov if (IS_ERR(pii->image_id)) { 5663eb3b2d6bSIlya Dryomov ret = PTR_ERR(pii->image_id); 5664eb3b2d6bSIlya Dryomov pii->image_id = NULL; 5665eb3b2d6bSIlya Dryomov return ret; 5666eb3b2d6bSIlya Dryomov } 5667eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->snap_id, e_inval); 5668e92c0eafSIlya Dryomov pii->has_overlap = true; 5669eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 5670eb3b2d6bSIlya Dryomov 5671eb3b2d6bSIlya Dryomov return 0; 5672eb3b2d6bSIlya Dryomov 5673eb3b2d6bSIlya Dryomov e_inval: 5674eb3b2d6bSIlya Dryomov return -EINVAL; 5675eb3b2d6bSIlya Dryomov } 5676eb3b2d6bSIlya Dryomov 5677eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev, 5678eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 5679eb3b2d6bSIlya Dryomov { 5680eb3b2d6bSIlya Dryomov struct page *req_page, *reply_page; 5681eb3b2d6bSIlya Dryomov void *p; 5682eb3b2d6bSIlya Dryomov int ret; 5683eb3b2d6bSIlya Dryomov 5684eb3b2d6bSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 5685eb3b2d6bSIlya Dryomov if (!req_page) 5686eb3b2d6bSIlya Dryomov return -ENOMEM; 5687eb3b2d6bSIlya Dryomov 5688eb3b2d6bSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 5689eb3b2d6bSIlya Dryomov if (!reply_page) { 5690eb3b2d6bSIlya Dryomov __free_page(req_page); 5691eb3b2d6bSIlya Dryomov return -ENOMEM; 5692eb3b2d6bSIlya Dryomov } 5693eb3b2d6bSIlya Dryomov 5694eb3b2d6bSIlya Dryomov p = page_address(req_page); 5695eb3b2d6bSIlya Dryomov ceph_encode_64(&p, rbd_dev->spec->snap_id); 5696e92c0eafSIlya Dryomov ret = __get_parent_info(rbd_dev, req_page, reply_page, pii); 5697e92c0eafSIlya Dryomov if (ret > 0) 5698e92c0eafSIlya Dryomov ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page, 5699e92c0eafSIlya Dryomov pii); 5700eb3b2d6bSIlya Dryomov 5701eb3b2d6bSIlya Dryomov __free_page(req_page); 5702eb3b2d6bSIlya Dryomov __free_page(reply_page); 5703eb3b2d6bSIlya Dryomov return ret; 5704eb3b2d6bSIlya Dryomov } 5705eb3b2d6bSIlya Dryomov 570686b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 570786b00e0dSAlex Elder { 570886b00e0dSAlex Elder struct rbd_spec *parent_spec; 5709eb3b2d6bSIlya Dryomov struct parent_image_info pii = { 0 }; 571086b00e0dSAlex Elder int ret; 571186b00e0dSAlex Elder 571286b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 571386b00e0dSAlex Elder if (!parent_spec) 571486b00e0dSAlex Elder return -ENOMEM; 571586b00e0dSAlex Elder 5716eb3b2d6bSIlya Dryomov ret = get_parent_info(rbd_dev, &pii); 5717eb3b2d6bSIlya Dryomov if (ret) 571886b00e0dSAlex Elder goto out_err; 571986b00e0dSAlex Elder 5720e92c0eafSIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", 5721e92c0eafSIlya Dryomov __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, 5722e92c0eafSIlya Dryomov pii.has_overlap, pii.overlap); 5723eb3b2d6bSIlya Dryomov 5724e92c0eafSIlya Dryomov if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { 5725392a9dadSAlex Elder /* 5726392a9dadSAlex Elder * Either the parent never existed, or we have 5727392a9dadSAlex Elder * record of it but the image got flattened so it no 5728392a9dadSAlex Elder * longer has a parent. When the parent of a 5729392a9dadSAlex Elder * layered image disappears we immediately set the 5730392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 5731392a9dadSAlex Elder * requests will be treated as if the image had no 5732392a9dadSAlex Elder * parent. 5733e92c0eafSIlya Dryomov * 5734e92c0eafSIlya Dryomov * If !pii.has_overlap, the parent image spec is not 5735e92c0eafSIlya Dryomov * applicable. It's there to avoid duplication in each 5736e92c0eafSIlya Dryomov * snapshot record. 5737392a9dadSAlex Elder */ 5738392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 5739392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 5740392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 5741392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 5742392a9dadSAlex Elder rbd_dev->disk->disk_name); 5743392a9dadSAlex Elder } 5744392a9dadSAlex Elder 574586b00e0dSAlex Elder goto out; /* No parent? No problem. */ 5746392a9dadSAlex Elder } 574786b00e0dSAlex Elder 57480903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 57490903e875SAlex Elder 57500903e875SAlex Elder ret = -EIO; 5751eb3b2d6bSIlya Dryomov if (pii.pool_id > (u64)U32_MAX) { 57529584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 5753eb3b2d6bSIlya Dryomov (unsigned long long)pii.pool_id, U32_MAX); 575457385b51SAlex Elder goto out_err; 5755c0cd10dbSAlex Elder } 57560903e875SAlex Elder 57573b5cf2a2SAlex Elder /* 57583b5cf2a2SAlex Elder * The parent won't change (except when the clone is 57593b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 57603b5cf2a2SAlex Elder * record the parent spec we have not already done so. 57613b5cf2a2SAlex Elder */ 57623b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 5763eb3b2d6bSIlya Dryomov parent_spec->pool_id = pii.pool_id; 5764e92c0eafSIlya Dryomov if (pii.pool_ns && *pii.pool_ns) { 5765e92c0eafSIlya Dryomov parent_spec->pool_ns = pii.pool_ns; 5766e92c0eafSIlya Dryomov pii.pool_ns = NULL; 5767e92c0eafSIlya Dryomov } 5768eb3b2d6bSIlya Dryomov parent_spec->image_id = pii.image_id; 5769eb3b2d6bSIlya Dryomov pii.image_id = NULL; 5770eb3b2d6bSIlya Dryomov parent_spec->snap_id = pii.snap_id; 5771b26c047bSIlya Dryomov 577286b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 577386b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 57743b5cf2a2SAlex Elder } 57753b5cf2a2SAlex Elder 57763b5cf2a2SAlex Elder /* 5777cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 5778cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 57793b5cf2a2SAlex Elder */ 5780eb3b2d6bSIlya Dryomov if (!pii.overlap) { 57813b5cf2a2SAlex Elder if (parent_spec) { 5782cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 5783cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 5784cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 5785cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 578670cf49cfSAlex Elder } else { 5787cf32bd9cSIlya Dryomov /* initial probe */ 5788cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 57893b5cf2a2SAlex Elder } 579070cf49cfSAlex Elder } 5791eb3b2d6bSIlya Dryomov rbd_dev->parent_overlap = pii.overlap; 5792cf32bd9cSIlya Dryomov 579386b00e0dSAlex Elder out: 579486b00e0dSAlex Elder ret = 0; 579586b00e0dSAlex Elder out_err: 5796e92c0eafSIlya Dryomov kfree(pii.pool_ns); 5797eb3b2d6bSIlya Dryomov kfree(pii.image_id); 579886b00e0dSAlex Elder rbd_spec_put(parent_spec); 579986b00e0dSAlex Elder return ret; 580086b00e0dSAlex Elder } 580186b00e0dSAlex Elder 5802cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 5803cc070d59SAlex Elder { 5804cc070d59SAlex Elder struct { 5805cc070d59SAlex Elder __le64 stripe_unit; 5806cc070d59SAlex Elder __le64 stripe_count; 5807cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 5808cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 5809cc070d59SAlex Elder void *p; 5810cc070d59SAlex Elder int ret; 5811cc070d59SAlex Elder 5812ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5813ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 5814ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 5815cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5816cc070d59SAlex Elder if (ret < 0) 5817cc070d59SAlex Elder return ret; 5818cc070d59SAlex Elder if (ret < size) 5819cc070d59SAlex Elder return -ERANGE; 5820cc070d59SAlex Elder 5821cc070d59SAlex Elder p = &striping_info_buf; 5822b1331852SIlya Dryomov rbd_dev->header.stripe_unit = ceph_decode_64(&p); 5823b1331852SIlya Dryomov rbd_dev->header.stripe_count = ceph_decode_64(&p); 5824cc070d59SAlex Elder return 0; 5825cc070d59SAlex Elder } 5826cc070d59SAlex Elder 58277e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 58287e97332eSIlya Dryomov { 58297e97332eSIlya Dryomov __le64 data_pool_id; 58307e97332eSIlya Dryomov int ret; 58317e97332eSIlya Dryomov 58327e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 58337e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 58347e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 58357e97332eSIlya Dryomov if (ret < 0) 58367e97332eSIlya Dryomov return ret; 58377e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 58387e97332eSIlya Dryomov return -EBADMSG; 58397e97332eSIlya Dryomov 58407e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 58417e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 58427e97332eSIlya Dryomov return 0; 58437e97332eSIlya Dryomov } 58447e97332eSIlya Dryomov 58459e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 58469e15b77dSAlex Elder { 5847ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 58489e15b77dSAlex Elder size_t image_id_size; 58499e15b77dSAlex Elder char *image_id; 58509e15b77dSAlex Elder void *p; 58519e15b77dSAlex Elder void *end; 58529e15b77dSAlex Elder size_t size; 58539e15b77dSAlex Elder void *reply_buf = NULL; 58549e15b77dSAlex Elder size_t len = 0; 58559e15b77dSAlex Elder char *image_name = NULL; 58569e15b77dSAlex Elder int ret; 58579e15b77dSAlex Elder 58589e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 58599e15b77dSAlex Elder 586069e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 586169e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 58629e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 58639e15b77dSAlex Elder if (!image_id) 58649e15b77dSAlex Elder return NULL; 58659e15b77dSAlex Elder 58669e15b77dSAlex Elder p = image_id; 58674157976bSAlex Elder end = image_id + image_id_size; 586869e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 58699e15b77dSAlex Elder 58709e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 58719e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 58729e15b77dSAlex Elder if (!reply_buf) 58739e15b77dSAlex Elder goto out; 58749e15b77dSAlex Elder 5875ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5876ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5877ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5878e2a58ee5SAlex Elder reply_buf, size); 58799e15b77dSAlex Elder if (ret < 0) 58809e15b77dSAlex Elder goto out; 58819e15b77dSAlex Elder p = reply_buf; 5882f40eb349SAlex Elder end = reply_buf + ret; 5883f40eb349SAlex Elder 58849e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 58859e15b77dSAlex Elder if (IS_ERR(image_name)) 58869e15b77dSAlex Elder image_name = NULL; 58879e15b77dSAlex Elder else 58889e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 58899e15b77dSAlex Elder out: 58909e15b77dSAlex Elder kfree(reply_buf); 58919e15b77dSAlex Elder kfree(image_id); 58929e15b77dSAlex Elder 58939e15b77dSAlex Elder return image_name; 58949e15b77dSAlex Elder } 58959e15b77dSAlex Elder 58962ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 58972ad3d716SAlex Elder { 58982ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 58992ad3d716SAlex Elder const char *snap_name; 59002ad3d716SAlex Elder u32 which = 0; 59012ad3d716SAlex Elder 59022ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 59032ad3d716SAlex Elder 59042ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 59052ad3d716SAlex Elder while (which < snapc->num_snaps) { 59062ad3d716SAlex Elder if (!strcmp(name, snap_name)) 59072ad3d716SAlex Elder return snapc->snaps[which]; 59082ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 59092ad3d716SAlex Elder which++; 59102ad3d716SAlex Elder } 59112ad3d716SAlex Elder return CEPH_NOSNAP; 59122ad3d716SAlex Elder } 59132ad3d716SAlex Elder 59142ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 59152ad3d716SAlex Elder { 59162ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 59172ad3d716SAlex Elder u32 which; 59182ad3d716SAlex Elder bool found = false; 59192ad3d716SAlex Elder u64 snap_id; 59202ad3d716SAlex Elder 59212ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 59222ad3d716SAlex Elder const char *snap_name; 59232ad3d716SAlex Elder 59242ad3d716SAlex Elder snap_id = snapc->snaps[which]; 59252ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5926efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5927efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5928efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5929efadc98aSJosh Durgin continue; 5930efadc98aSJosh Durgin else 59312ad3d716SAlex Elder break; 5932efadc98aSJosh Durgin } 59332ad3d716SAlex Elder found = !strcmp(name, snap_name); 59342ad3d716SAlex Elder kfree(snap_name); 59352ad3d716SAlex Elder } 59362ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 59372ad3d716SAlex Elder } 59382ad3d716SAlex Elder 59392ad3d716SAlex Elder /* 59402ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 59412ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 59422ad3d716SAlex Elder */ 59432ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 59442ad3d716SAlex Elder { 59452ad3d716SAlex Elder if (rbd_dev->image_format == 1) 59462ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 59472ad3d716SAlex Elder 59482ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 59492ad3d716SAlex Elder } 59502ad3d716SAlex Elder 59519e15b77dSAlex Elder /* 595204077599SIlya Dryomov * An image being mapped will have everything but the snap id. 59539e15b77dSAlex Elder */ 595404077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 595504077599SIlya Dryomov { 595604077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 595704077599SIlya Dryomov 595804077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 595904077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 596004077599SIlya Dryomov rbd_assert(spec->snap_name); 596104077599SIlya Dryomov 596204077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 596304077599SIlya Dryomov u64 snap_id; 596404077599SIlya Dryomov 596504077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 596604077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 596704077599SIlya Dryomov return -ENOENT; 596804077599SIlya Dryomov 596904077599SIlya Dryomov spec->snap_id = snap_id; 597004077599SIlya Dryomov } else { 597104077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 597204077599SIlya Dryomov } 597304077599SIlya Dryomov 597404077599SIlya Dryomov return 0; 597504077599SIlya Dryomov } 597604077599SIlya Dryomov 597704077599SIlya Dryomov /* 597804077599SIlya Dryomov * A parent image will have all ids but none of the names. 597904077599SIlya Dryomov * 598004077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 598104077599SIlya Dryomov * can't figure out the name for an image id. 598204077599SIlya Dryomov */ 598304077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 59849e15b77dSAlex Elder { 59852e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 59862e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 59872e9f7f1cSAlex Elder const char *pool_name; 59882e9f7f1cSAlex Elder const char *image_name; 59892e9f7f1cSAlex Elder const char *snap_name; 59909e15b77dSAlex Elder int ret; 59919e15b77dSAlex Elder 599204077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 599304077599SIlya Dryomov rbd_assert(spec->image_id); 599404077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 59959e15b77dSAlex Elder 59962e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 59979e15b77dSAlex Elder 59982e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 59992e9f7f1cSAlex Elder if (!pool_name) { 60002e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 6001935dc89fSAlex Elder return -EIO; 6002935dc89fSAlex Elder } 60032e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 60042e9f7f1cSAlex Elder if (!pool_name) 60059e15b77dSAlex Elder return -ENOMEM; 60069e15b77dSAlex Elder 60079e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 60089e15b77dSAlex Elder 60092e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 60102e9f7f1cSAlex Elder if (!image_name) 601106ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 60129e15b77dSAlex Elder 601304077599SIlya Dryomov /* Fetch the snapshot name */ 60149e15b77dSAlex Elder 60152e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 6016da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 6017da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 60189e15b77dSAlex Elder goto out_err; 60192e9f7f1cSAlex Elder } 60202e9f7f1cSAlex Elder 60212e9f7f1cSAlex Elder spec->pool_name = pool_name; 60222e9f7f1cSAlex Elder spec->image_name = image_name; 60232e9f7f1cSAlex Elder spec->snap_name = snap_name; 60249e15b77dSAlex Elder 60259e15b77dSAlex Elder return 0; 602604077599SIlya Dryomov 60279e15b77dSAlex Elder out_err: 60282e9f7f1cSAlex Elder kfree(image_name); 60292e9f7f1cSAlex Elder kfree(pool_name); 60309e15b77dSAlex Elder return ret; 60319e15b77dSAlex Elder } 60329e15b77dSAlex Elder 6033cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 603435d489f9SAlex Elder { 603535d489f9SAlex Elder size_t size; 603635d489f9SAlex Elder int ret; 603735d489f9SAlex Elder void *reply_buf; 603835d489f9SAlex Elder void *p; 603935d489f9SAlex Elder void *end; 604035d489f9SAlex Elder u64 seq; 604135d489f9SAlex Elder u32 snap_count; 604235d489f9SAlex Elder struct ceph_snap_context *snapc; 604335d489f9SAlex Elder u32 i; 604435d489f9SAlex Elder 604535d489f9SAlex Elder /* 604635d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 604735d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 604835d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 604935d489f9SAlex Elder * prepared to receive. 605035d489f9SAlex Elder */ 605135d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 605235d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 605335d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 605435d489f9SAlex Elder if (!reply_buf) 605535d489f9SAlex Elder return -ENOMEM; 605635d489f9SAlex Elder 6057ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 6058ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 6059ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 606036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 606135d489f9SAlex Elder if (ret < 0) 606235d489f9SAlex Elder goto out; 606335d489f9SAlex Elder 606435d489f9SAlex Elder p = reply_buf; 606557385b51SAlex Elder end = reply_buf + ret; 606657385b51SAlex Elder ret = -ERANGE; 606735d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 606835d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 606935d489f9SAlex Elder 607035d489f9SAlex Elder /* 607135d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 607235d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 607335d489f9SAlex Elder * make sure the computed size of the snapshot context we 607435d489f9SAlex Elder * allocate is representable in a size_t. 607535d489f9SAlex Elder */ 607635d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 607735d489f9SAlex Elder / sizeof (u64)) { 607835d489f9SAlex Elder ret = -EINVAL; 607935d489f9SAlex Elder goto out; 608035d489f9SAlex Elder } 608135d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 608235d489f9SAlex Elder goto out; 6083468521c1SAlex Elder ret = 0; 608435d489f9SAlex Elder 6085812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 608635d489f9SAlex Elder if (!snapc) { 608735d489f9SAlex Elder ret = -ENOMEM; 608835d489f9SAlex Elder goto out; 608935d489f9SAlex Elder } 609035d489f9SAlex Elder snapc->seq = seq; 609135d489f9SAlex Elder for (i = 0; i < snap_count; i++) 609235d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 609335d489f9SAlex Elder 609449ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 609535d489f9SAlex Elder rbd_dev->header.snapc = snapc; 609635d489f9SAlex Elder 609735d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 609835d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 609935d489f9SAlex Elder out: 610035d489f9SAlex Elder kfree(reply_buf); 610135d489f9SAlex Elder 610257385b51SAlex Elder return ret; 610335d489f9SAlex Elder } 610435d489f9SAlex Elder 610554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 610654cac61fSAlex Elder u64 snap_id) 6107b8b1e2dbSAlex Elder { 6108b8b1e2dbSAlex Elder size_t size; 6109b8b1e2dbSAlex Elder void *reply_buf; 611054cac61fSAlex Elder __le64 snapid; 6111b8b1e2dbSAlex Elder int ret; 6112b8b1e2dbSAlex Elder void *p; 6113b8b1e2dbSAlex Elder void *end; 6114b8b1e2dbSAlex Elder char *snap_name; 6115b8b1e2dbSAlex Elder 6116b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 6117b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 6118b8b1e2dbSAlex Elder if (!reply_buf) 6119b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 6120b8b1e2dbSAlex Elder 612154cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 6122ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 6123ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 6124ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 612536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6126f40eb349SAlex Elder if (ret < 0) { 6127f40eb349SAlex Elder snap_name = ERR_PTR(ret); 6128b8b1e2dbSAlex Elder goto out; 6129f40eb349SAlex Elder } 6130b8b1e2dbSAlex Elder 6131b8b1e2dbSAlex Elder p = reply_buf; 6132f40eb349SAlex Elder end = reply_buf + ret; 6133e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 6134f40eb349SAlex Elder if (IS_ERR(snap_name)) 6135b8b1e2dbSAlex Elder goto out; 6136f40eb349SAlex Elder 6137b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 613854cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 6139b8b1e2dbSAlex Elder out: 6140b8b1e2dbSAlex Elder kfree(reply_buf); 6141b8b1e2dbSAlex Elder 6142f40eb349SAlex Elder return snap_name; 6143b8b1e2dbSAlex Elder } 6144b8b1e2dbSAlex Elder 61452df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 6146117973fbSAlex Elder { 61472df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 6148117973fbSAlex Elder int ret; 6149117973fbSAlex Elder 61501617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 61511617e40cSJosh Durgin if (ret) 6152cfbf6377SAlex Elder return ret; 61531617e40cSJosh Durgin 61542df3fac7SAlex Elder if (first_time) { 61552df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 61562df3fac7SAlex Elder if (ret) 6157cfbf6377SAlex Elder return ret; 61582df3fac7SAlex Elder } 61592df3fac7SAlex Elder 6160cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 6161d194cd1dSIlya Dryomov if (ret && first_time) { 6162d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 6163d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 6164d194cd1dSIlya Dryomov } 6165117973fbSAlex Elder 6166117973fbSAlex Elder return ret; 6167117973fbSAlex Elder } 6168117973fbSAlex Elder 6169a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 6170a720ae09SIlya Dryomov { 6171a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6172a720ae09SIlya Dryomov 6173a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 6174a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 6175a720ae09SIlya Dryomov 6176a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 6177a720ae09SIlya Dryomov } 6178a720ae09SIlya Dryomov 61791ddbe94eSAlex Elder /* 6180e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 6181e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 6182593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 6183593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 6184e28fff26SAlex Elder */ 6185e28fff26SAlex Elder static inline size_t next_token(const char **buf) 6186e28fff26SAlex Elder { 6187e28fff26SAlex Elder /* 6188e28fff26SAlex Elder * These are the characters that produce nonzero for 6189e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 6190e28fff26SAlex Elder */ 6191435a120aSColin Ian King static const char spaces[] = " \f\n\r\t\v"; 6192e28fff26SAlex Elder 6193e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 6194e28fff26SAlex Elder 6195e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 6196e28fff26SAlex Elder } 6197e28fff26SAlex Elder 6198e28fff26SAlex Elder /* 6199ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 6200ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 6201ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 6202ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 6203ea3352f4SAlex Elder * 6204ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 6205ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 6206ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 6207ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 6208ea3352f4SAlex Elder * 6209ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 6210ea3352f4SAlex Elder * the end of the found token. 6211ea3352f4SAlex Elder * 6212ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 6213ea3352f4SAlex Elder */ 6214ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 6215ea3352f4SAlex Elder { 6216ea3352f4SAlex Elder char *dup; 6217ea3352f4SAlex Elder size_t len; 6218ea3352f4SAlex Elder 6219ea3352f4SAlex Elder len = next_token(buf); 62204caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 6221ea3352f4SAlex Elder if (!dup) 6222ea3352f4SAlex Elder return NULL; 6223ea3352f4SAlex Elder *(dup + len) = '\0'; 6224ea3352f4SAlex Elder *buf += len; 6225ea3352f4SAlex Elder 6226ea3352f4SAlex Elder if (lenp) 6227ea3352f4SAlex Elder *lenp = len; 6228ea3352f4SAlex Elder 6229ea3352f4SAlex Elder return dup; 6230ea3352f4SAlex Elder } 6231ea3352f4SAlex Elder 623282995cc6SDavid Howells static int rbd_parse_param(struct fs_parameter *param, 623382995cc6SDavid Howells struct rbd_parse_opts_ctx *pctx) 623482995cc6SDavid Howells { 623582995cc6SDavid Howells struct rbd_options *opt = pctx->opts; 623682995cc6SDavid Howells struct fs_parse_result result; 62373fbb8d55SAl Viro struct p_log log = {.prefix = "rbd"}; 623882995cc6SDavid Howells int token, ret; 623982995cc6SDavid Howells 624082995cc6SDavid Howells ret = ceph_parse_param(param, pctx->copts, NULL); 624182995cc6SDavid Howells if (ret != -ENOPARAM) 624282995cc6SDavid Howells return ret; 624382995cc6SDavid Howells 6244d7167b14SAl Viro token = __fs_parse(&log, rbd_parameters, param, &result); 624582995cc6SDavid Howells dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); 624682995cc6SDavid Howells if (token < 0) { 62472c3f3dc3SAl Viro if (token == -ENOPARAM) 62482c3f3dc3SAl Viro return inval_plog(&log, "Unknown parameter '%s'", 624982995cc6SDavid Howells param->key); 625082995cc6SDavid Howells return token; 625182995cc6SDavid Howells } 625282995cc6SDavid Howells 625382995cc6SDavid Howells switch (token) { 625482995cc6SDavid Howells case Opt_queue_depth: 625582995cc6SDavid Howells if (result.uint_32 < 1) 625682995cc6SDavid Howells goto out_of_range; 625782995cc6SDavid Howells opt->queue_depth = result.uint_32; 625882995cc6SDavid Howells break; 625982995cc6SDavid Howells case Opt_alloc_size: 626082995cc6SDavid Howells if (result.uint_32 < SECTOR_SIZE) 626182995cc6SDavid Howells goto out_of_range; 62622c3f3dc3SAl Viro if (!is_power_of_2(result.uint_32)) 62632c3f3dc3SAl Viro return inval_plog(&log, "alloc_size must be a power of 2"); 626482995cc6SDavid Howells opt->alloc_size = result.uint_32; 626582995cc6SDavid Howells break; 626682995cc6SDavid Howells case Opt_lock_timeout: 626782995cc6SDavid Howells /* 0 is "wait forever" (i.e. infinite timeout) */ 626882995cc6SDavid Howells if (result.uint_32 > INT_MAX / 1000) 626982995cc6SDavid Howells goto out_of_range; 627082995cc6SDavid Howells opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000); 627182995cc6SDavid Howells break; 627282995cc6SDavid Howells case Opt_pool_ns: 627382995cc6SDavid Howells kfree(pctx->spec->pool_ns); 627482995cc6SDavid Howells pctx->spec->pool_ns = param->string; 627582995cc6SDavid Howells param->string = NULL; 627682995cc6SDavid Howells break; 6277dc1dad8eSIlya Dryomov case Opt_compression_hint: 6278dc1dad8eSIlya Dryomov switch (result.uint_32) { 6279dc1dad8eSIlya Dryomov case Opt_compression_hint_none: 6280dc1dad8eSIlya Dryomov opt->alloc_hint_flags &= 6281dc1dad8eSIlya Dryomov ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE | 6282dc1dad8eSIlya Dryomov CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE); 6283dc1dad8eSIlya Dryomov break; 6284dc1dad8eSIlya Dryomov case Opt_compression_hint_compressible: 6285dc1dad8eSIlya Dryomov opt->alloc_hint_flags |= 6286dc1dad8eSIlya Dryomov CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; 6287dc1dad8eSIlya Dryomov opt->alloc_hint_flags &= 6288dc1dad8eSIlya Dryomov ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; 6289dc1dad8eSIlya Dryomov break; 6290dc1dad8eSIlya Dryomov case Opt_compression_hint_incompressible: 6291dc1dad8eSIlya Dryomov opt->alloc_hint_flags |= 6292dc1dad8eSIlya Dryomov CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE; 6293dc1dad8eSIlya Dryomov opt->alloc_hint_flags &= 6294dc1dad8eSIlya Dryomov ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE; 6295dc1dad8eSIlya Dryomov break; 6296dc1dad8eSIlya Dryomov default: 6297dc1dad8eSIlya Dryomov BUG(); 6298dc1dad8eSIlya Dryomov } 6299dc1dad8eSIlya Dryomov break; 630082995cc6SDavid Howells case Opt_read_only: 630182995cc6SDavid Howells opt->read_only = true; 630282995cc6SDavid Howells break; 630382995cc6SDavid Howells case Opt_read_write: 630482995cc6SDavid Howells opt->read_only = false; 630582995cc6SDavid Howells break; 630682995cc6SDavid Howells case Opt_lock_on_read: 630782995cc6SDavid Howells opt->lock_on_read = true; 630882995cc6SDavid Howells break; 630982995cc6SDavid Howells case Opt_exclusive: 631082995cc6SDavid Howells opt->exclusive = true; 631182995cc6SDavid Howells break; 631282995cc6SDavid Howells case Opt_notrim: 631382995cc6SDavid Howells opt->trim = false; 631482995cc6SDavid Howells break; 631582995cc6SDavid Howells default: 631682995cc6SDavid Howells BUG(); 631782995cc6SDavid Howells } 631882995cc6SDavid Howells 631982995cc6SDavid Howells return 0; 632082995cc6SDavid Howells 632182995cc6SDavid Howells out_of_range: 63222c3f3dc3SAl Viro return inval_plog(&log, "%s out of range", param->key); 632382995cc6SDavid Howells } 632482995cc6SDavid Howells 632582995cc6SDavid Howells /* 632682995cc6SDavid Howells * This duplicates most of generic_parse_monolithic(), untying it from 632782995cc6SDavid Howells * fs_context and skipping standard superblock and security options. 632882995cc6SDavid Howells */ 632982995cc6SDavid Howells static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx) 633082995cc6SDavid Howells { 633182995cc6SDavid Howells char *key; 633282995cc6SDavid Howells int ret = 0; 633382995cc6SDavid Howells 633482995cc6SDavid Howells dout("%s '%s'\n", __func__, options); 633582995cc6SDavid Howells while ((key = strsep(&options, ",")) != NULL) { 633682995cc6SDavid Howells if (*key) { 633782995cc6SDavid Howells struct fs_parameter param = { 633882995cc6SDavid Howells .key = key, 63390f89589aSAl Viro .type = fs_value_is_flag, 634082995cc6SDavid Howells }; 634182995cc6SDavid Howells char *value = strchr(key, '='); 634282995cc6SDavid Howells size_t v_len = 0; 634382995cc6SDavid Howells 634482995cc6SDavid Howells if (value) { 634582995cc6SDavid Howells if (value == key) 634682995cc6SDavid Howells continue; 634782995cc6SDavid Howells *value++ = 0; 634882995cc6SDavid Howells v_len = strlen(value); 634982995cc6SDavid Howells param.string = kmemdup_nul(value, v_len, 635082995cc6SDavid Howells GFP_KERNEL); 635182995cc6SDavid Howells if (!param.string) 635282995cc6SDavid Howells return -ENOMEM; 63530f89589aSAl Viro param.type = fs_value_is_string; 635482995cc6SDavid Howells } 635582995cc6SDavid Howells param.size = v_len; 635682995cc6SDavid Howells 635782995cc6SDavid Howells ret = rbd_parse_param(¶m, pctx); 635882995cc6SDavid Howells kfree(param.string); 635982995cc6SDavid Howells if (ret) 636082995cc6SDavid Howells break; 636182995cc6SDavid Howells } 636282995cc6SDavid Howells } 636382995cc6SDavid Howells 636482995cc6SDavid Howells return ret; 636582995cc6SDavid Howells } 636682995cc6SDavid Howells 6367ea3352f4SAlex Elder /* 6368859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 6369859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 6370859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 6371859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 6372d22f76e7SAlex Elder * 6373859c31dfSAlex Elder * The information extracted from these options is recorded in 6374859c31dfSAlex Elder * the other parameters which return dynamically-allocated 6375859c31dfSAlex Elder * structures: 6376859c31dfSAlex Elder * ceph_opts 6377859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 6378859c31dfSAlex Elder * structure. Caller must release the returned pointer using 6379859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 6380859c31dfSAlex Elder * rbd_opts 6381859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 6382859c31dfSAlex Elder * this function; caller must release with kfree(). 6383859c31dfSAlex Elder * spec 6384859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 6385859c31dfSAlex Elder * initialized by this function based on parsed options. 6386859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 6387859c31dfSAlex Elder * 6388859c31dfSAlex Elder * The options passed take this form: 6389859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 6390859c31dfSAlex Elder * where: 6391859c31dfSAlex Elder * <mon_addrs> 6392859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 6393859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 6394859c31dfSAlex Elder * by a port number (separated by a colon). 6395859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 6396859c31dfSAlex Elder * <options> 6397859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 6398859c31dfSAlex Elder * <pool_name> 6399859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 6400859c31dfSAlex Elder * <image_name> 6401859c31dfSAlex Elder * The name of the image in that pool to map. 6402859c31dfSAlex Elder * <snap_id> 6403859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 6404859c31dfSAlex Elder * present data from the image at the time that snapshot was 6405859c31dfSAlex Elder * created. The image head is used if no snapshot id is 6406859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 6407a725f65eSAlex Elder */ 6408859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 6409dc79b113SAlex Elder struct ceph_options **ceph_opts, 6410859c31dfSAlex Elder struct rbd_options **opts, 6411859c31dfSAlex Elder struct rbd_spec **rbd_spec) 6412a725f65eSAlex Elder { 6413e28fff26SAlex Elder size_t len; 6414859c31dfSAlex Elder char *options; 64150ddebc0cSAlex Elder const char *mon_addrs; 6416ecb4dc22SAlex Elder char *snap_name; 64170ddebc0cSAlex Elder size_t mon_addrs_size; 641882995cc6SDavid Howells struct rbd_parse_opts_ctx pctx = { 0 }; 6419dc79b113SAlex Elder int ret; 6420e28fff26SAlex Elder 6421e28fff26SAlex Elder /* The first four tokens are required */ 6422e28fff26SAlex Elder 64237ef3214aSAlex Elder len = next_token(&buf); 64244fb5d671SAlex Elder if (!len) { 64254fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 64264fb5d671SAlex Elder return -EINVAL; 64274fb5d671SAlex Elder } 64280ddebc0cSAlex Elder mon_addrs = buf; 642982995cc6SDavid Howells mon_addrs_size = len; 64307ef3214aSAlex Elder buf += len; 6431a725f65eSAlex Elder 6432dc79b113SAlex Elder ret = -EINVAL; 6433f28e565aSAlex Elder options = dup_token(&buf, NULL); 6434f28e565aSAlex Elder if (!options) 6435dc79b113SAlex Elder return -ENOMEM; 64364fb5d671SAlex Elder if (!*options) { 64374fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 64384fb5d671SAlex Elder goto out_err; 64394fb5d671SAlex Elder } 6440a725f65eSAlex Elder 6441c300156bSIlya Dryomov pctx.spec = rbd_spec_alloc(); 6442c300156bSIlya Dryomov if (!pctx.spec) 6443f28e565aSAlex Elder goto out_mem; 6444859c31dfSAlex Elder 6445c300156bSIlya Dryomov pctx.spec->pool_name = dup_token(&buf, NULL); 6446c300156bSIlya Dryomov if (!pctx.spec->pool_name) 6447859c31dfSAlex Elder goto out_mem; 6448c300156bSIlya Dryomov if (!*pctx.spec->pool_name) { 64494fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 64504fb5d671SAlex Elder goto out_err; 64514fb5d671SAlex Elder } 6452e28fff26SAlex Elder 6453c300156bSIlya Dryomov pctx.spec->image_name = dup_token(&buf, NULL); 6454c300156bSIlya Dryomov if (!pctx.spec->image_name) 6455f28e565aSAlex Elder goto out_mem; 6456c300156bSIlya Dryomov if (!*pctx.spec->image_name) { 64574fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 64584fb5d671SAlex Elder goto out_err; 64594fb5d671SAlex Elder } 6460e28fff26SAlex Elder 6461f28e565aSAlex Elder /* 6462f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 6463f28e565aSAlex Elder * (indicating the head/no snapshot). 6464f28e565aSAlex Elder */ 64653feeb894SAlex Elder len = next_token(&buf); 6466820a5f3eSAlex Elder if (!len) { 64673feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 64683feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 6469f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 6470dc79b113SAlex Elder ret = -ENAMETOOLONG; 6471f28e565aSAlex Elder goto out_err; 6472849b4260SAlex Elder } 6473ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 6474ecb4dc22SAlex Elder if (!snap_name) 6475f28e565aSAlex Elder goto out_mem; 6476ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 6477c300156bSIlya Dryomov pctx.spec->snap_name = snap_name; 6478e5c35534SAlex Elder 647982995cc6SDavid Howells pctx.copts = ceph_alloc_options(); 648082995cc6SDavid Howells if (!pctx.copts) 648182995cc6SDavid Howells goto out_mem; 648282995cc6SDavid Howells 64830ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 6484e28fff26SAlex Elder 6485c300156bSIlya Dryomov pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); 6486c300156bSIlya Dryomov if (!pctx.opts) 64874e9afebaSAlex Elder goto out_mem; 64884e9afebaSAlex Elder 6489c300156bSIlya Dryomov pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; 6490c300156bSIlya Dryomov pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 64910c93e1b7SIlya Dryomov pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; 6492c300156bSIlya Dryomov pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; 6493c300156bSIlya Dryomov pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 6494c300156bSIlya Dryomov pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; 6495c300156bSIlya Dryomov pctx.opts->trim = RBD_TRIM_DEFAULT; 6496d22f76e7SAlex Elder 64972d7c86a8SVenky Shankar ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL, 64982d7c86a8SVenky Shankar ','); 649982995cc6SDavid Howells if (ret) 6500dc79b113SAlex Elder goto out_err; 6501859c31dfSAlex Elder 650282995cc6SDavid Howells ret = rbd_parse_options(options, &pctx); 650382995cc6SDavid Howells if (ret) 650482995cc6SDavid Howells goto out_err; 650582995cc6SDavid Howells 650682995cc6SDavid Howells *ceph_opts = pctx.copts; 6507c300156bSIlya Dryomov *opts = pctx.opts; 6508c300156bSIlya Dryomov *rbd_spec = pctx.spec; 650982995cc6SDavid Howells kfree(options); 6510dc79b113SAlex Elder return 0; 651182995cc6SDavid Howells 6512f28e565aSAlex Elder out_mem: 6513dc79b113SAlex Elder ret = -ENOMEM; 6514d22f76e7SAlex Elder out_err: 6515c300156bSIlya Dryomov kfree(pctx.opts); 651682995cc6SDavid Howells ceph_destroy_options(pctx.copts); 6517c300156bSIlya Dryomov rbd_spec_put(pctx.spec); 6518f28e565aSAlex Elder kfree(options); 6519dc79b113SAlex Elder return ret; 6520a725f65eSAlex Elder } 6521a725f65eSAlex Elder 6522e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 6523e010dd0aSIlya Dryomov { 6524e010dd0aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6525e010dd0aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6526e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev); 6527e010dd0aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 6528e010dd0aSIlya Dryomov } 6529e010dd0aSIlya Dryomov 6530637cd060SIlya Dryomov /* 6531637cd060SIlya Dryomov * If the wait is interrupted, an error is returned even if the lock 6532637cd060SIlya Dryomov * was successfully acquired. rbd_dev_image_unlock() will release it 6533637cd060SIlya Dryomov * if needed. 6534637cd060SIlya Dryomov */ 6535e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) 6536e010dd0aSIlya Dryomov { 6537637cd060SIlya Dryomov long ret; 65382f18d466SIlya Dryomov 6539e010dd0aSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { 6540637cd060SIlya Dryomov if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read) 6541637cd060SIlya Dryomov return 0; 6542637cd060SIlya Dryomov 6543e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); 6544e010dd0aSIlya Dryomov return -EINVAL; 6545e010dd0aSIlya Dryomov } 6546e010dd0aSIlya Dryomov 65473fe69921SIlya Dryomov if (rbd_is_ro(rbd_dev)) 6548637cd060SIlya Dryomov return 0; 6549637cd060SIlya Dryomov 6550637cd060SIlya Dryomov rbd_assert(!rbd_is_lock_owner(rbd_dev)); 6551637cd060SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 6552637cd060SIlya Dryomov ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait, 6553637cd060SIlya Dryomov ceph_timeout_jiffies(rbd_dev->opts->lock_timeout)); 655425e6be21SDongsheng Yang if (ret > 0) { 6555637cd060SIlya Dryomov ret = rbd_dev->acquire_err; 655625e6be21SDongsheng Yang } else { 655725e6be21SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->lock_dwork); 655825e6be21SDongsheng Yang if (!ret) 6559637cd060SIlya Dryomov ret = -ETIMEDOUT; 656025e6be21SDongsheng Yang } 6561637cd060SIlya Dryomov 65622f18d466SIlya Dryomov if (ret) { 6563637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); 6564637cd060SIlya Dryomov return ret; 6565e010dd0aSIlya Dryomov } 6566e010dd0aSIlya Dryomov 6567637cd060SIlya Dryomov /* 6568637cd060SIlya Dryomov * The lock may have been released by now, unless automatic lock 6569637cd060SIlya Dryomov * transitions are disabled. 6570637cd060SIlya Dryomov */ 6571637cd060SIlya Dryomov rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); 6572e010dd0aSIlya Dryomov return 0; 6573e010dd0aSIlya Dryomov } 6574e010dd0aSIlya Dryomov 657530ba1f02SIlya Dryomov /* 6576589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 6577589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 6578589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 6579589d30e0SAlex Elder * 6580589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 6581589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 6582589d30e0SAlex Elder * with the supplied name. 6583589d30e0SAlex Elder * 6584589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 6585589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 6586589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 6587589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 6588589d30e0SAlex Elder */ 6589589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 6590589d30e0SAlex Elder { 6591589d30e0SAlex Elder int ret; 6592589d30e0SAlex Elder size_t size; 6593ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 6594589d30e0SAlex Elder void *response; 6595c0fba368SAlex Elder char *image_id; 65962f82ee54SAlex Elder 6597589d30e0SAlex Elder /* 65982c0d0a10SAlex Elder * When probing a parent image, the image id is already 65992c0d0a10SAlex Elder * known (and the image name likely is not). There's no 6600c0fba368SAlex Elder * need to fetch the image id again in this case. We 6601c0fba368SAlex Elder * do still need to set the image format though. 66022c0d0a10SAlex Elder */ 6603c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 6604c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 6605c0fba368SAlex Elder 66062c0d0a10SAlex Elder return 0; 6607c0fba368SAlex Elder } 66082c0d0a10SAlex Elder 66092c0d0a10SAlex Elder /* 6610589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 6611589d30e0SAlex Elder * so, get the image's persistent id from it. 6612589d30e0SAlex Elder */ 6613ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 6614ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 6615ecd4a68aSIlya Dryomov if (ret) 6616ecd4a68aSIlya Dryomov return ret; 6617ecd4a68aSIlya Dryomov 6618ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 6619589d30e0SAlex Elder 6620589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 6621589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 6622589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 6623589d30e0SAlex Elder if (!response) { 6624589d30e0SAlex Elder ret = -ENOMEM; 6625589d30e0SAlex Elder goto out; 6626589d30e0SAlex Elder } 6627589d30e0SAlex Elder 6628c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 6629c0fba368SAlex Elder 6630ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 6631ecd4a68aSIlya Dryomov "get_id", NULL, 0, 66325435d206SDongsheng Yang response, size); 663336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6634c0fba368SAlex Elder if (ret == -ENOENT) { 6635c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 6636c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 6637c0fba368SAlex Elder if (!ret) 6638c0fba368SAlex Elder rbd_dev->image_format = 1; 66397dd440c9SIlya Dryomov } else if (ret >= 0) { 6640c0fba368SAlex Elder void *p = response; 6641589d30e0SAlex Elder 6642c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 6643979ed480SAlex Elder NULL, GFP_NOIO); 6644461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 6645c0fba368SAlex Elder if (!ret) 6646c0fba368SAlex Elder rbd_dev->image_format = 2; 6647c0fba368SAlex Elder } 6648c0fba368SAlex Elder 6649c0fba368SAlex Elder if (!ret) { 6650c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 6651c0fba368SAlex Elder dout("image_id is %s\n", image_id); 6652589d30e0SAlex Elder } 6653589d30e0SAlex Elder out: 6654589d30e0SAlex Elder kfree(response); 6655ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 6656589d30e0SAlex Elder return ret; 6657589d30e0SAlex Elder } 6658589d30e0SAlex Elder 66593abef3b3SAlex Elder /* 66603abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 66613abef3b3SAlex Elder * call. 66623abef3b3SAlex Elder */ 66636fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 66646fd48b3bSAlex Elder { 66656fd48b3bSAlex Elder struct rbd_image_header *header; 66666fd48b3bSAlex Elder 6667a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 666822e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev); 6669da5ef6beSIlya Dryomov rbd_dev_mapping_clear(rbd_dev); 66706fd48b3bSAlex Elder 66716fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 66726fd48b3bSAlex Elder 66736fd48b3bSAlex Elder header = &rbd_dev->header; 6674812164f8SAlex Elder ceph_put_snap_context(header->snapc); 66756fd48b3bSAlex Elder kfree(header->snap_sizes); 66766fd48b3bSAlex Elder kfree(header->snap_names); 66776fd48b3bSAlex Elder kfree(header->object_prefix); 66786fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 66796fd48b3bSAlex Elder } 66806fd48b3bSAlex Elder 66812df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 6682a30b71b9SAlex Elder { 6683a30b71b9SAlex Elder int ret; 6684a30b71b9SAlex Elder 66851e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 668657385b51SAlex Elder if (ret) 66871e130199SAlex Elder goto out_err; 6688b1b5402aSAlex Elder 66892df3fac7SAlex Elder /* 66902df3fac7SAlex Elder * Get the and check features for the image. Currently the 66912df3fac7SAlex Elder * features are assumed to never change. 66922df3fac7SAlex Elder */ 6693b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 669457385b51SAlex Elder if (ret) 6695b1b5402aSAlex Elder goto out_err; 669635d489f9SAlex Elder 6697cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 6698cc070d59SAlex Elder 6699cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 6700cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 6701cc070d59SAlex Elder if (ret < 0) 6702cc070d59SAlex Elder goto out_err; 6703cc070d59SAlex Elder } 6704a30b71b9SAlex Elder 67057e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 67067e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 67077e97332eSIlya Dryomov if (ret) 67087e97332eSIlya Dryomov goto out_err; 67097e97332eSIlya Dryomov } 67107e97332eSIlya Dryomov 6711263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 671235152979SAlex Elder return 0; 6713263423f8SIlya Dryomov 67149d475de5SAlex Elder out_err: 6715642a2537SAlex Elder rbd_dev->header.features = 0; 67161e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 67171e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 67189d475de5SAlex Elder return ret; 6719a30b71b9SAlex Elder } 6720a30b71b9SAlex Elder 67216d69bb53SIlya Dryomov /* 67226d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 67236d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 67246d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 67256d69bb53SIlya Dryomov */ 67266d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 672783a06263SAlex Elder { 67282f82ee54SAlex Elder struct rbd_device *parent = NULL; 6729124afba2SAlex Elder int ret; 6730124afba2SAlex Elder 6731124afba2SAlex Elder if (!rbd_dev->parent_spec) 6732124afba2SAlex Elder return 0; 6733124afba2SAlex Elder 67346d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 67356d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 67366d69bb53SIlya Dryomov ret = -EINVAL; 67376d69bb53SIlya Dryomov goto out_err; 67386d69bb53SIlya Dryomov } 67396d69bb53SIlya Dryomov 67401643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 67411f2c6651SIlya Dryomov if (!parent) { 6742124afba2SAlex Elder ret = -ENOMEM; 6743124afba2SAlex Elder goto out_err; 67441f2c6651SIlya Dryomov } 67451f2c6651SIlya Dryomov 67461f2c6651SIlya Dryomov /* 67471f2c6651SIlya Dryomov * Images related by parent/child relationships always share 67481f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 67491f2c6651SIlya Dryomov */ 67501f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 67511f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 6752124afba2SAlex Elder 675339258aa2SIlya Dryomov __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags); 675439258aa2SIlya Dryomov 67556d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 6756124afba2SAlex Elder if (ret < 0) 6757124afba2SAlex Elder goto out_err; 67581f2c6651SIlya Dryomov 6759124afba2SAlex Elder rbd_dev->parent = parent; 6760a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 6761124afba2SAlex Elder return 0; 6762124afba2SAlex Elder 67631f2c6651SIlya Dryomov out_err: 67641f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 67651f2c6651SIlya Dryomov rbd_dev_destroy(parent); 6766124afba2SAlex Elder return ret; 6767124afba2SAlex Elder } 6768124afba2SAlex Elder 67695769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 67705769ed0cSIlya Dryomov { 67715769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 67725769ed0cSIlya Dryomov rbd_free_disk(rbd_dev); 67735769ed0cSIlya Dryomov if (!single_major) 67745769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name); 67755769ed0cSIlya Dryomov } 67765769ed0cSIlya Dryomov 6777811c6688SIlya Dryomov /* 6778811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 6779811c6688SIlya Dryomov * upon return. 6780811c6688SIlya Dryomov */ 6781200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 6782124afba2SAlex Elder { 678383a06263SAlex Elder int ret; 678483a06263SAlex Elder 67859b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 678683a06263SAlex Elder 67879b60e70bSIlya Dryomov if (!single_major) { 678883a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 678983a06263SAlex Elder if (ret < 0) 67901643dfa4SIlya Dryomov goto err_out_unlock; 67919b60e70bSIlya Dryomov 679283a06263SAlex Elder rbd_dev->major = ret; 6793dd82fff1SIlya Dryomov rbd_dev->minor = 0; 67949b60e70bSIlya Dryomov } else { 67959b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 67969b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 67979b60e70bSIlya Dryomov } 679883a06263SAlex Elder 679983a06263SAlex Elder /* Set up the blkdev mapping. */ 680083a06263SAlex Elder 680183a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 680283a06263SAlex Elder if (ret) 680383a06263SAlex Elder goto err_out_blkdev; 680483a06263SAlex Elder 6805f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 680639258aa2SIlya Dryomov set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev)); 6807f35a4deeSAlex Elder 68085769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 6809f35a4deeSAlex Elder if (ret) 6810da5ef6beSIlya Dryomov goto err_out_disk; 681183a06263SAlex Elder 6812129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6813811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 68145769ed0cSIlya Dryomov return 0; 68152f82ee54SAlex Elder 681683a06263SAlex Elder err_out_disk: 681783a06263SAlex Elder rbd_free_disk(rbd_dev); 681883a06263SAlex Elder err_out_blkdev: 68199b60e70bSIlya Dryomov if (!single_major) 682083a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 6821811c6688SIlya Dryomov err_out_unlock: 6822811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 682383a06263SAlex Elder return ret; 682483a06263SAlex Elder } 682583a06263SAlex Elder 6826332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 6827332bb12dSAlex Elder { 6828332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 6829c41d13a3SIlya Dryomov int ret; 6830332bb12dSAlex Elder 6831332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 6832332bb12dSAlex Elder 6833332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6834332bb12dSAlex Elder if (rbd_dev->image_format == 1) 6835c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6836332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 6837332bb12dSAlex Elder else 6838c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6839332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 6840c41d13a3SIlya Dryomov 6841c41d13a3SIlya Dryomov return ret; 6842332bb12dSAlex Elder } 6843332bb12dSAlex Elder 6844b9ef2b88SIlya Dryomov static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap) 6845b9ef2b88SIlya Dryomov { 6846b9ef2b88SIlya Dryomov if (!is_snap) { 6847b9ef2b88SIlya Dryomov pr_info("image %s/%s%s%s does not exist\n", 6848b9ef2b88SIlya Dryomov rbd_dev->spec->pool_name, 6849b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ?: "", 6850b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 6851b9ef2b88SIlya Dryomov rbd_dev->spec->image_name); 6852b9ef2b88SIlya Dryomov } else { 6853b9ef2b88SIlya Dryomov pr_info("snap %s/%s%s%s@%s does not exist\n", 6854b9ef2b88SIlya Dryomov rbd_dev->spec->pool_name, 6855b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ?: "", 6856b9ef2b88SIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 6857b9ef2b88SIlya Dryomov rbd_dev->spec->image_name, 6858b9ef2b88SIlya Dryomov rbd_dev->spec->snap_name); 6859b9ef2b88SIlya Dryomov } 6860b9ef2b88SIlya Dryomov } 6861b9ef2b88SIlya Dryomov 6862200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 6863200a6a8bSAlex Elder { 6864b8776051SIlya Dryomov if (!rbd_is_ro(rbd_dev)) 6865fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev); 6866952c48b0SIlya Dryomov 6867952c48b0SIlya Dryomov rbd_dev_unprobe(rbd_dev); 68686fd48b3bSAlex Elder rbd_dev->image_format = 0; 68696fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 68706fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 6871200a6a8bSAlex Elder } 6872200a6a8bSAlex Elder 6873a30b71b9SAlex Elder /* 6874a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 68751f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 68761f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 68771f3ef788SAlex Elder * object to get detailed information about the rbd image. 68780e4e1de5SIlya Dryomov * 68790e4e1de5SIlya Dryomov * On success, returns with header_rwsem held for write if called 68800e4e1de5SIlya Dryomov * with @depth == 0. 6881a30b71b9SAlex Elder */ 68826d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 6883a30b71b9SAlex Elder { 6884b9ef2b88SIlya Dryomov bool need_watch = !rbd_is_ro(rbd_dev); 6885a30b71b9SAlex Elder int ret; 6886a30b71b9SAlex Elder 6887a30b71b9SAlex Elder /* 68883abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 68893abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 68903abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 68913abef3b3SAlex Elder * will be set to either 1 or 2. 6892a30b71b9SAlex Elder */ 6893a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 6894a30b71b9SAlex Elder if (ret) 6895c0fba368SAlex Elder return ret; 6896c0fba368SAlex Elder 6897332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 6898332bb12dSAlex Elder if (ret) 6899332bb12dSAlex Elder goto err_out_format; 6900332bb12dSAlex Elder 6901b9ef2b88SIlya Dryomov if (need_watch) { 690299d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 69031fe48023SIlya Dryomov if (ret) { 69041fe48023SIlya Dryomov if (ret == -ENOENT) 6905b9ef2b88SIlya Dryomov rbd_print_dne(rbd_dev, false); 6906c41d13a3SIlya Dryomov goto err_out_format; 69071f3ef788SAlex Elder } 69081fe48023SIlya Dryomov } 6909b644de2bSAlex Elder 69100e4e1de5SIlya Dryomov if (!depth) 69110e4e1de5SIlya Dryomov down_write(&rbd_dev->header_rwsem); 69120e4e1de5SIlya Dryomov 6913a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 6914b9ef2b88SIlya Dryomov if (ret) { 6915b9ef2b88SIlya Dryomov if (ret == -ENOENT && !need_watch) 6916b9ef2b88SIlya Dryomov rbd_print_dne(rbd_dev, false); 6917952c48b0SIlya Dryomov goto err_out_probe; 6918b9ef2b88SIlya Dryomov } 6919a30b71b9SAlex Elder 692004077599SIlya Dryomov /* 692104077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 692204077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 692304077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 692404077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 692504077599SIlya Dryomov */ 69266d69bb53SIlya Dryomov if (!depth) 692704077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 692804077599SIlya Dryomov else 692904077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 69301fe48023SIlya Dryomov if (ret) { 69311fe48023SIlya Dryomov if (ret == -ENOENT) 6932b9ef2b88SIlya Dryomov rbd_print_dne(rbd_dev, true); 693333dca39fSAlex Elder goto err_out_probe; 69341fe48023SIlya Dryomov } 69359bb81c9bSAlex Elder 6936da5ef6beSIlya Dryomov ret = rbd_dev_mapping_set(rbd_dev); 6937da5ef6beSIlya Dryomov if (ret) 6938da5ef6beSIlya Dryomov goto err_out_probe; 6939da5ef6beSIlya Dryomov 6940f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev) && 694122e8bd51SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) { 694222e8bd51SIlya Dryomov ret = rbd_object_map_load(rbd_dev); 694322e8bd51SIlya Dryomov if (ret) 694422e8bd51SIlya Dryomov goto err_out_probe; 694522e8bd51SIlya Dryomov } 694622e8bd51SIlya Dryomov 6947e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 6948e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 6949e8f59b59SIlya Dryomov if (ret) 6950e8f59b59SIlya Dryomov goto err_out_probe; 6951e8f59b59SIlya Dryomov } 6952e8f59b59SIlya Dryomov 69536d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 695430d60ba2SAlex Elder if (ret) 695530d60ba2SAlex Elder goto err_out_probe; 695683a06263SAlex Elder 695730d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 6958c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 695930d60ba2SAlex Elder return 0; 6960e8f59b59SIlya Dryomov 69616fd48b3bSAlex Elder err_out_probe: 69620e4e1de5SIlya Dryomov if (!depth) 69630e4e1de5SIlya Dryomov up_write(&rbd_dev->header_rwsem); 6964b9ef2b88SIlya Dryomov if (need_watch) 696599d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 6966952c48b0SIlya Dryomov rbd_dev_unprobe(rbd_dev); 6967332bb12dSAlex Elder err_out_format: 6968332bb12dSAlex Elder rbd_dev->image_format = 0; 69695655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 69705655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 69715655c4d9SAlex Elder return ret; 697283a06263SAlex Elder } 697383a06263SAlex Elder 69749b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 697559c2be1eSYehuda Sadeh const char *buf, 697659c2be1eSYehuda Sadeh size_t count) 6977602adf40SYehuda Sadeh { 6978cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 6979dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 69804e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 6981859c31dfSAlex Elder struct rbd_spec *spec = NULL; 69829d3997fdSAlex Elder struct rbd_client *rbdc; 6983b51c83c2SIlya Dryomov int rc; 6984602adf40SYehuda Sadeh 6985f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN)) 6986f44d04e6SIlya Dryomov return -EPERM; 6987f44d04e6SIlya Dryomov 6988602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 6989602adf40SYehuda Sadeh return -ENODEV; 6990602adf40SYehuda Sadeh 6991a725f65eSAlex Elder /* parse add command */ 6992859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 6993dc79b113SAlex Elder if (rc < 0) 6994dd5ac32dSIlya Dryomov goto out; 6995a725f65eSAlex Elder 69969d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 69979d3997fdSAlex Elder if (IS_ERR(rbdc)) { 69989d3997fdSAlex Elder rc = PTR_ERR(rbdc); 69990ddebc0cSAlex Elder goto err_out_args; 70009d3997fdSAlex Elder } 7001602adf40SYehuda Sadeh 7002602adf40SYehuda Sadeh /* pick the pool */ 7003dd435855SIlya Dryomov rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); 70041fe48023SIlya Dryomov if (rc < 0) { 70051fe48023SIlya Dryomov if (rc == -ENOENT) 70061fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 7007602adf40SYehuda Sadeh goto err_out_client; 70081fe48023SIlya Dryomov } 7009859c31dfSAlex Elder spec->pool_id = (u64)rc; 7010859c31dfSAlex Elder 7011d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 7012b51c83c2SIlya Dryomov if (!rbd_dev) { 7013b51c83c2SIlya Dryomov rc = -ENOMEM; 7014bd4ba655SAlex Elder goto err_out_client; 7015b51c83c2SIlya Dryomov } 7016c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 7017c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 7018d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 7019602adf40SYehuda Sadeh 702039258aa2SIlya Dryomov /* if we are mapping a snapshot it will be a read-only mapping */ 702139258aa2SIlya Dryomov if (rbd_dev->opts->read_only || 702239258aa2SIlya Dryomov strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME)) 702339258aa2SIlya Dryomov __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags); 702439258aa2SIlya Dryomov 70250d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 70260d6d1e9cSMike Christie if (!rbd_dev->config_info) { 70270d6d1e9cSMike Christie rc = -ENOMEM; 70280d6d1e9cSMike Christie goto err_out_rbd_dev; 70290d6d1e9cSMike Christie } 70300d6d1e9cSMike Christie 70316d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 70320e4e1de5SIlya Dryomov if (rc < 0) 7033c53d5893SAlex Elder goto err_out_rbd_dev; 703405fd6f6fSAlex Elder 70350c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { 70360c93e1b7SIlya Dryomov rbd_warn(rbd_dev, "alloc_size adjusted to %u", 70370c93e1b7SIlya Dryomov rbd_dev->layout.object_size); 70380c93e1b7SIlya Dryomov rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; 70390c93e1b7SIlya Dryomov } 70400c93e1b7SIlya Dryomov 7041b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 7042fd22aef8SIlya Dryomov if (rc) 70438b679ec5SIlya Dryomov goto err_out_image_probe; 70443abef3b3SAlex Elder 7045e010dd0aSIlya Dryomov rc = rbd_add_acquire_lock(rbd_dev); 7046e010dd0aSIlya Dryomov if (rc) 7047637cd060SIlya Dryomov goto err_out_image_lock; 7048b536f69aSAlex Elder 70495769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */ 70505769ed0cSIlya Dryomov 70515769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev); 70525769ed0cSIlya Dryomov if (rc) 7053e010dd0aSIlya Dryomov goto err_out_image_lock; 70545769ed0cSIlya Dryomov 705527c97abcSLuis Chamberlain rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); 705627c97abcSLuis Chamberlain if (rc) 705727c97abcSLuis Chamberlain goto err_out_cleanup_disk; 70585769ed0cSIlya Dryomov 70595769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 70605769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 70615769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 70625769ed0cSIlya Dryomov 70635769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 70645769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 70655769ed0cSIlya Dryomov rbd_dev->header.features); 7066dd5ac32dSIlya Dryomov rc = count; 7067dd5ac32dSIlya Dryomov out: 7068dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 7069dd5ac32dSIlya Dryomov return rc; 7070b536f69aSAlex Elder 707127c97abcSLuis Chamberlain err_out_cleanup_disk: 707227c97abcSLuis Chamberlain rbd_free_disk(rbd_dev); 7073e010dd0aSIlya Dryomov err_out_image_lock: 7074e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 70755769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev); 70768b679ec5SIlya Dryomov err_out_image_probe: 70778b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev); 7078c53d5893SAlex Elder err_out_rbd_dev: 7079c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 7080bd4ba655SAlex Elder err_out_client: 70819d3997fdSAlex Elder rbd_put_client(rbdc); 70820ddebc0cSAlex Elder err_out_args: 7083859c31dfSAlex Elder rbd_spec_put(spec); 7084d147543dSIlya Dryomov kfree(rbd_opts); 7085dd5ac32dSIlya Dryomov goto out; 7086602adf40SYehuda Sadeh } 7087602adf40SYehuda Sadeh 70887e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count) 70899b60e70bSIlya Dryomov { 70909b60e70bSIlya Dryomov if (single_major) 70919b60e70bSIlya Dryomov return -EINVAL; 70929b60e70bSIlya Dryomov 70939b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 70949b60e70bSIlya Dryomov } 70959b60e70bSIlya Dryomov 70967e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, 70979b60e70bSIlya Dryomov size_t count) 70989b60e70bSIlya Dryomov { 70999b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 71009b60e70bSIlya Dryomov } 71019b60e70bSIlya Dryomov 710205a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 710305a46afdSAlex Elder { 7104ad945fc1SAlex Elder while (rbd_dev->parent) { 710505a46afdSAlex Elder struct rbd_device *first = rbd_dev; 710605a46afdSAlex Elder struct rbd_device *second = first->parent; 710705a46afdSAlex Elder struct rbd_device *third; 710805a46afdSAlex Elder 710905a46afdSAlex Elder /* 711005a46afdSAlex Elder * Follow to the parent with no grandparent and 711105a46afdSAlex Elder * remove it. 711205a46afdSAlex Elder */ 711305a46afdSAlex Elder while (second && (third = second->parent)) { 711405a46afdSAlex Elder first = second; 711505a46afdSAlex Elder second = third; 711605a46afdSAlex Elder } 7117ad945fc1SAlex Elder rbd_assert(second); 71188ad42cd0SAlex Elder rbd_dev_image_release(second); 71198b679ec5SIlya Dryomov rbd_dev_destroy(second); 7120ad945fc1SAlex Elder first->parent = NULL; 7121ad945fc1SAlex Elder first->parent_overlap = 0; 7122ad945fc1SAlex Elder 7123ad945fc1SAlex Elder rbd_assert(first->parent_spec); 712405a46afdSAlex Elder rbd_spec_put(first->parent_spec); 712505a46afdSAlex Elder first->parent_spec = NULL; 712605a46afdSAlex Elder } 712705a46afdSAlex Elder } 712805a46afdSAlex Elder 71299b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 7130602adf40SYehuda Sadeh const char *buf, 7131602adf40SYehuda Sadeh size_t count) 7132602adf40SYehuda Sadeh { 7133602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 7134751cc0e3SAlex Elder struct list_head *tmp; 7135751cc0e3SAlex Elder int dev_id; 71360276dca6SMike Christie char opt_buf[6]; 71370276dca6SMike Christie bool force = false; 71380d8189e1SAlex Elder int ret; 7139602adf40SYehuda Sadeh 7140f44d04e6SIlya Dryomov if (!capable(CAP_SYS_ADMIN)) 7141f44d04e6SIlya Dryomov return -EPERM; 7142f44d04e6SIlya Dryomov 71430276dca6SMike Christie dev_id = -1; 71440276dca6SMike Christie opt_buf[0] = '\0'; 71450276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 71460276dca6SMike Christie if (dev_id < 0) { 71470276dca6SMike Christie pr_err("dev_id out of range\n"); 7148602adf40SYehuda Sadeh return -EINVAL; 71490276dca6SMike Christie } 71500276dca6SMike Christie if (opt_buf[0] != '\0') { 71510276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 71520276dca6SMike Christie force = true; 71530276dca6SMike Christie } else { 71540276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 71550276dca6SMike Christie return -EINVAL; 71560276dca6SMike Christie } 71570276dca6SMike Christie } 7158602adf40SYehuda Sadeh 7159602adf40SYehuda Sadeh ret = -ENOENT; 7160751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 7161751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 7162751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 7163751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 7164751cc0e3SAlex Elder ret = 0; 7165751cc0e3SAlex Elder break; 7166602adf40SYehuda Sadeh } 7167751cc0e3SAlex Elder } 7168751cc0e3SAlex Elder if (!ret) { 7169a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 71700276dca6SMike Christie if (rbd_dev->open_count && !force) 717142382b70SAlex Elder ret = -EBUSY; 717285f5a4d6SIlya Dryomov else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, 717385f5a4d6SIlya Dryomov &rbd_dev->flags)) 717485f5a4d6SIlya Dryomov ret = -EINPROGRESS; 7175a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 7176751cc0e3SAlex Elder } 7177751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 717885f5a4d6SIlya Dryomov if (ret) 71791ba0f1e7SAlex Elder return ret; 7180751cc0e3SAlex Elder 71810276dca6SMike Christie if (force) { 71820276dca6SMike Christie /* 71830276dca6SMike Christie * Prevent new IO from being queued and wait for existing 71840276dca6SMike Christie * IO to complete/fail. 71850276dca6SMike Christie */ 71860276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 71877a5428dcSChristoph Hellwig blk_mark_disk_dead(rbd_dev->disk); 71880276dca6SMike Christie } 71890276dca6SMike Christie 71905769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk); 71915769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 71925769ed0cSIlya Dryomov list_del_init(&rbd_dev->node); 71935769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 71945769ed0cSIlya Dryomov device_del(&rbd_dev->dev); 7195fca27065SIlya Dryomov 7196e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 7197dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 71988ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 71998b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev); 72001ba0f1e7SAlex Elder return count; 7201602adf40SYehuda Sadeh } 7202602adf40SYehuda Sadeh 72037e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) 72049b60e70bSIlya Dryomov { 72059b60e70bSIlya Dryomov if (single_major) 72069b60e70bSIlya Dryomov return -EINVAL; 72079b60e70bSIlya Dryomov 72089b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 72099b60e70bSIlya Dryomov } 72109b60e70bSIlya Dryomov 72117e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, 72129b60e70bSIlya Dryomov size_t count) 72139b60e70bSIlya Dryomov { 72149b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 72159b60e70bSIlya Dryomov } 72169b60e70bSIlya Dryomov 7217602adf40SYehuda Sadeh /* 7218602adf40SYehuda Sadeh * create control files in sysfs 7219dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 7220602adf40SYehuda Sadeh */ 72217d8dc534SChengguang Xu static int __init rbd_sysfs_init(void) 7222602adf40SYehuda Sadeh { 7223dfc5606dSYehuda Sadeh int ret; 7224602adf40SYehuda Sadeh 7225fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 7226dfc5606dSYehuda Sadeh if (ret < 0) 7227dfc5606dSYehuda Sadeh return ret; 7228602adf40SYehuda Sadeh 7229fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 7230fed4c143SAlex Elder if (ret < 0) 7231fed4c143SAlex Elder device_unregister(&rbd_root_dev); 7232602adf40SYehuda Sadeh 7233602adf40SYehuda Sadeh return ret; 7234602adf40SYehuda Sadeh } 7235602adf40SYehuda Sadeh 72367d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void) 7237602adf40SYehuda Sadeh { 7238dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 7239fed4c143SAlex Elder device_unregister(&rbd_root_dev); 7240602adf40SYehuda Sadeh } 7241602adf40SYehuda Sadeh 72427d8dc534SChengguang Xu static int __init rbd_slab_init(void) 72431c2a9dfeSAlex Elder { 72441c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 724503d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 7246868311b1SAlex Elder if (!rbd_img_request_cache) 7247868311b1SAlex Elder return -ENOMEM; 7248868311b1SAlex Elder 7249868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 725003d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 725178c2a44aSAlex Elder if (!rbd_obj_request_cache) 725278c2a44aSAlex Elder goto out_err; 725378c2a44aSAlex Elder 72541c2a9dfeSAlex Elder return 0; 72551c2a9dfeSAlex Elder 72566c696d85SIlya Dryomov out_err: 7257868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 7258868311b1SAlex Elder rbd_img_request_cache = NULL; 72591c2a9dfeSAlex Elder return -ENOMEM; 72601c2a9dfeSAlex Elder } 72611c2a9dfeSAlex Elder 72621c2a9dfeSAlex Elder static void rbd_slab_exit(void) 72631c2a9dfeSAlex Elder { 7264868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 7265868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 7266868311b1SAlex Elder rbd_obj_request_cache = NULL; 7267868311b1SAlex Elder 72681c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 72691c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 72701c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 72711c2a9dfeSAlex Elder } 72721c2a9dfeSAlex Elder 7273cc344fa1SAlex Elder static int __init rbd_init(void) 7274602adf40SYehuda Sadeh { 7275602adf40SYehuda Sadeh int rc; 7276602adf40SYehuda Sadeh 72771e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 72781e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 72791e32d34cSAlex Elder return -EINVAL; 72801e32d34cSAlex Elder } 7281e1b4d96dSIlya Dryomov 72821c2a9dfeSAlex Elder rc = rbd_slab_init(); 7283602adf40SYehuda Sadeh if (rc) 7284602adf40SYehuda Sadeh return rc; 7285e1b4d96dSIlya Dryomov 7286f5ee37bdSIlya Dryomov /* 7287f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 7288f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 7289f5ee37bdSIlya Dryomov */ 7290f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 7291f5ee37bdSIlya Dryomov if (!rbd_wq) { 7292f5ee37bdSIlya Dryomov rc = -ENOMEM; 7293f5ee37bdSIlya Dryomov goto err_out_slab; 7294f5ee37bdSIlya Dryomov } 7295f5ee37bdSIlya Dryomov 72969b60e70bSIlya Dryomov if (single_major) { 72979b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 72989b60e70bSIlya Dryomov if (rbd_major < 0) { 72999b60e70bSIlya Dryomov rc = rbd_major; 7300f5ee37bdSIlya Dryomov goto err_out_wq; 73019b60e70bSIlya Dryomov } 73029b60e70bSIlya Dryomov } 73039b60e70bSIlya Dryomov 73041c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 73051c2a9dfeSAlex Elder if (rc) 73069b60e70bSIlya Dryomov goto err_out_blkdev; 73071c2a9dfeSAlex Elder 73089b60e70bSIlya Dryomov if (single_major) 73099b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 73109b60e70bSIlya Dryomov else 7311e1b4d96dSIlya Dryomov pr_info("loaded\n"); 73129b60e70bSIlya Dryomov 7313e1b4d96dSIlya Dryomov return 0; 7314e1b4d96dSIlya Dryomov 73159b60e70bSIlya Dryomov err_out_blkdev: 73169b60e70bSIlya Dryomov if (single_major) 73179b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 7318f5ee37bdSIlya Dryomov err_out_wq: 7319f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 7320e1b4d96dSIlya Dryomov err_out_slab: 7321e1b4d96dSIlya Dryomov rbd_slab_exit(); 73221c2a9dfeSAlex Elder return rc; 7323602adf40SYehuda Sadeh } 7324602adf40SYehuda Sadeh 7325cc344fa1SAlex Elder static void __exit rbd_exit(void) 7326602adf40SYehuda Sadeh { 7327ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 7328602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 73299b60e70bSIlya Dryomov if (single_major) 73309b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 7331f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 73321c2a9dfeSAlex Elder rbd_slab_exit(); 7333602adf40SYehuda Sadeh } 7334602adf40SYehuda Sadeh 7335602adf40SYehuda Sadeh module_init(rbd_init); 7336602adf40SYehuda Sadeh module_exit(rbd_exit); 7337602adf40SYehuda Sadeh 7338d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 7339602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 7340602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 7341602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 7342602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 7343602adf40SYehuda Sadeh 734490da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 7345602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 7346