1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 3543df3d35SIlya Dryomov #include <linux/ceph/striper.h> 36602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3759c2be1eSYehuda Sadeh #include <linux/parser.h> 3830d1cff8SAlex Elder #include <linux/bsearch.h> 39602adf40SYehuda Sadeh 40602adf40SYehuda Sadeh #include <linux/kernel.h> 41602adf40SYehuda Sadeh #include <linux/device.h> 42602adf40SYehuda Sadeh #include <linux/module.h> 437ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 44602adf40SYehuda Sadeh #include <linux/fs.h> 45602adf40SYehuda Sadeh #include <linux/blkdev.h> 461c2a9dfeSAlex Elder #include <linux/slab.h> 47f8a22fc2SIlya Dryomov #include <linux/idr.h> 48bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 49602adf40SYehuda Sadeh 50602adf40SYehuda Sadeh #include "rbd_types.h" 51602adf40SYehuda Sadeh 52aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 53aafb230eSAlex Elder 54593a9e7bSAlex Elder /* 55a2acd00eSAlex Elder * Increment the given counter and return its updated value. 56a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 57a2acd00eSAlex Elder * If the counter is already at its maximum value returns 58a2acd00eSAlex Elder * -EINVAL without updating it. 59a2acd00eSAlex Elder */ 60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 61a2acd00eSAlex Elder { 62a2acd00eSAlex Elder unsigned int counter; 63a2acd00eSAlex Elder 64bfc18e38SMark Rutland counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); 65a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 66a2acd00eSAlex Elder return (int)counter; 67a2acd00eSAlex Elder 68a2acd00eSAlex Elder atomic_dec(v); 69a2acd00eSAlex Elder 70a2acd00eSAlex Elder return -EINVAL; 71a2acd00eSAlex Elder } 72a2acd00eSAlex Elder 73a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 75a2acd00eSAlex Elder { 76a2acd00eSAlex Elder int counter; 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder counter = atomic_dec_return(v); 79a2acd00eSAlex Elder if (counter >= 0) 80a2acd00eSAlex Elder return counter; 81a2acd00eSAlex Elder 82a2acd00eSAlex Elder atomic_inc(v); 83a2acd00eSAlex Elder 84a2acd00eSAlex Elder return -EINVAL; 85a2acd00eSAlex Elder } 86a2acd00eSAlex Elder 87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 88602adf40SYehuda Sadeh 897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 91602adf40SYehuda Sadeh 926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 936d69bb53SIlya Dryomov 94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 96d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 97d4b125e9SAlex Elder 9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 99602adf40SYehuda Sadeh 100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 101602adf40SYehuda Sadeh 1029682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1039682fc6dSAlex Elder 1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1079e15b77dSAlex Elder 1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 109589d30e0SAlex Elder 110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11199d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 11299d16943SIlya Dryomov 113d889140cSAlex Elder /* Feature bits */ 114d889140cSAlex Elder 1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0) 1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 11822e8bd51SIlya Dryomov #define RBD_FEATURE_OBJECT_MAP (1ULL<<3) 11922e8bd51SIlya Dryomov #define RBD_FEATURE_FAST_DIFF (1ULL<<4) 120b9f6d447SIlya Dryomov #define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5) 1218767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7) 122e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS (1ULL<<8) 1238767b293SIlya Dryomov 124ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 125ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1267e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 12722e8bd51SIlya Dryomov RBD_FEATURE_OBJECT_MAP | \ 12822e8bd51SIlya Dryomov RBD_FEATURE_FAST_DIFF | \ 129b9f6d447SIlya Dryomov RBD_FEATURE_DEEP_FLATTEN | \ 130e573427aSIlya Dryomov RBD_FEATURE_DATA_POOL | \ 131e573427aSIlya Dryomov RBD_FEATURE_OPERATIONS) 132d889140cSAlex Elder 133d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 134d889140cSAlex Elder 135770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 136d889140cSAlex Elder 13781a89793SAlex Elder /* 13881a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13981a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 14081a89793SAlex Elder */ 141602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 142602adf40SYehuda Sadeh 143602adf40SYehuda Sadeh /* 144602adf40SYehuda Sadeh * block device image metadata (in-memory version) 145602adf40SYehuda Sadeh */ 146602adf40SYehuda Sadeh struct rbd_image_header { 147f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 148849b4260SAlex Elder char *object_prefix; 149602adf40SYehuda Sadeh __u8 obj_order; 150f35a4deeSAlex Elder u64 stripe_unit; 151f35a4deeSAlex Elder u64 stripe_count; 1527e97332eSIlya Dryomov s64 data_pool_id; 153f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 154602adf40SYehuda Sadeh 155f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 156f84344f3SAlex Elder u64 image_size; 157f84344f3SAlex Elder struct ceph_snap_context *snapc; 158f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 159f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 16059c2be1eSYehuda Sadeh }; 16159c2be1eSYehuda Sadeh 1620d7dbfceSAlex Elder /* 1630d7dbfceSAlex Elder * An rbd image specification. 1640d7dbfceSAlex Elder * 1650d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 166c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 167c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 168c66c6e0cSAlex Elder * 169c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 170c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 171c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 172c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 173c66c6e0cSAlex Elder * 174c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 175c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 176c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 177c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 178c66c6e0cSAlex Elder * is shared between the parent and child). 179c66c6e0cSAlex Elder * 180c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 181c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 182c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 183c66c6e0cSAlex Elder * 184c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 185c66c6e0cSAlex Elder * could be a null pointer). 1860d7dbfceSAlex Elder */ 1870d7dbfceSAlex Elder struct rbd_spec { 1880d7dbfceSAlex Elder u64 pool_id; 189ecb4dc22SAlex Elder const char *pool_name; 190b26c047bSIlya Dryomov const char *pool_ns; /* NULL if default, never "" */ 1910d7dbfceSAlex Elder 192ecb4dc22SAlex Elder const char *image_id; 193ecb4dc22SAlex Elder const char *image_name; 1940d7dbfceSAlex Elder 1950d7dbfceSAlex Elder u64 snap_id; 196ecb4dc22SAlex Elder const char *snap_name; 1970d7dbfceSAlex Elder 1980d7dbfceSAlex Elder struct kref kref; 1990d7dbfceSAlex Elder }; 2000d7dbfceSAlex Elder 201602adf40SYehuda Sadeh /* 202f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 203602adf40SYehuda Sadeh */ 204602adf40SYehuda Sadeh struct rbd_client { 205602adf40SYehuda Sadeh struct ceph_client *client; 206602adf40SYehuda Sadeh struct kref kref; 207602adf40SYehuda Sadeh struct list_head node; 208602adf40SYehuda Sadeh }; 209602adf40SYehuda Sadeh 2100192ce2eSIlya Dryomov struct pending_result { 2110192ce2eSIlya Dryomov int result; /* first nonzero result */ 2120192ce2eSIlya Dryomov int num_pending; 2130192ce2eSIlya Dryomov }; 2140192ce2eSIlya Dryomov 215bf0d5f50SAlex Elder struct rbd_img_request; 216bf0d5f50SAlex Elder 2179969ebc5SAlex Elder enum obj_request_type { 218a1fbb5e7SIlya Dryomov OBJ_REQUEST_NODATA = 1, 2195359a17dSIlya Dryomov OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 2207e07efb1SIlya Dryomov OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 221afb97888SIlya Dryomov OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ 2229969ebc5SAlex Elder }; 223bf0d5f50SAlex Elder 2246d2940c8SGuangliang Zhao enum obj_operation_type { 225a1fbb5e7SIlya Dryomov OBJ_OP_READ = 1, 2266d2940c8SGuangliang Zhao OBJ_OP_WRITE, 22790e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2286484cbe9SIlya Dryomov OBJ_OP_ZEROOUT, 2296d2940c8SGuangliang Zhao }; 2306d2940c8SGuangliang Zhao 2310ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_DELETION (1U << 0) 2320ad5d953SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1) 233793333a3SIlya Dryomov #define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2) 23422e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_MAY_EXIST (1U << 3) 23522e8bd51SIlya Dryomov #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4) 2360ad5d953SIlya Dryomov 237a9b67e69SIlya Dryomov enum rbd_obj_read_state { 23885b5e6d1SIlya Dryomov RBD_OBJ_READ_START = 1, 23985b5e6d1SIlya Dryomov RBD_OBJ_READ_OBJECT, 240a9b67e69SIlya Dryomov RBD_OBJ_READ_PARENT, 241a9b67e69SIlya Dryomov }; 242a9b67e69SIlya Dryomov 2433da691bfSIlya Dryomov /* 2443da691bfSIlya Dryomov * Writes go through the following state machine to deal with 2453da691bfSIlya Dryomov * layering: 2463da691bfSIlya Dryomov * 24789a59c1cSIlya Dryomov * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . . 24889a59c1cSIlya Dryomov * . | . 24989a59c1cSIlya Dryomov * . v . 25089a59c1cSIlya Dryomov * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . . 25189a59c1cSIlya Dryomov * . | . . 25289a59c1cSIlya Dryomov * . v v (deep-copyup . 25389a59c1cSIlya Dryomov * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) . 25489a59c1cSIlya Dryomov * flattened) v | . . 25589a59c1cSIlya Dryomov * . v . . 25689a59c1cSIlya Dryomov * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup . 25789a59c1cSIlya Dryomov * | not needed) v 25889a59c1cSIlya Dryomov * v . 25989a59c1cSIlya Dryomov * done . . . . . . . . . . . . . . . . . . 2603da691bfSIlya Dryomov * ^ 2613da691bfSIlya Dryomov * | 2623da691bfSIlya Dryomov * RBD_OBJ_WRITE_FLAT 2633da691bfSIlya Dryomov * 2643da691bfSIlya Dryomov * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether 26589a59c1cSIlya Dryomov * assert_exists guard is needed or not (in some cases it's not needed 26689a59c1cSIlya Dryomov * even if there is a parent). 2673da691bfSIlya Dryomov */ 2683da691bfSIlya Dryomov enum rbd_obj_write_state { 26985b5e6d1SIlya Dryomov RBD_OBJ_WRITE_START = 1, 27022e8bd51SIlya Dryomov RBD_OBJ_WRITE_PRE_OBJECT_MAP, 27185b5e6d1SIlya Dryomov RBD_OBJ_WRITE_OBJECT, 272793333a3SIlya Dryomov __RBD_OBJ_WRITE_COPYUP, 273793333a3SIlya Dryomov RBD_OBJ_WRITE_COPYUP, 27422e8bd51SIlya Dryomov RBD_OBJ_WRITE_POST_OBJECT_MAP, 275793333a3SIlya Dryomov }; 276793333a3SIlya Dryomov 277793333a3SIlya Dryomov enum rbd_obj_copyup_state { 278793333a3SIlya Dryomov RBD_OBJ_COPYUP_START = 1, 279793333a3SIlya Dryomov RBD_OBJ_COPYUP_READ_PARENT, 28022e8bd51SIlya Dryomov __RBD_OBJ_COPYUP_OBJECT_MAPS, 28122e8bd51SIlya Dryomov RBD_OBJ_COPYUP_OBJECT_MAPS, 282793333a3SIlya Dryomov __RBD_OBJ_COPYUP_WRITE_OBJECT, 283793333a3SIlya Dryomov RBD_OBJ_COPYUP_WRITE_OBJECT, 284926f9b3fSAlex Elder }; 285926f9b3fSAlex Elder 286bf0d5f50SAlex Elder struct rbd_obj_request { 28743df3d35SIlya Dryomov struct ceph_object_extent ex; 2880ad5d953SIlya Dryomov unsigned int flags; /* RBD_OBJ_FLAG_* */ 289c5b5ef6cSAlex Elder union { 290a9b67e69SIlya Dryomov enum rbd_obj_read_state read_state; /* for reads */ 2913da691bfSIlya Dryomov enum rbd_obj_write_state write_state; /* for writes */ 2923da691bfSIlya Dryomov }; 293bf0d5f50SAlex Elder 294bf0d5f50SAlex Elder struct rbd_img_request *img_request; 29586bd7998SIlya Dryomov struct ceph_file_extent *img_extents; 29686bd7998SIlya Dryomov u32 num_img_extents; 297bf0d5f50SAlex Elder 298788e2df3SAlex Elder union { 2995359a17dSIlya Dryomov struct ceph_bio_iter bio_pos; 300788e2df3SAlex Elder struct { 3017e07efb1SIlya Dryomov struct ceph_bvec_iter bvec_pos; 3027e07efb1SIlya Dryomov u32 bvec_count; 303afb97888SIlya Dryomov u32 bvec_idx; 304788e2df3SAlex Elder }; 305788e2df3SAlex Elder }; 306793333a3SIlya Dryomov 307793333a3SIlya Dryomov enum rbd_obj_copyup_state copyup_state; 3087e07efb1SIlya Dryomov struct bio_vec *copyup_bvecs; 3097e07efb1SIlya Dryomov u32 copyup_bvec_count; 310bf0d5f50SAlex Elder 311bcbab1dbSIlya Dryomov struct list_head osd_reqs; /* w/ r_private_item */ 312bf0d5f50SAlex Elder 31385b5e6d1SIlya Dryomov struct mutex state_mutex; 314793333a3SIlya Dryomov struct pending_result pending; 315bf0d5f50SAlex Elder struct kref kref; 316bf0d5f50SAlex Elder }; 317bf0d5f50SAlex Elder 3180c425248SAlex Elder enum img_req_flags { 3199849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 320d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 3210c425248SAlex Elder }; 3220c425248SAlex Elder 3230192ce2eSIlya Dryomov enum rbd_img_state { 3240192ce2eSIlya Dryomov RBD_IMG_START = 1, 325637cd060SIlya Dryomov RBD_IMG_EXCLUSIVE_LOCK, 3260192ce2eSIlya Dryomov __RBD_IMG_OBJECT_REQUESTS, 3270192ce2eSIlya Dryomov RBD_IMG_OBJECT_REQUESTS, 3280192ce2eSIlya Dryomov }; 3290192ce2eSIlya Dryomov 330bf0d5f50SAlex Elder struct rbd_img_request { 331bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 3329bb0248dSIlya Dryomov enum obj_operation_type op_type; 333ecc633caSIlya Dryomov enum obj_request_type data_type; 3340c425248SAlex Elder unsigned long flags; 3350192ce2eSIlya Dryomov enum rbd_img_state state; 336bf0d5f50SAlex Elder union { 337bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 3389849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 3399849e986SAlex Elder }; 3409849e986SAlex Elder union { 3419849e986SAlex Elder struct request *rq; /* block request */ 3429849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 343bf0d5f50SAlex Elder }; 344bf0d5f50SAlex Elder 345e1fddc8fSIlya Dryomov struct list_head lock_item; 34643df3d35SIlya Dryomov struct list_head object_extents; /* obj_req.ex structs */ 347bf0d5f50SAlex Elder 3480192ce2eSIlya Dryomov struct mutex state_mutex; 3490192ce2eSIlya Dryomov struct pending_result pending; 3500192ce2eSIlya Dryomov struct work_struct work; 3510192ce2eSIlya Dryomov int work_result; 352bf0d5f50SAlex Elder struct kref kref; 353bf0d5f50SAlex Elder }; 354bf0d5f50SAlex Elder 355bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 35643df3d35SIlya Dryomov list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) 357bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 35843df3d35SIlya Dryomov list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) 359bf0d5f50SAlex Elder 36099d16943SIlya Dryomov enum rbd_watch_state { 36199d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 36299d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 36399d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 36499d16943SIlya Dryomov }; 36599d16943SIlya Dryomov 366ed95b21aSIlya Dryomov enum rbd_lock_state { 367ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 368ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 369ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 370ed95b21aSIlya Dryomov }; 371ed95b21aSIlya Dryomov 372ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 373ed95b21aSIlya Dryomov struct rbd_client_id { 374ed95b21aSIlya Dryomov u64 gid; 375ed95b21aSIlya Dryomov u64 handle; 376ed95b21aSIlya Dryomov }; 377ed95b21aSIlya Dryomov 378f84344f3SAlex Elder struct rbd_mapping { 37999c1f08fSAlex Elder u64 size; 38034b13184SAlex Elder u64 features; 381f84344f3SAlex Elder }; 382f84344f3SAlex Elder 383602adf40SYehuda Sadeh /* 384602adf40SYehuda Sadeh * a single device 385602adf40SYehuda Sadeh */ 386602adf40SYehuda Sadeh struct rbd_device { 387de71a297SAlex Elder int dev_id; /* blkdev unique id */ 388602adf40SYehuda Sadeh 389602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 390dd82fff1SIlya Dryomov int minor; 391602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 392602adf40SYehuda Sadeh 393a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 394602adf40SYehuda Sadeh struct rbd_client *rbd_client; 395602adf40SYehuda Sadeh 396602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 397602adf40SYehuda Sadeh 398b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 399602adf40SYehuda Sadeh 400602adf40SYehuda Sadeh struct rbd_image_header header; 401b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 4020d7dbfceSAlex Elder struct rbd_spec *spec; 403d147543dSIlya Dryomov struct rbd_options *opts; 4040d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 405602adf40SYehuda Sadeh 406c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 407922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 408971f839aSAlex Elder 4091643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 4100903e875SAlex Elder 41199d16943SIlya Dryomov struct mutex watch_mutex; 41299d16943SIlya Dryomov enum rbd_watch_state watch_state; 413922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 41499d16943SIlya Dryomov u64 watch_cookie; 41599d16943SIlya Dryomov struct delayed_work watch_dwork; 41659c2be1eSYehuda Sadeh 417ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 418ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 419cbbfb0ffSIlya Dryomov char lock_cookie[32]; 420ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 421ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 422ed95b21aSIlya Dryomov struct work_struct released_lock_work; 423ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 424ed95b21aSIlya Dryomov struct work_struct unlock_work; 425e1fddc8fSIlya Dryomov spinlock_t lock_lists_lock; 426637cd060SIlya Dryomov struct list_head acquiring_list; 427e1fddc8fSIlya Dryomov struct list_head running_list; 428637cd060SIlya Dryomov struct completion acquire_wait; 429637cd060SIlya Dryomov int acquire_err; 430e1fddc8fSIlya Dryomov struct completion releasing_wait; 431ed95b21aSIlya Dryomov 43222e8bd51SIlya Dryomov spinlock_t object_map_lock; 43322e8bd51SIlya Dryomov u8 *object_map; 43422e8bd51SIlya Dryomov u64 object_map_size; /* in objects */ 43522e8bd51SIlya Dryomov u64 object_map_flags; 436602adf40SYehuda Sadeh 4371643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 438602adf40SYehuda Sadeh 43986b00e0dSAlex Elder struct rbd_spec *parent_spec; 44086b00e0dSAlex Elder u64 parent_overlap; 441a2acd00eSAlex Elder atomic_t parent_ref; 4422f82ee54SAlex Elder struct rbd_device *parent; 44386b00e0dSAlex Elder 4447ad18afaSChristoph Hellwig /* Block layer tags. */ 4457ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 4467ad18afaSChristoph Hellwig 447c666601aSJosh Durgin /* protects updating the header */ 448c666601aSJosh Durgin struct rw_semaphore header_rwsem; 449f84344f3SAlex Elder 450f84344f3SAlex Elder struct rbd_mapping mapping; 451602adf40SYehuda Sadeh 452602adf40SYehuda Sadeh struct list_head node; 453dfc5606dSYehuda Sadeh 454dfc5606dSYehuda Sadeh /* sysfs related */ 455dfc5606dSYehuda Sadeh struct device dev; 456b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 457dfc5606dSYehuda Sadeh }; 458dfc5606dSYehuda Sadeh 459b82d167bSAlex Elder /* 46087c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 46187c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 46287c0fdedSIlya Dryomov * by rbd_dev->lock 463b82d167bSAlex Elder */ 4646d292906SAlex Elder enum rbd_dev_flags { 4656d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 466b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 4676d292906SAlex Elder }; 4686d292906SAlex Elder 469cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 470e124a82fSAlex Elder 471602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 472e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 473e124a82fSAlex Elder 474602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 475432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 476602adf40SYehuda Sadeh 47778c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 47878c2a44aSAlex Elder 4791c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 480868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4811c2a9dfeSAlex Elder 4829b60e70bSIlya Dryomov static int rbd_major; 483f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 484f8a22fc2SIlya Dryomov 485f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 486f5ee37bdSIlya Dryomov 48789a59c1cSIlya Dryomov static struct ceph_snap_context rbd_empty_snapc = { 48889a59c1cSIlya Dryomov .nref = REFCOUNT_INIT(1), 48989a59c1cSIlya Dryomov }; 49089a59c1cSIlya Dryomov 4919b60e70bSIlya Dryomov /* 4923cfa3b16SIlya Dryomov * single-major requires >= 0.75 version of userspace rbd utility. 4939b60e70bSIlya Dryomov */ 4943cfa3b16SIlya Dryomov static bool single_major = true; 4955657a819SJoe Perches module_param(single_major, bool, 0444); 4963cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 4979b60e70bSIlya Dryomov 4987e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count); 4997e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, 500f0f8cef5SAlex Elder size_t count); 5017e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, 502f0f8cef5SAlex Elder size_t count); 5037e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, 5049b60e70bSIlya Dryomov size_t count); 5056d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 506f0f8cef5SAlex Elder 5079b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 5089b60e70bSIlya Dryomov { 5097e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 5109b60e70bSIlya Dryomov } 5119b60e70bSIlya Dryomov 5129b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 5139b60e70bSIlya Dryomov { 5147e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 5159b60e70bSIlya Dryomov } 5169b60e70bSIlya Dryomov 517f3c0e459SIlya Dryomov static bool rbd_is_snap(struct rbd_device *rbd_dev) 518f3c0e459SIlya Dryomov { 519f3c0e459SIlya Dryomov return rbd_dev->spec->snap_id != CEPH_NOSNAP; 520f3c0e459SIlya Dryomov } 521f3c0e459SIlya Dryomov 522ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 523ed95b21aSIlya Dryomov { 524637cd060SIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem); 525637cd060SIlya Dryomov 526ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 527ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 528ed95b21aSIlya Dryomov } 529ed95b21aSIlya Dryomov 530ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 531ed95b21aSIlya Dryomov { 532ed95b21aSIlya Dryomov bool is_lock_owner; 533ed95b21aSIlya Dryomov 534ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 535ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 536ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 537ed95b21aSIlya Dryomov return is_lock_owner; 538ed95b21aSIlya Dryomov } 539ed95b21aSIlya Dryomov 5407e9586baSGreg Kroah-Hartman static ssize_t supported_features_show(struct bus_type *bus, char *buf) 5418767b293SIlya Dryomov { 5428767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); 5438767b293SIlya Dryomov } 5448767b293SIlya Dryomov 5457e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add); 5467e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove); 5477e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(add_single_major); 5487e9586baSGreg Kroah-Hartman static BUS_ATTR_WO(remove_single_major); 5497e9586baSGreg Kroah-Hartman static BUS_ATTR_RO(supported_features); 550b15a21ddSGreg Kroah-Hartman 551b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 552b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 553b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 5549b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 5559b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 5568767b293SIlya Dryomov &bus_attr_supported_features.attr, 557b15a21ddSGreg Kroah-Hartman NULL, 558f0f8cef5SAlex Elder }; 55992c76dc0SIlya Dryomov 56092c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 56192c76dc0SIlya Dryomov struct attribute *attr, int index) 56292c76dc0SIlya Dryomov { 5639b60e70bSIlya Dryomov if (!single_major && 5649b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 5659b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 5669b60e70bSIlya Dryomov return 0; 5679b60e70bSIlya Dryomov 56892c76dc0SIlya Dryomov return attr->mode; 56992c76dc0SIlya Dryomov } 57092c76dc0SIlya Dryomov 57192c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 57292c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 57392c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 57492c76dc0SIlya Dryomov }; 57592c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 576f0f8cef5SAlex Elder 577f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 578f0f8cef5SAlex Elder .name = "rbd", 579b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 580f0f8cef5SAlex Elder }; 581f0f8cef5SAlex Elder 582f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 583f0f8cef5SAlex Elder { 584f0f8cef5SAlex Elder } 585f0f8cef5SAlex Elder 586f0f8cef5SAlex Elder static struct device rbd_root_dev = { 587f0f8cef5SAlex Elder .init_name = "rbd", 588f0f8cef5SAlex Elder .release = rbd_root_dev_release, 589f0f8cef5SAlex Elder }; 590f0f8cef5SAlex Elder 59106ecc6cbSAlex Elder static __printf(2, 3) 59206ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 59306ecc6cbSAlex Elder { 59406ecc6cbSAlex Elder struct va_format vaf; 59506ecc6cbSAlex Elder va_list args; 59606ecc6cbSAlex Elder 59706ecc6cbSAlex Elder va_start(args, fmt); 59806ecc6cbSAlex Elder vaf.fmt = fmt; 59906ecc6cbSAlex Elder vaf.va = &args; 60006ecc6cbSAlex Elder 60106ecc6cbSAlex Elder if (!rbd_dev) 60206ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 60306ecc6cbSAlex Elder else if (rbd_dev->disk) 60406ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 60506ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 60606ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 60706ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 60806ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 60906ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 61006ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 61106ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 61206ecc6cbSAlex Elder else /* punt */ 61306ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 61406ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 61506ecc6cbSAlex Elder va_end(args); 61606ecc6cbSAlex Elder } 61706ecc6cbSAlex Elder 618aafb230eSAlex Elder #ifdef RBD_DEBUG 619aafb230eSAlex Elder #define rbd_assert(expr) \ 620aafb230eSAlex Elder if (unlikely(!(expr))) { \ 621aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 622aafb230eSAlex Elder "at line %d:\n\n" \ 623aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 624aafb230eSAlex Elder __func__, __LINE__, #expr); \ 625aafb230eSAlex Elder BUG(); \ 626aafb230eSAlex Elder } 627aafb230eSAlex Elder #else /* !RBD_DEBUG */ 628aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 629aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 630dfc5606dSYehuda Sadeh 63105a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 6328b3e1a56SAlex Elder 633cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 6342df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 635a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 636e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 63754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 63854cac61fSAlex Elder u64 snap_id); 6392ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 6402ad3d716SAlex Elder u8 *order, u64 *snap_size); 6412ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 6422ad3d716SAlex Elder u64 *snap_features); 64322e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev); 64459c2be1eSYehuda Sadeh 64554ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result); 6460192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result); 6470192ce2eSIlya Dryomov 6480192ce2eSIlya Dryomov /* 6490192ce2eSIlya Dryomov * Return true if nothing else is pending. 6500192ce2eSIlya Dryomov */ 6510192ce2eSIlya Dryomov static bool pending_result_dec(struct pending_result *pending, int *result) 6520192ce2eSIlya Dryomov { 6530192ce2eSIlya Dryomov rbd_assert(pending->num_pending > 0); 6540192ce2eSIlya Dryomov 6550192ce2eSIlya Dryomov if (*result && !pending->result) 6560192ce2eSIlya Dryomov pending->result = *result; 6570192ce2eSIlya Dryomov if (--pending->num_pending) 6580192ce2eSIlya Dryomov return false; 6590192ce2eSIlya Dryomov 6600192ce2eSIlya Dryomov *result = pending->result; 6610192ce2eSIlya Dryomov return true; 6620192ce2eSIlya Dryomov } 663602adf40SYehuda Sadeh 664602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 665602adf40SYehuda Sadeh { 666f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 667b82d167bSAlex Elder bool removing = false; 668602adf40SYehuda Sadeh 669a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 670b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 671b82d167bSAlex Elder removing = true; 672b82d167bSAlex Elder else 673b82d167bSAlex Elder rbd_dev->open_count++; 674a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 675b82d167bSAlex Elder if (removing) 676b82d167bSAlex Elder return -ENOENT; 677b82d167bSAlex Elder 678c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 679340c7a2bSAlex Elder 680602adf40SYehuda Sadeh return 0; 681602adf40SYehuda Sadeh } 682602adf40SYehuda Sadeh 683db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 684dfc5606dSYehuda Sadeh { 685dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 686b82d167bSAlex Elder unsigned long open_count_before; 687b82d167bSAlex Elder 688a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 689b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 690a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 691b82d167bSAlex Elder rbd_assert(open_count_before > 0); 692dfc5606dSYehuda Sadeh 693c3e946ceSAlex Elder put_device(&rbd_dev->dev); 694dfc5606dSYehuda Sadeh } 695dfc5606dSYehuda Sadeh 696131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 697131fd9f6SGuangliang Zhao { 6981de797bbSIlya Dryomov int ro; 699131fd9f6SGuangliang Zhao 7001de797bbSIlya Dryomov if (get_user(ro, (int __user *)arg)) 701131fd9f6SGuangliang Zhao return -EFAULT; 702131fd9f6SGuangliang Zhao 7031de797bbSIlya Dryomov /* Snapshots can't be marked read-write */ 704f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev) && !ro) 705131fd9f6SGuangliang Zhao return -EROFS; 706131fd9f6SGuangliang Zhao 7071de797bbSIlya Dryomov /* Let blkdev_roset() handle it */ 7081de797bbSIlya Dryomov return -ENOTTY; 709131fd9f6SGuangliang Zhao } 710131fd9f6SGuangliang Zhao 711131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 712131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 713131fd9f6SGuangliang Zhao { 714131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 7151de797bbSIlya Dryomov int ret; 716131fd9f6SGuangliang Zhao 717131fd9f6SGuangliang Zhao switch (cmd) { 718131fd9f6SGuangliang Zhao case BLKROSET: 719131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 720131fd9f6SGuangliang Zhao break; 721131fd9f6SGuangliang Zhao default: 722131fd9f6SGuangliang Zhao ret = -ENOTTY; 723131fd9f6SGuangliang Zhao } 724131fd9f6SGuangliang Zhao 725131fd9f6SGuangliang Zhao return ret; 726131fd9f6SGuangliang Zhao } 727131fd9f6SGuangliang Zhao 728131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 729131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 730131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 731131fd9f6SGuangliang Zhao { 732131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 733131fd9f6SGuangliang Zhao } 734131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 735131fd9f6SGuangliang Zhao 736602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 737602adf40SYehuda Sadeh .owner = THIS_MODULE, 738602adf40SYehuda Sadeh .open = rbd_open, 739dfc5606dSYehuda Sadeh .release = rbd_release, 740131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 741131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 742131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 743131fd9f6SGuangliang Zhao #endif 744602adf40SYehuda Sadeh }; 745602adf40SYehuda Sadeh 746602adf40SYehuda Sadeh /* 7477262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 748cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 749602adf40SYehuda Sadeh */ 750f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 751602adf40SYehuda Sadeh { 752602adf40SYehuda Sadeh struct rbd_client *rbdc; 753602adf40SYehuda Sadeh int ret = -ENOMEM; 754602adf40SYehuda Sadeh 75537206ee5SAlex Elder dout("%s:\n", __func__); 756602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 757602adf40SYehuda Sadeh if (!rbdc) 758602adf40SYehuda Sadeh goto out_opt; 759602adf40SYehuda Sadeh 760602adf40SYehuda Sadeh kref_init(&rbdc->kref); 761602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 762602adf40SYehuda Sadeh 76374da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc); 764602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 76508f75463SAlex Elder goto out_rbdc; 76643ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 767602adf40SYehuda Sadeh 768602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 769602adf40SYehuda Sadeh if (ret < 0) 77008f75463SAlex Elder goto out_client; 771602adf40SYehuda Sadeh 772432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 773602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 774432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 775602adf40SYehuda Sadeh 77637206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 777bc534d86SAlex Elder 778602adf40SYehuda Sadeh return rbdc; 77908f75463SAlex Elder out_client: 780602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 78108f75463SAlex Elder out_rbdc: 782602adf40SYehuda Sadeh kfree(rbdc); 783602adf40SYehuda Sadeh out_opt: 78443ae4701SAlex Elder if (ceph_opts) 78543ae4701SAlex Elder ceph_destroy_options(ceph_opts); 78637206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 78737206ee5SAlex Elder 78828f259b7SVasiliy Kulikov return ERR_PTR(ret); 789602adf40SYehuda Sadeh } 790602adf40SYehuda Sadeh 7912f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7922f82ee54SAlex Elder { 7932f82ee54SAlex Elder kref_get(&rbdc->kref); 7942f82ee54SAlex Elder 7952f82ee54SAlex Elder return rbdc; 7962f82ee54SAlex Elder } 7972f82ee54SAlex Elder 798602adf40SYehuda Sadeh /* 7991f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 8001f7ba331SAlex Elder * found, bump its reference count. 801602adf40SYehuda Sadeh */ 8021f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 803602adf40SYehuda Sadeh { 804602adf40SYehuda Sadeh struct rbd_client *client_node; 8051f7ba331SAlex Elder bool found = false; 806602adf40SYehuda Sadeh 80743ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 808602adf40SYehuda Sadeh return NULL; 809602adf40SYehuda Sadeh 8101f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 8111f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 8121f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 8132f82ee54SAlex Elder __rbd_get_client(client_node); 8142f82ee54SAlex Elder 8151f7ba331SAlex Elder found = true; 8161f7ba331SAlex Elder break; 8171f7ba331SAlex Elder } 8181f7ba331SAlex Elder } 8191f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 8201f7ba331SAlex Elder 8211f7ba331SAlex Elder return found ? client_node : NULL; 822602adf40SYehuda Sadeh } 823602adf40SYehuda Sadeh 824602adf40SYehuda Sadeh /* 825210c104cSIlya Dryomov * (Per device) rbd map options 82659c2be1eSYehuda Sadeh */ 82759c2be1eSYehuda Sadeh enum { 828b5584180SIlya Dryomov Opt_queue_depth, 8290c93e1b7SIlya Dryomov Opt_alloc_size, 83034f55d0bSDongsheng Yang Opt_lock_timeout, 83159c2be1eSYehuda Sadeh Opt_last_int, 83259c2be1eSYehuda Sadeh /* int args above */ 833b26c047bSIlya Dryomov Opt_pool_ns, 83459c2be1eSYehuda Sadeh Opt_last_string, 83559c2be1eSYehuda Sadeh /* string args above */ 836cc0538b6SAlex Elder Opt_read_only, 837cc0538b6SAlex Elder Opt_read_write, 83880de1912SIlya Dryomov Opt_lock_on_read, 839e010dd0aSIlya Dryomov Opt_exclusive, 840d9360540SIlya Dryomov Opt_notrim, 841210c104cSIlya Dryomov Opt_err 84259c2be1eSYehuda Sadeh }; 84359c2be1eSYehuda Sadeh 84443ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 845b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 8460c93e1b7SIlya Dryomov {Opt_alloc_size, "alloc_size=%d"}, 84734f55d0bSDongsheng Yang {Opt_lock_timeout, "lock_timeout=%d"}, 84859c2be1eSYehuda Sadeh /* int args above */ 849b26c047bSIlya Dryomov {Opt_pool_ns, "_pool_ns=%s"}, 85059c2be1eSYehuda Sadeh /* string args above */ 851be466c1cSAlex Elder {Opt_read_only, "read_only"}, 852cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 853cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 854cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 85580de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 856e010dd0aSIlya Dryomov {Opt_exclusive, "exclusive"}, 857d9360540SIlya Dryomov {Opt_notrim, "notrim"}, 858210c104cSIlya Dryomov {Opt_err, NULL} 85959c2be1eSYehuda Sadeh }; 86059c2be1eSYehuda Sadeh 86198571b5aSAlex Elder struct rbd_options { 862b5584180SIlya Dryomov int queue_depth; 8630c93e1b7SIlya Dryomov int alloc_size; 86434f55d0bSDongsheng Yang unsigned long lock_timeout; 86598571b5aSAlex Elder bool read_only; 86680de1912SIlya Dryomov bool lock_on_read; 867e010dd0aSIlya Dryomov bool exclusive; 868d9360540SIlya Dryomov bool trim; 86998571b5aSAlex Elder }; 87098571b5aSAlex Elder 871b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 8720c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) 87334f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ 87498571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 87580de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 876e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT false 877d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT true 87898571b5aSAlex Elder 879c300156bSIlya Dryomov struct parse_rbd_opts_ctx { 880c300156bSIlya Dryomov struct rbd_spec *spec; 881c300156bSIlya Dryomov struct rbd_options *opts; 882c300156bSIlya Dryomov }; 883c300156bSIlya Dryomov 88459c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 88559c2be1eSYehuda Sadeh { 886c300156bSIlya Dryomov struct parse_rbd_opts_ctx *pctx = private; 88759c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 88859c2be1eSYehuda Sadeh int token, intval, ret; 88959c2be1eSYehuda Sadeh 89043ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 89159c2be1eSYehuda Sadeh if (token < Opt_last_int) { 89259c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 89359c2be1eSYehuda Sadeh if (ret < 0) { 8942f56b6baSIlya Dryomov pr_err("bad option arg (not int) at '%s'\n", c); 89559c2be1eSYehuda Sadeh return ret; 89659c2be1eSYehuda Sadeh } 89759c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 89859c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 899210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 90059c2be1eSYehuda Sadeh } else { 90159c2be1eSYehuda Sadeh dout("got token %d\n", token); 90259c2be1eSYehuda Sadeh } 90359c2be1eSYehuda Sadeh 90459c2be1eSYehuda Sadeh switch (token) { 905b5584180SIlya Dryomov case Opt_queue_depth: 906b5584180SIlya Dryomov if (intval < 1) { 907b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 908b5584180SIlya Dryomov return -EINVAL; 909b5584180SIlya Dryomov } 910c300156bSIlya Dryomov pctx->opts->queue_depth = intval; 911b5584180SIlya Dryomov break; 9120c93e1b7SIlya Dryomov case Opt_alloc_size: 91316d80c54SIlya Dryomov if (intval < SECTOR_SIZE) { 9140c93e1b7SIlya Dryomov pr_err("alloc_size out of range\n"); 9150c93e1b7SIlya Dryomov return -EINVAL; 9160c93e1b7SIlya Dryomov } 9170c93e1b7SIlya Dryomov if (!is_power_of_2(intval)) { 9180c93e1b7SIlya Dryomov pr_err("alloc_size must be a power of 2\n"); 9190c93e1b7SIlya Dryomov return -EINVAL; 9200c93e1b7SIlya Dryomov } 9210c93e1b7SIlya Dryomov pctx->opts->alloc_size = intval; 9220c93e1b7SIlya Dryomov break; 92334f55d0bSDongsheng Yang case Opt_lock_timeout: 92434f55d0bSDongsheng Yang /* 0 is "wait forever" (i.e. infinite timeout) */ 92534f55d0bSDongsheng Yang if (intval < 0 || intval > INT_MAX / 1000) { 92634f55d0bSDongsheng Yang pr_err("lock_timeout out of range\n"); 92734f55d0bSDongsheng Yang return -EINVAL; 92834f55d0bSDongsheng Yang } 929c300156bSIlya Dryomov pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); 93034f55d0bSDongsheng Yang break; 931b26c047bSIlya Dryomov case Opt_pool_ns: 932b26c047bSIlya Dryomov kfree(pctx->spec->pool_ns); 933b26c047bSIlya Dryomov pctx->spec->pool_ns = match_strdup(argstr); 934b26c047bSIlya Dryomov if (!pctx->spec->pool_ns) 935b26c047bSIlya Dryomov return -ENOMEM; 93659c2be1eSYehuda Sadeh break; 937cc0538b6SAlex Elder case Opt_read_only: 938c300156bSIlya Dryomov pctx->opts->read_only = true; 939cc0538b6SAlex Elder break; 940cc0538b6SAlex Elder case Opt_read_write: 941c300156bSIlya Dryomov pctx->opts->read_only = false; 942cc0538b6SAlex Elder break; 94380de1912SIlya Dryomov case Opt_lock_on_read: 944c300156bSIlya Dryomov pctx->opts->lock_on_read = true; 94580de1912SIlya Dryomov break; 946e010dd0aSIlya Dryomov case Opt_exclusive: 947c300156bSIlya Dryomov pctx->opts->exclusive = true; 948e010dd0aSIlya Dryomov break; 949d9360540SIlya Dryomov case Opt_notrim: 950c300156bSIlya Dryomov pctx->opts->trim = false; 951d9360540SIlya Dryomov break; 95259c2be1eSYehuda Sadeh default: 953210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 954210c104cSIlya Dryomov return -EINVAL; 95559c2be1eSYehuda Sadeh } 956210c104cSIlya Dryomov 95759c2be1eSYehuda Sadeh return 0; 95859c2be1eSYehuda Sadeh } 95959c2be1eSYehuda Sadeh 9606d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 9616d2940c8SGuangliang Zhao { 9626d2940c8SGuangliang Zhao switch (op_type) { 9636d2940c8SGuangliang Zhao case OBJ_OP_READ: 9646d2940c8SGuangliang Zhao return "read"; 9656d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 9666d2940c8SGuangliang Zhao return "write"; 96790e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 96890e98c52SGuangliang Zhao return "discard"; 9696484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 9706484cbe9SIlya Dryomov return "zeroout"; 9716d2940c8SGuangliang Zhao default: 9726d2940c8SGuangliang Zhao return "???"; 9736d2940c8SGuangliang Zhao } 9746d2940c8SGuangliang Zhao } 9756d2940c8SGuangliang Zhao 97659c2be1eSYehuda Sadeh /* 977602adf40SYehuda Sadeh * Destroy ceph client 978d23a4b3fSAlex Elder * 979432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 980602adf40SYehuda Sadeh */ 981602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 982602adf40SYehuda Sadeh { 983602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 984602adf40SYehuda Sadeh 98537206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 986cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 987602adf40SYehuda Sadeh list_del(&rbdc->node); 988cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 989602adf40SYehuda Sadeh 990602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 991602adf40SYehuda Sadeh kfree(rbdc); 992602adf40SYehuda Sadeh } 993602adf40SYehuda Sadeh 994602adf40SYehuda Sadeh /* 995602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 996602adf40SYehuda Sadeh * it. 997602adf40SYehuda Sadeh */ 9989d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 999602adf40SYehuda Sadeh { 1000c53d5893SAlex Elder if (rbdc) 10019d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 1002602adf40SYehuda Sadeh } 1003602adf40SYehuda Sadeh 10045feb0d8dSIlya Dryomov /* 10055feb0d8dSIlya Dryomov * Get a ceph client with specific addr and configuration, if one does 10065feb0d8dSIlya Dryomov * not exist create it. Either way, ceph_opts is consumed by this 10075feb0d8dSIlya Dryomov * function. 10085feb0d8dSIlya Dryomov */ 10095feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 10105feb0d8dSIlya Dryomov { 10115feb0d8dSIlya Dryomov struct rbd_client *rbdc; 1012dd435855SIlya Dryomov int ret; 10135feb0d8dSIlya Dryomov 1014a32e4143SIlya Dryomov mutex_lock(&client_mutex); 10155feb0d8dSIlya Dryomov rbdc = rbd_client_find(ceph_opts); 1016dd435855SIlya Dryomov if (rbdc) { 10175feb0d8dSIlya Dryomov ceph_destroy_options(ceph_opts); 1018dd435855SIlya Dryomov 1019dd435855SIlya Dryomov /* 1020dd435855SIlya Dryomov * Using an existing client. Make sure ->pg_pools is up to 1021dd435855SIlya Dryomov * date before we look up the pool id in do_rbd_add(). 1022dd435855SIlya Dryomov */ 10239d4a227fSIlya Dryomov ret = ceph_wait_for_latest_osdmap(rbdc->client, 10249d4a227fSIlya Dryomov rbdc->client->options->mount_timeout); 1025dd435855SIlya Dryomov if (ret) { 1026dd435855SIlya Dryomov rbd_warn(NULL, "failed to get latest osdmap: %d", ret); 1027dd435855SIlya Dryomov rbd_put_client(rbdc); 1028dd435855SIlya Dryomov rbdc = ERR_PTR(ret); 1029dd435855SIlya Dryomov } 1030dd435855SIlya Dryomov } else { 10315feb0d8dSIlya Dryomov rbdc = rbd_client_create(ceph_opts); 1032dd435855SIlya Dryomov } 10335feb0d8dSIlya Dryomov mutex_unlock(&client_mutex); 10345feb0d8dSIlya Dryomov 10355feb0d8dSIlya Dryomov return rbdc; 10365feb0d8dSIlya Dryomov } 10375feb0d8dSIlya Dryomov 1038a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 1039a30b71b9SAlex Elder { 1040a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 1041a30b71b9SAlex Elder } 1042a30b71b9SAlex Elder 10438e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 10448e94af8eSAlex Elder { 1045103a150fSAlex Elder size_t size; 1046103a150fSAlex Elder u32 snap_count; 1047103a150fSAlex Elder 1048103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 1049103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 1050103a150fSAlex Elder return false; 1051103a150fSAlex Elder 1052db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 1053db2388b6SAlex Elder 1054db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 1055db2388b6SAlex Elder return false; 1056db2388b6SAlex Elder 1057db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 1058db2388b6SAlex Elder 1059db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 1060db2388b6SAlex Elder return false; 1061db2388b6SAlex Elder 1062103a150fSAlex Elder /* 1063103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 1064103a150fSAlex Elder * that limits the number of snapshots. 1065103a150fSAlex Elder */ 1066103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 1067103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 1068103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 1069103a150fSAlex Elder return false; 1070103a150fSAlex Elder 1071103a150fSAlex Elder /* 1072103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 1073103a150fSAlex Elder * header must also be representable in a size_t. 1074103a150fSAlex Elder */ 1075103a150fSAlex Elder size -= snap_count * sizeof (__le64); 1076103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 1077103a150fSAlex Elder return false; 1078103a150fSAlex Elder 1079103a150fSAlex Elder return true; 10808e94af8eSAlex Elder } 10818e94af8eSAlex Elder 1082602adf40SYehuda Sadeh /* 10835bc3fb17SIlya Dryomov * returns the size of an object in the image 10845bc3fb17SIlya Dryomov */ 10855bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 10865bc3fb17SIlya Dryomov { 10875bc3fb17SIlya Dryomov return 1U << header->obj_order; 10885bc3fb17SIlya Dryomov } 10895bc3fb17SIlya Dryomov 1090263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 1091263423f8SIlya Dryomov { 1092263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 1093263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 1094263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 1095263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 1096263423f8SIlya Dryomov } 1097263423f8SIlya Dryomov 1098263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 1099263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 1100263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 11017e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 11027e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 1103263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 1104263423f8SIlya Dryomov } 1105263423f8SIlya Dryomov 11065bc3fb17SIlya Dryomov /* 1107bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1108bb23e37aSAlex Elder * on-disk header. 1109602adf40SYehuda Sadeh */ 1110662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 11114156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1112602adf40SYehuda Sadeh { 1113662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1114bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1115bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1116bb23e37aSAlex Elder char *object_prefix = NULL; 1117bb23e37aSAlex Elder char *snap_names = NULL; 1118bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1119ccece235SAlex Elder u32 snap_count; 1120bb23e37aSAlex Elder int ret = -ENOMEM; 1121621901d6SAlex Elder u32 i; 1122602adf40SYehuda Sadeh 1123bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1124103a150fSAlex Elder 1125bb23e37aSAlex Elder if (first_time) { 1126848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1127848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1128848d796cSIlya Dryomov GFP_KERNEL); 1129bb23e37aSAlex Elder if (!object_prefix) 1130602adf40SYehuda Sadeh return -ENOMEM; 1131bb23e37aSAlex Elder } 113200f1f36fSAlex Elder 1133bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1134d2bb24e5SAlex Elder 1135602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1136bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1137bb23e37aSAlex Elder if (!snapc) 1138bb23e37aSAlex Elder goto out_err; 1139bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1140602adf40SYehuda Sadeh if (snap_count) { 1141bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1142f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1143f785cc1dSAlex Elder 1144bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1145621901d6SAlex Elder 1146f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1147bb23e37aSAlex Elder goto out_2big; 1148bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1149bb23e37aSAlex Elder if (!snap_names) 1150602adf40SYehuda Sadeh goto out_err; 1151bb23e37aSAlex Elder 1152bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 115388a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 115488a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 115588a25a5fSMarkus Elfring GFP_KERNEL); 1156bb23e37aSAlex Elder if (!snap_sizes) 1157bb23e37aSAlex Elder goto out_err; 1158bb23e37aSAlex Elder 1159f785cc1dSAlex Elder /* 1160bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1161bb23e37aSAlex Elder * and size. 1162bb23e37aSAlex Elder * 116399a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1164bb23e37aSAlex Elder * ondisk buffer we're working with has 1165f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1166f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1167f785cc1dSAlex Elder */ 1168bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1169bb23e37aSAlex Elder snaps = ondisk->snaps; 1170bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1171bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1172bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1173bb23e37aSAlex Elder } 1174602adf40SYehuda Sadeh } 1175849b4260SAlex Elder 1176bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1177bb23e37aSAlex Elder 1178bb23e37aSAlex Elder if (first_time) { 1179bb23e37aSAlex Elder header->object_prefix = object_prefix; 1180602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1181263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1182662518b1SAlex Elder } else { 1183662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1184662518b1SAlex Elder kfree(header->snap_names); 1185662518b1SAlex Elder kfree(header->snap_sizes); 1186bb23e37aSAlex Elder } 11876a52325fSAlex Elder 1188bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1189621901d6SAlex Elder 1190f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1191bb23e37aSAlex Elder header->snapc = snapc; 1192bb23e37aSAlex Elder header->snap_names = snap_names; 1193bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1194468521c1SAlex Elder 1195602adf40SYehuda Sadeh return 0; 1196bb23e37aSAlex Elder out_2big: 1197bb23e37aSAlex Elder ret = -EIO; 11986a52325fSAlex Elder out_err: 1199bb23e37aSAlex Elder kfree(snap_sizes); 1200bb23e37aSAlex Elder kfree(snap_names); 1201bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1202bb23e37aSAlex Elder kfree(object_prefix); 1203ccece235SAlex Elder 1204bb23e37aSAlex Elder return ret; 1205602adf40SYehuda Sadeh } 1206602adf40SYehuda Sadeh 12079682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 12089682fc6dSAlex Elder { 12099682fc6dSAlex Elder const char *snap_name; 12109682fc6dSAlex Elder 12119682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 12129682fc6dSAlex Elder 12139682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 12149682fc6dSAlex Elder 12159682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 12169682fc6dSAlex Elder while (which--) 12179682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 12189682fc6dSAlex Elder 12199682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 12209682fc6dSAlex Elder } 12219682fc6dSAlex Elder 122230d1cff8SAlex Elder /* 122330d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 122430d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 122530d1cff8SAlex Elder */ 122630d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 122730d1cff8SAlex Elder { 122830d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 122930d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 123030d1cff8SAlex Elder 123130d1cff8SAlex Elder if (snap_id1 < snap_id2) 123230d1cff8SAlex Elder return 1; 123330d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 123430d1cff8SAlex Elder } 123530d1cff8SAlex Elder 123630d1cff8SAlex Elder /* 123730d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 123830d1cff8SAlex Elder * present. 123930d1cff8SAlex Elder * 124030d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 124130d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 124230d1cff8SAlex Elder * 124330d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 124430d1cff8SAlex Elder * reverse order, highest snapshot id first. 124530d1cff8SAlex Elder */ 12469682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 12479682fc6dSAlex Elder { 12489682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 124930d1cff8SAlex Elder u64 *found; 12509682fc6dSAlex Elder 125130d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 125230d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 12539682fc6dSAlex Elder 125430d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 12559682fc6dSAlex Elder } 12569682fc6dSAlex Elder 12572ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 12582ad3d716SAlex Elder u64 snap_id) 125954cac61fSAlex Elder { 126054cac61fSAlex Elder u32 which; 1261da6a6b63SJosh Durgin const char *snap_name; 126254cac61fSAlex Elder 126354cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 126454cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1265da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 126654cac61fSAlex Elder 1267da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1268da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 126954cac61fSAlex Elder } 127054cac61fSAlex Elder 12719e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 12729e15b77dSAlex Elder { 12739e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 12749e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 12759e15b77dSAlex Elder 127654cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 127754cac61fSAlex Elder if (rbd_dev->image_format == 1) 127854cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 12799e15b77dSAlex Elder 128054cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 12819e15b77dSAlex Elder } 12829e15b77dSAlex Elder 12832ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 12842ad3d716SAlex Elder u64 *snap_size) 1285602adf40SYehuda Sadeh { 12862ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12872ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12882ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 12892ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12902ad3d716SAlex Elder u32 which; 129100f1f36fSAlex Elder 12922ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 12932ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 12942ad3d716SAlex Elder return -ENOENT; 129500f1f36fSAlex Elder 12962ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 12972ad3d716SAlex Elder } else { 12982ad3d716SAlex Elder u64 size = 0; 12992ad3d716SAlex Elder int ret; 13002ad3d716SAlex Elder 13012ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 13022ad3d716SAlex Elder if (ret) 13032ad3d716SAlex Elder return ret; 13042ad3d716SAlex Elder 13052ad3d716SAlex Elder *snap_size = size; 13062ad3d716SAlex Elder } 13072ad3d716SAlex Elder return 0; 13082ad3d716SAlex Elder } 13092ad3d716SAlex Elder 13102ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 13112ad3d716SAlex Elder u64 *snap_features) 13122ad3d716SAlex Elder { 13132ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 13142ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 13152ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 13162ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 13172ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 13182ad3d716SAlex Elder } else { 13192ad3d716SAlex Elder u64 features = 0; 13202ad3d716SAlex Elder int ret; 13212ad3d716SAlex Elder 13222ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 13232ad3d716SAlex Elder if (ret) 13242ad3d716SAlex Elder return ret; 13252ad3d716SAlex Elder 13262ad3d716SAlex Elder *snap_features = features; 13272ad3d716SAlex Elder } 13282ad3d716SAlex Elder return 0; 132900f1f36fSAlex Elder } 1330602adf40SYehuda Sadeh 1331d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1332602adf40SYehuda Sadeh { 13338f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 13342ad3d716SAlex Elder u64 size = 0; 13352ad3d716SAlex Elder u64 features = 0; 13362ad3d716SAlex Elder int ret; 13378b0241f8SAlex Elder 13382ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 13392ad3d716SAlex Elder if (ret) 13402ad3d716SAlex Elder return ret; 13412ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 13422ad3d716SAlex Elder if (ret) 13432ad3d716SAlex Elder return ret; 13442ad3d716SAlex Elder 13452ad3d716SAlex Elder rbd_dev->mapping.size = size; 13462ad3d716SAlex Elder rbd_dev->mapping.features = features; 13472ad3d716SAlex Elder 13488b0241f8SAlex Elder return 0; 1349602adf40SYehuda Sadeh } 1350602adf40SYehuda Sadeh 1351d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1352d1cf5788SAlex Elder { 1353d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1354d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1355200a6a8bSAlex Elder } 1356200a6a8bSAlex Elder 13575359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv) 135865ccfe21SAlex Elder { 1359602adf40SYehuda Sadeh void *buf; 13605359a17dSIlya Dryomov unsigned long flags; 1361602adf40SYehuda Sadeh 13625359a17dSIlya Dryomov buf = bvec_kmap_irq(bv, &flags); 13635359a17dSIlya Dryomov memset(buf, 0, bv->bv_len); 13645359a17dSIlya Dryomov flush_dcache_page(bv->bv_page); 136585b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1366602adf40SYehuda Sadeh } 1367602adf40SYehuda Sadeh 13685359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) 1369b9434c5bSAlex Elder { 13705359a17dSIlya Dryomov struct ceph_bio_iter it = *bio_pos; 1371b9434c5bSAlex Elder 13725359a17dSIlya Dryomov ceph_bio_iter_advance(&it, off); 13735359a17dSIlya Dryomov ceph_bio_iter_advance_step(&it, bytes, ({ 13745359a17dSIlya Dryomov zero_bvec(&bv); 13755359a17dSIlya Dryomov })); 1376b9434c5bSAlex Elder } 1377b9434c5bSAlex Elder 13787e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) 1379602adf40SYehuda Sadeh { 13807e07efb1SIlya Dryomov struct ceph_bvec_iter it = *bvec_pos; 1381602adf40SYehuda Sadeh 13827e07efb1SIlya Dryomov ceph_bvec_iter_advance(&it, off); 13837e07efb1SIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 13847e07efb1SIlya Dryomov zero_bvec(&bv); 13857e07efb1SIlya Dryomov })); 1386602adf40SYehuda Sadeh } 1387602adf40SYehuda Sadeh 1388f7760dadSAlex Elder /* 13893da691bfSIlya Dryomov * Zero a range in @obj_req data buffer defined by a bio (list) or 1390afb97888SIlya Dryomov * (private) bio_vec array. 1391f7760dadSAlex Elder * 13923da691bfSIlya Dryomov * @off is relative to the start of the data buffer. 1393f7760dadSAlex Elder */ 13943da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 13953da691bfSIlya Dryomov u32 bytes) 1396f7760dadSAlex Elder { 139754ab3b24SIlya Dryomov dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes); 139854ab3b24SIlya Dryomov 1399ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 14003da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 14013da691bfSIlya Dryomov zero_bios(&obj_req->bio_pos, off, bytes); 14023da691bfSIlya Dryomov break; 14033da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1404afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 14053da691bfSIlya Dryomov zero_bvecs(&obj_req->bvec_pos, off, bytes); 14063da691bfSIlya Dryomov break; 14073da691bfSIlya Dryomov default: 140816809372SArnd Bergmann BUG(); 1409f5400b7aSAlex Elder } 1410bf0d5f50SAlex Elder } 1411bf0d5f50SAlex Elder 1412bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1413bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1414bf0d5f50SAlex Elder { 1415bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 141637206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 14172c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1418bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1419bf0d5f50SAlex Elder } 1420bf0d5f50SAlex Elder 1421bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1422bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1423bf0d5f50SAlex Elder { 1424bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 142537206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 14262c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 1427bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1428bf0d5f50SAlex Elder } 1429bf0d5f50SAlex Elder 1430bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1431bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1432bf0d5f50SAlex Elder { 143325dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 143425dcf954SAlex Elder 1435b155e86cSAlex Elder /* Image request now owns object's original reference */ 1436bf0d5f50SAlex Elder obj_request->img_request = img_request; 143715961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1438bf0d5f50SAlex Elder } 1439bf0d5f50SAlex Elder 1440bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1441bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1442bf0d5f50SAlex Elder { 144315961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 144443df3d35SIlya Dryomov list_del(&obj_request->ex.oe_item); 1445bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1446bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1447bf0d5f50SAlex Elder } 1448bf0d5f50SAlex Elder 1449a086a1b8SIlya Dryomov static void rbd_osd_submit(struct ceph_osd_request *osd_req) 1450bf0d5f50SAlex Elder { 1451a086a1b8SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 1452980917fcSIlya Dryomov 1453a086a1b8SIlya Dryomov dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", 1454a086a1b8SIlya Dryomov __func__, osd_req, obj_req, obj_req->ex.oe_objno, 1455a086a1b8SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len); 1456980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1457bf0d5f50SAlex Elder } 1458bf0d5f50SAlex Elder 14590c425248SAlex Elder /* 14600c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 14610c425248SAlex Elder * is conditionally set to 1 at image request initialization time 14620c425248SAlex Elder * and currently never change thereafter. 14630c425248SAlex Elder */ 1464d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1465d0b2e944SAlex Elder { 1466d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1467d0b2e944SAlex Elder smp_mb(); 1468d0b2e944SAlex Elder } 1469d0b2e944SAlex Elder 1470a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1471a2acd00eSAlex Elder { 1472a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1473a2acd00eSAlex Elder smp_mb(); 1474a2acd00eSAlex Elder } 1475a2acd00eSAlex Elder 1476d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1477d0b2e944SAlex Elder { 1478d0b2e944SAlex Elder smp_mb(); 1479d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1480d0b2e944SAlex Elder } 1481d0b2e944SAlex Elder 14823da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) 14833b434a2aSJosh Durgin { 14843da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 14853da691bfSIlya Dryomov 148643df3d35SIlya Dryomov return !obj_req->ex.oe_off && 148743df3d35SIlya Dryomov obj_req->ex.oe_len == rbd_dev->layout.object_size; 14883b434a2aSJosh Durgin } 14893b434a2aSJosh Durgin 14903da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) 14916e2a4505SAlex Elder { 14923da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1493b9434c5bSAlex Elder 149443df3d35SIlya Dryomov return obj_req->ex.oe_off + obj_req->ex.oe_len == 14953da691bfSIlya Dryomov rbd_dev->layout.object_size; 14966e2a4505SAlex Elder } 14976e2a4505SAlex Elder 149813488d53SIlya Dryomov /* 149913488d53SIlya Dryomov * Must be called after rbd_obj_calc_img_extents(). 150013488d53SIlya Dryomov */ 150113488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) 150213488d53SIlya Dryomov { 150313488d53SIlya Dryomov if (!obj_req->num_img_extents || 15049b17eb2cSIlya Dryomov (rbd_obj_is_entire(obj_req) && 15059b17eb2cSIlya Dryomov !obj_req->img_request->snapc->num_snaps)) 150613488d53SIlya Dryomov return false; 150713488d53SIlya Dryomov 150813488d53SIlya Dryomov return true; 150913488d53SIlya Dryomov } 151013488d53SIlya Dryomov 151186bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) 1512bf0d5f50SAlex Elder { 151386bd7998SIlya Dryomov return ceph_file_extents_bytes(obj_req->img_extents, 151486bd7998SIlya Dryomov obj_req->num_img_extents); 1515bf0d5f50SAlex Elder } 1516bf0d5f50SAlex Elder 15173da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req) 15180dcc685eSIlya Dryomov { 15199bb0248dSIlya Dryomov switch (img_req->op_type) { 15203da691bfSIlya Dryomov case OBJ_OP_READ: 15213da691bfSIlya Dryomov return false; 15223da691bfSIlya Dryomov case OBJ_OP_WRITE: 15233da691bfSIlya Dryomov case OBJ_OP_DISCARD: 15246484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 15253da691bfSIlya Dryomov return true; 15263da691bfSIlya Dryomov default: 1527c6244b3bSArnd Bergmann BUG(); 15280dcc685eSIlya Dryomov } 15290dcc685eSIlya Dryomov } 15300dcc685eSIlya Dryomov 153185e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1532bf0d5f50SAlex Elder { 15333da691bfSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 153454ab3b24SIlya Dryomov int result; 1535bf0d5f50SAlex Elder 15363da691bfSIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 15373da691bfSIlya Dryomov osd_req->r_result, obj_req); 1538bf0d5f50SAlex Elder 1539c47f9371SAlex Elder /* 15403da691bfSIlya Dryomov * Writes aren't allowed to return a data payload. In some 15413da691bfSIlya Dryomov * guarded write cases (e.g. stat + zero on an empty object) 15423da691bfSIlya Dryomov * a stat response makes it through, but we don't care. 1543c47f9371SAlex Elder */ 154454ab3b24SIlya Dryomov if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request)) 154554ab3b24SIlya Dryomov result = 0; 154654ab3b24SIlya Dryomov else 154754ab3b24SIlya Dryomov result = osd_req->r_result; 15480ccd5926SIlya Dryomov 154954ab3b24SIlya Dryomov rbd_obj_handle_request(obj_req, result); 1550bf0d5f50SAlex Elder } 1551bf0d5f50SAlex Elder 1552bcbab1dbSIlya Dryomov static void rbd_osd_format_read(struct ceph_osd_request *osd_req) 1553430c28c3SAlex Elder { 1554bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_request = osd_req->r_priv; 1555430c28c3SAlex Elder 1556a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_READ; 15577c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 15589d4df01fSAlex Elder } 15599d4df01fSAlex Elder 1560bcbab1dbSIlya Dryomov static void rbd_osd_format_write(struct ceph_osd_request *osd_req) 15619d4df01fSAlex Elder { 1562bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_request = osd_req->r_priv; 15639d4df01fSAlex Elder 1564a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1565fac02ddfSArnd Bergmann ktime_get_real_ts64(&osd_req->r_mtime); 156643df3d35SIlya Dryomov osd_req->r_data_offset = obj_request->ex.oe_off; 1567430c28c3SAlex Elder } 1568430c28c3SAlex Elder 1569bc81207eSIlya Dryomov static struct ceph_osd_request * 1570bcbab1dbSIlya Dryomov __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, 1571bcbab1dbSIlya Dryomov struct ceph_snap_context *snapc, int num_ops) 1572bc81207eSIlya Dryomov { 1573e28eded5SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1574bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1575bc81207eSIlya Dryomov struct ceph_osd_request *req; 1576a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1577a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1578bcbab1dbSIlya Dryomov int ret; 1579bc81207eSIlya Dryomov 1580e28eded5SIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1581bc81207eSIlya Dryomov if (!req) 1582bcbab1dbSIlya Dryomov return ERR_PTR(-ENOMEM); 1583bc81207eSIlya Dryomov 1584bcbab1dbSIlya Dryomov list_add_tail(&req->r_private_item, &obj_req->osd_reqs); 1585bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1586a162b308SIlya Dryomov req->r_priv = obj_req; 1587bc81207eSIlya Dryomov 1588b26c047bSIlya Dryomov /* 1589b26c047bSIlya Dryomov * Data objects may be stored in a separate pool, but always in 1590b26c047bSIlya Dryomov * the same namespace in that pool as the header in its pool. 1591b26c047bSIlya Dryomov */ 1592b26c047bSIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 1593bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1594b26c047bSIlya Dryomov 1595bcbab1dbSIlya Dryomov ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1596bcbab1dbSIlya Dryomov rbd_dev->header.object_prefix, 1597bcbab1dbSIlya Dryomov obj_req->ex.oe_objno); 1598bcbab1dbSIlya Dryomov if (ret) 1599bcbab1dbSIlya Dryomov return ERR_PTR(ret); 1600bc81207eSIlya Dryomov 1601bc81207eSIlya Dryomov return req; 1602bc81207eSIlya Dryomov } 1603bc81207eSIlya Dryomov 1604e28eded5SIlya Dryomov static struct ceph_osd_request * 1605bcbab1dbSIlya Dryomov rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops) 1606e28eded5SIlya Dryomov { 1607bcbab1dbSIlya Dryomov return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc, 1608e28eded5SIlya Dryomov num_ops); 1609e28eded5SIlya Dryomov } 1610e28eded5SIlya Dryomov 1611ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void) 1612bf0d5f50SAlex Elder { 1613bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1614bf0d5f50SAlex Elder 16155a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 16166c696d85SIlya Dryomov if (!obj_request) 1617f907ad55SAlex Elder return NULL; 1618f907ad55SAlex Elder 161943df3d35SIlya Dryomov ceph_object_extent_init(&obj_request->ex); 1620bcbab1dbSIlya Dryomov INIT_LIST_HEAD(&obj_request->osd_reqs); 162185b5e6d1SIlya Dryomov mutex_init(&obj_request->state_mutex); 1622bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1623bf0d5f50SAlex Elder 162467e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1625bf0d5f50SAlex Elder return obj_request; 1626bf0d5f50SAlex Elder } 1627bf0d5f50SAlex Elder 1628bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1629bf0d5f50SAlex Elder { 1630bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1631bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req; 16327e07efb1SIlya Dryomov u32 i; 1633bf0d5f50SAlex Elder 1634bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1635bf0d5f50SAlex Elder 163637206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 163737206ee5SAlex Elder 1638bcbab1dbSIlya Dryomov while (!list_empty(&obj_request->osd_reqs)) { 1639bcbab1dbSIlya Dryomov osd_req = list_first_entry(&obj_request->osd_reqs, 1640bcbab1dbSIlya Dryomov struct ceph_osd_request, r_private_item); 1641bcbab1dbSIlya Dryomov list_del_init(&osd_req->r_private_item); 1642bcbab1dbSIlya Dryomov ceph_osdc_put_request(osd_req); 1643bcbab1dbSIlya Dryomov } 1644bf0d5f50SAlex Elder 1645ecc633caSIlya Dryomov switch (obj_request->img_request->data_type) { 16469969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1647bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 16487e07efb1SIlya Dryomov case OBJ_REQUEST_BVECS: 16495359a17dSIlya Dryomov break; /* Nothing to do */ 1650afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 1651afb97888SIlya Dryomov kfree(obj_request->bvec_pos.bvecs); 1652bf0d5f50SAlex Elder break; 16537e07efb1SIlya Dryomov default: 165416809372SArnd Bergmann BUG(); 1655bf0d5f50SAlex Elder } 1656bf0d5f50SAlex Elder 165786bd7998SIlya Dryomov kfree(obj_request->img_extents); 16587e07efb1SIlya Dryomov if (obj_request->copyup_bvecs) { 16597e07efb1SIlya Dryomov for (i = 0; i < obj_request->copyup_bvec_count; i++) { 16607e07efb1SIlya Dryomov if (obj_request->copyup_bvecs[i].bv_page) 16617e07efb1SIlya Dryomov __free_page(obj_request->copyup_bvecs[i].bv_page); 16627e07efb1SIlya Dryomov } 16637e07efb1SIlya Dryomov kfree(obj_request->copyup_bvecs); 1664bf0d5f50SAlex Elder } 1665bf0d5f50SAlex Elder 1666868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1667bf0d5f50SAlex Elder } 1668bf0d5f50SAlex Elder 1669fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1670fb65d228SAlex Elder 1671fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1672fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1673fb65d228SAlex Elder { 1674fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1675fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1676fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1677fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1678fb65d228SAlex Elder } 1679fb65d228SAlex Elder 1680bf0d5f50SAlex Elder /* 1681a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1682a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1683a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1684a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1685a2acd00eSAlex Elder */ 1686a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1687a2acd00eSAlex Elder { 1688a2acd00eSAlex Elder int counter; 1689a2acd00eSAlex Elder 1690a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1691a2acd00eSAlex Elder return; 1692a2acd00eSAlex Elder 1693a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1694a2acd00eSAlex Elder if (counter > 0) 1695a2acd00eSAlex Elder return; 1696a2acd00eSAlex Elder 1697a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1698a2acd00eSAlex Elder 1699a2acd00eSAlex Elder if (!counter) 1700a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1701a2acd00eSAlex Elder else 17029584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 1703a2acd00eSAlex Elder } 1704a2acd00eSAlex Elder 1705a2acd00eSAlex Elder /* 1706a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1707a2acd00eSAlex Elder * parent. 1708a2acd00eSAlex Elder * 1709a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1710a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1711a2acd00eSAlex Elder * false otherwise. 1712a2acd00eSAlex Elder */ 1713a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1714a2acd00eSAlex Elder { 1715ae43e9d0SIlya Dryomov int counter = 0; 1716a2acd00eSAlex Elder 1717a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1718a2acd00eSAlex Elder return false; 1719a2acd00eSAlex Elder 1720ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 1721ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 1722a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1723ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 1724a2acd00eSAlex Elder 1725a2acd00eSAlex Elder if (counter < 0) 17269584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 1727a2acd00eSAlex Elder 1728ae43e9d0SIlya Dryomov return counter > 0; 1729a2acd00eSAlex Elder } 1730a2acd00eSAlex Elder 1731bf0d5f50SAlex Elder /* 1732bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1733bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1734bf0d5f50SAlex Elder * (if there is one). 1735bf0d5f50SAlex Elder */ 1736cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1737cc344fa1SAlex Elder struct rbd_device *rbd_dev, 17386d2940c8SGuangliang Zhao enum obj_operation_type op_type, 17394e752f0aSJosh Durgin struct ceph_snap_context *snapc) 1740bf0d5f50SAlex Elder { 1741bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1742bf0d5f50SAlex Elder 1743a0c5895bSIlya Dryomov img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO); 1744bf0d5f50SAlex Elder if (!img_request) 1745bf0d5f50SAlex Elder return NULL; 1746bf0d5f50SAlex Elder 1747bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 17489bb0248dSIlya Dryomov img_request->op_type = op_type; 17499bb0248dSIlya Dryomov if (!rbd_img_is_write(img_request)) 1750bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 17519bb0248dSIlya Dryomov else 17529bb0248dSIlya Dryomov img_request->snapc = snapc; 17539bb0248dSIlya Dryomov 1754a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 1755d0b2e944SAlex Elder img_request_layered_set(img_request); 1756a0c5895bSIlya Dryomov 1757e1fddc8fSIlya Dryomov INIT_LIST_HEAD(&img_request->lock_item); 175843df3d35SIlya Dryomov INIT_LIST_HEAD(&img_request->object_extents); 17590192ce2eSIlya Dryomov mutex_init(&img_request->state_mutex); 1760bf0d5f50SAlex Elder kref_init(&img_request->kref); 1761bf0d5f50SAlex Elder 1762bf0d5f50SAlex Elder return img_request; 1763bf0d5f50SAlex Elder } 1764bf0d5f50SAlex Elder 1765bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1766bf0d5f50SAlex Elder { 1767bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1768bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1769bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1770bf0d5f50SAlex Elder 1771bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1772bf0d5f50SAlex Elder 177337206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 177437206ee5SAlex Elder 1775e1fddc8fSIlya Dryomov WARN_ON(!list_empty(&img_request->lock_item)); 1776bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1777bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 1778bf0d5f50SAlex Elder 1779a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 1780a2acd00eSAlex Elder img_request_layered_clear(img_request); 1781a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 1782a2acd00eSAlex Elder } 1783a2acd00eSAlex Elder 17849bb0248dSIlya Dryomov if (rbd_img_is_write(img_request)) 1785812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 1786bf0d5f50SAlex Elder 17871c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 1788bf0d5f50SAlex Elder } 1789bf0d5f50SAlex Elder 179022e8bd51SIlya Dryomov #define BITS_PER_OBJ 2 179122e8bd51SIlya Dryomov #define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ) 179222e8bd51SIlya Dryomov #define OBJ_MASK ((1 << BITS_PER_OBJ) - 1) 179322e8bd51SIlya Dryomov 179422e8bd51SIlya Dryomov static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno, 179522e8bd51SIlya Dryomov u64 *index, u8 *shift) 179622e8bd51SIlya Dryomov { 179722e8bd51SIlya Dryomov u32 off; 179822e8bd51SIlya Dryomov 179922e8bd51SIlya Dryomov rbd_assert(objno < rbd_dev->object_map_size); 180022e8bd51SIlya Dryomov *index = div_u64_rem(objno, OBJS_PER_BYTE, &off); 180122e8bd51SIlya Dryomov *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ; 180222e8bd51SIlya Dryomov } 180322e8bd51SIlya Dryomov 180422e8bd51SIlya Dryomov static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) 180522e8bd51SIlya Dryomov { 180622e8bd51SIlya Dryomov u64 index; 180722e8bd51SIlya Dryomov u8 shift; 180822e8bd51SIlya Dryomov 180922e8bd51SIlya Dryomov lockdep_assert_held(&rbd_dev->object_map_lock); 181022e8bd51SIlya Dryomov __rbd_object_map_index(rbd_dev, objno, &index, &shift); 181122e8bd51SIlya Dryomov return (rbd_dev->object_map[index] >> shift) & OBJ_MASK; 181222e8bd51SIlya Dryomov } 181322e8bd51SIlya Dryomov 181422e8bd51SIlya Dryomov static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val) 181522e8bd51SIlya Dryomov { 181622e8bd51SIlya Dryomov u64 index; 181722e8bd51SIlya Dryomov u8 shift; 181822e8bd51SIlya Dryomov u8 *p; 181922e8bd51SIlya Dryomov 182022e8bd51SIlya Dryomov lockdep_assert_held(&rbd_dev->object_map_lock); 182122e8bd51SIlya Dryomov rbd_assert(!(val & ~OBJ_MASK)); 182222e8bd51SIlya Dryomov 182322e8bd51SIlya Dryomov __rbd_object_map_index(rbd_dev, objno, &index, &shift); 182422e8bd51SIlya Dryomov p = &rbd_dev->object_map[index]; 182522e8bd51SIlya Dryomov *p = (*p & ~(OBJ_MASK << shift)) | (val << shift); 182622e8bd51SIlya Dryomov } 182722e8bd51SIlya Dryomov 182822e8bd51SIlya Dryomov static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno) 182922e8bd51SIlya Dryomov { 183022e8bd51SIlya Dryomov u8 state; 183122e8bd51SIlya Dryomov 183222e8bd51SIlya Dryomov spin_lock(&rbd_dev->object_map_lock); 183322e8bd51SIlya Dryomov state = __rbd_object_map_get(rbd_dev, objno); 183422e8bd51SIlya Dryomov spin_unlock(&rbd_dev->object_map_lock); 183522e8bd51SIlya Dryomov return state; 183622e8bd51SIlya Dryomov } 183722e8bd51SIlya Dryomov 183822e8bd51SIlya Dryomov static bool use_object_map(struct rbd_device *rbd_dev) 183922e8bd51SIlya Dryomov { 184022e8bd51SIlya Dryomov return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) && 184122e8bd51SIlya Dryomov !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)); 184222e8bd51SIlya Dryomov } 184322e8bd51SIlya Dryomov 184422e8bd51SIlya Dryomov static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno) 184522e8bd51SIlya Dryomov { 184622e8bd51SIlya Dryomov u8 state; 184722e8bd51SIlya Dryomov 184822e8bd51SIlya Dryomov /* fall back to default logic if object map is disabled or invalid */ 184922e8bd51SIlya Dryomov if (!use_object_map(rbd_dev)) 185022e8bd51SIlya Dryomov return true; 185122e8bd51SIlya Dryomov 185222e8bd51SIlya Dryomov state = rbd_object_map_get(rbd_dev, objno); 185322e8bd51SIlya Dryomov return state != OBJECT_NONEXISTENT; 185422e8bd51SIlya Dryomov } 185522e8bd51SIlya Dryomov 185622e8bd51SIlya Dryomov static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id, 185722e8bd51SIlya Dryomov struct ceph_object_id *oid) 185822e8bd51SIlya Dryomov { 185922e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) 186022e8bd51SIlya Dryomov ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX, 186122e8bd51SIlya Dryomov rbd_dev->spec->image_id); 186222e8bd51SIlya Dryomov else 186322e8bd51SIlya Dryomov ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX, 186422e8bd51SIlya Dryomov rbd_dev->spec->image_id, snap_id); 186522e8bd51SIlya Dryomov } 186622e8bd51SIlya Dryomov 186722e8bd51SIlya Dryomov static int rbd_object_map_lock(struct rbd_device *rbd_dev) 186822e8bd51SIlya Dryomov { 186922e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 187022e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 187122e8bd51SIlya Dryomov u8 lock_type; 187222e8bd51SIlya Dryomov char *lock_tag; 187322e8bd51SIlya Dryomov struct ceph_locker *lockers; 187422e8bd51SIlya Dryomov u32 num_lockers; 187522e8bd51SIlya Dryomov bool broke_lock = false; 187622e8bd51SIlya Dryomov int ret; 187722e8bd51SIlya Dryomov 187822e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); 187922e8bd51SIlya Dryomov 188022e8bd51SIlya Dryomov again: 188122e8bd51SIlya Dryomov ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, 188222e8bd51SIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0); 188322e8bd51SIlya Dryomov if (ret != -EBUSY || broke_lock) { 188422e8bd51SIlya Dryomov if (ret == -EEXIST) 188522e8bd51SIlya Dryomov ret = 0; /* already locked by myself */ 188622e8bd51SIlya Dryomov if (ret) 188722e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to lock object map: %d", ret); 188822e8bd51SIlya Dryomov return ret; 188922e8bd51SIlya Dryomov } 189022e8bd51SIlya Dryomov 189122e8bd51SIlya Dryomov ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc, 189222e8bd51SIlya Dryomov RBD_LOCK_NAME, &lock_type, &lock_tag, 189322e8bd51SIlya Dryomov &lockers, &num_lockers); 189422e8bd51SIlya Dryomov if (ret) { 189522e8bd51SIlya Dryomov if (ret == -ENOENT) 189622e8bd51SIlya Dryomov goto again; 189722e8bd51SIlya Dryomov 189822e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret); 189922e8bd51SIlya Dryomov return ret; 190022e8bd51SIlya Dryomov } 190122e8bd51SIlya Dryomov 190222e8bd51SIlya Dryomov kfree(lock_tag); 190322e8bd51SIlya Dryomov if (num_lockers == 0) 190422e8bd51SIlya Dryomov goto again; 190522e8bd51SIlya Dryomov 190622e8bd51SIlya Dryomov rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu", 190722e8bd51SIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 190822e8bd51SIlya Dryomov 190922e8bd51SIlya Dryomov ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc, 191022e8bd51SIlya Dryomov RBD_LOCK_NAME, lockers[0].id.cookie, 191122e8bd51SIlya Dryomov &lockers[0].id.name); 191222e8bd51SIlya Dryomov ceph_free_lockers(lockers, num_lockers); 191322e8bd51SIlya Dryomov if (ret) { 191422e8bd51SIlya Dryomov if (ret == -ENOENT) 191522e8bd51SIlya Dryomov goto again; 191622e8bd51SIlya Dryomov 191722e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to break object map lock: %d", ret); 191822e8bd51SIlya Dryomov return ret; 191922e8bd51SIlya Dryomov } 192022e8bd51SIlya Dryomov 192122e8bd51SIlya Dryomov broke_lock = true; 192222e8bd51SIlya Dryomov goto again; 192322e8bd51SIlya Dryomov } 192422e8bd51SIlya Dryomov 192522e8bd51SIlya Dryomov static void rbd_object_map_unlock(struct rbd_device *rbd_dev) 192622e8bd51SIlya Dryomov { 192722e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 192822e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 192922e8bd51SIlya Dryomov int ret; 193022e8bd51SIlya Dryomov 193122e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid); 193222e8bd51SIlya Dryomov 193322e8bd51SIlya Dryomov ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME, 193422e8bd51SIlya Dryomov ""); 193522e8bd51SIlya Dryomov if (ret && ret != -ENOENT) 193622e8bd51SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock object map: %d", ret); 193722e8bd51SIlya Dryomov } 193822e8bd51SIlya Dryomov 193922e8bd51SIlya Dryomov static int decode_object_map_header(void **p, void *end, u64 *object_map_size) 194022e8bd51SIlya Dryomov { 194122e8bd51SIlya Dryomov u8 struct_v; 194222e8bd51SIlya Dryomov u32 struct_len; 194322e8bd51SIlya Dryomov u32 header_len; 194422e8bd51SIlya Dryomov void *header_end; 194522e8bd51SIlya Dryomov int ret; 194622e8bd51SIlya Dryomov 194722e8bd51SIlya Dryomov ceph_decode_32_safe(p, end, header_len, e_inval); 194822e8bd51SIlya Dryomov header_end = *p + header_len; 194922e8bd51SIlya Dryomov 195022e8bd51SIlya Dryomov ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v, 195122e8bd51SIlya Dryomov &struct_len); 195222e8bd51SIlya Dryomov if (ret) 195322e8bd51SIlya Dryomov return ret; 195422e8bd51SIlya Dryomov 195522e8bd51SIlya Dryomov ceph_decode_64_safe(p, end, *object_map_size, e_inval); 195622e8bd51SIlya Dryomov 195722e8bd51SIlya Dryomov *p = header_end; 195822e8bd51SIlya Dryomov return 0; 195922e8bd51SIlya Dryomov 196022e8bd51SIlya Dryomov e_inval: 196122e8bd51SIlya Dryomov return -EINVAL; 196222e8bd51SIlya Dryomov } 196322e8bd51SIlya Dryomov 196422e8bd51SIlya Dryomov static int __rbd_object_map_load(struct rbd_device *rbd_dev) 196522e8bd51SIlya Dryomov { 196622e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 196722e8bd51SIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 196822e8bd51SIlya Dryomov struct page **pages; 196922e8bd51SIlya Dryomov void *p, *end; 197022e8bd51SIlya Dryomov size_t reply_len; 197122e8bd51SIlya Dryomov u64 num_objects; 197222e8bd51SIlya Dryomov u64 object_map_bytes; 197322e8bd51SIlya Dryomov u64 object_map_size; 197422e8bd51SIlya Dryomov int num_pages; 197522e8bd51SIlya Dryomov int ret; 197622e8bd51SIlya Dryomov 197722e8bd51SIlya Dryomov rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size); 197822e8bd51SIlya Dryomov 197922e8bd51SIlya Dryomov num_objects = ceph_get_num_objects(&rbd_dev->layout, 198022e8bd51SIlya Dryomov rbd_dev->mapping.size); 198122e8bd51SIlya Dryomov object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ, 198222e8bd51SIlya Dryomov BITS_PER_BYTE); 198322e8bd51SIlya Dryomov num_pages = calc_pages_for(0, object_map_bytes) + 1; 198422e8bd51SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 198522e8bd51SIlya Dryomov if (IS_ERR(pages)) 198622e8bd51SIlya Dryomov return PTR_ERR(pages); 198722e8bd51SIlya Dryomov 198822e8bd51SIlya Dryomov reply_len = num_pages * PAGE_SIZE; 198922e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid); 199022e8bd51SIlya Dryomov ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc, 199122e8bd51SIlya Dryomov "rbd", "object_map_load", CEPH_OSD_FLAG_READ, 199222e8bd51SIlya Dryomov NULL, 0, pages, &reply_len); 199322e8bd51SIlya Dryomov if (ret) 199422e8bd51SIlya Dryomov goto out; 199522e8bd51SIlya Dryomov 199622e8bd51SIlya Dryomov p = page_address(pages[0]); 199722e8bd51SIlya Dryomov end = p + min(reply_len, (size_t)PAGE_SIZE); 199822e8bd51SIlya Dryomov ret = decode_object_map_header(&p, end, &object_map_size); 199922e8bd51SIlya Dryomov if (ret) 200022e8bd51SIlya Dryomov goto out; 200122e8bd51SIlya Dryomov 200222e8bd51SIlya Dryomov if (object_map_size != num_objects) { 200322e8bd51SIlya Dryomov rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu", 200422e8bd51SIlya Dryomov object_map_size, num_objects); 200522e8bd51SIlya Dryomov ret = -EINVAL; 200622e8bd51SIlya Dryomov goto out; 200722e8bd51SIlya Dryomov } 200822e8bd51SIlya Dryomov 200922e8bd51SIlya Dryomov if (offset_in_page(p) + object_map_bytes > reply_len) { 201022e8bd51SIlya Dryomov ret = -EINVAL; 201122e8bd51SIlya Dryomov goto out; 201222e8bd51SIlya Dryomov } 201322e8bd51SIlya Dryomov 201422e8bd51SIlya Dryomov rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL); 201522e8bd51SIlya Dryomov if (!rbd_dev->object_map) { 201622e8bd51SIlya Dryomov ret = -ENOMEM; 201722e8bd51SIlya Dryomov goto out; 201822e8bd51SIlya Dryomov } 201922e8bd51SIlya Dryomov 202022e8bd51SIlya Dryomov rbd_dev->object_map_size = object_map_size; 202122e8bd51SIlya Dryomov ceph_copy_from_page_vector(pages, rbd_dev->object_map, 202222e8bd51SIlya Dryomov offset_in_page(p), object_map_bytes); 202322e8bd51SIlya Dryomov 202422e8bd51SIlya Dryomov out: 202522e8bd51SIlya Dryomov ceph_release_page_vector(pages, num_pages); 202622e8bd51SIlya Dryomov return ret; 202722e8bd51SIlya Dryomov } 202822e8bd51SIlya Dryomov 202922e8bd51SIlya Dryomov static void rbd_object_map_free(struct rbd_device *rbd_dev) 203022e8bd51SIlya Dryomov { 203122e8bd51SIlya Dryomov kvfree(rbd_dev->object_map); 203222e8bd51SIlya Dryomov rbd_dev->object_map = NULL; 203322e8bd51SIlya Dryomov rbd_dev->object_map_size = 0; 203422e8bd51SIlya Dryomov } 203522e8bd51SIlya Dryomov 203622e8bd51SIlya Dryomov static int rbd_object_map_load(struct rbd_device *rbd_dev) 203722e8bd51SIlya Dryomov { 203822e8bd51SIlya Dryomov int ret; 203922e8bd51SIlya Dryomov 204022e8bd51SIlya Dryomov ret = __rbd_object_map_load(rbd_dev); 204122e8bd51SIlya Dryomov if (ret) 204222e8bd51SIlya Dryomov return ret; 204322e8bd51SIlya Dryomov 204422e8bd51SIlya Dryomov ret = rbd_dev_v2_get_flags(rbd_dev); 204522e8bd51SIlya Dryomov if (ret) { 204622e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev); 204722e8bd51SIlya Dryomov return ret; 204822e8bd51SIlya Dryomov } 204922e8bd51SIlya Dryomov 205022e8bd51SIlya Dryomov if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID) 205122e8bd51SIlya Dryomov rbd_warn(rbd_dev, "object map is invalid"); 205222e8bd51SIlya Dryomov 205322e8bd51SIlya Dryomov return 0; 205422e8bd51SIlya Dryomov } 205522e8bd51SIlya Dryomov 205622e8bd51SIlya Dryomov static int rbd_object_map_open(struct rbd_device *rbd_dev) 205722e8bd51SIlya Dryomov { 205822e8bd51SIlya Dryomov int ret; 205922e8bd51SIlya Dryomov 206022e8bd51SIlya Dryomov ret = rbd_object_map_lock(rbd_dev); 206122e8bd51SIlya Dryomov if (ret) 206222e8bd51SIlya Dryomov return ret; 206322e8bd51SIlya Dryomov 206422e8bd51SIlya Dryomov ret = rbd_object_map_load(rbd_dev); 206522e8bd51SIlya Dryomov if (ret) { 206622e8bd51SIlya Dryomov rbd_object_map_unlock(rbd_dev); 206722e8bd51SIlya Dryomov return ret; 206822e8bd51SIlya Dryomov } 206922e8bd51SIlya Dryomov 207022e8bd51SIlya Dryomov return 0; 207122e8bd51SIlya Dryomov } 207222e8bd51SIlya Dryomov 207322e8bd51SIlya Dryomov static void rbd_object_map_close(struct rbd_device *rbd_dev) 207422e8bd51SIlya Dryomov { 207522e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev); 207622e8bd51SIlya Dryomov rbd_object_map_unlock(rbd_dev); 207722e8bd51SIlya Dryomov } 207822e8bd51SIlya Dryomov 207922e8bd51SIlya Dryomov /* 208022e8bd51SIlya Dryomov * This function needs snap_id (or more precisely just something to 208122e8bd51SIlya Dryomov * distinguish between HEAD and snapshot object maps), new_state and 208222e8bd51SIlya Dryomov * current_state that were passed to rbd_object_map_update(). 208322e8bd51SIlya Dryomov * 208422e8bd51SIlya Dryomov * To avoid allocating and stashing a context we piggyback on the OSD 208522e8bd51SIlya Dryomov * request. A HEAD update has two ops (assert_locked). For new_state 208622e8bd51SIlya Dryomov * and current_state we decode our own object_map_update op, encoded in 208722e8bd51SIlya Dryomov * rbd_cls_object_map_update(). 208822e8bd51SIlya Dryomov */ 208922e8bd51SIlya Dryomov static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req, 209022e8bd51SIlya Dryomov struct ceph_osd_request *osd_req) 209122e8bd51SIlya Dryomov { 209222e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 209322e8bd51SIlya Dryomov struct ceph_osd_data *osd_data; 209422e8bd51SIlya Dryomov u64 objno; 2095633739b2SIlya Dryomov u8 state, new_state, uninitialized_var(current_state); 209622e8bd51SIlya Dryomov bool has_current_state; 209722e8bd51SIlya Dryomov void *p; 209822e8bd51SIlya Dryomov 209922e8bd51SIlya Dryomov if (osd_req->r_result) 210022e8bd51SIlya Dryomov return osd_req->r_result; 210122e8bd51SIlya Dryomov 210222e8bd51SIlya Dryomov /* 210322e8bd51SIlya Dryomov * Nothing to do for a snapshot object map. 210422e8bd51SIlya Dryomov */ 210522e8bd51SIlya Dryomov if (osd_req->r_num_ops == 1) 210622e8bd51SIlya Dryomov return 0; 210722e8bd51SIlya Dryomov 210822e8bd51SIlya Dryomov /* 210922e8bd51SIlya Dryomov * Update in-memory HEAD object map. 211022e8bd51SIlya Dryomov */ 211122e8bd51SIlya Dryomov rbd_assert(osd_req->r_num_ops == 2); 211222e8bd51SIlya Dryomov osd_data = osd_req_op_data(osd_req, 1, cls, request_data); 211322e8bd51SIlya Dryomov rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES); 211422e8bd51SIlya Dryomov 211522e8bd51SIlya Dryomov p = page_address(osd_data->pages[0]); 211622e8bd51SIlya Dryomov objno = ceph_decode_64(&p); 211722e8bd51SIlya Dryomov rbd_assert(objno == obj_req->ex.oe_objno); 211822e8bd51SIlya Dryomov rbd_assert(ceph_decode_64(&p) == objno + 1); 211922e8bd51SIlya Dryomov new_state = ceph_decode_8(&p); 212022e8bd51SIlya Dryomov has_current_state = ceph_decode_8(&p); 212122e8bd51SIlya Dryomov if (has_current_state) 212222e8bd51SIlya Dryomov current_state = ceph_decode_8(&p); 212322e8bd51SIlya Dryomov 212422e8bd51SIlya Dryomov spin_lock(&rbd_dev->object_map_lock); 212522e8bd51SIlya Dryomov state = __rbd_object_map_get(rbd_dev, objno); 212622e8bd51SIlya Dryomov if (!has_current_state || current_state == state || 212722e8bd51SIlya Dryomov (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN)) 212822e8bd51SIlya Dryomov __rbd_object_map_set(rbd_dev, objno, new_state); 212922e8bd51SIlya Dryomov spin_unlock(&rbd_dev->object_map_lock); 213022e8bd51SIlya Dryomov 213122e8bd51SIlya Dryomov return 0; 213222e8bd51SIlya Dryomov } 213322e8bd51SIlya Dryomov 213422e8bd51SIlya Dryomov static void rbd_object_map_callback(struct ceph_osd_request *osd_req) 213522e8bd51SIlya Dryomov { 213622e8bd51SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 213722e8bd51SIlya Dryomov int result; 213822e8bd51SIlya Dryomov 213922e8bd51SIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 214022e8bd51SIlya Dryomov osd_req->r_result, obj_req); 214122e8bd51SIlya Dryomov 214222e8bd51SIlya Dryomov result = rbd_object_map_update_finish(obj_req, osd_req); 214322e8bd51SIlya Dryomov rbd_obj_handle_request(obj_req, result); 214422e8bd51SIlya Dryomov } 214522e8bd51SIlya Dryomov 214622e8bd51SIlya Dryomov static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state) 214722e8bd51SIlya Dryomov { 214822e8bd51SIlya Dryomov u8 state = rbd_object_map_get(rbd_dev, objno); 214922e8bd51SIlya Dryomov 215022e8bd51SIlya Dryomov if (state == new_state || 215122e8bd51SIlya Dryomov (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) || 215222e8bd51SIlya Dryomov (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING)) 215322e8bd51SIlya Dryomov return false; 215422e8bd51SIlya Dryomov 215522e8bd51SIlya Dryomov return true; 215622e8bd51SIlya Dryomov } 215722e8bd51SIlya Dryomov 215822e8bd51SIlya Dryomov static int rbd_cls_object_map_update(struct ceph_osd_request *req, 215922e8bd51SIlya Dryomov int which, u64 objno, u8 new_state, 216022e8bd51SIlya Dryomov const u8 *current_state) 216122e8bd51SIlya Dryomov { 216222e8bd51SIlya Dryomov struct page **pages; 216322e8bd51SIlya Dryomov void *p, *start; 216422e8bd51SIlya Dryomov int ret; 216522e8bd51SIlya Dryomov 216622e8bd51SIlya Dryomov ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update"); 216722e8bd51SIlya Dryomov if (ret) 216822e8bd51SIlya Dryomov return ret; 216922e8bd51SIlya Dryomov 217022e8bd51SIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 217122e8bd51SIlya Dryomov if (IS_ERR(pages)) 217222e8bd51SIlya Dryomov return PTR_ERR(pages); 217322e8bd51SIlya Dryomov 217422e8bd51SIlya Dryomov p = start = page_address(pages[0]); 217522e8bd51SIlya Dryomov ceph_encode_64(&p, objno); 217622e8bd51SIlya Dryomov ceph_encode_64(&p, objno + 1); 217722e8bd51SIlya Dryomov ceph_encode_8(&p, new_state); 217822e8bd51SIlya Dryomov if (current_state) { 217922e8bd51SIlya Dryomov ceph_encode_8(&p, 1); 218022e8bd51SIlya Dryomov ceph_encode_8(&p, *current_state); 218122e8bd51SIlya Dryomov } else { 218222e8bd51SIlya Dryomov ceph_encode_8(&p, 0); 218322e8bd51SIlya Dryomov } 218422e8bd51SIlya Dryomov 218522e8bd51SIlya Dryomov osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0, 218622e8bd51SIlya Dryomov false, true); 218722e8bd51SIlya Dryomov return 0; 218822e8bd51SIlya Dryomov } 218922e8bd51SIlya Dryomov 219022e8bd51SIlya Dryomov /* 219122e8bd51SIlya Dryomov * Return: 219222e8bd51SIlya Dryomov * 0 - object map update sent 219322e8bd51SIlya Dryomov * 1 - object map update isn't needed 219422e8bd51SIlya Dryomov * <0 - error 219522e8bd51SIlya Dryomov */ 219622e8bd51SIlya Dryomov static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id, 219722e8bd51SIlya Dryomov u8 new_state, const u8 *current_state) 219822e8bd51SIlya Dryomov { 219922e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 220022e8bd51SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 220122e8bd51SIlya Dryomov struct ceph_osd_request *req; 220222e8bd51SIlya Dryomov int num_ops = 1; 220322e8bd51SIlya Dryomov int which = 0; 220422e8bd51SIlya Dryomov int ret; 220522e8bd51SIlya Dryomov 220622e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) { 220722e8bd51SIlya Dryomov if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state)) 220822e8bd51SIlya Dryomov return 1; 220922e8bd51SIlya Dryomov 221022e8bd51SIlya Dryomov num_ops++; /* assert_locked */ 221122e8bd51SIlya Dryomov } 221222e8bd51SIlya Dryomov 221322e8bd51SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO); 221422e8bd51SIlya Dryomov if (!req) 221522e8bd51SIlya Dryomov return -ENOMEM; 221622e8bd51SIlya Dryomov 221722e8bd51SIlya Dryomov list_add_tail(&req->r_private_item, &obj_req->osd_reqs); 221822e8bd51SIlya Dryomov req->r_callback = rbd_object_map_callback; 221922e8bd51SIlya Dryomov req->r_priv = obj_req; 222022e8bd51SIlya Dryomov 222122e8bd51SIlya Dryomov rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid); 222222e8bd51SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 222322e8bd51SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_WRITE; 222422e8bd51SIlya Dryomov ktime_get_real_ts64(&req->r_mtime); 222522e8bd51SIlya Dryomov 222622e8bd51SIlya Dryomov if (snap_id == CEPH_NOSNAP) { 222722e8bd51SIlya Dryomov /* 222822e8bd51SIlya Dryomov * Protect against possible race conditions during lock 222922e8bd51SIlya Dryomov * ownership transitions. 223022e8bd51SIlya Dryomov */ 223122e8bd51SIlya Dryomov ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME, 223222e8bd51SIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, "", ""); 223322e8bd51SIlya Dryomov if (ret) 223422e8bd51SIlya Dryomov return ret; 223522e8bd51SIlya Dryomov } 223622e8bd51SIlya Dryomov 223722e8bd51SIlya Dryomov ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno, 223822e8bd51SIlya Dryomov new_state, current_state); 223922e8bd51SIlya Dryomov if (ret) 224022e8bd51SIlya Dryomov return ret; 224122e8bd51SIlya Dryomov 224222e8bd51SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_NOIO); 224322e8bd51SIlya Dryomov if (ret) 224422e8bd51SIlya Dryomov return ret; 224522e8bd51SIlya Dryomov 224622e8bd51SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 224722e8bd51SIlya Dryomov return 0; 224822e8bd51SIlya Dryomov } 224922e8bd51SIlya Dryomov 225086bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents, 225186bd7998SIlya Dryomov u32 *num_img_extents, u64 overlap) 2252e93f3152SAlex Elder { 225386bd7998SIlya Dryomov u32 cnt = *num_img_extents; 2254e93f3152SAlex Elder 225586bd7998SIlya Dryomov /* drop extents completely beyond the overlap */ 225686bd7998SIlya Dryomov while (cnt && img_extents[cnt - 1].fe_off >= overlap) 225786bd7998SIlya Dryomov cnt--; 2258e93f3152SAlex Elder 225986bd7998SIlya Dryomov if (cnt) { 226086bd7998SIlya Dryomov struct ceph_file_extent *ex = &img_extents[cnt - 1]; 2261e93f3152SAlex Elder 226286bd7998SIlya Dryomov /* trim final overlapping extent */ 226386bd7998SIlya Dryomov if (ex->fe_off + ex->fe_len > overlap) 226486bd7998SIlya Dryomov ex->fe_len = overlap - ex->fe_off; 2265e93f3152SAlex Elder } 2266e93f3152SAlex Elder 226786bd7998SIlya Dryomov *num_img_extents = cnt; 226886bd7998SIlya Dryomov } 226986bd7998SIlya Dryomov 227086bd7998SIlya Dryomov /* 227186bd7998SIlya Dryomov * Determine the byte range(s) covered by either just the object extent 227286bd7998SIlya Dryomov * or the entire object in the parent image. 227386bd7998SIlya Dryomov */ 227486bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, 227586bd7998SIlya Dryomov bool entire) 2276e93f3152SAlex Elder { 227786bd7998SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2278c5b5ef6cSAlex Elder int ret; 2279c5b5ef6cSAlex Elder 228086bd7998SIlya Dryomov if (!rbd_dev->parent_overlap) 228186bd7998SIlya Dryomov return 0; 228286bd7998SIlya Dryomov 228386bd7998SIlya Dryomov ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, 228486bd7998SIlya Dryomov entire ? 0 : obj_req->ex.oe_off, 228586bd7998SIlya Dryomov entire ? rbd_dev->layout.object_size : 228686bd7998SIlya Dryomov obj_req->ex.oe_len, 228786bd7998SIlya Dryomov &obj_req->img_extents, 228886bd7998SIlya Dryomov &obj_req->num_img_extents); 228986bd7998SIlya Dryomov if (ret) 229086bd7998SIlya Dryomov return ret; 229186bd7998SIlya Dryomov 229286bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 229386bd7998SIlya Dryomov rbd_dev->parent_overlap); 229486bd7998SIlya Dryomov return 0; 229586bd7998SIlya Dryomov } 229686bd7998SIlya Dryomov 2297bcbab1dbSIlya Dryomov static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which) 22983da691bfSIlya Dryomov { 2299bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 2300bcbab1dbSIlya Dryomov 2301ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 23023da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 2303bcbab1dbSIlya Dryomov osd_req_op_extent_osd_data_bio(osd_req, which, 23043da691bfSIlya Dryomov &obj_req->bio_pos, 230543df3d35SIlya Dryomov obj_req->ex.oe_len); 23063da691bfSIlya Dryomov break; 23073da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2308afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 23093da691bfSIlya Dryomov rbd_assert(obj_req->bvec_pos.iter.bi_size == 231043df3d35SIlya Dryomov obj_req->ex.oe_len); 2311afb97888SIlya Dryomov rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 2312bcbab1dbSIlya Dryomov osd_req_op_extent_osd_data_bvec_pos(osd_req, which, 23133da691bfSIlya Dryomov &obj_req->bvec_pos); 23143da691bfSIlya Dryomov break; 23153da691bfSIlya Dryomov default: 231616809372SArnd Bergmann BUG(); 23173da691bfSIlya Dryomov } 23183da691bfSIlya Dryomov } 23193da691bfSIlya Dryomov 2320bcbab1dbSIlya Dryomov static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which) 23213da691bfSIlya Dryomov { 23223da691bfSIlya Dryomov struct page **pages; 23233da691bfSIlya Dryomov 2324c5b5ef6cSAlex Elder /* 2325c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 2326c5b5ef6cSAlex Elder * le64 length; 2327c5b5ef6cSAlex Elder * struct { 2328c5b5ef6cSAlex Elder * le32 tv_sec; 2329c5b5ef6cSAlex Elder * le32 tv_nsec; 2330c5b5ef6cSAlex Elder * } mtime; 2331c5b5ef6cSAlex Elder */ 23323da691bfSIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 23333da691bfSIlya Dryomov if (IS_ERR(pages)) 23343da691bfSIlya Dryomov return PTR_ERR(pages); 23353da691bfSIlya Dryomov 2336bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0); 2337bcbab1dbSIlya Dryomov osd_req_op_raw_data_in_pages(osd_req, which, pages, 23383da691bfSIlya Dryomov 8 + sizeof(struct ceph_timespec), 23393da691bfSIlya Dryomov 0, false, true); 23403da691bfSIlya Dryomov return 0; 2341710214e3SIlya Dryomov } 2342c5b5ef6cSAlex Elder 2343b5ae8cbcSIlya Dryomov static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which, 2344b5ae8cbcSIlya Dryomov u32 bytes) 234513488d53SIlya Dryomov { 2346b5ae8cbcSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 2347b5ae8cbcSIlya Dryomov int ret; 2348b5ae8cbcSIlya Dryomov 2349b5ae8cbcSIlya Dryomov ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup"); 2350b5ae8cbcSIlya Dryomov if (ret) 2351b5ae8cbcSIlya Dryomov return ret; 2352b5ae8cbcSIlya Dryomov 2353b5ae8cbcSIlya Dryomov osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs, 2354b5ae8cbcSIlya Dryomov obj_req->copyup_bvec_count, bytes); 2355b5ae8cbcSIlya Dryomov return 0; 235613488d53SIlya Dryomov } 235713488d53SIlya Dryomov 2358ea9b743cSIlya Dryomov static int rbd_obj_init_read(struct rbd_obj_request *obj_req) 23593da691bfSIlya Dryomov { 2360ea9b743cSIlya Dryomov obj_req->read_state = RBD_OBJ_READ_START; 2361ea9b743cSIlya Dryomov return 0; 2362ea9b743cSIlya Dryomov } 2363ea9b743cSIlya Dryomov 2364bcbab1dbSIlya Dryomov static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, 2365bcbab1dbSIlya Dryomov int which) 23663da691bfSIlya Dryomov { 2367bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 23683da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 23693da691bfSIlya Dryomov u16 opcode; 2370c5b5ef6cSAlex Elder 23718b5bec5cSIlya Dryomov if (!use_object_map(rbd_dev) || 23728b5bec5cSIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) { 2373bcbab1dbSIlya Dryomov osd_req_op_alloc_hint_init(osd_req, which++, 23743da691bfSIlya Dryomov rbd_dev->layout.object_size, 23753da691bfSIlya Dryomov rbd_dev->layout.object_size); 23768b5bec5cSIlya Dryomov } 2377c5b5ef6cSAlex Elder 23783da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) 23793da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 23803da691bfSIlya Dryomov else 23813da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITE; 2382c5b5ef6cSAlex Elder 2383bcbab1dbSIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode, 238443df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2385bcbab1dbSIlya Dryomov rbd_osd_setup_data(osd_req, which); 23863da691bfSIlya Dryomov } 23873da691bfSIlya Dryomov 2388ea9b743cSIlya Dryomov static int rbd_obj_init_write(struct rbd_obj_request *obj_req) 23893da691bfSIlya Dryomov { 23903da691bfSIlya Dryomov int ret; 23913da691bfSIlya Dryomov 239286bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 239386bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 239486bd7998SIlya Dryomov if (ret) 239586bd7998SIlya Dryomov return ret; 239686bd7998SIlya Dryomov 23970ad5d953SIlya Dryomov if (rbd_obj_copyup_enabled(obj_req)) 23980ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; 23993da691bfSIlya Dryomov 240085b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START; 24013da691bfSIlya Dryomov return 0; 240270d045f6SIlya Dryomov } 240370d045f6SIlya Dryomov 24046484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) 24056484cbe9SIlya Dryomov { 24066484cbe9SIlya Dryomov return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : 24076484cbe9SIlya Dryomov CEPH_OSD_OP_ZERO; 24086484cbe9SIlya Dryomov } 24096484cbe9SIlya Dryomov 241027bbd911SIlya Dryomov static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req, 241127bbd911SIlya Dryomov int which) 241227bbd911SIlya Dryomov { 241327bbd911SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 241427bbd911SIlya Dryomov 241527bbd911SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { 241627bbd911SIlya Dryomov rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); 241727bbd911SIlya Dryomov osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0); 241827bbd911SIlya Dryomov } else { 241927bbd911SIlya Dryomov osd_req_op_extent_init(osd_req, which, 242027bbd911SIlya Dryomov truncate_or_zero_opcode(obj_req), 242127bbd911SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 242227bbd911SIlya Dryomov 0, 0); 242327bbd911SIlya Dryomov } 242427bbd911SIlya Dryomov } 242527bbd911SIlya Dryomov 2426ea9b743cSIlya Dryomov static int rbd_obj_init_discard(struct rbd_obj_request *obj_req) 24276484cbe9SIlya Dryomov { 24280c93e1b7SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 242927bbd911SIlya Dryomov u64 off, next_off; 24306484cbe9SIlya Dryomov int ret; 24316484cbe9SIlya Dryomov 24320c93e1b7SIlya Dryomov /* 24330c93e1b7SIlya Dryomov * Align the range to alloc_size boundary and punt on discards 24340c93e1b7SIlya Dryomov * that are too small to free up any space. 24350c93e1b7SIlya Dryomov * 24360c93e1b7SIlya Dryomov * alloc_size == object_size && is_tail() is a special case for 24370c93e1b7SIlya Dryomov * filestore with filestore_punch_hole = false, needed to allow 24380c93e1b7SIlya Dryomov * truncate (in addition to delete). 24390c93e1b7SIlya Dryomov */ 24400c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || 24410c93e1b7SIlya Dryomov !rbd_obj_is_tail(obj_req)) { 244227bbd911SIlya Dryomov off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size); 244327bbd911SIlya Dryomov next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len, 244427bbd911SIlya Dryomov rbd_dev->opts->alloc_size); 24450c93e1b7SIlya Dryomov if (off >= next_off) 24460c93e1b7SIlya Dryomov return 1; 244727bbd911SIlya Dryomov 244827bbd911SIlya Dryomov dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, 244927bbd911SIlya Dryomov obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, 245027bbd911SIlya Dryomov off, next_off - off); 245127bbd911SIlya Dryomov obj_req->ex.oe_off = off; 245227bbd911SIlya Dryomov obj_req->ex.oe_len = next_off - off; 24530c93e1b7SIlya Dryomov } 24540c93e1b7SIlya Dryomov 24556484cbe9SIlya Dryomov /* reverse map the entire object onto the parent */ 24566484cbe9SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 24576484cbe9SIlya Dryomov if (ret) 24586484cbe9SIlya Dryomov return ret; 24596484cbe9SIlya Dryomov 246022e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; 24610ad5d953SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) 24620ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_DELETION; 24636484cbe9SIlya Dryomov 246485b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START; 24656484cbe9SIlya Dryomov return 0; 24666484cbe9SIlya Dryomov } 24676484cbe9SIlya Dryomov 2468bcbab1dbSIlya Dryomov static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req, 2469bcbab1dbSIlya Dryomov int which) 247013488d53SIlya Dryomov { 2471bcbab1dbSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 24723da691bfSIlya Dryomov u16 opcode; 2473058aa991SIlya Dryomov 24743da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) { 247586bd7998SIlya Dryomov if (obj_req->num_img_extents) { 24760ad5d953SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) 2477bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which++, 24782bb1e56eSIlya Dryomov CEPH_OSD_OP_CREATE, 0); 24793da691bfSIlya Dryomov opcode = CEPH_OSD_OP_TRUNCATE; 24803da691bfSIlya Dryomov } else { 24810ad5d953SIlya Dryomov rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION); 2482bcbab1dbSIlya Dryomov osd_req_op_init(osd_req, which++, 24833da691bfSIlya Dryomov CEPH_OSD_OP_DELETE, 0); 24843da691bfSIlya Dryomov opcode = 0; 24853da691bfSIlya Dryomov } 24863da691bfSIlya Dryomov } else { 24876484cbe9SIlya Dryomov opcode = truncate_or_zero_opcode(obj_req); 24883da691bfSIlya Dryomov } 24893da691bfSIlya Dryomov 24903da691bfSIlya Dryomov if (opcode) 2491bcbab1dbSIlya Dryomov osd_req_op_extent_init(osd_req, which, opcode, 249243df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 24933da691bfSIlya Dryomov 0, 0); 24943da691bfSIlya Dryomov } 24953da691bfSIlya Dryomov 2496ea9b743cSIlya Dryomov static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req) 24973da691bfSIlya Dryomov { 24983da691bfSIlya Dryomov int ret; 24993da691bfSIlya Dryomov 250086bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 250186bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 250286bd7998SIlya Dryomov if (ret) 250386bd7998SIlya Dryomov return ret; 250486bd7998SIlya Dryomov 25050ad5d953SIlya Dryomov if (rbd_obj_copyup_enabled(obj_req)) 25060ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED; 25070ad5d953SIlya Dryomov if (!obj_req->num_img_extents) { 250822e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT; 25090ad5d953SIlya Dryomov if (rbd_obj_is_entire(obj_req)) 25100ad5d953SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_DELETION; 25113da691bfSIlya Dryomov } 25123da691bfSIlya Dryomov 251385b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_START; 2514980917fcSIlya Dryomov return 0; 2515b454e36dSAlex Elder } 2516b454e36dSAlex Elder 2517a086a1b8SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req) 2518a086a1b8SIlya Dryomov { 25198b5bec5cSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 25208b5bec5cSIlya Dryomov 25218b5bec5cSIlya Dryomov switch (img_req->op_type) { 2522a086a1b8SIlya Dryomov case OBJ_OP_WRITE: 25238b5bec5cSIlya Dryomov if (!use_object_map(img_req->rbd_dev) || 25248b5bec5cSIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) 2525a086a1b8SIlya Dryomov return 2; /* setallochint + write/writefull */ 25268b5bec5cSIlya Dryomov 25278b5bec5cSIlya Dryomov return 1; /* write/writefull */ 2528a086a1b8SIlya Dryomov case OBJ_OP_DISCARD: 2529a086a1b8SIlya Dryomov return 1; /* delete/truncate/zero */ 2530a086a1b8SIlya Dryomov case OBJ_OP_ZEROOUT: 2531a086a1b8SIlya Dryomov if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents && 2532a086a1b8SIlya Dryomov !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)) 2533a086a1b8SIlya Dryomov return 2; /* create + truncate */ 2534a086a1b8SIlya Dryomov 2535a086a1b8SIlya Dryomov return 1; /* delete/truncate/zero */ 2536a086a1b8SIlya Dryomov default: 2537a086a1b8SIlya Dryomov BUG(); 2538a086a1b8SIlya Dryomov } 2539a086a1b8SIlya Dryomov } 2540a086a1b8SIlya Dryomov 2541a086a1b8SIlya Dryomov static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req, 2542a086a1b8SIlya Dryomov int which) 2543a086a1b8SIlya Dryomov { 2544a086a1b8SIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 2545a086a1b8SIlya Dryomov 2546a086a1b8SIlya Dryomov switch (obj_req->img_request->op_type) { 2547a086a1b8SIlya Dryomov case OBJ_OP_WRITE: 2548a086a1b8SIlya Dryomov __rbd_osd_setup_write_ops(osd_req, which); 2549a086a1b8SIlya Dryomov break; 2550a086a1b8SIlya Dryomov case OBJ_OP_DISCARD: 2551a086a1b8SIlya Dryomov __rbd_osd_setup_discard_ops(osd_req, which); 2552a086a1b8SIlya Dryomov break; 2553a086a1b8SIlya Dryomov case OBJ_OP_ZEROOUT: 2554a086a1b8SIlya Dryomov __rbd_osd_setup_zeroout_ops(osd_req, which); 2555a086a1b8SIlya Dryomov break; 2556a086a1b8SIlya Dryomov default: 2557a086a1b8SIlya Dryomov BUG(); 2558a086a1b8SIlya Dryomov } 2559a086a1b8SIlya Dryomov } 2560a086a1b8SIlya Dryomov 2561b454e36dSAlex Elder /* 2562a086a1b8SIlya Dryomov * Prune the list of object requests (adjust offset and/or length, drop 2563a086a1b8SIlya Dryomov * redundant requests). Prepare object request state machines and image 2564a086a1b8SIlya Dryomov * request state machine for execution. 2565b454e36dSAlex Elder */ 25663da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req) 25673da691bfSIlya Dryomov { 25680c93e1b7SIlya Dryomov struct rbd_obj_request *obj_req, *next_obj_req; 25693da691bfSIlya Dryomov int ret; 25703d7efd18SAlex Elder 25710c93e1b7SIlya Dryomov for_each_obj_request_safe(img_req, obj_req, next_obj_req) { 25729bb0248dSIlya Dryomov switch (img_req->op_type) { 25733da691bfSIlya Dryomov case OBJ_OP_READ: 2574ea9b743cSIlya Dryomov ret = rbd_obj_init_read(obj_req); 25753da691bfSIlya Dryomov break; 25763da691bfSIlya Dryomov case OBJ_OP_WRITE: 2577ea9b743cSIlya Dryomov ret = rbd_obj_init_write(obj_req); 25783da691bfSIlya Dryomov break; 25793da691bfSIlya Dryomov case OBJ_OP_DISCARD: 2580ea9b743cSIlya Dryomov ret = rbd_obj_init_discard(obj_req); 25813da691bfSIlya Dryomov break; 25826484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 2583ea9b743cSIlya Dryomov ret = rbd_obj_init_zeroout(obj_req); 25846484cbe9SIlya Dryomov break; 25853da691bfSIlya Dryomov default: 258616809372SArnd Bergmann BUG(); 25873da691bfSIlya Dryomov } 25880c93e1b7SIlya Dryomov if (ret < 0) 25893da691bfSIlya Dryomov return ret; 25900c93e1b7SIlya Dryomov if (ret > 0) { 25910c93e1b7SIlya Dryomov rbd_img_obj_request_del(img_req, obj_req); 25920c93e1b7SIlya Dryomov continue; 25930c93e1b7SIlya Dryomov } 2594b454e36dSAlex Elder } 2595b454e36dSAlex Elder 25960192ce2eSIlya Dryomov img_req->state = RBD_IMG_START; 25973da691bfSIlya Dryomov return 0; 25983da691bfSIlya Dryomov } 25993da691bfSIlya Dryomov 26005a237819SIlya Dryomov union rbd_img_fill_iter { 26015a237819SIlya Dryomov struct ceph_bio_iter bio_iter; 26025a237819SIlya Dryomov struct ceph_bvec_iter bvec_iter; 26035a237819SIlya Dryomov }; 26045a237819SIlya Dryomov 26055a237819SIlya Dryomov struct rbd_img_fill_ctx { 26065a237819SIlya Dryomov enum obj_request_type pos_type; 26075a237819SIlya Dryomov union rbd_img_fill_iter *pos; 26085a237819SIlya Dryomov union rbd_img_fill_iter iter; 26095a237819SIlya Dryomov ceph_object_extent_fn_t set_pos_fn; 2610afb97888SIlya Dryomov ceph_object_extent_fn_t count_fn; 2611afb97888SIlya Dryomov ceph_object_extent_fn_t copy_fn; 26125a237819SIlya Dryomov }; 26135a237819SIlya Dryomov 26145a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg) 26155a237819SIlya Dryomov { 26165a237819SIlya Dryomov struct rbd_img_request *img_req = arg; 26175a237819SIlya Dryomov struct rbd_obj_request *obj_req; 26185a237819SIlya Dryomov 26195a237819SIlya Dryomov obj_req = rbd_obj_request_create(); 26205a237819SIlya Dryomov if (!obj_req) 26215a237819SIlya Dryomov return NULL; 26225a237819SIlya Dryomov 26235a237819SIlya Dryomov rbd_img_obj_request_add(img_req, obj_req); 26245a237819SIlya Dryomov return &obj_req->ex; 26255a237819SIlya Dryomov } 26265a237819SIlya Dryomov 26275a237819SIlya Dryomov /* 2628afb97888SIlya Dryomov * While su != os && sc == 1 is technically not fancy (it's the same 2629afb97888SIlya Dryomov * layout as su == os && sc == 1), we can't use the nocopy path for it 2630afb97888SIlya Dryomov * because ->set_pos_fn() should be called only once per object. 2631afb97888SIlya Dryomov * ceph_file_to_extents() invokes action_fn once per stripe unit, so 2632afb97888SIlya Dryomov * treat su != os && sc == 1 as fancy. 26335a237819SIlya Dryomov */ 2634afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l) 2635afb97888SIlya Dryomov { 2636afb97888SIlya Dryomov return l->stripe_unit != l->object_size; 2637afb97888SIlya Dryomov } 2638afb97888SIlya Dryomov 2639afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, 26405a237819SIlya Dryomov struct ceph_file_extent *img_extents, 26415a237819SIlya Dryomov u32 num_img_extents, 26425a237819SIlya Dryomov struct rbd_img_fill_ctx *fctx) 26435a237819SIlya Dryomov { 26445a237819SIlya Dryomov u32 i; 26455a237819SIlya Dryomov int ret; 26465a237819SIlya Dryomov 26475a237819SIlya Dryomov img_req->data_type = fctx->pos_type; 26485a237819SIlya Dryomov 26495a237819SIlya Dryomov /* 26505a237819SIlya Dryomov * Create object requests and set each object request's starting 26515a237819SIlya Dryomov * position in the provided bio (list) or bio_vec array. 26525a237819SIlya Dryomov */ 26535a237819SIlya Dryomov fctx->iter = *fctx->pos; 26545a237819SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 26555a237819SIlya Dryomov ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 26565a237819SIlya Dryomov img_extents[i].fe_off, 26575a237819SIlya Dryomov img_extents[i].fe_len, 26585a237819SIlya Dryomov &img_req->object_extents, 26595a237819SIlya Dryomov alloc_object_extent, img_req, 26605a237819SIlya Dryomov fctx->set_pos_fn, &fctx->iter); 26615a237819SIlya Dryomov if (ret) 26625a237819SIlya Dryomov return ret; 26635a237819SIlya Dryomov } 26645a237819SIlya Dryomov 26655a237819SIlya Dryomov return __rbd_img_fill_request(img_req); 26665a237819SIlya Dryomov } 26675a237819SIlya Dryomov 2668afb97888SIlya Dryomov /* 2669afb97888SIlya Dryomov * Map a list of image extents to a list of object extents, create the 2670afb97888SIlya Dryomov * corresponding object requests (normally each to a different object, 2671afb97888SIlya Dryomov * but not always) and add them to @img_req. For each object request, 2672afb97888SIlya Dryomov * set up its data descriptor to point to the corresponding chunk(s) of 2673afb97888SIlya Dryomov * @fctx->pos data buffer. 2674afb97888SIlya Dryomov * 2675afb97888SIlya Dryomov * Because ceph_file_to_extents() will merge adjacent object extents 2676afb97888SIlya Dryomov * together, each object request's data descriptor may point to multiple 2677afb97888SIlya Dryomov * different chunks of @fctx->pos data buffer. 2678afb97888SIlya Dryomov * 2679afb97888SIlya Dryomov * @fctx->pos data buffer is assumed to be large enough. 2680afb97888SIlya Dryomov */ 2681afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req, 2682afb97888SIlya Dryomov struct ceph_file_extent *img_extents, 2683afb97888SIlya Dryomov u32 num_img_extents, 2684afb97888SIlya Dryomov struct rbd_img_fill_ctx *fctx) 2685afb97888SIlya Dryomov { 2686afb97888SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 2687afb97888SIlya Dryomov struct rbd_obj_request *obj_req; 2688afb97888SIlya Dryomov u32 i; 2689afb97888SIlya Dryomov int ret; 2690afb97888SIlya Dryomov 2691afb97888SIlya Dryomov if (fctx->pos_type == OBJ_REQUEST_NODATA || 2692afb97888SIlya Dryomov !rbd_layout_is_fancy(&rbd_dev->layout)) 2693afb97888SIlya Dryomov return rbd_img_fill_request_nocopy(img_req, img_extents, 2694afb97888SIlya Dryomov num_img_extents, fctx); 2695afb97888SIlya Dryomov 2696afb97888SIlya Dryomov img_req->data_type = OBJ_REQUEST_OWN_BVECS; 2697afb97888SIlya Dryomov 2698afb97888SIlya Dryomov /* 2699afb97888SIlya Dryomov * Create object requests and determine ->bvec_count for each object 2700afb97888SIlya Dryomov * request. Note that ->bvec_count sum over all object requests may 2701afb97888SIlya Dryomov * be greater than the number of bio_vecs in the provided bio (list) 2702afb97888SIlya Dryomov * or bio_vec array because when mapped, those bio_vecs can straddle 2703afb97888SIlya Dryomov * stripe unit boundaries. 2704afb97888SIlya Dryomov */ 2705afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2706afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2707afb97888SIlya Dryomov ret = ceph_file_to_extents(&rbd_dev->layout, 2708afb97888SIlya Dryomov img_extents[i].fe_off, 2709afb97888SIlya Dryomov img_extents[i].fe_len, 2710afb97888SIlya Dryomov &img_req->object_extents, 2711afb97888SIlya Dryomov alloc_object_extent, img_req, 2712afb97888SIlya Dryomov fctx->count_fn, &fctx->iter); 2713afb97888SIlya Dryomov if (ret) 2714afb97888SIlya Dryomov return ret; 2715afb97888SIlya Dryomov } 2716afb97888SIlya Dryomov 2717afb97888SIlya Dryomov for_each_obj_request(img_req, obj_req) { 2718afb97888SIlya Dryomov obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, 2719afb97888SIlya Dryomov sizeof(*obj_req->bvec_pos.bvecs), 2720afb97888SIlya Dryomov GFP_NOIO); 2721afb97888SIlya Dryomov if (!obj_req->bvec_pos.bvecs) 2722afb97888SIlya Dryomov return -ENOMEM; 2723afb97888SIlya Dryomov } 2724afb97888SIlya Dryomov 2725afb97888SIlya Dryomov /* 2726afb97888SIlya Dryomov * Fill in each object request's private bio_vec array, splitting and 2727afb97888SIlya Dryomov * rearranging the provided bio_vecs in stripe unit chunks as needed. 2728afb97888SIlya Dryomov */ 2729afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2730afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2731afb97888SIlya Dryomov ret = ceph_iterate_extents(&rbd_dev->layout, 2732afb97888SIlya Dryomov img_extents[i].fe_off, 2733afb97888SIlya Dryomov img_extents[i].fe_len, 2734afb97888SIlya Dryomov &img_req->object_extents, 2735afb97888SIlya Dryomov fctx->copy_fn, &fctx->iter); 2736afb97888SIlya Dryomov if (ret) 2737afb97888SIlya Dryomov return ret; 2738afb97888SIlya Dryomov } 2739afb97888SIlya Dryomov 2740afb97888SIlya Dryomov return __rbd_img_fill_request(img_req); 2741afb97888SIlya Dryomov } 2742afb97888SIlya Dryomov 27435a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req, 27445a237819SIlya Dryomov u64 off, u64 len) 27455a237819SIlya Dryomov { 27465a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 27475a237819SIlya Dryomov union rbd_img_fill_iter dummy; 27485a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 27495a237819SIlya Dryomov .pos_type = OBJ_REQUEST_NODATA, 27505a237819SIlya Dryomov .pos = &dummy, 27515a237819SIlya Dryomov }; 27525a237819SIlya Dryomov 27535a237819SIlya Dryomov return rbd_img_fill_request(img_req, &ex, 1, &fctx); 27545a237819SIlya Dryomov } 27555a237819SIlya Dryomov 27565a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 27575a237819SIlya Dryomov { 27585a237819SIlya Dryomov struct rbd_obj_request *obj_req = 27595a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 27605a237819SIlya Dryomov struct ceph_bio_iter *it = arg; 27615a237819SIlya Dryomov 27625a237819SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 27635a237819SIlya Dryomov obj_req->bio_pos = *it; 27645a237819SIlya Dryomov ceph_bio_iter_advance(it, bytes); 27655a237819SIlya Dryomov } 27665a237819SIlya Dryomov 2767afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2768afb97888SIlya Dryomov { 2769afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2770afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2771afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2772afb97888SIlya Dryomov 2773afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2774afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2775afb97888SIlya Dryomov obj_req->bvec_count++; 2776afb97888SIlya Dryomov })); 2777afb97888SIlya Dryomov 2778afb97888SIlya Dryomov } 2779afb97888SIlya Dryomov 2780afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2781afb97888SIlya Dryomov { 2782afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2783afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2784afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2785afb97888SIlya Dryomov 2786afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2787afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2788afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2789afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2790afb97888SIlya Dryomov })); 2791afb97888SIlya Dryomov } 2792afb97888SIlya Dryomov 27935a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 27945a237819SIlya Dryomov struct ceph_file_extent *img_extents, 27955a237819SIlya Dryomov u32 num_img_extents, 27965a237819SIlya Dryomov struct ceph_bio_iter *bio_pos) 27975a237819SIlya Dryomov { 27985a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 27995a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BIO, 28005a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bio_pos, 28015a237819SIlya Dryomov .set_pos_fn = set_bio_pos, 2802afb97888SIlya Dryomov .count_fn = count_bio_bvecs, 2803afb97888SIlya Dryomov .copy_fn = copy_bio_bvecs, 28045a237819SIlya Dryomov }; 28055a237819SIlya Dryomov 28065a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 28075a237819SIlya Dryomov &fctx); 28085a237819SIlya Dryomov } 28095a237819SIlya Dryomov 28105a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, 28115a237819SIlya Dryomov u64 off, u64 len, struct bio *bio) 28125a237819SIlya Dryomov { 28135a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 28145a237819SIlya Dryomov struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; 28155a237819SIlya Dryomov 28165a237819SIlya Dryomov return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); 28175a237819SIlya Dryomov } 28185a237819SIlya Dryomov 28195a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 28205a237819SIlya Dryomov { 28215a237819SIlya Dryomov struct rbd_obj_request *obj_req = 28225a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 28235a237819SIlya Dryomov struct ceph_bvec_iter *it = arg; 28245a237819SIlya Dryomov 28255a237819SIlya Dryomov obj_req->bvec_pos = *it; 28265a237819SIlya Dryomov ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); 28275a237819SIlya Dryomov ceph_bvec_iter_advance(it, bytes); 28285a237819SIlya Dryomov } 28295a237819SIlya Dryomov 2830afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2831afb97888SIlya Dryomov { 2832afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2833afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2834afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2835afb97888SIlya Dryomov 2836afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2837afb97888SIlya Dryomov obj_req->bvec_count++; 2838afb97888SIlya Dryomov })); 2839afb97888SIlya Dryomov } 2840afb97888SIlya Dryomov 2841afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2842afb97888SIlya Dryomov { 2843afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2844afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2845afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2846afb97888SIlya Dryomov 2847afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2848afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2849afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2850afb97888SIlya Dryomov })); 2851afb97888SIlya Dryomov } 2852afb97888SIlya Dryomov 28535a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 28545a237819SIlya Dryomov struct ceph_file_extent *img_extents, 28555a237819SIlya Dryomov u32 num_img_extents, 28565a237819SIlya Dryomov struct ceph_bvec_iter *bvec_pos) 28575a237819SIlya Dryomov { 28585a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 28595a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BVECS, 28605a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bvec_pos, 28615a237819SIlya Dryomov .set_pos_fn = set_bvec_pos, 2862afb97888SIlya Dryomov .count_fn = count_bvecs, 2863afb97888SIlya Dryomov .copy_fn = copy_bvecs, 28645a237819SIlya Dryomov }; 28655a237819SIlya Dryomov 28665a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 28675a237819SIlya Dryomov &fctx); 28685a237819SIlya Dryomov } 28695a237819SIlya Dryomov 28705a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 28715a237819SIlya Dryomov struct ceph_file_extent *img_extents, 28725a237819SIlya Dryomov u32 num_img_extents, 28735a237819SIlya Dryomov struct bio_vec *bvecs) 28745a237819SIlya Dryomov { 28755a237819SIlya Dryomov struct ceph_bvec_iter it = { 28765a237819SIlya Dryomov .bvecs = bvecs, 28775a237819SIlya Dryomov .iter = { .bi_size = ceph_file_extents_bytes(img_extents, 28785a237819SIlya Dryomov num_img_extents) }, 28795a237819SIlya Dryomov }; 28805a237819SIlya Dryomov 28815a237819SIlya Dryomov return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, 28825a237819SIlya Dryomov &it); 28835a237819SIlya Dryomov } 28845a237819SIlya Dryomov 28850192ce2eSIlya Dryomov static void rbd_img_handle_request_work(struct work_struct *work) 2886bf0d5f50SAlex Elder { 28870192ce2eSIlya Dryomov struct rbd_img_request *img_req = 28880192ce2eSIlya Dryomov container_of(work, struct rbd_img_request, work); 2889bf0d5f50SAlex Elder 28900192ce2eSIlya Dryomov rbd_img_handle_request(img_req, img_req->work_result); 28910192ce2eSIlya Dryomov } 2892bf0d5f50SAlex Elder 28930192ce2eSIlya Dryomov static void rbd_img_schedule(struct rbd_img_request *img_req, int result) 28940192ce2eSIlya Dryomov { 28950192ce2eSIlya Dryomov INIT_WORK(&img_req->work, rbd_img_handle_request_work); 28960192ce2eSIlya Dryomov img_req->work_result = result; 28970192ce2eSIlya Dryomov queue_work(rbd_wq, &img_req->work); 2898bf0d5f50SAlex Elder } 2899bf0d5f50SAlex Elder 290022e8bd51SIlya Dryomov static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req) 290122e8bd51SIlya Dryomov { 290222e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 290322e8bd51SIlya Dryomov 290422e8bd51SIlya Dryomov if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) { 290522e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; 290622e8bd51SIlya Dryomov return true; 290722e8bd51SIlya Dryomov } 290822e8bd51SIlya Dryomov 290922e8bd51SIlya Dryomov dout("%s %p objno %llu assuming dne\n", __func__, obj_req, 291022e8bd51SIlya Dryomov obj_req->ex.oe_objno); 291122e8bd51SIlya Dryomov return false; 291222e8bd51SIlya Dryomov } 291322e8bd51SIlya Dryomov 291485b5e6d1SIlya Dryomov static int rbd_obj_read_object(struct rbd_obj_request *obj_req) 291585b5e6d1SIlya Dryomov { 2916a086a1b8SIlya Dryomov struct ceph_osd_request *osd_req; 2917a086a1b8SIlya Dryomov int ret; 2918a086a1b8SIlya Dryomov 2919a086a1b8SIlya Dryomov osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1); 2920a086a1b8SIlya Dryomov if (IS_ERR(osd_req)) 2921a086a1b8SIlya Dryomov return PTR_ERR(osd_req); 2922a086a1b8SIlya Dryomov 2923a086a1b8SIlya Dryomov osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ, 2924a086a1b8SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2925a086a1b8SIlya Dryomov rbd_osd_setup_data(osd_req, 0); 2926a086a1b8SIlya Dryomov rbd_osd_format_read(osd_req); 2927a086a1b8SIlya Dryomov 2928a086a1b8SIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 2929a086a1b8SIlya Dryomov if (ret) 2930a086a1b8SIlya Dryomov return ret; 2931a086a1b8SIlya Dryomov 2932a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 293385b5e6d1SIlya Dryomov return 0; 2934bf0d5f50SAlex Elder } 2935bf0d5f50SAlex Elder 293686bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 29373da691bfSIlya Dryomov { 29383da691bfSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 29393da691bfSIlya Dryomov struct rbd_img_request *child_img_req; 29403da691bfSIlya Dryomov int ret; 29413da691bfSIlya Dryomov 2942e93aca0aSIlya Dryomov child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, 2943e93aca0aSIlya Dryomov OBJ_OP_READ, NULL); 29443da691bfSIlya Dryomov if (!child_img_req) 29453da691bfSIlya Dryomov return -ENOMEM; 29463da691bfSIlya Dryomov 2947e93aca0aSIlya Dryomov __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2948e93aca0aSIlya Dryomov child_img_req->obj_request = obj_req; 2949e93aca0aSIlya Dryomov 295021ed05a8SIlya Dryomov dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, 295121ed05a8SIlya Dryomov obj_req); 295221ed05a8SIlya Dryomov 29533da691bfSIlya Dryomov if (!rbd_img_is_write(img_req)) { 2954ecc633caSIlya Dryomov switch (img_req->data_type) { 29553da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 29565a237819SIlya Dryomov ret = __rbd_img_fill_from_bio(child_img_req, 29575a237819SIlya Dryomov obj_req->img_extents, 29585a237819SIlya Dryomov obj_req->num_img_extents, 29593da691bfSIlya Dryomov &obj_req->bio_pos); 29603da691bfSIlya Dryomov break; 29613da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2962afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 29635a237819SIlya Dryomov ret = __rbd_img_fill_from_bvecs(child_img_req, 29645a237819SIlya Dryomov obj_req->img_extents, 29655a237819SIlya Dryomov obj_req->num_img_extents, 29663da691bfSIlya Dryomov &obj_req->bvec_pos); 29673da691bfSIlya Dryomov break; 29683da691bfSIlya Dryomov default: 2969d342a15bSArnd Bergmann BUG(); 29703da691bfSIlya Dryomov } 29713da691bfSIlya Dryomov } else { 29725a237819SIlya Dryomov ret = rbd_img_fill_from_bvecs(child_img_req, 29735a237819SIlya Dryomov obj_req->img_extents, 29745a237819SIlya Dryomov obj_req->num_img_extents, 29755a237819SIlya Dryomov obj_req->copyup_bvecs); 29763da691bfSIlya Dryomov } 29773da691bfSIlya Dryomov if (ret) { 29783da691bfSIlya Dryomov rbd_img_request_put(child_img_req); 2979663ae2ccSIlya Dryomov return ret; 2980bf0d5f50SAlex Elder } 2981bf0d5f50SAlex Elder 29820192ce2eSIlya Dryomov /* avoid parent chain recursion */ 29830192ce2eSIlya Dryomov rbd_img_schedule(child_img_req, 0); 29843da691bfSIlya Dryomov return 0; 29853da691bfSIlya Dryomov } 29863da691bfSIlya Dryomov 298785b5e6d1SIlya Dryomov static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result) 29888b3e1a56SAlex Elder { 29893da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 29903da691bfSIlya Dryomov int ret; 29918b3e1a56SAlex Elder 299222e8bd51SIlya Dryomov again: 2993a9b67e69SIlya Dryomov switch (obj_req->read_state) { 299485b5e6d1SIlya Dryomov case RBD_OBJ_READ_START: 299585b5e6d1SIlya Dryomov rbd_assert(!*result); 299685b5e6d1SIlya Dryomov 299722e8bd51SIlya Dryomov if (!rbd_obj_may_exist(obj_req)) { 299822e8bd51SIlya Dryomov *result = -ENOENT; 299922e8bd51SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_OBJECT; 300022e8bd51SIlya Dryomov goto again; 300122e8bd51SIlya Dryomov } 300222e8bd51SIlya Dryomov 300385b5e6d1SIlya Dryomov ret = rbd_obj_read_object(obj_req); 300485b5e6d1SIlya Dryomov if (ret) { 300585b5e6d1SIlya Dryomov *result = ret; 300685b5e6d1SIlya Dryomov return true; 300785b5e6d1SIlya Dryomov } 300885b5e6d1SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_OBJECT; 300985b5e6d1SIlya Dryomov return false; 3010a9b67e69SIlya Dryomov case RBD_OBJ_READ_OBJECT: 3011a9b67e69SIlya Dryomov if (*result == -ENOENT && rbd_dev->parent_overlap) { 301286bd7998SIlya Dryomov /* reverse map this object extent onto the parent */ 301386bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, false); 301486bd7998SIlya Dryomov if (ret) { 301554ab3b24SIlya Dryomov *result = ret; 301686bd7998SIlya Dryomov return true; 301786bd7998SIlya Dryomov } 301886bd7998SIlya Dryomov if (obj_req->num_img_extents) { 301986bd7998SIlya Dryomov ret = rbd_obj_read_from_parent(obj_req); 30203da691bfSIlya Dryomov if (ret) { 302154ab3b24SIlya Dryomov *result = ret; 30223da691bfSIlya Dryomov return true; 30233da691bfSIlya Dryomov } 3024a9b67e69SIlya Dryomov obj_req->read_state = RBD_OBJ_READ_PARENT; 30253da691bfSIlya Dryomov return false; 30263da691bfSIlya Dryomov } 302786bd7998SIlya Dryomov } 302802c74fbaSAlex Elder 302902c74fbaSAlex Elder /* 30303da691bfSIlya Dryomov * -ENOENT means a hole in the image -- zero-fill the entire 30313da691bfSIlya Dryomov * length of the request. A short read also implies zero-fill 303254ab3b24SIlya Dryomov * to the end of the request. 303302c74fbaSAlex Elder */ 303454ab3b24SIlya Dryomov if (*result == -ENOENT) { 303554ab3b24SIlya Dryomov rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len); 303654ab3b24SIlya Dryomov *result = 0; 303754ab3b24SIlya Dryomov } else if (*result >= 0) { 303854ab3b24SIlya Dryomov if (*result < obj_req->ex.oe_len) 303954ab3b24SIlya Dryomov rbd_obj_zero_range(obj_req, *result, 304054ab3b24SIlya Dryomov obj_req->ex.oe_len - *result); 304154ab3b24SIlya Dryomov else 304254ab3b24SIlya Dryomov rbd_assert(*result == obj_req->ex.oe_len); 304354ab3b24SIlya Dryomov *result = 0; 30443da691bfSIlya Dryomov } 30453da691bfSIlya Dryomov return true; 3046a9b67e69SIlya Dryomov case RBD_OBJ_READ_PARENT: 3047d435c9a7SIlya Dryomov /* 3048d435c9a7SIlya Dryomov * The parent image is read only up to the overlap -- zero-fill 3049d435c9a7SIlya Dryomov * from the overlap to the end of the request. 3050d435c9a7SIlya Dryomov */ 3051d435c9a7SIlya Dryomov if (!*result) { 3052d435c9a7SIlya Dryomov u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req); 3053d435c9a7SIlya Dryomov 3054d435c9a7SIlya Dryomov if (obj_overlap < obj_req->ex.oe_len) 3055d435c9a7SIlya Dryomov rbd_obj_zero_range(obj_req, obj_overlap, 3056d435c9a7SIlya Dryomov obj_req->ex.oe_len - obj_overlap); 3057d435c9a7SIlya Dryomov } 3058a9b67e69SIlya Dryomov return true; 3059a9b67e69SIlya Dryomov default: 3060a9b67e69SIlya Dryomov BUG(); 3061a9b67e69SIlya Dryomov } 30623da691bfSIlya Dryomov } 30633da691bfSIlya Dryomov 306422e8bd51SIlya Dryomov static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req) 306522e8bd51SIlya Dryomov { 306622e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 306722e8bd51SIlya Dryomov 306822e8bd51SIlya Dryomov if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) 306922e8bd51SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST; 307022e8bd51SIlya Dryomov 307122e8bd51SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) && 307222e8bd51SIlya Dryomov (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) { 307322e8bd51SIlya Dryomov dout("%s %p noop for nonexistent\n", __func__, obj_req); 30743da691bfSIlya Dryomov return true; 30753da691bfSIlya Dryomov } 30763da691bfSIlya Dryomov 307722e8bd51SIlya Dryomov return false; 307822e8bd51SIlya Dryomov } 307922e8bd51SIlya Dryomov 308022e8bd51SIlya Dryomov /* 308122e8bd51SIlya Dryomov * Return: 308222e8bd51SIlya Dryomov * 0 - object map update sent 308322e8bd51SIlya Dryomov * 1 - object map update isn't needed 308422e8bd51SIlya Dryomov * <0 - error 308522e8bd51SIlya Dryomov */ 308622e8bd51SIlya Dryomov static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req) 308722e8bd51SIlya Dryomov { 308822e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 308922e8bd51SIlya Dryomov u8 new_state; 309022e8bd51SIlya Dryomov 309122e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 309222e8bd51SIlya Dryomov return 1; 309322e8bd51SIlya Dryomov 309422e8bd51SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_DELETION) 309522e8bd51SIlya Dryomov new_state = OBJECT_PENDING; 309622e8bd51SIlya Dryomov else 309722e8bd51SIlya Dryomov new_state = OBJECT_EXISTS; 309822e8bd51SIlya Dryomov 309922e8bd51SIlya Dryomov return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL); 310022e8bd51SIlya Dryomov } 310122e8bd51SIlya Dryomov 310285b5e6d1SIlya Dryomov static int rbd_obj_write_object(struct rbd_obj_request *obj_req) 310385b5e6d1SIlya Dryomov { 3104a086a1b8SIlya Dryomov struct ceph_osd_request *osd_req; 3105a086a1b8SIlya Dryomov int num_ops = count_write_ops(obj_req); 3106a086a1b8SIlya Dryomov int which = 0; 3107a086a1b8SIlya Dryomov int ret; 3108a086a1b8SIlya Dryomov 3109a086a1b8SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) 3110a086a1b8SIlya Dryomov num_ops++; /* stat */ 3111a086a1b8SIlya Dryomov 3112a086a1b8SIlya Dryomov osd_req = rbd_obj_add_osd_request(obj_req, num_ops); 3113a086a1b8SIlya Dryomov if (IS_ERR(osd_req)) 3114a086a1b8SIlya Dryomov return PTR_ERR(osd_req); 3115a086a1b8SIlya Dryomov 3116a086a1b8SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { 3117a086a1b8SIlya Dryomov ret = rbd_osd_setup_stat(osd_req, which++); 3118a086a1b8SIlya Dryomov if (ret) 3119a086a1b8SIlya Dryomov return ret; 3120a086a1b8SIlya Dryomov } 3121a086a1b8SIlya Dryomov 3122a086a1b8SIlya Dryomov rbd_osd_setup_write_ops(osd_req, which); 3123a086a1b8SIlya Dryomov rbd_osd_format_write(osd_req); 3124a086a1b8SIlya Dryomov 3125a086a1b8SIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 3126a086a1b8SIlya Dryomov if (ret) 3127a086a1b8SIlya Dryomov return ret; 3128a086a1b8SIlya Dryomov 3129a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 313085b5e6d1SIlya Dryomov return 0; 313185b5e6d1SIlya Dryomov } 313285b5e6d1SIlya Dryomov 31333da691bfSIlya Dryomov /* 31343da691bfSIlya Dryomov * copyup_bvecs pages are never highmem pages 31353da691bfSIlya Dryomov */ 31363da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) 31373da691bfSIlya Dryomov { 31383da691bfSIlya Dryomov struct ceph_bvec_iter it = { 31393da691bfSIlya Dryomov .bvecs = bvecs, 31403da691bfSIlya Dryomov .iter = { .bi_size = bytes }, 31413da691bfSIlya Dryomov }; 31423da691bfSIlya Dryomov 31433da691bfSIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 31443da691bfSIlya Dryomov if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, 31453da691bfSIlya Dryomov bv.bv_len)) 31463da691bfSIlya Dryomov return false; 31473da691bfSIlya Dryomov })); 31483da691bfSIlya Dryomov return true; 31493da691bfSIlya Dryomov } 31503da691bfSIlya Dryomov 31513a482501SIlya Dryomov #define MODS_ONLY U32_MAX 31523a482501SIlya Dryomov 3153793333a3SIlya Dryomov static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req, 315489a59c1cSIlya Dryomov u32 bytes) 31553da691bfSIlya Dryomov { 3156bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req; 3157fe943d50SChengguang Xu int ret; 31583da691bfSIlya Dryomov 31593da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 316089a59c1cSIlya Dryomov rbd_assert(bytes > 0 && bytes != MODS_ONLY); 31613da691bfSIlya Dryomov 3162bcbab1dbSIlya Dryomov osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1); 3163bcbab1dbSIlya Dryomov if (IS_ERR(osd_req)) 3164bcbab1dbSIlya Dryomov return PTR_ERR(osd_req); 31653da691bfSIlya Dryomov 3166b5ae8cbcSIlya Dryomov ret = rbd_osd_setup_copyup(osd_req, 0, bytes); 3167fe943d50SChengguang Xu if (ret) 3168fe943d50SChengguang Xu return ret; 3169fe943d50SChengguang Xu 3170bcbab1dbSIlya Dryomov rbd_osd_format_write(osd_req); 31713da691bfSIlya Dryomov 3172bcbab1dbSIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 317389a59c1cSIlya Dryomov if (ret) 317489a59c1cSIlya Dryomov return ret; 317589a59c1cSIlya Dryomov 3176a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 317789a59c1cSIlya Dryomov return 0; 317889a59c1cSIlya Dryomov } 317989a59c1cSIlya Dryomov 3180793333a3SIlya Dryomov static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req, 3181793333a3SIlya Dryomov u32 bytes) 31823da691bfSIlya Dryomov { 3183bcbab1dbSIlya Dryomov struct ceph_osd_request *osd_req; 3184a086a1b8SIlya Dryomov int num_ops = count_write_ops(obj_req); 3185a086a1b8SIlya Dryomov int which = 0; 31863da691bfSIlya Dryomov int ret; 31873da691bfSIlya Dryomov 31883da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 31893da691bfSIlya Dryomov 3190a086a1b8SIlya Dryomov if (bytes != MODS_ONLY) 3191a086a1b8SIlya Dryomov num_ops++; /* copyup */ 319213488d53SIlya Dryomov 3193a086a1b8SIlya Dryomov osd_req = rbd_obj_add_osd_request(obj_req, num_ops); 3194bcbab1dbSIlya Dryomov if (IS_ERR(osd_req)) 3195bcbab1dbSIlya Dryomov return PTR_ERR(osd_req); 31963da691bfSIlya Dryomov 31973a482501SIlya Dryomov if (bytes != MODS_ONLY) { 3198b5ae8cbcSIlya Dryomov ret = rbd_osd_setup_copyup(osd_req, which++, bytes); 31993da691bfSIlya Dryomov if (ret) 32003da691bfSIlya Dryomov return ret; 32013a482501SIlya Dryomov } 32023da691bfSIlya Dryomov 3203a086a1b8SIlya Dryomov rbd_osd_setup_write_ops(osd_req, which); 3204a086a1b8SIlya Dryomov rbd_osd_format_write(osd_req); 32053da691bfSIlya Dryomov 3206bcbab1dbSIlya Dryomov ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO); 32073da691bfSIlya Dryomov if (ret) 32083da691bfSIlya Dryomov return ret; 32093da691bfSIlya Dryomov 3210a086a1b8SIlya Dryomov rbd_osd_submit(osd_req); 32113da691bfSIlya Dryomov return 0; 32123da691bfSIlya Dryomov } 32133da691bfSIlya Dryomov 32147e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 32157e07efb1SIlya Dryomov { 32167e07efb1SIlya Dryomov u32 i; 32177e07efb1SIlya Dryomov 32187e07efb1SIlya Dryomov rbd_assert(!obj_req->copyup_bvecs); 32197e07efb1SIlya Dryomov obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); 32207e07efb1SIlya Dryomov obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, 32217e07efb1SIlya Dryomov sizeof(*obj_req->copyup_bvecs), 32227e07efb1SIlya Dryomov GFP_NOIO); 32237e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs) 32247e07efb1SIlya Dryomov return -ENOMEM; 32257e07efb1SIlya Dryomov 32267e07efb1SIlya Dryomov for (i = 0; i < obj_req->copyup_bvec_count; i++) { 32277e07efb1SIlya Dryomov unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); 32287e07efb1SIlya Dryomov 32297e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); 32307e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs[i].bv_page) 32317e07efb1SIlya Dryomov return -ENOMEM; 32327e07efb1SIlya Dryomov 32337e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_offset = 0; 32347e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_len = len; 32357e07efb1SIlya Dryomov obj_overlap -= len; 32367e07efb1SIlya Dryomov } 32377e07efb1SIlya Dryomov 32387e07efb1SIlya Dryomov rbd_assert(!obj_overlap); 32397e07efb1SIlya Dryomov return 0; 32407e07efb1SIlya Dryomov } 32417e07efb1SIlya Dryomov 32420ad5d953SIlya Dryomov /* 32430ad5d953SIlya Dryomov * The target object doesn't exist. Read the data for the entire 32440ad5d953SIlya Dryomov * target object up to the overlap point (if any) from the parent, 32450ad5d953SIlya Dryomov * so we can use it for a copyup. 32460ad5d953SIlya Dryomov */ 3247793333a3SIlya Dryomov static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req) 32483da691bfSIlya Dryomov { 32493da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 32503da691bfSIlya Dryomov int ret; 32513da691bfSIlya Dryomov 325286bd7998SIlya Dryomov rbd_assert(obj_req->num_img_extents); 325386bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 325486bd7998SIlya Dryomov rbd_dev->parent_overlap); 325586bd7998SIlya Dryomov if (!obj_req->num_img_extents) { 32563da691bfSIlya Dryomov /* 32573da691bfSIlya Dryomov * The overlap has become 0 (most likely because the 32583a482501SIlya Dryomov * image has been flattened). Re-submit the original write 32593a482501SIlya Dryomov * request -- pass MODS_ONLY since the copyup isn't needed 32603a482501SIlya Dryomov * anymore. 32613da691bfSIlya Dryomov */ 3262793333a3SIlya Dryomov return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY); 32633da691bfSIlya Dryomov } 32643da691bfSIlya Dryomov 326586bd7998SIlya Dryomov ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 32663da691bfSIlya Dryomov if (ret) 32673da691bfSIlya Dryomov return ret; 32683da691bfSIlya Dryomov 326986bd7998SIlya Dryomov return rbd_obj_read_from_parent(obj_req); 32703da691bfSIlya Dryomov } 32713da691bfSIlya Dryomov 327222e8bd51SIlya Dryomov static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req) 32733da691bfSIlya Dryomov { 327422e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 327522e8bd51SIlya Dryomov struct ceph_snap_context *snapc = obj_req->img_request->snapc; 327622e8bd51SIlya Dryomov u8 new_state; 327722e8bd51SIlya Dryomov u32 i; 32783da691bfSIlya Dryomov int ret; 32793da691bfSIlya Dryomov 328022e8bd51SIlya Dryomov rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); 32813da691bfSIlya Dryomov 328222e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 328322e8bd51SIlya Dryomov return; 328489a59c1cSIlya Dryomov 328522e8bd51SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) 328622e8bd51SIlya Dryomov return; 32873da691bfSIlya Dryomov 328822e8bd51SIlya Dryomov for (i = 0; i < snapc->num_snaps; i++) { 328922e8bd51SIlya Dryomov if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) && 329022e8bd51SIlya Dryomov i + 1 < snapc->num_snaps) 329122e8bd51SIlya Dryomov new_state = OBJECT_EXISTS_CLEAN; 329222e8bd51SIlya Dryomov else 329322e8bd51SIlya Dryomov new_state = OBJECT_EXISTS; 32943da691bfSIlya Dryomov 329522e8bd51SIlya Dryomov ret = rbd_object_map_update(obj_req, snapc->snaps[i], 329622e8bd51SIlya Dryomov new_state, NULL); 329722e8bd51SIlya Dryomov if (ret < 0) { 329822e8bd51SIlya Dryomov obj_req->pending.result = ret; 329902c74fbaSAlex Elder return; 330002c74fbaSAlex Elder } 330102c74fbaSAlex Elder 330222e8bd51SIlya Dryomov rbd_assert(!ret); 330322e8bd51SIlya Dryomov obj_req->pending.num_pending++; 3304a9e8ba2cSAlex Elder } 33058b3e1a56SAlex Elder } 33068b3e1a56SAlex Elder 3307793333a3SIlya Dryomov static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req) 33088b3e1a56SAlex Elder { 3309793333a3SIlya Dryomov u32 bytes = rbd_obj_img_extents_bytes(obj_req); 3310793333a3SIlya Dryomov int ret; 33118b3e1a56SAlex Elder 3312793333a3SIlya Dryomov rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending); 33138b3e1a56SAlex Elder 3314793333a3SIlya Dryomov /* 3315793333a3SIlya Dryomov * Only send non-zero copyup data to save some I/O and network 3316793333a3SIlya Dryomov * bandwidth -- zero copyup data is equivalent to the object not 3317793333a3SIlya Dryomov * existing. 3318793333a3SIlya Dryomov */ 3319793333a3SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS) 3320793333a3SIlya Dryomov bytes = 0; 3321793333a3SIlya Dryomov 3322793333a3SIlya Dryomov if (obj_req->img_request->snapc->num_snaps && bytes > 0) { 3323793333a3SIlya Dryomov /* 3324793333a3SIlya Dryomov * Send a copyup request with an empty snapshot context to 3325793333a3SIlya Dryomov * deep-copyup the object through all existing snapshots. 3326793333a3SIlya Dryomov * A second request with the current snapshot context will be 3327793333a3SIlya Dryomov * sent for the actual modification. 3328793333a3SIlya Dryomov */ 3329793333a3SIlya Dryomov ret = rbd_obj_copyup_empty_snapc(obj_req, bytes); 3330793333a3SIlya Dryomov if (ret) { 3331793333a3SIlya Dryomov obj_req->pending.result = ret; 3332793333a3SIlya Dryomov return; 33337114edacSIlya Dryomov } 33348b3e1a56SAlex Elder 3335793333a3SIlya Dryomov obj_req->pending.num_pending++; 3336793333a3SIlya Dryomov bytes = MODS_ONLY; 33373da691bfSIlya Dryomov } 33388b3e1a56SAlex Elder 3339793333a3SIlya Dryomov ret = rbd_obj_copyup_current_snapc(obj_req, bytes); 3340793333a3SIlya Dryomov if (ret) { 3341793333a3SIlya Dryomov obj_req->pending.result = ret; 3342793333a3SIlya Dryomov return; 3343793333a3SIlya Dryomov } 3344793333a3SIlya Dryomov 3345793333a3SIlya Dryomov obj_req->pending.num_pending++; 3346793333a3SIlya Dryomov } 3347793333a3SIlya Dryomov 3348793333a3SIlya Dryomov static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result) 33493da691bfSIlya Dryomov { 335022e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3351793333a3SIlya Dryomov int ret; 33527114edacSIlya Dryomov 33537114edacSIlya Dryomov again: 3354793333a3SIlya Dryomov switch (obj_req->copyup_state) { 3355793333a3SIlya Dryomov case RBD_OBJ_COPYUP_START: 3356793333a3SIlya Dryomov rbd_assert(!*result); 33573da691bfSIlya Dryomov 3358793333a3SIlya Dryomov ret = rbd_obj_copyup_read_parent(obj_req); 3359793333a3SIlya Dryomov if (ret) { 3360793333a3SIlya Dryomov *result = ret; 3361793333a3SIlya Dryomov return true; 3362793333a3SIlya Dryomov } 3363793333a3SIlya Dryomov if (obj_req->num_img_extents) 3364793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT; 3365793333a3SIlya Dryomov else 3366793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; 3367793333a3SIlya Dryomov return false; 3368793333a3SIlya Dryomov case RBD_OBJ_COPYUP_READ_PARENT: 3369793333a3SIlya Dryomov if (*result) 3370793333a3SIlya Dryomov return true; 3371793333a3SIlya Dryomov 3372793333a3SIlya Dryomov if (is_zero_bvecs(obj_req->copyup_bvecs, 3373793333a3SIlya Dryomov rbd_obj_img_extents_bytes(obj_req))) { 3374793333a3SIlya Dryomov dout("%s %p detected zeros\n", __func__, obj_req); 3375793333a3SIlya Dryomov obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS; 33767114edacSIlya Dryomov } 33777114edacSIlya Dryomov 337822e8bd51SIlya Dryomov rbd_obj_copyup_object_maps(obj_req); 337922e8bd51SIlya Dryomov if (!obj_req->pending.num_pending) { 338022e8bd51SIlya Dryomov *result = obj_req->pending.result; 338122e8bd51SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS; 33827114edacSIlya Dryomov goto again; 33837114edacSIlya Dryomov } 338422e8bd51SIlya Dryomov obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS; 338522e8bd51SIlya Dryomov return false; 338622e8bd51SIlya Dryomov case __RBD_OBJ_COPYUP_OBJECT_MAPS: 338722e8bd51SIlya Dryomov if (!pending_result_dec(&obj_req->pending, result)) 338822e8bd51SIlya Dryomov return false; 338922e8bd51SIlya Dryomov /* fall through */ 339022e8bd51SIlya Dryomov case RBD_OBJ_COPYUP_OBJECT_MAPS: 339122e8bd51SIlya Dryomov if (*result) { 339222e8bd51SIlya Dryomov rbd_warn(rbd_dev, "snap object map update failed: %d", 339322e8bd51SIlya Dryomov *result); 339422e8bd51SIlya Dryomov return true; 339522e8bd51SIlya Dryomov } 339622e8bd51SIlya Dryomov 3397793333a3SIlya Dryomov rbd_obj_copyup_write_object(obj_req); 3398793333a3SIlya Dryomov if (!obj_req->pending.num_pending) { 3399793333a3SIlya Dryomov *result = obj_req->pending.result; 3400793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT; 3401793333a3SIlya Dryomov goto again; 3402793333a3SIlya Dryomov } 3403793333a3SIlya Dryomov obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT; 3404793333a3SIlya Dryomov return false; 3405793333a3SIlya Dryomov case __RBD_OBJ_COPYUP_WRITE_OBJECT: 3406793333a3SIlya Dryomov if (!pending_result_dec(&obj_req->pending, result)) 3407793333a3SIlya Dryomov return false; 3408793333a3SIlya Dryomov /* fall through */ 3409793333a3SIlya Dryomov case RBD_OBJ_COPYUP_WRITE_OBJECT: 3410793333a3SIlya Dryomov return true; 3411793333a3SIlya Dryomov default: 3412793333a3SIlya Dryomov BUG(); 3413793333a3SIlya Dryomov } 3414793333a3SIlya Dryomov } 3415793333a3SIlya Dryomov 341622e8bd51SIlya Dryomov /* 341722e8bd51SIlya Dryomov * Return: 341822e8bd51SIlya Dryomov * 0 - object map update sent 341922e8bd51SIlya Dryomov * 1 - object map update isn't needed 342022e8bd51SIlya Dryomov * <0 - error 342122e8bd51SIlya Dryomov */ 342222e8bd51SIlya Dryomov static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req) 342322e8bd51SIlya Dryomov { 342422e8bd51SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 342522e8bd51SIlya Dryomov u8 current_state = OBJECT_PENDING; 342622e8bd51SIlya Dryomov 342722e8bd51SIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 342822e8bd51SIlya Dryomov return 1; 342922e8bd51SIlya Dryomov 343022e8bd51SIlya Dryomov if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION)) 343122e8bd51SIlya Dryomov return 1; 343222e8bd51SIlya Dryomov 343322e8bd51SIlya Dryomov return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT, 343422e8bd51SIlya Dryomov ¤t_state); 343522e8bd51SIlya Dryomov } 343622e8bd51SIlya Dryomov 343785b5e6d1SIlya Dryomov static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) 3438b8d70035SAlex Elder { 3439793333a3SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3440b8d70035SAlex Elder int ret; 3441b8d70035SAlex Elder 3442793333a3SIlya Dryomov again: 3443cf81b60eSAlex Elder switch (obj_req->write_state) { 344485b5e6d1SIlya Dryomov case RBD_OBJ_WRITE_START: 344585b5e6d1SIlya Dryomov rbd_assert(!*result); 344685b5e6d1SIlya Dryomov 344722e8bd51SIlya Dryomov if (rbd_obj_write_is_noop(obj_req)) 344822e8bd51SIlya Dryomov return true; 344922e8bd51SIlya Dryomov 345022e8bd51SIlya Dryomov ret = rbd_obj_write_pre_object_map(obj_req); 345122e8bd51SIlya Dryomov if (ret < 0) { 345222e8bd51SIlya Dryomov *result = ret; 345322e8bd51SIlya Dryomov return true; 345422e8bd51SIlya Dryomov } 345522e8bd51SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP; 345622e8bd51SIlya Dryomov if (ret > 0) 345722e8bd51SIlya Dryomov goto again; 345822e8bd51SIlya Dryomov return false; 345922e8bd51SIlya Dryomov case RBD_OBJ_WRITE_PRE_OBJECT_MAP: 346022e8bd51SIlya Dryomov if (*result) { 346122e8bd51SIlya Dryomov rbd_warn(rbd_dev, "pre object map update failed: %d", 346222e8bd51SIlya Dryomov *result); 346322e8bd51SIlya Dryomov return true; 346422e8bd51SIlya Dryomov } 346585b5e6d1SIlya Dryomov ret = rbd_obj_write_object(obj_req); 346685b5e6d1SIlya Dryomov if (ret) { 346785b5e6d1SIlya Dryomov *result = ret; 346885b5e6d1SIlya Dryomov return true; 346985b5e6d1SIlya Dryomov } 347085b5e6d1SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_OBJECT; 347185b5e6d1SIlya Dryomov return false; 34720ad5d953SIlya Dryomov case RBD_OBJ_WRITE_OBJECT: 347354ab3b24SIlya Dryomov if (*result == -ENOENT) { 34740ad5d953SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { 3475793333a3SIlya Dryomov *result = 0; 3476793333a3SIlya Dryomov obj_req->copyup_state = RBD_OBJ_COPYUP_START; 3477793333a3SIlya Dryomov obj_req->write_state = __RBD_OBJ_WRITE_COPYUP; 3478793333a3SIlya Dryomov goto again; 3479b8d70035SAlex Elder } 34800ad5d953SIlya Dryomov /* 34810ad5d953SIlya Dryomov * On a non-existent object: 34820ad5d953SIlya Dryomov * delete - -ENOENT, truncate/zero - 0 34830ad5d953SIlya Dryomov */ 34840ad5d953SIlya Dryomov if (obj_req->flags & RBD_OBJ_FLAG_DELETION) 34850ad5d953SIlya Dryomov *result = 0; 34860ad5d953SIlya Dryomov } 3487793333a3SIlya Dryomov if (*result) 3488793333a3SIlya Dryomov return true; 3489793333a3SIlya Dryomov 3490793333a3SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP; 3491793333a3SIlya Dryomov goto again; 3492793333a3SIlya Dryomov case __RBD_OBJ_WRITE_COPYUP: 3493793333a3SIlya Dryomov if (!rbd_obj_advance_copyup(obj_req, result)) 3494793333a3SIlya Dryomov return false; 34959969ebc5SAlex Elder /* fall through */ 3496793333a3SIlya Dryomov case RBD_OBJ_WRITE_COPYUP: 349722e8bd51SIlya Dryomov if (*result) { 3498793333a3SIlya Dryomov rbd_warn(rbd_dev, "copyup failed: %d", *result); 3499bf0d5f50SAlex Elder return true; 350022e8bd51SIlya Dryomov } 350122e8bd51SIlya Dryomov ret = rbd_obj_write_post_object_map(obj_req); 350222e8bd51SIlya Dryomov if (ret < 0) { 350322e8bd51SIlya Dryomov *result = ret; 350422e8bd51SIlya Dryomov return true; 350522e8bd51SIlya Dryomov } 350622e8bd51SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP; 350722e8bd51SIlya Dryomov if (ret > 0) 350822e8bd51SIlya Dryomov goto again; 350922e8bd51SIlya Dryomov return false; 351022e8bd51SIlya Dryomov case RBD_OBJ_WRITE_POST_OBJECT_MAP: 351122e8bd51SIlya Dryomov if (*result) 351222e8bd51SIlya Dryomov rbd_warn(rbd_dev, "post object map update failed: %d", 351322e8bd51SIlya Dryomov *result); 351422e8bd51SIlya Dryomov return true; 3515bf0d5f50SAlex Elder default: 3516bf0d5f50SAlex Elder BUG(); 3517bf0d5f50SAlex Elder } 3518bf0d5f50SAlex Elder } 3519bf0d5f50SAlex Elder 3520bf0d5f50SAlex Elder /* 35210ad5d953SIlya Dryomov * Return true if @obj_req is completed. 3522bf0d5f50SAlex Elder */ 352354ab3b24SIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req, 352454ab3b24SIlya Dryomov int *result) 3525bf0d5f50SAlex Elder { 35260ad5d953SIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 35270192ce2eSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 35280ad5d953SIlya Dryomov bool done; 35290ad5d953SIlya Dryomov 353085b5e6d1SIlya Dryomov mutex_lock(&obj_req->state_mutex); 35310ad5d953SIlya Dryomov if (!rbd_img_is_write(img_req)) 353285b5e6d1SIlya Dryomov done = rbd_obj_advance_read(obj_req, result); 35330ad5d953SIlya Dryomov else 353485b5e6d1SIlya Dryomov done = rbd_obj_advance_write(obj_req, result); 353585b5e6d1SIlya Dryomov mutex_unlock(&obj_req->state_mutex); 35360ad5d953SIlya Dryomov 35370192ce2eSIlya Dryomov if (done && *result) { 35380192ce2eSIlya Dryomov rbd_assert(*result < 0); 35390192ce2eSIlya Dryomov rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d", 35400192ce2eSIlya Dryomov obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 35410192ce2eSIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, *result); 35420192ce2eSIlya Dryomov } 35430ad5d953SIlya Dryomov return done; 35449969ebc5SAlex Elder } 35459969ebc5SAlex Elder 35460192ce2eSIlya Dryomov /* 35470192ce2eSIlya Dryomov * This is open-coded in rbd_img_handle_request() to avoid parent chain 35480192ce2eSIlya Dryomov * recursion. 35490192ce2eSIlya Dryomov */ 355054ab3b24SIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result) 35519969ebc5SAlex Elder { 35520192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) 35530192ce2eSIlya Dryomov rbd_img_handle_request(obj_req->img_request, result); 35547114edacSIlya Dryomov } 35557114edacSIlya Dryomov 3556e1fddc8fSIlya Dryomov static bool need_exclusive_lock(struct rbd_img_request *img_req) 3557e1fddc8fSIlya Dryomov { 3558e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3559e1fddc8fSIlya Dryomov 3560e1fddc8fSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) 3561e1fddc8fSIlya Dryomov return false; 3562e1fddc8fSIlya Dryomov 3563f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev)) 3564e1fddc8fSIlya Dryomov return false; 3565e1fddc8fSIlya Dryomov 3566e1fddc8fSIlya Dryomov rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 356722e8bd51SIlya Dryomov if (rbd_dev->opts->lock_on_read || 356822e8bd51SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) 3569e1fddc8fSIlya Dryomov return true; 3570e1fddc8fSIlya Dryomov 3571e1fddc8fSIlya Dryomov return rbd_img_is_write(img_req); 3572e1fddc8fSIlya Dryomov } 3573e1fddc8fSIlya Dryomov 3574637cd060SIlya Dryomov static bool rbd_lock_add_request(struct rbd_img_request *img_req) 3575e1fddc8fSIlya Dryomov { 3576e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3577637cd060SIlya Dryomov bool locked; 3578e1fddc8fSIlya Dryomov 3579e1fddc8fSIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem); 3580637cd060SIlya Dryomov locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED; 3581e1fddc8fSIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock); 3582e1fddc8fSIlya Dryomov rbd_assert(list_empty(&img_req->lock_item)); 3583637cd060SIlya Dryomov if (!locked) 3584637cd060SIlya Dryomov list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list); 3585637cd060SIlya Dryomov else 3586e1fddc8fSIlya Dryomov list_add_tail(&img_req->lock_item, &rbd_dev->running_list); 3587e1fddc8fSIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock); 3588637cd060SIlya Dryomov return locked; 3589e1fddc8fSIlya Dryomov } 3590e1fddc8fSIlya Dryomov 3591e1fddc8fSIlya Dryomov static void rbd_lock_del_request(struct rbd_img_request *img_req) 3592e1fddc8fSIlya Dryomov { 3593e1fddc8fSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3594e1fddc8fSIlya Dryomov bool need_wakeup; 3595e1fddc8fSIlya Dryomov 3596e1fddc8fSIlya Dryomov lockdep_assert_held(&rbd_dev->lock_rwsem); 3597e1fddc8fSIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock); 3598e1fddc8fSIlya Dryomov rbd_assert(!list_empty(&img_req->lock_item)); 3599e1fddc8fSIlya Dryomov list_del_init(&img_req->lock_item); 3600e1fddc8fSIlya Dryomov need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING && 3601e1fddc8fSIlya Dryomov list_empty(&rbd_dev->running_list)); 3602e1fddc8fSIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock); 3603e1fddc8fSIlya Dryomov if (need_wakeup) 3604e1fddc8fSIlya Dryomov complete(&rbd_dev->releasing_wait); 3605e1fddc8fSIlya Dryomov } 3606e1fddc8fSIlya Dryomov 3607637cd060SIlya Dryomov static int rbd_img_exclusive_lock(struct rbd_img_request *img_req) 3608637cd060SIlya Dryomov { 3609637cd060SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3610637cd060SIlya Dryomov 3611637cd060SIlya Dryomov if (!need_exclusive_lock(img_req)) 3612637cd060SIlya Dryomov return 1; 3613637cd060SIlya Dryomov 3614637cd060SIlya Dryomov if (rbd_lock_add_request(img_req)) 3615637cd060SIlya Dryomov return 1; 3616637cd060SIlya Dryomov 3617637cd060SIlya Dryomov if (rbd_dev->opts->exclusive) { 3618637cd060SIlya Dryomov WARN_ON(1); /* lock got released? */ 3619637cd060SIlya Dryomov return -EROFS; 3620637cd060SIlya Dryomov } 3621637cd060SIlya Dryomov 3622637cd060SIlya Dryomov /* 3623637cd060SIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3624637cd060SIlya Dryomov * and cancel_delayed_work() in wake_lock_waiters(). 3625637cd060SIlya Dryomov */ 3626637cd060SIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3627637cd060SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3628637cd060SIlya Dryomov return 0; 3629637cd060SIlya Dryomov } 3630637cd060SIlya Dryomov 36310192ce2eSIlya Dryomov static void rbd_img_object_requests(struct rbd_img_request *img_req) 36320192ce2eSIlya Dryomov { 36330192ce2eSIlya Dryomov struct rbd_obj_request *obj_req; 36340192ce2eSIlya Dryomov 36350192ce2eSIlya Dryomov rbd_assert(!img_req->pending.result && !img_req->pending.num_pending); 36360192ce2eSIlya Dryomov 36370192ce2eSIlya Dryomov for_each_obj_request(img_req, obj_req) { 36380192ce2eSIlya Dryomov int result = 0; 36390192ce2eSIlya Dryomov 36400192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) { 36410192ce2eSIlya Dryomov if (result) { 36420192ce2eSIlya Dryomov img_req->pending.result = result; 36430192ce2eSIlya Dryomov return; 36440192ce2eSIlya Dryomov } 36450192ce2eSIlya Dryomov } else { 36460192ce2eSIlya Dryomov img_req->pending.num_pending++; 36470192ce2eSIlya Dryomov } 36480192ce2eSIlya Dryomov } 36490192ce2eSIlya Dryomov } 36500192ce2eSIlya Dryomov 36510192ce2eSIlya Dryomov static bool rbd_img_advance(struct rbd_img_request *img_req, int *result) 36520192ce2eSIlya Dryomov { 3653637cd060SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 3654637cd060SIlya Dryomov int ret; 3655637cd060SIlya Dryomov 36560192ce2eSIlya Dryomov again: 36570192ce2eSIlya Dryomov switch (img_req->state) { 36580192ce2eSIlya Dryomov case RBD_IMG_START: 36590192ce2eSIlya Dryomov rbd_assert(!*result); 36600192ce2eSIlya Dryomov 3661637cd060SIlya Dryomov ret = rbd_img_exclusive_lock(img_req); 3662637cd060SIlya Dryomov if (ret < 0) { 3663637cd060SIlya Dryomov *result = ret; 3664637cd060SIlya Dryomov return true; 3665637cd060SIlya Dryomov } 3666637cd060SIlya Dryomov img_req->state = RBD_IMG_EXCLUSIVE_LOCK; 3667637cd060SIlya Dryomov if (ret > 0) 3668637cd060SIlya Dryomov goto again; 3669637cd060SIlya Dryomov return false; 3670637cd060SIlya Dryomov case RBD_IMG_EXCLUSIVE_LOCK: 3671637cd060SIlya Dryomov if (*result) 3672637cd060SIlya Dryomov return true; 3673637cd060SIlya Dryomov 3674637cd060SIlya Dryomov rbd_assert(!need_exclusive_lock(img_req) || 3675637cd060SIlya Dryomov __rbd_is_lock_owner(rbd_dev)); 3676637cd060SIlya Dryomov 36770192ce2eSIlya Dryomov rbd_img_object_requests(img_req); 36780192ce2eSIlya Dryomov if (!img_req->pending.num_pending) { 36790192ce2eSIlya Dryomov *result = img_req->pending.result; 36800192ce2eSIlya Dryomov img_req->state = RBD_IMG_OBJECT_REQUESTS; 36817114edacSIlya Dryomov goto again; 36827114edacSIlya Dryomov } 36830192ce2eSIlya Dryomov img_req->state = __RBD_IMG_OBJECT_REQUESTS; 36840192ce2eSIlya Dryomov return false; 36850192ce2eSIlya Dryomov case __RBD_IMG_OBJECT_REQUESTS: 36860192ce2eSIlya Dryomov if (!pending_result_dec(&img_req->pending, result)) 36870192ce2eSIlya Dryomov return false; 36880192ce2eSIlya Dryomov /* fall through */ 36890192ce2eSIlya Dryomov case RBD_IMG_OBJECT_REQUESTS: 36900192ce2eSIlya Dryomov return true; 36910192ce2eSIlya Dryomov default: 36920192ce2eSIlya Dryomov BUG(); 36930192ce2eSIlya Dryomov } 36940192ce2eSIlya Dryomov } 36950192ce2eSIlya Dryomov 36960192ce2eSIlya Dryomov /* 36970192ce2eSIlya Dryomov * Return true if @img_req is completed. 36980192ce2eSIlya Dryomov */ 36990192ce2eSIlya Dryomov static bool __rbd_img_handle_request(struct rbd_img_request *img_req, 37000192ce2eSIlya Dryomov int *result) 37010192ce2eSIlya Dryomov { 37020192ce2eSIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 37030192ce2eSIlya Dryomov bool done; 37040192ce2eSIlya Dryomov 3705e1fddc8fSIlya Dryomov if (need_exclusive_lock(img_req)) { 3706e1fddc8fSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3707e1fddc8fSIlya Dryomov mutex_lock(&img_req->state_mutex); 3708e1fddc8fSIlya Dryomov done = rbd_img_advance(img_req, result); 3709e1fddc8fSIlya Dryomov if (done) 3710e1fddc8fSIlya Dryomov rbd_lock_del_request(img_req); 3711e1fddc8fSIlya Dryomov mutex_unlock(&img_req->state_mutex); 3712e1fddc8fSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3713e1fddc8fSIlya Dryomov } else { 37140192ce2eSIlya Dryomov mutex_lock(&img_req->state_mutex); 37150192ce2eSIlya Dryomov done = rbd_img_advance(img_req, result); 37160192ce2eSIlya Dryomov mutex_unlock(&img_req->state_mutex); 3717e1fddc8fSIlya Dryomov } 37180192ce2eSIlya Dryomov 37190192ce2eSIlya Dryomov if (done && *result) { 37200192ce2eSIlya Dryomov rbd_assert(*result < 0); 37210192ce2eSIlya Dryomov rbd_warn(rbd_dev, "%s%s result %d", 37220192ce2eSIlya Dryomov test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "", 37230192ce2eSIlya Dryomov obj_op_name(img_req->op_type), *result); 37240192ce2eSIlya Dryomov } 37250192ce2eSIlya Dryomov return done; 37260192ce2eSIlya Dryomov } 37270192ce2eSIlya Dryomov 37280192ce2eSIlya Dryomov static void rbd_img_handle_request(struct rbd_img_request *img_req, int result) 37290192ce2eSIlya Dryomov { 37300192ce2eSIlya Dryomov again: 37310192ce2eSIlya Dryomov if (!__rbd_img_handle_request(img_req, &result)) 37320192ce2eSIlya Dryomov return; 37330192ce2eSIlya Dryomov 37340192ce2eSIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 37350192ce2eSIlya Dryomov struct rbd_obj_request *obj_req = img_req->obj_request; 37360192ce2eSIlya Dryomov 37370192ce2eSIlya Dryomov rbd_img_request_put(img_req); 37380192ce2eSIlya Dryomov if (__rbd_obj_handle_request(obj_req, &result)) { 37390192ce2eSIlya Dryomov img_req = obj_req->img_request; 37400192ce2eSIlya Dryomov goto again; 37410192ce2eSIlya Dryomov } 37420192ce2eSIlya Dryomov } else { 37430192ce2eSIlya Dryomov struct request *rq = img_req->rq; 37440192ce2eSIlya Dryomov 37450192ce2eSIlya Dryomov rbd_img_request_put(img_req); 37460192ce2eSIlya Dryomov blk_mq_end_request(rq, errno_to_blk_status(result)); 37470192ce2eSIlya Dryomov } 37489969ebc5SAlex Elder } 37499969ebc5SAlex Elder 3750ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 3751ed95b21aSIlya Dryomov 3752ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 3753ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 3754ed95b21aSIlya Dryomov { 3755ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 3756ed95b21aSIlya Dryomov } 3757ed95b21aSIlya Dryomov 3758ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 3759ed95b21aSIlya Dryomov { 3760ed95b21aSIlya Dryomov struct rbd_client_id cid; 3761ed95b21aSIlya Dryomov 3762ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3763ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 3764ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 3765ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3766ed95b21aSIlya Dryomov return cid; 3767ed95b21aSIlya Dryomov } 3768ed95b21aSIlya Dryomov 3769ed95b21aSIlya Dryomov /* 3770ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3771ed95b21aSIlya Dryomov */ 3772ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 3773ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 3774ed95b21aSIlya Dryomov { 3775ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 3776ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 3777ed95b21aSIlya Dryomov cid->gid, cid->handle); 3778ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 3779ed95b21aSIlya Dryomov } 3780ed95b21aSIlya Dryomov 3781ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 3782ed95b21aSIlya Dryomov { 3783ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 3784ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 3785ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3786ed95b21aSIlya Dryomov } 3787ed95b21aSIlya Dryomov 3788edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) 3789edd8ca80SFlorian Margaine { 3790edd8ca80SFlorian Margaine struct rbd_client_id cid = rbd_get_cid(rbd_dev); 3791edd8ca80SFlorian Margaine 3792a2b1da09SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 3793edd8ca80SFlorian Margaine strcpy(rbd_dev->lock_cookie, cookie); 3794edd8ca80SFlorian Margaine rbd_set_owner_cid(rbd_dev, &cid); 3795edd8ca80SFlorian Margaine queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 3796edd8ca80SFlorian Margaine } 3797edd8ca80SFlorian Margaine 3798ed95b21aSIlya Dryomov /* 3799ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3800ed95b21aSIlya Dryomov */ 3801ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 3802ed95b21aSIlya Dryomov { 3803ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3804ed95b21aSIlya Dryomov char cookie[32]; 3805ed95b21aSIlya Dryomov int ret; 3806ed95b21aSIlya Dryomov 3807cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) || 3808cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0'); 3809ed95b21aSIlya Dryomov 3810ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 3811ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3812ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 3813ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 3814ed95b21aSIlya Dryomov if (ret) 3815ed95b21aSIlya Dryomov return ret; 3816ed95b21aSIlya Dryomov 3817edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 3818ed95b21aSIlya Dryomov return 0; 3819ed95b21aSIlya Dryomov } 3820ed95b21aSIlya Dryomov 3821ed95b21aSIlya Dryomov /* 3822ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3823ed95b21aSIlya Dryomov */ 3824bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev) 3825ed95b21aSIlya Dryomov { 3826ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3827ed95b21aSIlya Dryomov int ret; 3828ed95b21aSIlya Dryomov 3829cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) || 3830cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0'); 3831ed95b21aSIlya Dryomov 3832ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 3833cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie); 3834bbead745SIlya Dryomov if (ret && ret != -ENOENT) 3835637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock header: %d", ret); 3836ed95b21aSIlya Dryomov 3837bbead745SIlya Dryomov /* treat errors as the image is unlocked */ 3838bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 3839cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0'; 3840ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3841ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 3842ed95b21aSIlya Dryomov } 3843ed95b21aSIlya Dryomov 3844ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 3845ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 3846ed95b21aSIlya Dryomov struct page ***preply_pages, 3847ed95b21aSIlya Dryomov size_t *preply_len) 3848ed95b21aSIlya Dryomov { 3849ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3850ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 385108a79102SKyle Spiers char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; 385208a79102SKyle Spiers int buf_size = sizeof(buf); 3853ed95b21aSIlya Dryomov void *p = buf; 3854ed95b21aSIlya Dryomov 3855ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 3856ed95b21aSIlya Dryomov 3857ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 3858ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 3859ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 3860ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 3861ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 3862ed95b21aSIlya Dryomov 3863ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 3864ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 3865ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 3866ed95b21aSIlya Dryomov } 3867ed95b21aSIlya Dryomov 3868ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 3869ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 3870ed95b21aSIlya Dryomov { 3871ed95b21aSIlya Dryomov struct page **reply_pages; 3872ed95b21aSIlya Dryomov size_t reply_len; 3873ed95b21aSIlya Dryomov 3874ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 3875ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3876ed95b21aSIlya Dryomov } 3877ed95b21aSIlya Dryomov 3878ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 3879ed95b21aSIlya Dryomov { 3880ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3881ed95b21aSIlya Dryomov acquired_lock_work); 3882ed95b21aSIlya Dryomov 3883ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 3884ed95b21aSIlya Dryomov } 3885ed95b21aSIlya Dryomov 3886ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 3887ed95b21aSIlya Dryomov { 3888ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3889ed95b21aSIlya Dryomov released_lock_work); 3890ed95b21aSIlya Dryomov 3891ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 3892ed95b21aSIlya Dryomov } 3893ed95b21aSIlya Dryomov 3894ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 3895ed95b21aSIlya Dryomov { 3896ed95b21aSIlya Dryomov struct page **reply_pages; 3897ed95b21aSIlya Dryomov size_t reply_len; 3898ed95b21aSIlya Dryomov bool lock_owner_responded = false; 3899ed95b21aSIlya Dryomov int ret; 3900ed95b21aSIlya Dryomov 3901ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3902ed95b21aSIlya Dryomov 3903ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 3904ed95b21aSIlya Dryomov &reply_pages, &reply_len); 3905ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 3906ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 3907ed95b21aSIlya Dryomov goto out; 3908ed95b21aSIlya Dryomov } 3909ed95b21aSIlya Dryomov 3910ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 3911ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 3912ed95b21aSIlya Dryomov void *const end = p + reply_len; 3913ed95b21aSIlya Dryomov u32 n; 3914ed95b21aSIlya Dryomov 3915ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 3916ed95b21aSIlya Dryomov while (n--) { 3917ed95b21aSIlya Dryomov u8 struct_v; 3918ed95b21aSIlya Dryomov u32 len; 3919ed95b21aSIlya Dryomov 3920ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 3921ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 3922ed95b21aSIlya Dryomov 3923ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 3924ed95b21aSIlya Dryomov if (!len) 3925ed95b21aSIlya Dryomov continue; 3926ed95b21aSIlya Dryomov 3927ed95b21aSIlya Dryomov if (lock_owner_responded) { 3928ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3929ed95b21aSIlya Dryomov "duplicate lock owners detected"); 3930ed95b21aSIlya Dryomov ret = -EIO; 3931ed95b21aSIlya Dryomov goto out; 3932ed95b21aSIlya Dryomov } 3933ed95b21aSIlya Dryomov 3934ed95b21aSIlya Dryomov lock_owner_responded = true; 3935ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 3936ed95b21aSIlya Dryomov &struct_v, &len); 3937ed95b21aSIlya Dryomov if (ret) { 3938ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 3939ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 3940ed95b21aSIlya Dryomov ret); 3941ed95b21aSIlya Dryomov goto e_inval; 3942ed95b21aSIlya Dryomov } 3943ed95b21aSIlya Dryomov 3944ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 3945ed95b21aSIlya Dryomov } 3946ed95b21aSIlya Dryomov } 3947ed95b21aSIlya Dryomov 3948ed95b21aSIlya Dryomov if (!lock_owner_responded) { 3949ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 3950ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 3951ed95b21aSIlya Dryomov } 3952ed95b21aSIlya Dryomov 3953ed95b21aSIlya Dryomov out: 3954ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 3955ed95b21aSIlya Dryomov return ret; 3956ed95b21aSIlya Dryomov 3957ed95b21aSIlya Dryomov e_inval: 3958ed95b21aSIlya Dryomov ret = -EINVAL; 3959ed95b21aSIlya Dryomov goto out; 3960ed95b21aSIlya Dryomov } 3961ed95b21aSIlya Dryomov 3962637cd060SIlya Dryomov /* 3963637cd060SIlya Dryomov * Either image request state machine(s) or rbd_add_acquire_lock() 3964637cd060SIlya Dryomov * (i.e. "rbd map"). 3965637cd060SIlya Dryomov */ 3966637cd060SIlya Dryomov static void wake_lock_waiters(struct rbd_device *rbd_dev, int result) 3967ed95b21aSIlya Dryomov { 3968637cd060SIlya Dryomov struct rbd_img_request *img_req; 3969637cd060SIlya Dryomov 3970637cd060SIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3971d9b9c893SLinus Torvalds lockdep_assert_held_write(&rbd_dev->lock_rwsem); 3972ed95b21aSIlya Dryomov 3973ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3974637cd060SIlya Dryomov if (!completion_done(&rbd_dev->acquire_wait)) { 3975637cd060SIlya Dryomov rbd_assert(list_empty(&rbd_dev->acquiring_list) && 3976637cd060SIlya Dryomov list_empty(&rbd_dev->running_list)); 3977637cd060SIlya Dryomov rbd_dev->acquire_err = result; 3978637cd060SIlya Dryomov complete_all(&rbd_dev->acquire_wait); 3979637cd060SIlya Dryomov return; 3980637cd060SIlya Dryomov } 3981637cd060SIlya Dryomov 3982637cd060SIlya Dryomov list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) { 3983637cd060SIlya Dryomov mutex_lock(&img_req->state_mutex); 3984637cd060SIlya Dryomov rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK); 3985637cd060SIlya Dryomov rbd_img_schedule(img_req, result); 3986637cd060SIlya Dryomov mutex_unlock(&img_req->state_mutex); 3987637cd060SIlya Dryomov } 3988637cd060SIlya Dryomov 3989637cd060SIlya Dryomov list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list); 3990ed95b21aSIlya Dryomov } 3991ed95b21aSIlya Dryomov 3992ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 3993ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 3994ed95b21aSIlya Dryomov { 3995ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3996ed95b21aSIlya Dryomov u8 lock_type; 3997ed95b21aSIlya Dryomov char *lock_tag; 3998ed95b21aSIlya Dryomov int ret; 3999ed95b21aSIlya Dryomov 4000ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4001ed95b21aSIlya Dryomov 4002ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 4003ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 4004ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 4005ed95b21aSIlya Dryomov if (ret) 4006ed95b21aSIlya Dryomov return ret; 4007ed95b21aSIlya Dryomov 4008ed95b21aSIlya Dryomov if (*num_lockers == 0) { 4009ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 4010ed95b21aSIlya Dryomov goto out; 4011ed95b21aSIlya Dryomov } 4012ed95b21aSIlya Dryomov 4013ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 4014ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 4015ed95b21aSIlya Dryomov lock_tag); 4016ed95b21aSIlya Dryomov ret = -EBUSY; 4017ed95b21aSIlya Dryomov goto out; 4018ed95b21aSIlya Dryomov } 4019ed95b21aSIlya Dryomov 4020ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 4021ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 4022ed95b21aSIlya Dryomov ret = -EBUSY; 4023ed95b21aSIlya Dryomov goto out; 4024ed95b21aSIlya Dryomov } 4025ed95b21aSIlya Dryomov 4026ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 4027ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 4028ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 4029ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 4030ed95b21aSIlya Dryomov ret = -EBUSY; 4031ed95b21aSIlya Dryomov goto out; 4032ed95b21aSIlya Dryomov } 4033ed95b21aSIlya Dryomov 4034ed95b21aSIlya Dryomov out: 4035ed95b21aSIlya Dryomov kfree(lock_tag); 4036ed95b21aSIlya Dryomov return ret; 4037ed95b21aSIlya Dryomov } 4038ed95b21aSIlya Dryomov 4039ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 4040ed95b21aSIlya Dryomov const struct ceph_locker *locker) 4041ed95b21aSIlya Dryomov { 4042ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4043ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 4044ed95b21aSIlya Dryomov u32 num_watchers; 4045ed95b21aSIlya Dryomov u64 cookie; 4046ed95b21aSIlya Dryomov int i; 4047ed95b21aSIlya Dryomov int ret; 4048ed95b21aSIlya Dryomov 4049ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 4050ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 4051ed95b21aSIlya Dryomov &num_watchers); 4052ed95b21aSIlya Dryomov if (ret) 4053ed95b21aSIlya Dryomov return ret; 4054ed95b21aSIlya Dryomov 4055ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 4056ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 4057ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 4058ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 4059ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 4060ed95b21aSIlya Dryomov struct rbd_client_id cid = { 4061ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 4062ed95b21aSIlya Dryomov .handle = cookie, 4063ed95b21aSIlya Dryomov }; 4064ed95b21aSIlya Dryomov 4065ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 4066ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 4067ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 4068ed95b21aSIlya Dryomov ret = 1; 4069ed95b21aSIlya Dryomov goto out; 4070ed95b21aSIlya Dryomov } 4071ed95b21aSIlya Dryomov } 4072ed95b21aSIlya Dryomov 4073ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 4074ed95b21aSIlya Dryomov ret = 0; 4075ed95b21aSIlya Dryomov out: 4076ed95b21aSIlya Dryomov kfree(watchers); 4077ed95b21aSIlya Dryomov return ret; 4078ed95b21aSIlya Dryomov } 4079ed95b21aSIlya Dryomov 4080ed95b21aSIlya Dryomov /* 4081ed95b21aSIlya Dryomov * lock_rwsem must be held for write 4082ed95b21aSIlya Dryomov */ 4083ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 4084ed95b21aSIlya Dryomov { 4085ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 4086ed95b21aSIlya Dryomov struct ceph_locker *lockers; 4087ed95b21aSIlya Dryomov u32 num_lockers; 4088ed95b21aSIlya Dryomov int ret; 4089ed95b21aSIlya Dryomov 4090ed95b21aSIlya Dryomov for (;;) { 4091ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 4092ed95b21aSIlya Dryomov if (ret != -EBUSY) 4093ed95b21aSIlya Dryomov return ret; 4094ed95b21aSIlya Dryomov 4095ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 4096ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 4097ed95b21aSIlya Dryomov if (ret) 4098ed95b21aSIlya Dryomov return ret; 4099ed95b21aSIlya Dryomov 4100ed95b21aSIlya Dryomov if (num_lockers == 0) 4101ed95b21aSIlya Dryomov goto again; 4102ed95b21aSIlya Dryomov 4103ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 4104637cd060SIlya Dryomov if (ret) 4105637cd060SIlya Dryomov goto out; /* request lock or error */ 4106ed95b21aSIlya Dryomov 410722e8bd51SIlya Dryomov rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", 4108ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 4109ed95b21aSIlya Dryomov 4110ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 4111ed95b21aSIlya Dryomov &lockers[0].info.addr); 4112ed95b21aSIlya Dryomov if (ret) { 4113ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 4114ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 4115ed95b21aSIlya Dryomov goto out; 4116ed95b21aSIlya Dryomov } 4117ed95b21aSIlya Dryomov 4118ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 4119ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 4120ed95b21aSIlya Dryomov lockers[0].id.cookie, 4121ed95b21aSIlya Dryomov &lockers[0].id.name); 4122ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 4123ed95b21aSIlya Dryomov goto out; 4124ed95b21aSIlya Dryomov 4125ed95b21aSIlya Dryomov again: 4126ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 4127ed95b21aSIlya Dryomov } 4128ed95b21aSIlya Dryomov 4129ed95b21aSIlya Dryomov out: 4130ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 4131ed95b21aSIlya Dryomov return ret; 4132ed95b21aSIlya Dryomov } 4133ed95b21aSIlya Dryomov 413422e8bd51SIlya Dryomov static int rbd_post_acquire_action(struct rbd_device *rbd_dev) 4135ed95b21aSIlya Dryomov { 413622e8bd51SIlya Dryomov int ret; 413722e8bd51SIlya Dryomov 413822e8bd51SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) { 413922e8bd51SIlya Dryomov ret = rbd_object_map_open(rbd_dev); 414022e8bd51SIlya Dryomov if (ret) 414122e8bd51SIlya Dryomov return ret; 414222e8bd51SIlya Dryomov } 414322e8bd51SIlya Dryomov 414422e8bd51SIlya Dryomov return 0; 414522e8bd51SIlya Dryomov } 414622e8bd51SIlya Dryomov 4147ed95b21aSIlya Dryomov /* 4148637cd060SIlya Dryomov * Return: 4149637cd060SIlya Dryomov * 0 - lock acquired 4150637cd060SIlya Dryomov * 1 - caller should call rbd_request_lock() 4151637cd060SIlya Dryomov * <0 - error 4152ed95b21aSIlya Dryomov */ 4153637cd060SIlya Dryomov static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) 4154ed95b21aSIlya Dryomov { 4155637cd060SIlya Dryomov int ret; 4156ed95b21aSIlya Dryomov 4157ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4158ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 4159ed95b21aSIlya Dryomov rbd_dev->lock_state); 4160ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 4161ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4162637cd060SIlya Dryomov return 0; 4163ed95b21aSIlya Dryomov } 4164ed95b21aSIlya Dryomov 4165ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4166ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4167ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 4168ed95b21aSIlya Dryomov rbd_dev->lock_state); 4169637cd060SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 4170637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4171637cd060SIlya Dryomov return 0; 4172ed95b21aSIlya Dryomov } 4173ed95b21aSIlya Dryomov 4174637cd060SIlya Dryomov ret = rbd_try_lock(rbd_dev); 4175637cd060SIlya Dryomov if (ret < 0) { 4176637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to lock header: %d", ret); 4177637cd060SIlya Dryomov if (ret == -EBLACKLISTED) 4178637cd060SIlya Dryomov goto out; 4179637cd060SIlya Dryomov 4180637cd060SIlya Dryomov ret = 1; /* request lock anyway */ 4181637cd060SIlya Dryomov } 4182637cd060SIlya Dryomov if (ret > 0) { 4183ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4184637cd060SIlya Dryomov return ret; 4185637cd060SIlya Dryomov } 4186637cd060SIlya Dryomov 4187637cd060SIlya Dryomov rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED); 4188637cd060SIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list)); 4189637cd060SIlya Dryomov 419022e8bd51SIlya Dryomov ret = rbd_post_acquire_action(rbd_dev); 419122e8bd51SIlya Dryomov if (ret) { 419222e8bd51SIlya Dryomov rbd_warn(rbd_dev, "post-acquire action failed: %d", ret); 419322e8bd51SIlya Dryomov /* 419422e8bd51SIlya Dryomov * Can't stay in RBD_LOCK_STATE_LOCKED because 419522e8bd51SIlya Dryomov * rbd_lock_add_request() would let the request through, 419622e8bd51SIlya Dryomov * assuming that e.g. object map is locked and loaded. 419722e8bd51SIlya Dryomov */ 419822e8bd51SIlya Dryomov rbd_unlock(rbd_dev); 419922e8bd51SIlya Dryomov } 420022e8bd51SIlya Dryomov 4201637cd060SIlya Dryomov out: 4202637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret); 4203637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4204637cd060SIlya Dryomov return ret; 4205ed95b21aSIlya Dryomov } 4206ed95b21aSIlya Dryomov 4207ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 4208ed95b21aSIlya Dryomov { 4209ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 4210ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 4211637cd060SIlya Dryomov int ret; 4212ed95b21aSIlya Dryomov 4213ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4214ed95b21aSIlya Dryomov again: 4215637cd060SIlya Dryomov ret = rbd_try_acquire_lock(rbd_dev); 4216637cd060SIlya Dryomov if (ret <= 0) { 4217637cd060SIlya Dryomov dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret); 4218ed95b21aSIlya Dryomov return; 4219ed95b21aSIlya Dryomov } 4220ed95b21aSIlya Dryomov 4221ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 4222ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 4223ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 4224e010dd0aSIlya Dryomov } else if (ret == -EROFS) { 4225e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "peer will not release lock"); 4226637cd060SIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4227637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret); 4228637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4229ed95b21aSIlya Dryomov } else if (ret < 0) { 4230ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 4231ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 4232ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 4233ed95b21aSIlya Dryomov } else { 4234ed95b21aSIlya Dryomov /* 4235ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 4236ed95b21aSIlya Dryomov * release the lock 4237ed95b21aSIlya Dryomov */ 42386b0a8774SColin Ian King dout("%s rbd_dev %p requeuing lock_dwork\n", __func__, 4239ed95b21aSIlya Dryomov rbd_dev); 4240ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 4241ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 4242ed95b21aSIlya Dryomov } 4243ed95b21aSIlya Dryomov } 4244ed95b21aSIlya Dryomov 4245a2b1da09SIlya Dryomov static bool rbd_quiesce_lock(struct rbd_device *rbd_dev) 4246ed95b21aSIlya Dryomov { 4247e1fddc8fSIlya Dryomov bool need_wait; 4248e1fddc8fSIlya Dryomov 4249a2b1da09SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4250d9b9c893SLinus Torvalds lockdep_assert_held_write(&rbd_dev->lock_rwsem); 4251a2b1da09SIlya Dryomov 4252ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 4253ed95b21aSIlya Dryomov return false; 4254ed95b21aSIlya Dryomov 4255ed95b21aSIlya Dryomov /* 4256ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 4257ed95b21aSIlya Dryomov */ 4258e1fddc8fSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 4259e1fddc8fSIlya Dryomov rbd_assert(!completion_done(&rbd_dev->releasing_wait)); 4260e1fddc8fSIlya Dryomov need_wait = !list_empty(&rbd_dev->running_list); 4261e1fddc8fSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 4262e1fddc8fSIlya Dryomov if (need_wait) 4263e1fddc8fSIlya Dryomov wait_for_completion(&rbd_dev->releasing_wait); 4264ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4265ed95b21aSIlya Dryomov 4266ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4267ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 4268ed95b21aSIlya Dryomov return false; 4269ed95b21aSIlya Dryomov 4270e1fddc8fSIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list)); 4271a2b1da09SIlya Dryomov return true; 4272a2b1da09SIlya Dryomov } 4273a2b1da09SIlya Dryomov 427422e8bd51SIlya Dryomov static void rbd_pre_release_action(struct rbd_device *rbd_dev) 427522e8bd51SIlya Dryomov { 427622e8bd51SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) 427722e8bd51SIlya Dryomov rbd_object_map_close(rbd_dev); 427822e8bd51SIlya Dryomov } 427922e8bd51SIlya Dryomov 4280e1fddc8fSIlya Dryomov static void __rbd_release_lock(struct rbd_device *rbd_dev) 4281e1fddc8fSIlya Dryomov { 4282e1fddc8fSIlya Dryomov rbd_assert(list_empty(&rbd_dev->running_list)); 4283e1fddc8fSIlya Dryomov 428422e8bd51SIlya Dryomov rbd_pre_release_action(rbd_dev); 4285bbead745SIlya Dryomov rbd_unlock(rbd_dev); 4286e1fddc8fSIlya Dryomov } 4287e1fddc8fSIlya Dryomov 4288a2b1da09SIlya Dryomov /* 4289a2b1da09SIlya Dryomov * lock_rwsem must be held for write 4290a2b1da09SIlya Dryomov */ 4291a2b1da09SIlya Dryomov static void rbd_release_lock(struct rbd_device *rbd_dev) 4292a2b1da09SIlya Dryomov { 4293a2b1da09SIlya Dryomov if (!rbd_quiesce_lock(rbd_dev)) 4294a2b1da09SIlya Dryomov return; 4295a2b1da09SIlya Dryomov 4296e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev); 4297a2b1da09SIlya Dryomov 4298ed95b21aSIlya Dryomov /* 4299ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 4300637cd060SIlya Dryomov * almost immediately if we got new IO while draining the running 4301637cd060SIlya Dryomov * list otherwise. We need to ack our own notifications, so this 4302637cd060SIlya Dryomov * lock_dwork will be requeued from rbd_handle_released_lock() by 4303637cd060SIlya Dryomov * way of maybe_kick_acquire(). 4304ed95b21aSIlya Dryomov */ 4305ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 4306ed95b21aSIlya Dryomov } 4307ed95b21aSIlya Dryomov 4308ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 4309ed95b21aSIlya Dryomov { 4310ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 4311ed95b21aSIlya Dryomov unlock_work); 4312ed95b21aSIlya Dryomov 4313ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4314ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 4315ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4316ed95b21aSIlya Dryomov } 4317ed95b21aSIlya Dryomov 4318637cd060SIlya Dryomov static void maybe_kick_acquire(struct rbd_device *rbd_dev) 4319637cd060SIlya Dryomov { 4320637cd060SIlya Dryomov bool have_requests; 4321637cd060SIlya Dryomov 4322637cd060SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4323637cd060SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 4324637cd060SIlya Dryomov return; 4325637cd060SIlya Dryomov 4326637cd060SIlya Dryomov spin_lock(&rbd_dev->lock_lists_lock); 4327637cd060SIlya Dryomov have_requests = !list_empty(&rbd_dev->acquiring_list); 4328637cd060SIlya Dryomov spin_unlock(&rbd_dev->lock_lists_lock); 4329637cd060SIlya Dryomov if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) { 4330637cd060SIlya Dryomov dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev); 4331637cd060SIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 4332637cd060SIlya Dryomov } 4333637cd060SIlya Dryomov } 4334637cd060SIlya Dryomov 4335ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 4336ed95b21aSIlya Dryomov void **p) 4337ed95b21aSIlya Dryomov { 4338ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 4339ed95b21aSIlya Dryomov 4340ed95b21aSIlya Dryomov if (struct_v >= 2) { 4341ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 4342ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 4343ed95b21aSIlya Dryomov } 4344ed95b21aSIlya Dryomov 4345ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 4346ed95b21aSIlya Dryomov cid.handle); 4347ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 4348ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4349ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 4350ed95b21aSIlya Dryomov /* 4351ed95b21aSIlya Dryomov * we already know that the remote client is 4352ed95b21aSIlya Dryomov * the owner 4353ed95b21aSIlya Dryomov */ 4354ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4355ed95b21aSIlya Dryomov return; 4356ed95b21aSIlya Dryomov } 4357ed95b21aSIlya Dryomov 4358ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 4359ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 4360ed95b21aSIlya Dryomov } else { 4361ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4362ed95b21aSIlya Dryomov } 4363ed95b21aSIlya Dryomov 4364637cd060SIlya Dryomov maybe_kick_acquire(rbd_dev); 4365ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4366ed95b21aSIlya Dryomov } 4367ed95b21aSIlya Dryomov 4368ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 4369ed95b21aSIlya Dryomov void **p) 4370ed95b21aSIlya Dryomov { 4371ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 4372ed95b21aSIlya Dryomov 4373ed95b21aSIlya Dryomov if (struct_v >= 2) { 4374ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 4375ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 4376ed95b21aSIlya Dryomov } 4377ed95b21aSIlya Dryomov 4378ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 4379ed95b21aSIlya Dryomov cid.handle); 4380ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 4381ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4382ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 4383ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 4384ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 4385ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 4386ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4387ed95b21aSIlya Dryomov return; 4388ed95b21aSIlya Dryomov } 4389ed95b21aSIlya Dryomov 4390ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 4391ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 4392ed95b21aSIlya Dryomov } else { 4393ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 4394ed95b21aSIlya Dryomov } 4395ed95b21aSIlya Dryomov 4396637cd060SIlya Dryomov maybe_kick_acquire(rbd_dev); 4397ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 4398ed95b21aSIlya Dryomov } 4399ed95b21aSIlya Dryomov 44003b77faa0SIlya Dryomov /* 44013b77faa0SIlya Dryomov * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no 44023b77faa0SIlya Dryomov * ResponseMessage is needed. 44033b77faa0SIlya Dryomov */ 44043b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 4405ed95b21aSIlya Dryomov void **p) 4406ed95b21aSIlya Dryomov { 4407ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 4408ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 44093b77faa0SIlya Dryomov int result = 1; 4410ed95b21aSIlya Dryomov 4411ed95b21aSIlya Dryomov if (struct_v >= 2) { 4412ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 4413ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 4414ed95b21aSIlya Dryomov } 4415ed95b21aSIlya Dryomov 4416ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 4417ed95b21aSIlya Dryomov cid.handle); 4418ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 44193b77faa0SIlya Dryomov return result; 4420ed95b21aSIlya Dryomov 4421ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 44223b77faa0SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 44233b77faa0SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED && 44243b77faa0SIlya Dryomov rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) 44253b77faa0SIlya Dryomov goto out_unlock; 44263b77faa0SIlya Dryomov 44273b77faa0SIlya Dryomov /* 44283b77faa0SIlya Dryomov * encode ResponseMessage(0) so the peer can detect 44293b77faa0SIlya Dryomov * a missing owner 44303b77faa0SIlya Dryomov */ 44313b77faa0SIlya Dryomov result = 0; 44323b77faa0SIlya Dryomov 4433ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 4434e010dd0aSIlya Dryomov if (!rbd_dev->opts->exclusive) { 4435e010dd0aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", 4436e010dd0aSIlya Dryomov __func__, rbd_dev); 4437e010dd0aSIlya Dryomov queue_work(rbd_dev->task_wq, 4438e010dd0aSIlya Dryomov &rbd_dev->unlock_work); 4439e010dd0aSIlya Dryomov } else { 4440e010dd0aSIlya Dryomov /* refuse to release the lock */ 4441e010dd0aSIlya Dryomov result = -EROFS; 4442ed95b21aSIlya Dryomov } 4443ed95b21aSIlya Dryomov } 4444ed95b21aSIlya Dryomov } 44453b77faa0SIlya Dryomov 44463b77faa0SIlya Dryomov out_unlock: 4447ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 44483b77faa0SIlya Dryomov return result; 4449ed95b21aSIlya Dryomov } 4450ed95b21aSIlya Dryomov 4451ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 4452ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 4453ed95b21aSIlya Dryomov { 4454ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 445508a79102SKyle Spiers char buf[4 + CEPH_ENCODING_START_BLK_LEN]; 445608a79102SKyle Spiers int buf_size = sizeof(buf); 4457ed95b21aSIlya Dryomov int ret; 4458ed95b21aSIlya Dryomov 4459ed95b21aSIlya Dryomov if (result) { 4460ed95b21aSIlya Dryomov void *p = buf; 4461ed95b21aSIlya Dryomov 4462ed95b21aSIlya Dryomov /* encode ResponseMessage */ 4463ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 4464ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 4465ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 4466ed95b21aSIlya Dryomov } else { 4467ed95b21aSIlya Dryomov buf_size = 0; 4468ed95b21aSIlya Dryomov } 4469ed95b21aSIlya Dryomov 4470ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 4471ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 4472ed95b21aSIlya Dryomov buf, buf_size); 4473ed95b21aSIlya Dryomov if (ret) 4474ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 4475ed95b21aSIlya Dryomov } 4476ed95b21aSIlya Dryomov 4477ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 4478ed95b21aSIlya Dryomov u64 cookie) 4479ed95b21aSIlya Dryomov { 4480ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4481ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 4482ed95b21aSIlya Dryomov } 4483ed95b21aSIlya Dryomov 4484ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 4485ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 4486ed95b21aSIlya Dryomov { 4487ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 4488ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 4489ed95b21aSIlya Dryomov } 4490922dab61SIlya Dryomov 4491922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 4492922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 4493bf0d5f50SAlex Elder { 4494922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 4495ed95b21aSIlya Dryomov void *p = data; 4496ed95b21aSIlya Dryomov void *const end = p + data_len; 4497d4c2269bSIlya Dryomov u8 struct_v = 0; 4498ed95b21aSIlya Dryomov u32 len; 4499ed95b21aSIlya Dryomov u32 notify_op; 4500bf0d5f50SAlex Elder int ret; 4501bf0d5f50SAlex Elder 4502ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 4503ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 4504ed95b21aSIlya Dryomov if (data_len) { 4505ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 4506ed95b21aSIlya Dryomov &struct_v, &len); 4507ed95b21aSIlya Dryomov if (ret) { 4508ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 4509ed95b21aSIlya Dryomov ret); 4510ed95b21aSIlya Dryomov return; 4511ed95b21aSIlya Dryomov } 451252bb1f9bSIlya Dryomov 4513ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 4514ed95b21aSIlya Dryomov } else { 4515ed95b21aSIlya Dryomov /* legacy notification for header updates */ 4516ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 4517ed95b21aSIlya Dryomov len = 0; 4518ed95b21aSIlya Dryomov } 4519ed95b21aSIlya Dryomov 4520ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 4521ed95b21aSIlya Dryomov switch (notify_op) { 4522ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 4523ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 4524ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4525ed95b21aSIlya Dryomov break; 4526ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 4527ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 4528ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4529ed95b21aSIlya Dryomov break; 4530ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 45313b77faa0SIlya Dryomov ret = rbd_handle_request_lock(rbd_dev, struct_v, &p); 45323b77faa0SIlya Dryomov if (ret <= 0) 4533ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 45343b77faa0SIlya Dryomov cookie, ret); 4535ed95b21aSIlya Dryomov else 4536ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4537ed95b21aSIlya Dryomov break; 4538ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 4539e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 4540e627db08SAlex Elder if (ret) 45419584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 4542bf0d5f50SAlex Elder 4543ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4544ed95b21aSIlya Dryomov break; 4545ed95b21aSIlya Dryomov default: 4546ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 4547ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 4548ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 4549ed95b21aSIlya Dryomov else 4550ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 4551ed95b21aSIlya Dryomov break; 45529969ebc5SAlex Elder } 45539969ebc5SAlex Elder } 45549969ebc5SAlex Elder 455599d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 45569969ebc5SAlex Elder 4557922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 4558bb040aa0SIlya Dryomov { 4559922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 4560bb040aa0SIlya Dryomov 4561922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 4562bb040aa0SIlya Dryomov 4563ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4564ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 4565ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4566bb040aa0SIlya Dryomov 456799d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 456899d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 456999d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 457099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 4571bb040aa0SIlya Dryomov 457299d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 4573bb040aa0SIlya Dryomov } 457499d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 4575bb040aa0SIlya Dryomov } 4576bb040aa0SIlya Dryomov 4577bb040aa0SIlya Dryomov /* 457899d16943SIlya Dryomov * watch_mutex must be locked 45799969ebc5SAlex Elder */ 458099d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 45819969ebc5SAlex Elder { 45829969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4583922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 45849969ebc5SAlex Elder 4585922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 458699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 45879969ebc5SAlex Elder 4588922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 4589922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 4590922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 4591922dab61SIlya Dryomov if (IS_ERR(handle)) 4592922dab61SIlya Dryomov return PTR_ERR(handle); 45939969ebc5SAlex Elder 4594922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 45958eb87565SAlex Elder return 0; 45969969ebc5SAlex Elder } 45979969ebc5SAlex Elder 459899d16943SIlya Dryomov /* 459999d16943SIlya Dryomov * watch_mutex must be locked 460099d16943SIlya Dryomov */ 460199d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 4602fca27065SIlya Dryomov { 4603922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4604922dab61SIlya Dryomov int ret; 4605b30a01f2SIlya Dryomov 460699d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 460799d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 4608b30a01f2SIlya Dryomov 4609922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 4610922dab61SIlya Dryomov if (ret) 4611922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 4612b30a01f2SIlya Dryomov 4613922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 4614c525f036SIlya Dryomov } 4615c525f036SIlya Dryomov 461699d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 4617c525f036SIlya Dryomov { 461899d16943SIlya Dryomov int ret; 4619811c6688SIlya Dryomov 462099d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 462199d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 462299d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 462399d16943SIlya Dryomov if (ret) 462499d16943SIlya Dryomov goto out; 462599d16943SIlya Dryomov 462699d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 462799d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 462899d16943SIlya Dryomov 462999d16943SIlya Dryomov out: 463099d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 463199d16943SIlya Dryomov return ret; 463299d16943SIlya Dryomov } 463399d16943SIlya Dryomov 463499d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 463599d16943SIlya Dryomov { 463699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 463799d16943SIlya Dryomov 4638ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 4639ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 4640ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 4641ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 464299d16943SIlya Dryomov } 464399d16943SIlya Dryomov 464499d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 464599d16943SIlya Dryomov { 464699d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 464799d16943SIlya Dryomov 464899d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 464999d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 465099d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 465199d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 465299d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 465399d16943SIlya Dryomov 465423edca86SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->watch_dwork); 4655811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 4656fca27065SIlya Dryomov } 4657fca27065SIlya Dryomov 465814bb211dSIlya Dryomov /* 465914bb211dSIlya Dryomov * lock_rwsem must be held for write 466014bb211dSIlya Dryomov */ 466114bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev) 466214bb211dSIlya Dryomov { 466314bb211dSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 466414bb211dSIlya Dryomov char cookie[32]; 466514bb211dSIlya Dryomov int ret; 466614bb211dSIlya Dryomov 4667a2b1da09SIlya Dryomov if (!rbd_quiesce_lock(rbd_dev)) 4668a2b1da09SIlya Dryomov return; 466914bb211dSIlya Dryomov 467014bb211dSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 467114bb211dSIlya Dryomov ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, 467214bb211dSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 467314bb211dSIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie, 467414bb211dSIlya Dryomov RBD_LOCK_TAG, cookie); 467514bb211dSIlya Dryomov if (ret) { 467614bb211dSIlya Dryomov if (ret != -EOPNOTSUPP) 467714bb211dSIlya Dryomov rbd_warn(rbd_dev, "failed to update lock cookie: %d", 467814bb211dSIlya Dryomov ret); 467914bb211dSIlya Dryomov 468014bb211dSIlya Dryomov /* 468114bb211dSIlya Dryomov * Lock cookie cannot be updated on older OSDs, so do 468214bb211dSIlya Dryomov * a manual release and queue an acquire. 468314bb211dSIlya Dryomov */ 4684e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev); 4685a2b1da09SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 468614bb211dSIlya Dryomov } else { 4687edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 4688637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, 0); 468914bb211dSIlya Dryomov } 469014bb211dSIlya Dryomov } 469114bb211dSIlya Dryomov 469299d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 469399d16943SIlya Dryomov { 469499d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 469599d16943SIlya Dryomov struct rbd_device, watch_dwork); 469699d16943SIlya Dryomov int ret; 469799d16943SIlya Dryomov 469899d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 469999d16943SIlya Dryomov 470099d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 470187c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 470287c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 470314bb211dSIlya Dryomov return; 470487c0fdedSIlya Dryomov } 470599d16943SIlya Dryomov 470699d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 470799d16943SIlya Dryomov if (ret) { 470899d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 4709637cd060SIlya Dryomov if (ret != -EBLACKLISTED && ret != -ENOENT) { 471099d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 471199d16943SIlya Dryomov &rbd_dev->watch_dwork, 471299d16943SIlya Dryomov RBD_RETRY_DELAY); 471387c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 471414bb211dSIlya Dryomov return; 471599d16943SIlya Dryomov } 471699d16943SIlya Dryomov 4717637cd060SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 4718637cd060SIlya Dryomov down_write(&rbd_dev->lock_rwsem); 4719637cd060SIlya Dryomov wake_lock_waiters(rbd_dev, ret); 4720637cd060SIlya Dryomov up_write(&rbd_dev->lock_rwsem); 4721637cd060SIlya Dryomov return; 4722637cd060SIlya Dryomov } 4723637cd060SIlya Dryomov 472499d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 472599d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 472699d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 472799d16943SIlya Dryomov 472814bb211dSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 472914bb211dSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 473014bb211dSIlya Dryomov rbd_reacquire_lock(rbd_dev); 473114bb211dSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 473214bb211dSIlya Dryomov 473399d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 473499d16943SIlya Dryomov if (ret) 4735f6870cc9SColin Ian King rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); 473699d16943SIlya Dryomov } 473799d16943SIlya Dryomov 473836be9a76SAlex Elder /* 4739f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 4740f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 474136be9a76SAlex Elder */ 474236be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 4743ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 4744ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 474536be9a76SAlex Elder const char *method_name, 47464157976bSAlex Elder const void *outbound, 474736be9a76SAlex Elder size_t outbound_size, 47484157976bSAlex Elder void *inbound, 4749e2a58ee5SAlex Elder size_t inbound_size) 475036be9a76SAlex Elder { 4751ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4752ecd4a68aSIlya Dryomov struct page *req_page = NULL; 4753ecd4a68aSIlya Dryomov struct page *reply_page; 475436be9a76SAlex Elder int ret; 475536be9a76SAlex Elder 475636be9a76SAlex Elder /* 47576010a451SAlex Elder * Method calls are ultimately read operations. The result 47586010a451SAlex Elder * should placed into the inbound buffer provided. They 47596010a451SAlex Elder * also supply outbound data--parameters for the object 47606010a451SAlex Elder * method. Currently if this is present it will be a 47616010a451SAlex Elder * snapshot id. 476236be9a76SAlex Elder */ 4763ecd4a68aSIlya Dryomov if (outbound) { 4764ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 4765ecd4a68aSIlya Dryomov return -E2BIG; 476636be9a76SAlex Elder 4767ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 4768ecd4a68aSIlya Dryomov if (!req_page) 4769ecd4a68aSIlya Dryomov return -ENOMEM; 477036be9a76SAlex Elder 4771ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 477204017e29SAlex Elder } 4773430c28c3SAlex Elder 4774ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 4775ecd4a68aSIlya Dryomov if (!reply_page) { 4776ecd4a68aSIlya Dryomov if (req_page) 4777ecd4a68aSIlya Dryomov __free_page(req_page); 4778ecd4a68aSIlya Dryomov return -ENOMEM; 4779ecd4a68aSIlya Dryomov } 478036be9a76SAlex Elder 4781ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 4782ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 478368ada915SIlya Dryomov &reply_page, &inbound_size); 4784ecd4a68aSIlya Dryomov if (!ret) { 4785ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 4786ecd4a68aSIlya Dryomov ret = inbound_size; 4787ecd4a68aSIlya Dryomov } 478857385b51SAlex Elder 4789ecd4a68aSIlya Dryomov if (req_page) 4790ecd4a68aSIlya Dryomov __free_page(req_page); 4791ecd4a68aSIlya Dryomov __free_page(reply_page); 479236be9a76SAlex Elder return ret; 479336be9a76SAlex Elder } 479436be9a76SAlex Elder 47957ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 4796bc1ecc65SIlya Dryomov { 47977ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 47987ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 4799bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 48004e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 4801bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 4802bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 48036d2940c8SGuangliang Zhao enum obj_operation_type op_type; 48044e752f0aSJosh Durgin u64 mapping_size; 4805bc1ecc65SIlya Dryomov int result; 4806bc1ecc65SIlya Dryomov 4807aebf526bSChristoph Hellwig switch (req_op(rq)) { 4808aebf526bSChristoph Hellwig case REQ_OP_DISCARD: 4809aebf526bSChristoph Hellwig op_type = OBJ_OP_DISCARD; 4810aebf526bSChristoph Hellwig break; 48116484cbe9SIlya Dryomov case REQ_OP_WRITE_ZEROES: 48126484cbe9SIlya Dryomov op_type = OBJ_OP_ZEROOUT; 48136484cbe9SIlya Dryomov break; 4814aebf526bSChristoph Hellwig case REQ_OP_WRITE: 4815aebf526bSChristoph Hellwig op_type = OBJ_OP_WRITE; 4816aebf526bSChristoph Hellwig break; 4817aebf526bSChristoph Hellwig case REQ_OP_READ: 4818aebf526bSChristoph Hellwig op_type = OBJ_OP_READ; 4819aebf526bSChristoph Hellwig break; 4820aebf526bSChristoph Hellwig default: 4821aebf526bSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, req_op(rq)); 48227ad18afaSChristoph Hellwig result = -EIO; 48237ad18afaSChristoph Hellwig goto err; 48247ad18afaSChristoph Hellwig } 48257ad18afaSChristoph Hellwig 4826bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 4827bc1ecc65SIlya Dryomov 4828bc1ecc65SIlya Dryomov if (!length) { 4829bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 4830bc1ecc65SIlya Dryomov result = 0; 4831bc1ecc65SIlya Dryomov goto err_rq; 4832bc1ecc65SIlya Dryomov } 4833bc1ecc65SIlya Dryomov 4834f3c0e459SIlya Dryomov if (op_type != OBJ_OP_READ && rbd_is_snap(rbd_dev)) { 4835b91a7bdcSIlya Dryomov rbd_warn(rbd_dev, "%s on read-only snapshot", 4836b91a7bdcSIlya Dryomov obj_op_name(op_type)); 4837b91a7bdcSIlya Dryomov result = -EIO; 4838b91a7bdcSIlya Dryomov goto err; 4839b91a7bdcSIlya Dryomov } 4840bc1ecc65SIlya Dryomov 4841bc1ecc65SIlya Dryomov /* 4842bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 4843bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 4844bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 4845bc1ecc65SIlya Dryomov * sending it if we already know. 4846bc1ecc65SIlya Dryomov */ 4847bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 4848bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 4849f3c0e459SIlya Dryomov rbd_assert(rbd_is_snap(rbd_dev)); 4850bc1ecc65SIlya Dryomov result = -ENXIO; 4851bc1ecc65SIlya Dryomov goto err_rq; 4852bc1ecc65SIlya Dryomov } 4853bc1ecc65SIlya Dryomov 4854bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 4855bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 4856bc1ecc65SIlya Dryomov length); 4857bc1ecc65SIlya Dryomov result = -EINVAL; 4858bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 4859bc1ecc65SIlya Dryomov } 4860bc1ecc65SIlya Dryomov 48617ad18afaSChristoph Hellwig blk_mq_start_request(rq); 48627ad18afaSChristoph Hellwig 48634e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 48644e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 48656d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 48664e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 48674e752f0aSJosh Durgin ceph_get_snap_context(snapc); 48684e752f0aSJosh Durgin } 48694e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 48704e752f0aSJosh Durgin 48714e752f0aSJosh Durgin if (offset + length > mapping_size) { 4872bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 48734e752f0aSJosh Durgin length, mapping_size); 4874bc1ecc65SIlya Dryomov result = -EIO; 4875bc1ecc65SIlya Dryomov goto err_rq; 4876bc1ecc65SIlya Dryomov } 4877bc1ecc65SIlya Dryomov 4878dfd9875fSIlya Dryomov img_request = rbd_img_request_create(rbd_dev, op_type, snapc); 4879bc1ecc65SIlya Dryomov if (!img_request) { 4880bc1ecc65SIlya Dryomov result = -ENOMEM; 4881637cd060SIlya Dryomov goto err_rq; 4882bc1ecc65SIlya Dryomov } 4883bc1ecc65SIlya Dryomov img_request->rq = rq; 488470b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 4885bc1ecc65SIlya Dryomov 488621ed05a8SIlya Dryomov dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, 488721ed05a8SIlya Dryomov img_request, obj_op_name(op_type), offset, length); 488821ed05a8SIlya Dryomov 48896484cbe9SIlya Dryomov if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 48905a237819SIlya Dryomov result = rbd_img_fill_nodata(img_request, offset, length); 489190e98c52SGuangliang Zhao else 48925a237819SIlya Dryomov result = rbd_img_fill_from_bio(img_request, offset, length, 489390e98c52SGuangliang Zhao rq->bio); 48940192ce2eSIlya Dryomov if (result) 4895bc1ecc65SIlya Dryomov goto err_img_request; 4896bc1ecc65SIlya Dryomov 4897e1fddc8fSIlya Dryomov rbd_img_handle_request(img_request, 0); 4898bc1ecc65SIlya Dryomov return; 4899bc1ecc65SIlya Dryomov 4900bc1ecc65SIlya Dryomov err_img_request: 4901bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 4902bc1ecc65SIlya Dryomov err_rq: 4903bc1ecc65SIlya Dryomov if (result) 4904bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 49056d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 49064e752f0aSJosh Durgin ceph_put_snap_context(snapc); 49077ad18afaSChristoph Hellwig err: 49082a842acaSChristoph Hellwig blk_mq_end_request(rq, errno_to_blk_status(result)); 4909bc1ecc65SIlya Dryomov } 4910bc1ecc65SIlya Dryomov 4911fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 49127ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 4913bc1ecc65SIlya Dryomov { 49147ad18afaSChristoph Hellwig struct request *rq = bd->rq; 49157ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 4916bc1ecc65SIlya Dryomov 49177ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 4918fc17b653SChristoph Hellwig return BLK_STS_OK; 4919bf0d5f50SAlex Elder } 4920bf0d5f50SAlex Elder 4921602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 4922602adf40SYehuda Sadeh { 49235769ed0cSIlya Dryomov blk_cleanup_queue(rbd_dev->disk->queue); 49247ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 49255769ed0cSIlya Dryomov put_disk(rbd_dev->disk); 49265769ed0cSIlya Dryomov rbd_dev->disk = NULL; 4927602adf40SYehuda Sadeh } 4928602adf40SYehuda Sadeh 4929788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 4930fe5478e0SIlya Dryomov struct ceph_object_id *oid, 4931fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 4932fe5478e0SIlya Dryomov void *buf, int buf_len) 4933788e2df3SAlex Elder 4934788e2df3SAlex Elder { 4935fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4936fe5478e0SIlya Dryomov struct ceph_osd_request *req; 4937fe5478e0SIlya Dryomov struct page **pages; 4938fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 4939788e2df3SAlex Elder int ret; 4940788e2df3SAlex Elder 4941fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 4942fe5478e0SIlya Dryomov if (!req) 4943fe5478e0SIlya Dryomov return -ENOMEM; 4944788e2df3SAlex Elder 4945fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 4946fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 4947fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 4948788e2df3SAlex Elder 4949fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 4950fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 4951fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 4952fe5478e0SIlya Dryomov goto out_req; 4953fe5478e0SIlya Dryomov } 49541ceae7efSAlex Elder 4955fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 4956fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 4957fe5478e0SIlya Dryomov true); 4958788e2df3SAlex Elder 495926f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 496026f887e0SIlya Dryomov if (ret) 496126f887e0SIlya Dryomov goto out_req; 496226f887e0SIlya Dryomov 4963fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 4964fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 4965fe5478e0SIlya Dryomov if (ret >= 0) 4966fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 4967fe5478e0SIlya Dryomov 4968fe5478e0SIlya Dryomov out_req: 4969fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 4970788e2df3SAlex Elder return ret; 4971788e2df3SAlex Elder } 4972788e2df3SAlex Elder 4973602adf40SYehuda Sadeh /* 4974662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 4975662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 4976662518b1SAlex Elder * information about the image. 49774156d998SAlex Elder */ 497899a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 49794156d998SAlex Elder { 49804156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 49814156d998SAlex Elder u32 snap_count = 0; 49824156d998SAlex Elder u64 names_size = 0; 49834156d998SAlex Elder u32 want_count; 49844156d998SAlex Elder int ret; 49854156d998SAlex Elder 49864156d998SAlex Elder /* 49874156d998SAlex Elder * The complete header will include an array of its 64-bit 49884156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 49894156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 49904156d998SAlex Elder * the number of snapshots could change by the time we read 49914156d998SAlex Elder * it in, in which case we re-read it. 49924156d998SAlex Elder */ 49934156d998SAlex Elder do { 49944156d998SAlex Elder size_t size; 49954156d998SAlex Elder 49964156d998SAlex Elder kfree(ondisk); 49974156d998SAlex Elder 49984156d998SAlex Elder size = sizeof (*ondisk); 49994156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 50004156d998SAlex Elder size += names_size; 50014156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 50024156d998SAlex Elder if (!ondisk) 5003662518b1SAlex Elder return -ENOMEM; 50044156d998SAlex Elder 5005fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 5006fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 50074156d998SAlex Elder if (ret < 0) 5008662518b1SAlex Elder goto out; 5009c0cd10dbSAlex Elder if ((size_t)ret < size) { 50104156d998SAlex Elder ret = -ENXIO; 501106ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 501206ecc6cbSAlex Elder size, ret); 5013662518b1SAlex Elder goto out; 50144156d998SAlex Elder } 50154156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 50164156d998SAlex Elder ret = -ENXIO; 501706ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 5018662518b1SAlex Elder goto out; 50194156d998SAlex Elder } 50204156d998SAlex Elder 50214156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 50224156d998SAlex Elder want_count = snap_count; 50234156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 50244156d998SAlex Elder } while (snap_count != want_count); 50254156d998SAlex Elder 5026662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 5027662518b1SAlex Elder out: 50284156d998SAlex Elder kfree(ondisk); 50294156d998SAlex Elder 5030dfc5606dSYehuda Sadeh return ret; 5031602adf40SYehuda Sadeh } 5032602adf40SYehuda Sadeh 503315228edeSAlex Elder /* 503415228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 503515228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 503615228edeSAlex Elder */ 503715228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 503815228edeSAlex Elder { 503915228edeSAlex Elder u64 snap_id; 504015228edeSAlex Elder 504115228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 504215228edeSAlex Elder return; 504315228edeSAlex Elder 504415228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 504515228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 504615228edeSAlex Elder return; 504715228edeSAlex Elder 504815228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 504915228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 505015228edeSAlex Elder } 505115228edeSAlex Elder 50529875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 50539875201eSJosh Durgin { 50549875201eSJosh Durgin sector_t size; 50559875201eSJosh Durgin 50569875201eSJosh Durgin /* 5057811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 5058811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 5059811c6688SIlya Dryomov * is just useless work since the device can't be opened. 50609875201eSJosh Durgin */ 5061811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 5062811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 50639875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 50649875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 50659875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 50669875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 50679875201eSJosh Durgin } 50689875201eSJosh Durgin } 50699875201eSJosh Durgin 5070cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 50711fe5e993SAlex Elder { 5072e627db08SAlex Elder u64 mapping_size; 50731fe5e993SAlex Elder int ret; 50741fe5e993SAlex Elder 5075cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 50763b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 5077a720ae09SIlya Dryomov 5078a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 507952bb1f9bSIlya Dryomov if (ret) 508073e39e4dSIlya Dryomov goto out; 508115228edeSAlex Elder 5082e8f59b59SIlya Dryomov /* 5083e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 5084e8f59b59SIlya Dryomov * mapped image getting flattened. 5085e8f59b59SIlya Dryomov */ 5086e8f59b59SIlya Dryomov if (rbd_dev->parent) { 5087e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5088e8f59b59SIlya Dryomov if (ret) 508973e39e4dSIlya Dryomov goto out; 5090e8f59b59SIlya Dryomov } 5091e8f59b59SIlya Dryomov 5092f3c0e459SIlya Dryomov if (!rbd_is_snap(rbd_dev)) { 50935ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 50945ff1108cSIlya Dryomov } else { 50955ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 509615228edeSAlex Elder rbd_exists_validate(rbd_dev); 50975ff1108cSIlya Dryomov } 50985ff1108cSIlya Dryomov 509973e39e4dSIlya Dryomov out: 5100cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 510173e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 51029875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 51031fe5e993SAlex Elder 510473e39e4dSIlya Dryomov return ret; 51051fe5e993SAlex Elder } 51061fe5e993SAlex Elder 5107d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, 5108d6296d39SChristoph Hellwig unsigned int hctx_idx, unsigned int numa_node) 51097ad18afaSChristoph Hellwig { 51107ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 51117ad18afaSChristoph Hellwig 51127ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 51137ad18afaSChristoph Hellwig return 0; 51147ad18afaSChristoph Hellwig } 51157ad18afaSChristoph Hellwig 5116f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = { 51177ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 51187ad18afaSChristoph Hellwig .init_request = rbd_init_request, 51197ad18afaSChristoph Hellwig }; 51207ad18afaSChristoph Hellwig 5121602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 5122602adf40SYehuda Sadeh { 5123602adf40SYehuda Sadeh struct gendisk *disk; 5124602adf40SYehuda Sadeh struct request_queue *q; 5125420efbdfSIlya Dryomov unsigned int objset_bytes = 5126420efbdfSIlya Dryomov rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; 51277ad18afaSChristoph Hellwig int err; 5128602adf40SYehuda Sadeh 5129602adf40SYehuda Sadeh /* create gendisk info */ 51307e513d43SIlya Dryomov disk = alloc_disk(single_major ? 51317e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 51327e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 5133602adf40SYehuda Sadeh if (!disk) 51341fcdb8aaSAlex Elder return -ENOMEM; 5135602adf40SYehuda Sadeh 5136f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 5137de71a297SAlex Elder rbd_dev->dev_id); 5138602adf40SYehuda Sadeh disk->major = rbd_dev->major; 5139dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 51407e513d43SIlya Dryomov if (single_major) 51417e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 5142602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 5143602adf40SYehuda Sadeh disk->private_data = rbd_dev; 5144602adf40SYehuda Sadeh 51457ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 51467ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 5147b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 51487ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 514956d18f62SMing Lei rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; 51507ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 51517ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 51527ad18afaSChristoph Hellwig 51537ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 51547ad18afaSChristoph Hellwig if (err) 5155602adf40SYehuda Sadeh goto out_disk; 5156029bcbd8SJosh Durgin 51577ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 51587ad18afaSChristoph Hellwig if (IS_ERR(q)) { 51597ad18afaSChristoph Hellwig err = PTR_ERR(q); 51607ad18afaSChristoph Hellwig goto out_tag_set; 51617ad18afaSChristoph Hellwig } 51627ad18afaSChristoph Hellwig 51638b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 5164d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 5165593a9e7bSAlex Elder 5166420efbdfSIlya Dryomov blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); 51670d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 516821acdf45SIlya Dryomov blk_queue_max_segments(q, USHRT_MAX); 516924f1df60SIlya Dryomov blk_queue_max_segment_size(q, UINT_MAX); 517016d80c54SIlya Dryomov blk_queue_io_min(q, rbd_dev->opts->alloc_size); 517116d80c54SIlya Dryomov blk_queue_io_opt(q, rbd_dev->opts->alloc_size); 5172029bcbd8SJosh Durgin 5173d9360540SIlya Dryomov if (rbd_dev->opts->trim) { 51748b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 517516d80c54SIlya Dryomov q->limits.discard_granularity = rbd_dev->opts->alloc_size; 5176420efbdfSIlya Dryomov blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); 5177420efbdfSIlya Dryomov blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); 5178d9360540SIlya Dryomov } 517990e98c52SGuangliang Zhao 5180bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 5181dc3b17ccSJan Kara q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 5182bae818eeSRonny Hegewald 51835769ed0cSIlya Dryomov /* 51845769ed0cSIlya Dryomov * disk_release() expects a queue ref from add_disk() and will 51855769ed0cSIlya Dryomov * put it. Hold an extra ref until add_disk() is called. 51865769ed0cSIlya Dryomov */ 51875769ed0cSIlya Dryomov WARN_ON(!blk_get_queue(q)); 5188602adf40SYehuda Sadeh disk->queue = q; 5189602adf40SYehuda Sadeh q->queuedata = rbd_dev; 5190602adf40SYehuda Sadeh 5191602adf40SYehuda Sadeh rbd_dev->disk = disk; 5192602adf40SYehuda Sadeh 5193602adf40SYehuda Sadeh return 0; 51947ad18afaSChristoph Hellwig out_tag_set: 51957ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 5196602adf40SYehuda Sadeh out_disk: 5197602adf40SYehuda Sadeh put_disk(disk); 51987ad18afaSChristoph Hellwig return err; 5199602adf40SYehuda Sadeh } 5200602adf40SYehuda Sadeh 5201dfc5606dSYehuda Sadeh /* 5202dfc5606dSYehuda Sadeh sysfs 5203dfc5606dSYehuda Sadeh */ 5204602adf40SYehuda Sadeh 5205593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 5206593a9e7bSAlex Elder { 5207593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 5208593a9e7bSAlex Elder } 5209593a9e7bSAlex Elder 5210dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 5211dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5212602adf40SYehuda Sadeh { 5213593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5214dfc5606dSYehuda Sadeh 5215fc71d833SAlex Elder return sprintf(buf, "%llu\n", 5216fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 5217602adf40SYehuda Sadeh } 5218602adf40SYehuda Sadeh 521934b13184SAlex Elder /* 522034b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 522134b13184SAlex Elder * necessarily the base image. 522234b13184SAlex Elder */ 522334b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 522434b13184SAlex Elder struct device_attribute *attr, char *buf) 522534b13184SAlex Elder { 522634b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 522734b13184SAlex Elder 522834b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 522934b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 523034b13184SAlex Elder } 523134b13184SAlex Elder 5232dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 5233dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5234602adf40SYehuda Sadeh { 5235593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5236dfc5606dSYehuda Sadeh 5237fc71d833SAlex Elder if (rbd_dev->major) 5238dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 5239fc71d833SAlex Elder 5240fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 5241dd82fff1SIlya Dryomov } 5242fc71d833SAlex Elder 5243dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 5244dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 5245dd82fff1SIlya Dryomov { 5246dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5247dd82fff1SIlya Dryomov 5248dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 5249dfc5606dSYehuda Sadeh } 5250dfc5606dSYehuda Sadeh 5251005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 5252005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 5253005a07bfSIlya Dryomov { 5254005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5255005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 5256005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 5257005a07bfSIlya Dryomov 5258005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 5259005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 5260005a07bfSIlya Dryomov } 5261005a07bfSIlya Dryomov 5262dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 5263dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5264dfc5606dSYehuda Sadeh { 5265593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5266dfc5606dSYehuda Sadeh 52671dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 5268033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 5269dfc5606dSYehuda Sadeh } 5270dfc5606dSYehuda Sadeh 5271267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 5272267fb90bSMike Christie struct device_attribute *attr, char *buf) 5273267fb90bSMike Christie { 5274267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5275267fb90bSMike Christie 5276267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 5277267fb90bSMike Christie } 5278267fb90bSMike Christie 52790d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 52800d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 52810d6d1e9cSMike Christie { 52820d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 52830d6d1e9cSMike Christie 52840d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 5285dfc5606dSYehuda Sadeh } 5286dfc5606dSYehuda Sadeh 5287dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 5288dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5289dfc5606dSYehuda Sadeh { 5290593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5291dfc5606dSYehuda Sadeh 52920d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 5293dfc5606dSYehuda Sadeh } 5294dfc5606dSYehuda Sadeh 52959bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 52969bb2f334SAlex Elder struct device_attribute *attr, char *buf) 52979bb2f334SAlex Elder { 52989bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 52999bb2f334SAlex Elder 53000d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 53010d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 53029bb2f334SAlex Elder } 53039bb2f334SAlex Elder 5304b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev, 5305b26c047bSIlya Dryomov struct device_attribute *attr, char *buf) 5306b26c047bSIlya Dryomov { 5307b26c047bSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5308b26c047bSIlya Dryomov 5309b26c047bSIlya Dryomov return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: ""); 5310b26c047bSIlya Dryomov } 5311b26c047bSIlya Dryomov 5312dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 5313dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 5314dfc5606dSYehuda Sadeh { 5315593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5316dfc5606dSYehuda Sadeh 5317a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 53180d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 5319a92ffdf8SAlex Elder 5320a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 5321dfc5606dSYehuda Sadeh } 5322dfc5606dSYehuda Sadeh 5323589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 5324589d30e0SAlex Elder struct device_attribute *attr, char *buf) 5325589d30e0SAlex Elder { 5326589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5327589d30e0SAlex Elder 53280d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 5329589d30e0SAlex Elder } 5330589d30e0SAlex Elder 533134b13184SAlex Elder /* 533234b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 533334b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 533434b13184SAlex Elder */ 5335dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 5336dfc5606dSYehuda Sadeh struct device_attribute *attr, 5337dfc5606dSYehuda Sadeh char *buf) 5338dfc5606dSYehuda Sadeh { 5339593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5340dfc5606dSYehuda Sadeh 53410d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 5342dfc5606dSYehuda Sadeh } 5343dfc5606dSYehuda Sadeh 534492a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 534592a58671SMike Christie struct device_attribute *attr, char *buf) 534692a58671SMike Christie { 534792a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 534892a58671SMike Christie 534992a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 535092a58671SMike Christie } 535192a58671SMike Christie 535286b00e0dSAlex Elder /* 5353ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 5354ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 5355ff96128fSIlya Dryomov * image)". 535686b00e0dSAlex Elder */ 535786b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 535886b00e0dSAlex Elder struct device_attribute *attr, 535986b00e0dSAlex Elder char *buf) 536086b00e0dSAlex Elder { 536186b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5362ff96128fSIlya Dryomov ssize_t count = 0; 536386b00e0dSAlex Elder 5364ff96128fSIlya Dryomov if (!rbd_dev->parent) 536586b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 536686b00e0dSAlex Elder 5367ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 5368ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 536986b00e0dSAlex Elder 5370ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 5371ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 5372e92c0eafSIlya Dryomov "pool_ns %s\n" 5373ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 5374ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 5375ff96128fSIlya Dryomov "overlap %llu\n", 5376ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 5377ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 5378e92c0eafSIlya Dryomov spec->pool_ns ?: "", 5379ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 5380ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 5381ff96128fSIlya Dryomov rbd_dev->parent_overlap); 5382ff96128fSIlya Dryomov } 538386b00e0dSAlex Elder 538486b00e0dSAlex Elder return count; 538586b00e0dSAlex Elder } 538686b00e0dSAlex Elder 5387dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 5388dfc5606dSYehuda Sadeh struct device_attribute *attr, 5389dfc5606dSYehuda Sadeh const char *buf, 5390dfc5606dSYehuda Sadeh size_t size) 5391dfc5606dSYehuda Sadeh { 5392593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 5393b813623aSAlex Elder int ret; 5394602adf40SYehuda Sadeh 5395cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 5396e627db08SAlex Elder if (ret) 539752bb1f9bSIlya Dryomov return ret; 5398b813623aSAlex Elder 539952bb1f9bSIlya Dryomov return size; 5400dfc5606dSYehuda Sadeh } 5401602adf40SYehuda Sadeh 54025657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL); 54035657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL); 54045657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL); 54055657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL); 54065657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL); 54075657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL); 54085657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); 54095657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); 54105657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); 54115657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); 5412b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL); 54135657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); 54145657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); 54155657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); 54165657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL); 54175657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL); 54185657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL); 5419dfc5606dSYehuda Sadeh 5420dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 5421dfc5606dSYehuda Sadeh &dev_attr_size.attr, 542234b13184SAlex Elder &dev_attr_features.attr, 5423dfc5606dSYehuda Sadeh &dev_attr_major.attr, 5424dd82fff1SIlya Dryomov &dev_attr_minor.attr, 5425005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 5426dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 5427267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 54280d6d1e9cSMike Christie &dev_attr_config_info.attr, 5429dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 54309bb2f334SAlex Elder &dev_attr_pool_id.attr, 5431b26c047bSIlya Dryomov &dev_attr_pool_ns.attr, 5432dfc5606dSYehuda Sadeh &dev_attr_name.attr, 5433589d30e0SAlex Elder &dev_attr_image_id.attr, 5434dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 543592a58671SMike Christie &dev_attr_snap_id.attr, 543686b00e0dSAlex Elder &dev_attr_parent.attr, 5437dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 5438dfc5606dSYehuda Sadeh NULL 5439dfc5606dSYehuda Sadeh }; 5440dfc5606dSYehuda Sadeh 5441dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 5442dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 5443dfc5606dSYehuda Sadeh }; 5444dfc5606dSYehuda Sadeh 5445dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 5446dfc5606dSYehuda Sadeh &rbd_attr_group, 5447dfc5606dSYehuda Sadeh NULL 5448dfc5606dSYehuda Sadeh }; 5449dfc5606dSYehuda Sadeh 54506cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 5451dfc5606dSYehuda Sadeh 5452b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = { 5453dfc5606dSYehuda Sadeh .name = "rbd", 5454dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 54556cac4695SIlya Dryomov .release = rbd_dev_release, 5456dfc5606dSYehuda Sadeh }; 5457dfc5606dSYehuda Sadeh 54588b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 54598b8fb99cSAlex Elder { 54608b8fb99cSAlex Elder kref_get(&spec->kref); 54618b8fb99cSAlex Elder 54628b8fb99cSAlex Elder return spec; 54638b8fb99cSAlex Elder } 54648b8fb99cSAlex Elder 54658b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 54668b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 54678b8fb99cSAlex Elder { 54688b8fb99cSAlex Elder if (spec) 54698b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 54708b8fb99cSAlex Elder } 54718b8fb99cSAlex Elder 54728b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 54738b8fb99cSAlex Elder { 54748b8fb99cSAlex Elder struct rbd_spec *spec; 54758b8fb99cSAlex Elder 54768b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 54778b8fb99cSAlex Elder if (!spec) 54788b8fb99cSAlex Elder return NULL; 547904077599SIlya Dryomov 548004077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 548104077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 54828b8fb99cSAlex Elder kref_init(&spec->kref); 54838b8fb99cSAlex Elder 54848b8fb99cSAlex Elder return spec; 54858b8fb99cSAlex Elder } 54868b8fb99cSAlex Elder 54878b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 54888b8fb99cSAlex Elder { 54898b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 54908b8fb99cSAlex Elder 54918b8fb99cSAlex Elder kfree(spec->pool_name); 5492b26c047bSIlya Dryomov kfree(spec->pool_ns); 54938b8fb99cSAlex Elder kfree(spec->image_id); 54948b8fb99cSAlex Elder kfree(spec->image_name); 54958b8fb99cSAlex Elder kfree(spec->snap_name); 54968b8fb99cSAlex Elder kfree(spec); 54978b8fb99cSAlex Elder } 54988b8fb99cSAlex Elder 54991643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 5500dd5ac32dSIlya Dryomov { 550199d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 5502ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 5503dd5ac32dSIlya Dryomov 5504c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 55056b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 55060d6d1e9cSMike Christie kfree(rbd_dev->config_info); 5507c41d13a3SIlya Dryomov 5508dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 5509dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 5510dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 5511dd5ac32dSIlya Dryomov kfree(rbd_dev); 55121643dfa4SIlya Dryomov } 55131643dfa4SIlya Dryomov 55141643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 55151643dfa4SIlya Dryomov { 55161643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 55171643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 55181643dfa4SIlya Dryomov 55191643dfa4SIlya Dryomov if (need_put) { 55201643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 55211643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 55221643dfa4SIlya Dryomov } 55231643dfa4SIlya Dryomov 55241643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 5525dd5ac32dSIlya Dryomov 5526dd5ac32dSIlya Dryomov /* 5527dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 5528dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 5529dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 5530dd5ac32dSIlya Dryomov */ 5531dd5ac32dSIlya Dryomov if (need_put) 5532dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 5533dd5ac32dSIlya Dryomov } 5534dd5ac32dSIlya Dryomov 55351643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 55361643dfa4SIlya Dryomov struct rbd_spec *spec) 5537c53d5893SAlex Elder { 5538c53d5893SAlex Elder struct rbd_device *rbd_dev; 5539c53d5893SAlex Elder 5540c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 5541c53d5893SAlex Elder if (!rbd_dev) 5542c53d5893SAlex Elder return NULL; 5543c53d5893SAlex Elder 5544c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 5545c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 5546c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 5547c53d5893SAlex Elder 55487e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 5549c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 5550431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 5551b26c047bSIlya Dryomov if (spec->pool_ns) { 5552b26c047bSIlya Dryomov WARN_ON(!*spec->pool_ns); 5553b26c047bSIlya Dryomov rbd_dev->header_oloc.pool_ns = 5554b26c047bSIlya Dryomov ceph_find_or_create_string(spec->pool_ns, 5555b26c047bSIlya Dryomov strlen(spec->pool_ns)); 5556b26c047bSIlya Dryomov } 5557c41d13a3SIlya Dryomov 555899d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 555999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 556099d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 556199d16943SIlya Dryomov 5562ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 5563ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 5564ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 5565ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 5566ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 5567ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 5568e1fddc8fSIlya Dryomov spin_lock_init(&rbd_dev->lock_lists_lock); 5569637cd060SIlya Dryomov INIT_LIST_HEAD(&rbd_dev->acquiring_list); 5570e1fddc8fSIlya Dryomov INIT_LIST_HEAD(&rbd_dev->running_list); 5571637cd060SIlya Dryomov init_completion(&rbd_dev->acquire_wait); 5572e1fddc8fSIlya Dryomov init_completion(&rbd_dev->releasing_wait); 5573ed95b21aSIlya Dryomov 557422e8bd51SIlya Dryomov spin_lock_init(&rbd_dev->object_map_lock); 5575c53d5893SAlex Elder 5576dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 5577dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 5578dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 5579dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 5580dd5ac32dSIlya Dryomov 5581c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 5582d147543dSIlya Dryomov rbd_dev->spec = spec; 55830903e875SAlex Elder 55841643dfa4SIlya Dryomov return rbd_dev; 55851643dfa4SIlya Dryomov } 55861643dfa4SIlya Dryomov 5587dd5ac32dSIlya Dryomov /* 55881643dfa4SIlya Dryomov * Create a mapping rbd_dev. 5589dd5ac32dSIlya Dryomov */ 55901643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 55911643dfa4SIlya Dryomov struct rbd_spec *spec, 55921643dfa4SIlya Dryomov struct rbd_options *opts) 55931643dfa4SIlya Dryomov { 55941643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 55951643dfa4SIlya Dryomov 55961643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 55971643dfa4SIlya Dryomov if (!rbd_dev) 55981643dfa4SIlya Dryomov return NULL; 55991643dfa4SIlya Dryomov 56001643dfa4SIlya Dryomov rbd_dev->opts = opts; 56011643dfa4SIlya Dryomov 56021643dfa4SIlya Dryomov /* get an id and fill in device name */ 56031643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 56041643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 56051643dfa4SIlya Dryomov GFP_KERNEL); 56061643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 56071643dfa4SIlya Dryomov goto fail_rbd_dev; 56081643dfa4SIlya Dryomov 56091643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 56101643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 56111643dfa4SIlya Dryomov rbd_dev->name); 56121643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 56131643dfa4SIlya Dryomov goto fail_dev_id; 56141643dfa4SIlya Dryomov 56151643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 5616dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 5617dd5ac32dSIlya Dryomov 56181643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 5619c53d5893SAlex Elder return rbd_dev; 56201643dfa4SIlya Dryomov 56211643dfa4SIlya Dryomov fail_dev_id: 56221643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 56231643dfa4SIlya Dryomov fail_rbd_dev: 56241643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 56251643dfa4SIlya Dryomov return NULL; 5626c53d5893SAlex Elder } 5627c53d5893SAlex Elder 5628c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 5629c53d5893SAlex Elder { 5630dd5ac32dSIlya Dryomov if (rbd_dev) 5631dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 5632c53d5893SAlex Elder } 5633c53d5893SAlex Elder 5634dfc5606dSYehuda Sadeh /* 56359d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 56369d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 56379d475de5SAlex Elder * image. 56389d475de5SAlex Elder */ 56399d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 56409d475de5SAlex Elder u8 *order, u64 *snap_size) 56419d475de5SAlex Elder { 56429d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 56439d475de5SAlex Elder int ret; 56449d475de5SAlex Elder struct { 56459d475de5SAlex Elder u8 order; 56469d475de5SAlex Elder __le64 size; 56479d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 56489d475de5SAlex Elder 5649ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5650ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 56514157976bSAlex Elder &snapid, sizeof(snapid), 5652e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 565336be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 56549d475de5SAlex Elder if (ret < 0) 56559d475de5SAlex Elder return ret; 565657385b51SAlex Elder if (ret < sizeof (size_buf)) 565757385b51SAlex Elder return -ERANGE; 56589d475de5SAlex Elder 5659c3545579SJosh Durgin if (order) { 56609d475de5SAlex Elder *order = size_buf.order; 5661c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 5662c3545579SJosh Durgin } 56639d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 56649d475de5SAlex Elder 5665c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 5666c3545579SJosh Durgin (unsigned long long)snap_id, 56679d475de5SAlex Elder (unsigned long long)*snap_size); 56689d475de5SAlex Elder 56699d475de5SAlex Elder return 0; 56709d475de5SAlex Elder } 56719d475de5SAlex Elder 56729d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 56739d475de5SAlex Elder { 56749d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 56759d475de5SAlex Elder &rbd_dev->header.obj_order, 56769d475de5SAlex Elder &rbd_dev->header.image_size); 56779d475de5SAlex Elder } 56789d475de5SAlex Elder 56791e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 56801e130199SAlex Elder { 56815435d206SDongsheng Yang size_t size; 56821e130199SAlex Elder void *reply_buf; 56831e130199SAlex Elder int ret; 56841e130199SAlex Elder void *p; 56851e130199SAlex Elder 56865435d206SDongsheng Yang /* Response will be an encoded string, which includes a length */ 56875435d206SDongsheng Yang size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; 56885435d206SDongsheng Yang reply_buf = kzalloc(size, GFP_KERNEL); 56891e130199SAlex Elder if (!reply_buf) 56901e130199SAlex Elder return -ENOMEM; 56911e130199SAlex Elder 5692ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5693ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 56945435d206SDongsheng Yang NULL, 0, reply_buf, size); 569536be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 56961e130199SAlex Elder if (ret < 0) 56971e130199SAlex Elder goto out; 56981e130199SAlex Elder 56991e130199SAlex Elder p = reply_buf; 57001e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 570157385b51SAlex Elder p + ret, NULL, GFP_NOIO); 570257385b51SAlex Elder ret = 0; 57031e130199SAlex Elder 57041e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 57051e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 57061e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 57071e130199SAlex Elder } else { 57081e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 57091e130199SAlex Elder } 57101e130199SAlex Elder out: 57111e130199SAlex Elder kfree(reply_buf); 57121e130199SAlex Elder 57131e130199SAlex Elder return ret; 57141e130199SAlex Elder } 57151e130199SAlex Elder 5716b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5717b1b5402aSAlex Elder u64 *snap_features) 5718b1b5402aSAlex Elder { 5719b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 5720b1b5402aSAlex Elder struct { 5721b1b5402aSAlex Elder __le64 features; 5722b1b5402aSAlex Elder __le64 incompat; 57234157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 5724d3767f0fSIlya Dryomov u64 unsup; 5725b1b5402aSAlex Elder int ret; 5726b1b5402aSAlex Elder 5727ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5728ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 57294157976bSAlex Elder &snapid, sizeof(snapid), 5730e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 573136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5732b1b5402aSAlex Elder if (ret < 0) 5733b1b5402aSAlex Elder return ret; 573457385b51SAlex Elder if (ret < sizeof (features_buf)) 573557385b51SAlex Elder return -ERANGE; 5736d889140cSAlex Elder 5737d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 5738d3767f0fSIlya Dryomov if (unsup) { 5739d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 5740d3767f0fSIlya Dryomov unsup); 5741b8f5c6edSAlex Elder return -ENXIO; 5742d3767f0fSIlya Dryomov } 5743d889140cSAlex Elder 5744b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 5745b1b5402aSAlex Elder 5746b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 5747b1b5402aSAlex Elder (unsigned long long)snap_id, 5748b1b5402aSAlex Elder (unsigned long long)*snap_features, 5749b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 5750b1b5402aSAlex Elder 5751b1b5402aSAlex Elder return 0; 5752b1b5402aSAlex Elder } 5753b1b5402aSAlex Elder 5754b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 5755b1b5402aSAlex Elder { 5756b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 5757b1b5402aSAlex Elder &rbd_dev->header.features); 5758b1b5402aSAlex Elder } 5759b1b5402aSAlex Elder 576022e8bd51SIlya Dryomov /* 576122e8bd51SIlya Dryomov * These are generic image flags, but since they are used only for 576222e8bd51SIlya Dryomov * object map, store them in rbd_dev->object_map_flags. 576322e8bd51SIlya Dryomov * 576422e8bd51SIlya Dryomov * For the same reason, this function is called only on object map 576522e8bd51SIlya Dryomov * (re)load and not on header refresh. 576622e8bd51SIlya Dryomov */ 576722e8bd51SIlya Dryomov static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev) 576822e8bd51SIlya Dryomov { 576922e8bd51SIlya Dryomov __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id); 577022e8bd51SIlya Dryomov __le64 flags; 577122e8bd51SIlya Dryomov int ret; 577222e8bd51SIlya Dryomov 577322e8bd51SIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 577422e8bd51SIlya Dryomov &rbd_dev->header_oloc, "get_flags", 577522e8bd51SIlya Dryomov &snapid, sizeof(snapid), 577622e8bd51SIlya Dryomov &flags, sizeof(flags)); 577722e8bd51SIlya Dryomov if (ret < 0) 577822e8bd51SIlya Dryomov return ret; 577922e8bd51SIlya Dryomov if (ret < sizeof(flags)) 578022e8bd51SIlya Dryomov return -EBADMSG; 578122e8bd51SIlya Dryomov 578222e8bd51SIlya Dryomov rbd_dev->object_map_flags = le64_to_cpu(flags); 578322e8bd51SIlya Dryomov return 0; 578422e8bd51SIlya Dryomov } 578522e8bd51SIlya Dryomov 5786eb3b2d6bSIlya Dryomov struct parent_image_info { 5787eb3b2d6bSIlya Dryomov u64 pool_id; 5788e92c0eafSIlya Dryomov const char *pool_ns; 5789eb3b2d6bSIlya Dryomov const char *image_id; 5790eb3b2d6bSIlya Dryomov u64 snap_id; 5791eb3b2d6bSIlya Dryomov 5792e92c0eafSIlya Dryomov bool has_overlap; 5793eb3b2d6bSIlya Dryomov u64 overlap; 5794eb3b2d6bSIlya Dryomov }; 5795eb3b2d6bSIlya Dryomov 5796eb3b2d6bSIlya Dryomov /* 5797eb3b2d6bSIlya Dryomov * The caller is responsible for @pii. 5798eb3b2d6bSIlya Dryomov */ 5799e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end, 5800e92c0eafSIlya Dryomov struct parent_image_info *pii) 5801e92c0eafSIlya Dryomov { 5802e92c0eafSIlya Dryomov u8 struct_v; 5803e92c0eafSIlya Dryomov u32 struct_len; 5804e92c0eafSIlya Dryomov int ret; 5805e92c0eafSIlya Dryomov 5806e92c0eafSIlya Dryomov ret = ceph_start_decoding(p, end, 1, "ParentImageSpec", 5807e92c0eafSIlya Dryomov &struct_v, &struct_len); 5808e92c0eafSIlya Dryomov if (ret) 5809e92c0eafSIlya Dryomov return ret; 5810e92c0eafSIlya Dryomov 5811e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->pool_id, e_inval); 5812e92c0eafSIlya Dryomov pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 5813e92c0eafSIlya Dryomov if (IS_ERR(pii->pool_ns)) { 5814e92c0eafSIlya Dryomov ret = PTR_ERR(pii->pool_ns); 5815e92c0eafSIlya Dryomov pii->pool_ns = NULL; 5816e92c0eafSIlya Dryomov return ret; 5817e92c0eafSIlya Dryomov } 5818e92c0eafSIlya Dryomov pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 5819e92c0eafSIlya Dryomov if (IS_ERR(pii->image_id)) { 5820e92c0eafSIlya Dryomov ret = PTR_ERR(pii->image_id); 5821e92c0eafSIlya Dryomov pii->image_id = NULL; 5822e92c0eafSIlya Dryomov return ret; 5823e92c0eafSIlya Dryomov } 5824e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->snap_id, e_inval); 5825e92c0eafSIlya Dryomov return 0; 5826e92c0eafSIlya Dryomov 5827e92c0eafSIlya Dryomov e_inval: 5828e92c0eafSIlya Dryomov return -EINVAL; 5829e92c0eafSIlya Dryomov } 5830e92c0eafSIlya Dryomov 5831e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev, 5832e92c0eafSIlya Dryomov struct page *req_page, 5833e92c0eafSIlya Dryomov struct page *reply_page, 5834e92c0eafSIlya Dryomov struct parent_image_info *pii) 5835e92c0eafSIlya Dryomov { 5836e92c0eafSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 5837e92c0eafSIlya Dryomov size_t reply_len = PAGE_SIZE; 5838e92c0eafSIlya Dryomov void *p, *end; 5839e92c0eafSIlya Dryomov int ret; 5840e92c0eafSIlya Dryomov 5841e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5842e92c0eafSIlya Dryomov "rbd", "parent_get", CEPH_OSD_FLAG_READ, 584368ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len); 5844e92c0eafSIlya Dryomov if (ret) 5845e92c0eafSIlya Dryomov return ret == -EOPNOTSUPP ? 1 : ret; 5846e92c0eafSIlya Dryomov 5847e92c0eafSIlya Dryomov p = page_address(reply_page); 5848e92c0eafSIlya Dryomov end = p + reply_len; 5849e92c0eafSIlya Dryomov ret = decode_parent_image_spec(&p, end, pii); 5850e92c0eafSIlya Dryomov if (ret) 5851e92c0eafSIlya Dryomov return ret; 5852e92c0eafSIlya Dryomov 5853e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5854e92c0eafSIlya Dryomov "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, 585568ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len); 5856e92c0eafSIlya Dryomov if (ret) 5857e92c0eafSIlya Dryomov return ret; 5858e92c0eafSIlya Dryomov 5859e92c0eafSIlya Dryomov p = page_address(reply_page); 5860e92c0eafSIlya Dryomov end = p + reply_len; 5861e92c0eafSIlya Dryomov ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval); 5862e92c0eafSIlya Dryomov if (pii->has_overlap) 5863e92c0eafSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 5864e92c0eafSIlya Dryomov 5865e92c0eafSIlya Dryomov return 0; 5866e92c0eafSIlya Dryomov 5867e92c0eafSIlya Dryomov e_inval: 5868e92c0eafSIlya Dryomov return -EINVAL; 5869e92c0eafSIlya Dryomov } 5870e92c0eafSIlya Dryomov 5871e92c0eafSIlya Dryomov /* 5872e92c0eafSIlya Dryomov * The caller is responsible for @pii. 5873e92c0eafSIlya Dryomov */ 5874eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev, 5875eb3b2d6bSIlya Dryomov struct page *req_page, 5876eb3b2d6bSIlya Dryomov struct page *reply_page, 5877eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 5878eb3b2d6bSIlya Dryomov { 5879eb3b2d6bSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 5880eb3b2d6bSIlya Dryomov size_t reply_len = PAGE_SIZE; 5881eb3b2d6bSIlya Dryomov void *p, *end; 5882eb3b2d6bSIlya Dryomov int ret; 5883eb3b2d6bSIlya Dryomov 5884eb3b2d6bSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 5885eb3b2d6bSIlya Dryomov "rbd", "get_parent", CEPH_OSD_FLAG_READ, 588668ada915SIlya Dryomov req_page, sizeof(u64), &reply_page, &reply_len); 5887eb3b2d6bSIlya Dryomov if (ret) 5888eb3b2d6bSIlya Dryomov return ret; 5889eb3b2d6bSIlya Dryomov 5890eb3b2d6bSIlya Dryomov p = page_address(reply_page); 5891eb3b2d6bSIlya Dryomov end = p + reply_len; 5892eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->pool_id, e_inval); 5893eb3b2d6bSIlya Dryomov pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5894eb3b2d6bSIlya Dryomov if (IS_ERR(pii->image_id)) { 5895eb3b2d6bSIlya Dryomov ret = PTR_ERR(pii->image_id); 5896eb3b2d6bSIlya Dryomov pii->image_id = NULL; 5897eb3b2d6bSIlya Dryomov return ret; 5898eb3b2d6bSIlya Dryomov } 5899eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->snap_id, e_inval); 5900e92c0eafSIlya Dryomov pii->has_overlap = true; 5901eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 5902eb3b2d6bSIlya Dryomov 5903eb3b2d6bSIlya Dryomov return 0; 5904eb3b2d6bSIlya Dryomov 5905eb3b2d6bSIlya Dryomov e_inval: 5906eb3b2d6bSIlya Dryomov return -EINVAL; 5907eb3b2d6bSIlya Dryomov } 5908eb3b2d6bSIlya Dryomov 5909eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev, 5910eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 5911eb3b2d6bSIlya Dryomov { 5912eb3b2d6bSIlya Dryomov struct page *req_page, *reply_page; 5913eb3b2d6bSIlya Dryomov void *p; 5914eb3b2d6bSIlya Dryomov int ret; 5915eb3b2d6bSIlya Dryomov 5916eb3b2d6bSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 5917eb3b2d6bSIlya Dryomov if (!req_page) 5918eb3b2d6bSIlya Dryomov return -ENOMEM; 5919eb3b2d6bSIlya Dryomov 5920eb3b2d6bSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 5921eb3b2d6bSIlya Dryomov if (!reply_page) { 5922eb3b2d6bSIlya Dryomov __free_page(req_page); 5923eb3b2d6bSIlya Dryomov return -ENOMEM; 5924eb3b2d6bSIlya Dryomov } 5925eb3b2d6bSIlya Dryomov 5926eb3b2d6bSIlya Dryomov p = page_address(req_page); 5927eb3b2d6bSIlya Dryomov ceph_encode_64(&p, rbd_dev->spec->snap_id); 5928e92c0eafSIlya Dryomov ret = __get_parent_info(rbd_dev, req_page, reply_page, pii); 5929e92c0eafSIlya Dryomov if (ret > 0) 5930e92c0eafSIlya Dryomov ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page, 5931e92c0eafSIlya Dryomov pii); 5932eb3b2d6bSIlya Dryomov 5933eb3b2d6bSIlya Dryomov __free_page(req_page); 5934eb3b2d6bSIlya Dryomov __free_page(reply_page); 5935eb3b2d6bSIlya Dryomov return ret; 5936eb3b2d6bSIlya Dryomov } 5937eb3b2d6bSIlya Dryomov 593886b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 593986b00e0dSAlex Elder { 594086b00e0dSAlex Elder struct rbd_spec *parent_spec; 5941eb3b2d6bSIlya Dryomov struct parent_image_info pii = { 0 }; 594286b00e0dSAlex Elder int ret; 594386b00e0dSAlex Elder 594486b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 594586b00e0dSAlex Elder if (!parent_spec) 594686b00e0dSAlex Elder return -ENOMEM; 594786b00e0dSAlex Elder 5948eb3b2d6bSIlya Dryomov ret = get_parent_info(rbd_dev, &pii); 5949eb3b2d6bSIlya Dryomov if (ret) 595086b00e0dSAlex Elder goto out_err; 595186b00e0dSAlex Elder 5952e92c0eafSIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", 5953e92c0eafSIlya Dryomov __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, 5954e92c0eafSIlya Dryomov pii.has_overlap, pii.overlap); 5955eb3b2d6bSIlya Dryomov 5956e92c0eafSIlya Dryomov if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { 5957392a9dadSAlex Elder /* 5958392a9dadSAlex Elder * Either the parent never existed, or we have 5959392a9dadSAlex Elder * record of it but the image got flattened so it no 5960392a9dadSAlex Elder * longer has a parent. When the parent of a 5961392a9dadSAlex Elder * layered image disappears we immediately set the 5962392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 5963392a9dadSAlex Elder * requests will be treated as if the image had no 5964392a9dadSAlex Elder * parent. 5965e92c0eafSIlya Dryomov * 5966e92c0eafSIlya Dryomov * If !pii.has_overlap, the parent image spec is not 5967e92c0eafSIlya Dryomov * applicable. It's there to avoid duplication in each 5968e92c0eafSIlya Dryomov * snapshot record. 5969392a9dadSAlex Elder */ 5970392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 5971392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 5972392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 5973392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 5974392a9dadSAlex Elder rbd_dev->disk->disk_name); 5975392a9dadSAlex Elder } 5976392a9dadSAlex Elder 597786b00e0dSAlex Elder goto out; /* No parent? No problem. */ 5978392a9dadSAlex Elder } 597986b00e0dSAlex Elder 59800903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 59810903e875SAlex Elder 59820903e875SAlex Elder ret = -EIO; 5983eb3b2d6bSIlya Dryomov if (pii.pool_id > (u64)U32_MAX) { 59849584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 5985eb3b2d6bSIlya Dryomov (unsigned long long)pii.pool_id, U32_MAX); 598657385b51SAlex Elder goto out_err; 5987c0cd10dbSAlex Elder } 59880903e875SAlex Elder 59893b5cf2a2SAlex Elder /* 59903b5cf2a2SAlex Elder * The parent won't change (except when the clone is 59913b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 59923b5cf2a2SAlex Elder * record the parent spec we have not already done so. 59933b5cf2a2SAlex Elder */ 59943b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 5995eb3b2d6bSIlya Dryomov parent_spec->pool_id = pii.pool_id; 5996e92c0eafSIlya Dryomov if (pii.pool_ns && *pii.pool_ns) { 5997e92c0eafSIlya Dryomov parent_spec->pool_ns = pii.pool_ns; 5998e92c0eafSIlya Dryomov pii.pool_ns = NULL; 5999e92c0eafSIlya Dryomov } 6000eb3b2d6bSIlya Dryomov parent_spec->image_id = pii.image_id; 6001eb3b2d6bSIlya Dryomov pii.image_id = NULL; 6002eb3b2d6bSIlya Dryomov parent_spec->snap_id = pii.snap_id; 6003b26c047bSIlya Dryomov 600486b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 600586b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 60063b5cf2a2SAlex Elder } 60073b5cf2a2SAlex Elder 60083b5cf2a2SAlex Elder /* 6009cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 6010cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 60113b5cf2a2SAlex Elder */ 6012eb3b2d6bSIlya Dryomov if (!pii.overlap) { 60133b5cf2a2SAlex Elder if (parent_spec) { 6014cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 6015cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 6016cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 6017cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 601870cf49cfSAlex Elder } else { 6019cf32bd9cSIlya Dryomov /* initial probe */ 6020cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 60213b5cf2a2SAlex Elder } 602270cf49cfSAlex Elder } 6023eb3b2d6bSIlya Dryomov rbd_dev->parent_overlap = pii.overlap; 6024cf32bd9cSIlya Dryomov 602586b00e0dSAlex Elder out: 602686b00e0dSAlex Elder ret = 0; 602786b00e0dSAlex Elder out_err: 6028e92c0eafSIlya Dryomov kfree(pii.pool_ns); 6029eb3b2d6bSIlya Dryomov kfree(pii.image_id); 603086b00e0dSAlex Elder rbd_spec_put(parent_spec); 603186b00e0dSAlex Elder return ret; 603286b00e0dSAlex Elder } 603386b00e0dSAlex Elder 6034cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 6035cc070d59SAlex Elder { 6036cc070d59SAlex Elder struct { 6037cc070d59SAlex Elder __le64 stripe_unit; 6038cc070d59SAlex Elder __le64 stripe_count; 6039cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 6040cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 6041cc070d59SAlex Elder void *p; 6042cc070d59SAlex Elder int ret; 6043cc070d59SAlex Elder 6044ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 6045ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 6046ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 6047cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6048cc070d59SAlex Elder if (ret < 0) 6049cc070d59SAlex Elder return ret; 6050cc070d59SAlex Elder if (ret < size) 6051cc070d59SAlex Elder return -ERANGE; 6052cc070d59SAlex Elder 6053cc070d59SAlex Elder p = &striping_info_buf; 6054b1331852SIlya Dryomov rbd_dev->header.stripe_unit = ceph_decode_64(&p); 6055b1331852SIlya Dryomov rbd_dev->header.stripe_count = ceph_decode_64(&p); 6056cc070d59SAlex Elder return 0; 6057cc070d59SAlex Elder } 6058cc070d59SAlex Elder 60597e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 60607e97332eSIlya Dryomov { 60617e97332eSIlya Dryomov __le64 data_pool_id; 60627e97332eSIlya Dryomov int ret; 60637e97332eSIlya Dryomov 60647e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 60657e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 60667e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 60677e97332eSIlya Dryomov if (ret < 0) 60687e97332eSIlya Dryomov return ret; 60697e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 60707e97332eSIlya Dryomov return -EBADMSG; 60717e97332eSIlya Dryomov 60727e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 60737e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 60747e97332eSIlya Dryomov return 0; 60757e97332eSIlya Dryomov } 60767e97332eSIlya Dryomov 60779e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 60789e15b77dSAlex Elder { 6079ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 60809e15b77dSAlex Elder size_t image_id_size; 60819e15b77dSAlex Elder char *image_id; 60829e15b77dSAlex Elder void *p; 60839e15b77dSAlex Elder void *end; 60849e15b77dSAlex Elder size_t size; 60859e15b77dSAlex Elder void *reply_buf = NULL; 60869e15b77dSAlex Elder size_t len = 0; 60879e15b77dSAlex Elder char *image_name = NULL; 60889e15b77dSAlex Elder int ret; 60899e15b77dSAlex Elder 60909e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 60919e15b77dSAlex Elder 609269e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 609369e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 60949e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 60959e15b77dSAlex Elder if (!image_id) 60969e15b77dSAlex Elder return NULL; 60979e15b77dSAlex Elder 60989e15b77dSAlex Elder p = image_id; 60994157976bSAlex Elder end = image_id + image_id_size; 610069e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 61019e15b77dSAlex Elder 61029e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 61039e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 61049e15b77dSAlex Elder if (!reply_buf) 61059e15b77dSAlex Elder goto out; 61069e15b77dSAlex Elder 6107ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 6108ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 6109ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 6110e2a58ee5SAlex Elder reply_buf, size); 61119e15b77dSAlex Elder if (ret < 0) 61129e15b77dSAlex Elder goto out; 61139e15b77dSAlex Elder p = reply_buf; 6114f40eb349SAlex Elder end = reply_buf + ret; 6115f40eb349SAlex Elder 61169e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 61179e15b77dSAlex Elder if (IS_ERR(image_name)) 61189e15b77dSAlex Elder image_name = NULL; 61199e15b77dSAlex Elder else 61209e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 61219e15b77dSAlex Elder out: 61229e15b77dSAlex Elder kfree(reply_buf); 61239e15b77dSAlex Elder kfree(image_id); 61249e15b77dSAlex Elder 61259e15b77dSAlex Elder return image_name; 61269e15b77dSAlex Elder } 61279e15b77dSAlex Elder 61282ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 61292ad3d716SAlex Elder { 61302ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 61312ad3d716SAlex Elder const char *snap_name; 61322ad3d716SAlex Elder u32 which = 0; 61332ad3d716SAlex Elder 61342ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 61352ad3d716SAlex Elder 61362ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 61372ad3d716SAlex Elder while (which < snapc->num_snaps) { 61382ad3d716SAlex Elder if (!strcmp(name, snap_name)) 61392ad3d716SAlex Elder return snapc->snaps[which]; 61402ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 61412ad3d716SAlex Elder which++; 61422ad3d716SAlex Elder } 61432ad3d716SAlex Elder return CEPH_NOSNAP; 61442ad3d716SAlex Elder } 61452ad3d716SAlex Elder 61462ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 61472ad3d716SAlex Elder { 61482ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 61492ad3d716SAlex Elder u32 which; 61502ad3d716SAlex Elder bool found = false; 61512ad3d716SAlex Elder u64 snap_id; 61522ad3d716SAlex Elder 61532ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 61542ad3d716SAlex Elder const char *snap_name; 61552ad3d716SAlex Elder 61562ad3d716SAlex Elder snap_id = snapc->snaps[which]; 61572ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 6158efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 6159efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 6160efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 6161efadc98aSJosh Durgin continue; 6162efadc98aSJosh Durgin else 61632ad3d716SAlex Elder break; 6164efadc98aSJosh Durgin } 61652ad3d716SAlex Elder found = !strcmp(name, snap_name); 61662ad3d716SAlex Elder kfree(snap_name); 61672ad3d716SAlex Elder } 61682ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 61692ad3d716SAlex Elder } 61702ad3d716SAlex Elder 61712ad3d716SAlex Elder /* 61722ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 61732ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 61742ad3d716SAlex Elder */ 61752ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 61762ad3d716SAlex Elder { 61772ad3d716SAlex Elder if (rbd_dev->image_format == 1) 61782ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 61792ad3d716SAlex Elder 61802ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 61812ad3d716SAlex Elder } 61822ad3d716SAlex Elder 61839e15b77dSAlex Elder /* 618404077599SIlya Dryomov * An image being mapped will have everything but the snap id. 61859e15b77dSAlex Elder */ 618604077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 618704077599SIlya Dryomov { 618804077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 618904077599SIlya Dryomov 619004077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 619104077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 619204077599SIlya Dryomov rbd_assert(spec->snap_name); 619304077599SIlya Dryomov 619404077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 619504077599SIlya Dryomov u64 snap_id; 619604077599SIlya Dryomov 619704077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 619804077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 619904077599SIlya Dryomov return -ENOENT; 620004077599SIlya Dryomov 620104077599SIlya Dryomov spec->snap_id = snap_id; 620204077599SIlya Dryomov } else { 620304077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 620404077599SIlya Dryomov } 620504077599SIlya Dryomov 620604077599SIlya Dryomov return 0; 620704077599SIlya Dryomov } 620804077599SIlya Dryomov 620904077599SIlya Dryomov /* 621004077599SIlya Dryomov * A parent image will have all ids but none of the names. 621104077599SIlya Dryomov * 621204077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 621304077599SIlya Dryomov * can't figure out the name for an image id. 621404077599SIlya Dryomov */ 621504077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 62169e15b77dSAlex Elder { 62172e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 62182e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 62192e9f7f1cSAlex Elder const char *pool_name; 62202e9f7f1cSAlex Elder const char *image_name; 62212e9f7f1cSAlex Elder const char *snap_name; 62229e15b77dSAlex Elder int ret; 62239e15b77dSAlex Elder 622404077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 622504077599SIlya Dryomov rbd_assert(spec->image_id); 622604077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 62279e15b77dSAlex Elder 62282e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 62299e15b77dSAlex Elder 62302e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 62312e9f7f1cSAlex Elder if (!pool_name) { 62322e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 6233935dc89fSAlex Elder return -EIO; 6234935dc89fSAlex Elder } 62352e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 62362e9f7f1cSAlex Elder if (!pool_name) 62379e15b77dSAlex Elder return -ENOMEM; 62389e15b77dSAlex Elder 62399e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 62409e15b77dSAlex Elder 62412e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 62422e9f7f1cSAlex Elder if (!image_name) 624306ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 62449e15b77dSAlex Elder 624504077599SIlya Dryomov /* Fetch the snapshot name */ 62469e15b77dSAlex Elder 62472e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 6248da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 6249da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 62509e15b77dSAlex Elder goto out_err; 62512e9f7f1cSAlex Elder } 62522e9f7f1cSAlex Elder 62532e9f7f1cSAlex Elder spec->pool_name = pool_name; 62542e9f7f1cSAlex Elder spec->image_name = image_name; 62552e9f7f1cSAlex Elder spec->snap_name = snap_name; 62569e15b77dSAlex Elder 62579e15b77dSAlex Elder return 0; 625804077599SIlya Dryomov 62599e15b77dSAlex Elder out_err: 62602e9f7f1cSAlex Elder kfree(image_name); 62612e9f7f1cSAlex Elder kfree(pool_name); 62629e15b77dSAlex Elder return ret; 62639e15b77dSAlex Elder } 62649e15b77dSAlex Elder 6265cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 626635d489f9SAlex Elder { 626735d489f9SAlex Elder size_t size; 626835d489f9SAlex Elder int ret; 626935d489f9SAlex Elder void *reply_buf; 627035d489f9SAlex Elder void *p; 627135d489f9SAlex Elder void *end; 627235d489f9SAlex Elder u64 seq; 627335d489f9SAlex Elder u32 snap_count; 627435d489f9SAlex Elder struct ceph_snap_context *snapc; 627535d489f9SAlex Elder u32 i; 627635d489f9SAlex Elder 627735d489f9SAlex Elder /* 627835d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 627935d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 628035d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 628135d489f9SAlex Elder * prepared to receive. 628235d489f9SAlex Elder */ 628335d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 628435d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 628535d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 628635d489f9SAlex Elder if (!reply_buf) 628735d489f9SAlex Elder return -ENOMEM; 628835d489f9SAlex Elder 6289ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 6290ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 6291ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 629236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 629335d489f9SAlex Elder if (ret < 0) 629435d489f9SAlex Elder goto out; 629535d489f9SAlex Elder 629635d489f9SAlex Elder p = reply_buf; 629757385b51SAlex Elder end = reply_buf + ret; 629857385b51SAlex Elder ret = -ERANGE; 629935d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 630035d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 630135d489f9SAlex Elder 630235d489f9SAlex Elder /* 630335d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 630435d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 630535d489f9SAlex Elder * make sure the computed size of the snapshot context we 630635d489f9SAlex Elder * allocate is representable in a size_t. 630735d489f9SAlex Elder */ 630835d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 630935d489f9SAlex Elder / sizeof (u64)) { 631035d489f9SAlex Elder ret = -EINVAL; 631135d489f9SAlex Elder goto out; 631235d489f9SAlex Elder } 631335d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 631435d489f9SAlex Elder goto out; 6315468521c1SAlex Elder ret = 0; 631635d489f9SAlex Elder 6317812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 631835d489f9SAlex Elder if (!snapc) { 631935d489f9SAlex Elder ret = -ENOMEM; 632035d489f9SAlex Elder goto out; 632135d489f9SAlex Elder } 632235d489f9SAlex Elder snapc->seq = seq; 632335d489f9SAlex Elder for (i = 0; i < snap_count; i++) 632435d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 632535d489f9SAlex Elder 632649ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 632735d489f9SAlex Elder rbd_dev->header.snapc = snapc; 632835d489f9SAlex Elder 632935d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 633035d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 633135d489f9SAlex Elder out: 633235d489f9SAlex Elder kfree(reply_buf); 633335d489f9SAlex Elder 633457385b51SAlex Elder return ret; 633535d489f9SAlex Elder } 633635d489f9SAlex Elder 633754cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 633854cac61fSAlex Elder u64 snap_id) 6339b8b1e2dbSAlex Elder { 6340b8b1e2dbSAlex Elder size_t size; 6341b8b1e2dbSAlex Elder void *reply_buf; 634254cac61fSAlex Elder __le64 snapid; 6343b8b1e2dbSAlex Elder int ret; 6344b8b1e2dbSAlex Elder void *p; 6345b8b1e2dbSAlex Elder void *end; 6346b8b1e2dbSAlex Elder char *snap_name; 6347b8b1e2dbSAlex Elder 6348b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 6349b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 6350b8b1e2dbSAlex Elder if (!reply_buf) 6351b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 6352b8b1e2dbSAlex Elder 635354cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 6354ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 6355ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 6356ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 635736be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6358f40eb349SAlex Elder if (ret < 0) { 6359f40eb349SAlex Elder snap_name = ERR_PTR(ret); 6360b8b1e2dbSAlex Elder goto out; 6361f40eb349SAlex Elder } 6362b8b1e2dbSAlex Elder 6363b8b1e2dbSAlex Elder p = reply_buf; 6364f40eb349SAlex Elder end = reply_buf + ret; 6365e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 6366f40eb349SAlex Elder if (IS_ERR(snap_name)) 6367b8b1e2dbSAlex Elder goto out; 6368f40eb349SAlex Elder 6369b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 637054cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 6371b8b1e2dbSAlex Elder out: 6372b8b1e2dbSAlex Elder kfree(reply_buf); 6373b8b1e2dbSAlex Elder 6374f40eb349SAlex Elder return snap_name; 6375b8b1e2dbSAlex Elder } 6376b8b1e2dbSAlex Elder 63772df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 6378117973fbSAlex Elder { 63792df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 6380117973fbSAlex Elder int ret; 6381117973fbSAlex Elder 63821617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 63831617e40cSJosh Durgin if (ret) 6384cfbf6377SAlex Elder return ret; 63851617e40cSJosh Durgin 63862df3fac7SAlex Elder if (first_time) { 63872df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 63882df3fac7SAlex Elder if (ret) 6389cfbf6377SAlex Elder return ret; 63902df3fac7SAlex Elder } 63912df3fac7SAlex Elder 6392cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 6393d194cd1dSIlya Dryomov if (ret && first_time) { 6394d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 6395d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 6396d194cd1dSIlya Dryomov } 6397117973fbSAlex Elder 6398117973fbSAlex Elder return ret; 6399117973fbSAlex Elder } 6400117973fbSAlex Elder 6401a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 6402a720ae09SIlya Dryomov { 6403a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6404a720ae09SIlya Dryomov 6405a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 6406a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 6407a720ae09SIlya Dryomov 6408a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 6409a720ae09SIlya Dryomov } 6410a720ae09SIlya Dryomov 64111ddbe94eSAlex Elder /* 6412e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 6413e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 6414593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 6415593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 6416e28fff26SAlex Elder */ 6417e28fff26SAlex Elder static inline size_t next_token(const char **buf) 6418e28fff26SAlex Elder { 6419e28fff26SAlex Elder /* 6420e28fff26SAlex Elder * These are the characters that produce nonzero for 6421e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 6422e28fff26SAlex Elder */ 6423e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 6424e28fff26SAlex Elder 6425e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 6426e28fff26SAlex Elder 6427e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 6428e28fff26SAlex Elder } 6429e28fff26SAlex Elder 6430e28fff26SAlex Elder /* 6431ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 6432ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 6433ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 6434ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 6435ea3352f4SAlex Elder * 6436ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 6437ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 6438ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 6439ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 6440ea3352f4SAlex Elder * 6441ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 6442ea3352f4SAlex Elder * the end of the found token. 6443ea3352f4SAlex Elder * 6444ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 6445ea3352f4SAlex Elder */ 6446ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 6447ea3352f4SAlex Elder { 6448ea3352f4SAlex Elder char *dup; 6449ea3352f4SAlex Elder size_t len; 6450ea3352f4SAlex Elder 6451ea3352f4SAlex Elder len = next_token(buf); 64524caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 6453ea3352f4SAlex Elder if (!dup) 6454ea3352f4SAlex Elder return NULL; 6455ea3352f4SAlex Elder *(dup + len) = '\0'; 6456ea3352f4SAlex Elder *buf += len; 6457ea3352f4SAlex Elder 6458ea3352f4SAlex Elder if (lenp) 6459ea3352f4SAlex Elder *lenp = len; 6460ea3352f4SAlex Elder 6461ea3352f4SAlex Elder return dup; 6462ea3352f4SAlex Elder } 6463ea3352f4SAlex Elder 6464ea3352f4SAlex Elder /* 6465859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 6466859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 6467859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 6468859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 6469d22f76e7SAlex Elder * 6470859c31dfSAlex Elder * The information extracted from these options is recorded in 6471859c31dfSAlex Elder * the other parameters which return dynamically-allocated 6472859c31dfSAlex Elder * structures: 6473859c31dfSAlex Elder * ceph_opts 6474859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 6475859c31dfSAlex Elder * structure. Caller must release the returned pointer using 6476859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 6477859c31dfSAlex Elder * rbd_opts 6478859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 6479859c31dfSAlex Elder * this function; caller must release with kfree(). 6480859c31dfSAlex Elder * spec 6481859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 6482859c31dfSAlex Elder * initialized by this function based on parsed options. 6483859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 6484859c31dfSAlex Elder * 6485859c31dfSAlex Elder * The options passed take this form: 6486859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 6487859c31dfSAlex Elder * where: 6488859c31dfSAlex Elder * <mon_addrs> 6489859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 6490859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 6491859c31dfSAlex Elder * by a port number (separated by a colon). 6492859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 6493859c31dfSAlex Elder * <options> 6494859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 6495859c31dfSAlex Elder * <pool_name> 6496859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 6497859c31dfSAlex Elder * <image_name> 6498859c31dfSAlex Elder * The name of the image in that pool to map. 6499859c31dfSAlex Elder * <snap_id> 6500859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 6501859c31dfSAlex Elder * present data from the image at the time that snapshot was 6502859c31dfSAlex Elder * created. The image head is used if no snapshot id is 6503859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 6504a725f65eSAlex Elder */ 6505859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 6506dc79b113SAlex Elder struct ceph_options **ceph_opts, 6507859c31dfSAlex Elder struct rbd_options **opts, 6508859c31dfSAlex Elder struct rbd_spec **rbd_spec) 6509a725f65eSAlex Elder { 6510e28fff26SAlex Elder size_t len; 6511859c31dfSAlex Elder char *options; 65120ddebc0cSAlex Elder const char *mon_addrs; 6513ecb4dc22SAlex Elder char *snap_name; 65140ddebc0cSAlex Elder size_t mon_addrs_size; 6515c300156bSIlya Dryomov struct parse_rbd_opts_ctx pctx = { 0 }; 6516859c31dfSAlex Elder struct ceph_options *copts; 6517dc79b113SAlex Elder int ret; 6518e28fff26SAlex Elder 6519e28fff26SAlex Elder /* The first four tokens are required */ 6520e28fff26SAlex Elder 65217ef3214aSAlex Elder len = next_token(&buf); 65224fb5d671SAlex Elder if (!len) { 65234fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 65244fb5d671SAlex Elder return -EINVAL; 65254fb5d671SAlex Elder } 65260ddebc0cSAlex Elder mon_addrs = buf; 6527f28e565aSAlex Elder mon_addrs_size = len + 1; 65287ef3214aSAlex Elder buf += len; 6529a725f65eSAlex Elder 6530dc79b113SAlex Elder ret = -EINVAL; 6531f28e565aSAlex Elder options = dup_token(&buf, NULL); 6532f28e565aSAlex Elder if (!options) 6533dc79b113SAlex Elder return -ENOMEM; 65344fb5d671SAlex Elder if (!*options) { 65354fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 65364fb5d671SAlex Elder goto out_err; 65374fb5d671SAlex Elder } 6538a725f65eSAlex Elder 6539c300156bSIlya Dryomov pctx.spec = rbd_spec_alloc(); 6540c300156bSIlya Dryomov if (!pctx.spec) 6541f28e565aSAlex Elder goto out_mem; 6542859c31dfSAlex Elder 6543c300156bSIlya Dryomov pctx.spec->pool_name = dup_token(&buf, NULL); 6544c300156bSIlya Dryomov if (!pctx.spec->pool_name) 6545859c31dfSAlex Elder goto out_mem; 6546c300156bSIlya Dryomov if (!*pctx.spec->pool_name) { 65474fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 65484fb5d671SAlex Elder goto out_err; 65494fb5d671SAlex Elder } 6550e28fff26SAlex Elder 6551c300156bSIlya Dryomov pctx.spec->image_name = dup_token(&buf, NULL); 6552c300156bSIlya Dryomov if (!pctx.spec->image_name) 6553f28e565aSAlex Elder goto out_mem; 6554c300156bSIlya Dryomov if (!*pctx.spec->image_name) { 65554fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 65564fb5d671SAlex Elder goto out_err; 65574fb5d671SAlex Elder } 6558e28fff26SAlex Elder 6559f28e565aSAlex Elder /* 6560f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 6561f28e565aSAlex Elder * (indicating the head/no snapshot). 6562f28e565aSAlex Elder */ 65633feeb894SAlex Elder len = next_token(&buf); 6564820a5f3eSAlex Elder if (!len) { 65653feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 65663feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 6567f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 6568dc79b113SAlex Elder ret = -ENAMETOOLONG; 6569f28e565aSAlex Elder goto out_err; 6570849b4260SAlex Elder } 6571ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 6572ecb4dc22SAlex Elder if (!snap_name) 6573f28e565aSAlex Elder goto out_mem; 6574ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 6575c300156bSIlya Dryomov pctx.spec->snap_name = snap_name; 6576e5c35534SAlex Elder 65770ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 6578e28fff26SAlex Elder 6579c300156bSIlya Dryomov pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); 6580c300156bSIlya Dryomov if (!pctx.opts) 65814e9afebaSAlex Elder goto out_mem; 65824e9afebaSAlex Elder 6583c300156bSIlya Dryomov pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; 6584c300156bSIlya Dryomov pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 65850c93e1b7SIlya Dryomov pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; 6586c300156bSIlya Dryomov pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; 6587c300156bSIlya Dryomov pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 6588c300156bSIlya Dryomov pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; 6589c300156bSIlya Dryomov pctx.opts->trim = RBD_TRIM_DEFAULT; 6590d22f76e7SAlex Elder 6591859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 65920ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 6593c300156bSIlya Dryomov parse_rbd_opts_token, &pctx); 6594859c31dfSAlex Elder if (IS_ERR(copts)) { 6595859c31dfSAlex Elder ret = PTR_ERR(copts); 6596dc79b113SAlex Elder goto out_err; 6597dc79b113SAlex Elder } 6598859c31dfSAlex Elder kfree(options); 6599859c31dfSAlex Elder 6600859c31dfSAlex Elder *ceph_opts = copts; 6601c300156bSIlya Dryomov *opts = pctx.opts; 6602c300156bSIlya Dryomov *rbd_spec = pctx.spec; 66030ddebc0cSAlex Elder 6604dc79b113SAlex Elder return 0; 6605f28e565aSAlex Elder out_mem: 6606dc79b113SAlex Elder ret = -ENOMEM; 6607d22f76e7SAlex Elder out_err: 6608c300156bSIlya Dryomov kfree(pctx.opts); 6609c300156bSIlya Dryomov rbd_spec_put(pctx.spec); 6610f28e565aSAlex Elder kfree(options); 6611d22f76e7SAlex Elder 6612dc79b113SAlex Elder return ret; 6613a725f65eSAlex Elder } 6614a725f65eSAlex Elder 6615e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 6616e010dd0aSIlya Dryomov { 6617e010dd0aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 6618e010dd0aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 6619e1fddc8fSIlya Dryomov __rbd_release_lock(rbd_dev); 6620e010dd0aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 6621e010dd0aSIlya Dryomov } 6622e010dd0aSIlya Dryomov 6623637cd060SIlya Dryomov /* 6624637cd060SIlya Dryomov * If the wait is interrupted, an error is returned even if the lock 6625637cd060SIlya Dryomov * was successfully acquired. rbd_dev_image_unlock() will release it 6626637cd060SIlya Dryomov * if needed. 6627637cd060SIlya Dryomov */ 6628e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) 6629e010dd0aSIlya Dryomov { 6630637cd060SIlya Dryomov long ret; 66312f18d466SIlya Dryomov 6632e010dd0aSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { 6633637cd060SIlya Dryomov if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read) 6634637cd060SIlya Dryomov return 0; 6635637cd060SIlya Dryomov 6636e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); 6637e010dd0aSIlya Dryomov return -EINVAL; 6638e010dd0aSIlya Dryomov } 6639e010dd0aSIlya Dryomov 6640f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev)) 6641637cd060SIlya Dryomov return 0; 6642637cd060SIlya Dryomov 6643637cd060SIlya Dryomov rbd_assert(!rbd_is_lock_owner(rbd_dev)); 6644637cd060SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 6645637cd060SIlya Dryomov ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait, 6646637cd060SIlya Dryomov ceph_timeout_jiffies(rbd_dev->opts->lock_timeout)); 664725e6be21SDongsheng Yang if (ret > 0) { 6648637cd060SIlya Dryomov ret = rbd_dev->acquire_err; 664925e6be21SDongsheng Yang } else { 665025e6be21SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->lock_dwork); 665125e6be21SDongsheng Yang if (!ret) 6652637cd060SIlya Dryomov ret = -ETIMEDOUT; 665325e6be21SDongsheng Yang } 6654637cd060SIlya Dryomov 66552f18d466SIlya Dryomov if (ret) { 6656637cd060SIlya Dryomov rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret); 6657637cd060SIlya Dryomov return ret; 6658e010dd0aSIlya Dryomov } 6659e010dd0aSIlya Dryomov 6660637cd060SIlya Dryomov /* 6661637cd060SIlya Dryomov * The lock may have been released by now, unless automatic lock 6662637cd060SIlya Dryomov * transitions are disabled. 6663637cd060SIlya Dryomov */ 6664637cd060SIlya Dryomov rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev)); 6665e010dd0aSIlya Dryomov return 0; 6666e010dd0aSIlya Dryomov } 6667e010dd0aSIlya Dryomov 666830ba1f02SIlya Dryomov /* 6669589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 6670589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 6671589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 6672589d30e0SAlex Elder * 6673589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 6674589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 6675589d30e0SAlex Elder * with the supplied name. 6676589d30e0SAlex Elder * 6677589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 6678589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 6679589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 6680589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 6681589d30e0SAlex Elder */ 6682589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 6683589d30e0SAlex Elder { 6684589d30e0SAlex Elder int ret; 6685589d30e0SAlex Elder size_t size; 6686ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 6687589d30e0SAlex Elder void *response; 6688c0fba368SAlex Elder char *image_id; 66892f82ee54SAlex Elder 6690589d30e0SAlex Elder /* 66912c0d0a10SAlex Elder * When probing a parent image, the image id is already 66922c0d0a10SAlex Elder * known (and the image name likely is not). There's no 6693c0fba368SAlex Elder * need to fetch the image id again in this case. We 6694c0fba368SAlex Elder * do still need to set the image format though. 66952c0d0a10SAlex Elder */ 6696c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 6697c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 6698c0fba368SAlex Elder 66992c0d0a10SAlex Elder return 0; 6700c0fba368SAlex Elder } 67012c0d0a10SAlex Elder 67022c0d0a10SAlex Elder /* 6703589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 6704589d30e0SAlex Elder * so, get the image's persistent id from it. 6705589d30e0SAlex Elder */ 6706ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 6707ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 6708ecd4a68aSIlya Dryomov if (ret) 6709ecd4a68aSIlya Dryomov return ret; 6710ecd4a68aSIlya Dryomov 6711ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 6712589d30e0SAlex Elder 6713589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 6714589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 6715589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 6716589d30e0SAlex Elder if (!response) { 6717589d30e0SAlex Elder ret = -ENOMEM; 6718589d30e0SAlex Elder goto out; 6719589d30e0SAlex Elder } 6720589d30e0SAlex Elder 6721c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 6722c0fba368SAlex Elder 6723ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 6724ecd4a68aSIlya Dryomov "get_id", NULL, 0, 67255435d206SDongsheng Yang response, size); 672636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6727c0fba368SAlex Elder if (ret == -ENOENT) { 6728c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 6729c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 6730c0fba368SAlex Elder if (!ret) 6731c0fba368SAlex Elder rbd_dev->image_format = 1; 67327dd440c9SIlya Dryomov } else if (ret >= 0) { 6733c0fba368SAlex Elder void *p = response; 6734589d30e0SAlex Elder 6735c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 6736979ed480SAlex Elder NULL, GFP_NOIO); 6737461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 6738c0fba368SAlex Elder if (!ret) 6739c0fba368SAlex Elder rbd_dev->image_format = 2; 6740c0fba368SAlex Elder } 6741c0fba368SAlex Elder 6742c0fba368SAlex Elder if (!ret) { 6743c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 6744c0fba368SAlex Elder dout("image_id is %s\n", image_id); 6745589d30e0SAlex Elder } 6746589d30e0SAlex Elder out: 6747589d30e0SAlex Elder kfree(response); 6748ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 6749589d30e0SAlex Elder return ret; 6750589d30e0SAlex Elder } 6751589d30e0SAlex Elder 67523abef3b3SAlex Elder /* 67533abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 67543abef3b3SAlex Elder * call. 67553abef3b3SAlex Elder */ 67566fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 67576fd48b3bSAlex Elder { 67586fd48b3bSAlex Elder struct rbd_image_header *header; 67596fd48b3bSAlex Elder 6760a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 676122e8bd51SIlya Dryomov rbd_object_map_free(rbd_dev); 6762da5ef6beSIlya Dryomov rbd_dev_mapping_clear(rbd_dev); 67636fd48b3bSAlex Elder 67646fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 67656fd48b3bSAlex Elder 67666fd48b3bSAlex Elder header = &rbd_dev->header; 6767812164f8SAlex Elder ceph_put_snap_context(header->snapc); 67686fd48b3bSAlex Elder kfree(header->snap_sizes); 67696fd48b3bSAlex Elder kfree(header->snap_names); 67706fd48b3bSAlex Elder kfree(header->object_prefix); 67716fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 67726fd48b3bSAlex Elder } 67736fd48b3bSAlex Elder 67742df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 6775a30b71b9SAlex Elder { 6776a30b71b9SAlex Elder int ret; 6777a30b71b9SAlex Elder 67781e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 677957385b51SAlex Elder if (ret) 67801e130199SAlex Elder goto out_err; 6781b1b5402aSAlex Elder 67822df3fac7SAlex Elder /* 67832df3fac7SAlex Elder * Get the and check features for the image. Currently the 67842df3fac7SAlex Elder * features are assumed to never change. 67852df3fac7SAlex Elder */ 6786b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 678757385b51SAlex Elder if (ret) 6788b1b5402aSAlex Elder goto out_err; 678935d489f9SAlex Elder 6790cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 6791cc070d59SAlex Elder 6792cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 6793cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 6794cc070d59SAlex Elder if (ret < 0) 6795cc070d59SAlex Elder goto out_err; 6796cc070d59SAlex Elder } 6797a30b71b9SAlex Elder 67987e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 67997e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 68007e97332eSIlya Dryomov if (ret) 68017e97332eSIlya Dryomov goto out_err; 68027e97332eSIlya Dryomov } 68037e97332eSIlya Dryomov 6804263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 680535152979SAlex Elder return 0; 6806263423f8SIlya Dryomov 68079d475de5SAlex Elder out_err: 6808642a2537SAlex Elder rbd_dev->header.features = 0; 68091e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 68101e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 68119d475de5SAlex Elder return ret; 6812a30b71b9SAlex Elder } 6813a30b71b9SAlex Elder 68146d69bb53SIlya Dryomov /* 68156d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 68166d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 68176d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 68186d69bb53SIlya Dryomov */ 68196d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 682083a06263SAlex Elder { 68212f82ee54SAlex Elder struct rbd_device *parent = NULL; 6822124afba2SAlex Elder int ret; 6823124afba2SAlex Elder 6824124afba2SAlex Elder if (!rbd_dev->parent_spec) 6825124afba2SAlex Elder return 0; 6826124afba2SAlex Elder 68276d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 68286d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 68296d69bb53SIlya Dryomov ret = -EINVAL; 68306d69bb53SIlya Dryomov goto out_err; 68316d69bb53SIlya Dryomov } 68326d69bb53SIlya Dryomov 68331643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 68341f2c6651SIlya Dryomov if (!parent) { 6835124afba2SAlex Elder ret = -ENOMEM; 6836124afba2SAlex Elder goto out_err; 68371f2c6651SIlya Dryomov } 68381f2c6651SIlya Dryomov 68391f2c6651SIlya Dryomov /* 68401f2c6651SIlya Dryomov * Images related by parent/child relationships always share 68411f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 68421f2c6651SIlya Dryomov */ 68431f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 68441f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 6845124afba2SAlex Elder 68466d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 6847124afba2SAlex Elder if (ret < 0) 6848124afba2SAlex Elder goto out_err; 68491f2c6651SIlya Dryomov 6850124afba2SAlex Elder rbd_dev->parent = parent; 6851a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 6852124afba2SAlex Elder return 0; 6853124afba2SAlex Elder 68541f2c6651SIlya Dryomov out_err: 68551f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 68561f2c6651SIlya Dryomov rbd_dev_destroy(parent); 6857124afba2SAlex Elder return ret; 6858124afba2SAlex Elder } 6859124afba2SAlex Elder 68605769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 68615769ed0cSIlya Dryomov { 68625769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 68635769ed0cSIlya Dryomov rbd_free_disk(rbd_dev); 68645769ed0cSIlya Dryomov if (!single_major) 68655769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name); 68665769ed0cSIlya Dryomov } 68675769ed0cSIlya Dryomov 6868811c6688SIlya Dryomov /* 6869811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 6870811c6688SIlya Dryomov * upon return. 6871811c6688SIlya Dryomov */ 6872200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 6873124afba2SAlex Elder { 687483a06263SAlex Elder int ret; 687583a06263SAlex Elder 68769b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 687783a06263SAlex Elder 68789b60e70bSIlya Dryomov if (!single_major) { 687983a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 688083a06263SAlex Elder if (ret < 0) 68811643dfa4SIlya Dryomov goto err_out_unlock; 68829b60e70bSIlya Dryomov 688383a06263SAlex Elder rbd_dev->major = ret; 6884dd82fff1SIlya Dryomov rbd_dev->minor = 0; 68859b60e70bSIlya Dryomov } else { 68869b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 68879b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 68889b60e70bSIlya Dryomov } 688983a06263SAlex Elder 689083a06263SAlex Elder /* Set up the blkdev mapping. */ 689183a06263SAlex Elder 689283a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 689383a06263SAlex Elder if (ret) 689483a06263SAlex Elder goto err_out_blkdev; 689583a06263SAlex Elder 6896f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 68979568c93eSIlya Dryomov set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); 6898f35a4deeSAlex Elder 68995769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 6900f35a4deeSAlex Elder if (ret) 6901da5ef6beSIlya Dryomov goto err_out_disk; 690283a06263SAlex Elder 6903129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 6904811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 69055769ed0cSIlya Dryomov return 0; 69062f82ee54SAlex Elder 690783a06263SAlex Elder err_out_disk: 690883a06263SAlex Elder rbd_free_disk(rbd_dev); 690983a06263SAlex Elder err_out_blkdev: 69109b60e70bSIlya Dryomov if (!single_major) 691183a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 6912811c6688SIlya Dryomov err_out_unlock: 6913811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 691483a06263SAlex Elder return ret; 691583a06263SAlex Elder } 691683a06263SAlex Elder 6917332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 6918332bb12dSAlex Elder { 6919332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 6920c41d13a3SIlya Dryomov int ret; 6921332bb12dSAlex Elder 6922332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 6923332bb12dSAlex Elder 6924332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 6925332bb12dSAlex Elder if (rbd_dev->image_format == 1) 6926c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6927332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 6928332bb12dSAlex Elder else 6929c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 6930332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 6931c41d13a3SIlya Dryomov 6932c41d13a3SIlya Dryomov return ret; 6933332bb12dSAlex Elder } 6934332bb12dSAlex Elder 6935200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 6936200a6a8bSAlex Elder { 69376fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 6938fd22aef8SIlya Dryomov if (rbd_dev->opts) 6939fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev); 69406fd48b3bSAlex Elder rbd_dev->image_format = 0; 69416fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 69426fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 6943200a6a8bSAlex Elder } 6944200a6a8bSAlex Elder 6945a30b71b9SAlex Elder /* 6946a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 69471f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 69481f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 69491f3ef788SAlex Elder * object to get detailed information about the rbd image. 6950a30b71b9SAlex Elder */ 69516d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 6952a30b71b9SAlex Elder { 6953a30b71b9SAlex Elder int ret; 6954a30b71b9SAlex Elder 6955a30b71b9SAlex Elder /* 69563abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 69573abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 69583abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 69593abef3b3SAlex Elder * will be set to either 1 or 2. 6960a30b71b9SAlex Elder */ 6961a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 6962a30b71b9SAlex Elder if (ret) 6963c0fba368SAlex Elder return ret; 6964c0fba368SAlex Elder 6965332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 6966332bb12dSAlex Elder if (ret) 6967332bb12dSAlex Elder goto err_out_format; 6968332bb12dSAlex Elder 69696d69bb53SIlya Dryomov if (!depth) { 697099d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 69711fe48023SIlya Dryomov if (ret) { 69721fe48023SIlya Dryomov if (ret == -ENOENT) 6973b26c047bSIlya Dryomov pr_info("image %s/%s%s%s does not exist\n", 69741fe48023SIlya Dryomov rbd_dev->spec->pool_name, 6975b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 6976b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 69771fe48023SIlya Dryomov rbd_dev->spec->image_name); 6978c41d13a3SIlya Dryomov goto err_out_format; 69791f3ef788SAlex Elder } 69801fe48023SIlya Dryomov } 6981b644de2bSAlex Elder 6982a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 69835655c4d9SAlex Elder if (ret) 6984b644de2bSAlex Elder goto err_out_watch; 6985a30b71b9SAlex Elder 698604077599SIlya Dryomov /* 698704077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 698804077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 698904077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 699004077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 699104077599SIlya Dryomov */ 69926d69bb53SIlya Dryomov if (!depth) 699304077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 699404077599SIlya Dryomov else 699504077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 69961fe48023SIlya Dryomov if (ret) { 69971fe48023SIlya Dryomov if (ret == -ENOENT) 6998b26c047bSIlya Dryomov pr_info("snap %s/%s%s%s@%s does not exist\n", 69991fe48023SIlya Dryomov rbd_dev->spec->pool_name, 7000b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 7001b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 70021fe48023SIlya Dryomov rbd_dev->spec->image_name, 70031fe48023SIlya Dryomov rbd_dev->spec->snap_name); 700433dca39fSAlex Elder goto err_out_probe; 70051fe48023SIlya Dryomov } 70069bb81c9bSAlex Elder 7007da5ef6beSIlya Dryomov ret = rbd_dev_mapping_set(rbd_dev); 7008da5ef6beSIlya Dryomov if (ret) 7009da5ef6beSIlya Dryomov goto err_out_probe; 7010da5ef6beSIlya Dryomov 7011f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev) && 701222e8bd51SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) { 701322e8bd51SIlya Dryomov ret = rbd_object_map_load(rbd_dev); 701422e8bd51SIlya Dryomov if (ret) 701522e8bd51SIlya Dryomov goto err_out_probe; 701622e8bd51SIlya Dryomov } 701722e8bd51SIlya Dryomov 7018e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 7019e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 7020e8f59b59SIlya Dryomov if (ret) 7021e8f59b59SIlya Dryomov goto err_out_probe; 7022e8f59b59SIlya Dryomov } 7023e8f59b59SIlya Dryomov 70246d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 702530d60ba2SAlex Elder if (ret) 702630d60ba2SAlex Elder goto err_out_probe; 702783a06263SAlex Elder 702830d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 7029c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 703030d60ba2SAlex Elder return 0; 7031e8f59b59SIlya Dryomov 70326fd48b3bSAlex Elder err_out_probe: 70336fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 7034b644de2bSAlex Elder err_out_watch: 70356d69bb53SIlya Dryomov if (!depth) 703699d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 7037332bb12dSAlex Elder err_out_format: 7038332bb12dSAlex Elder rbd_dev->image_format = 0; 70395655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 70405655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 70415655c4d9SAlex Elder return ret; 704283a06263SAlex Elder } 704383a06263SAlex Elder 70449b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 704559c2be1eSYehuda Sadeh const char *buf, 704659c2be1eSYehuda Sadeh size_t count) 7047602adf40SYehuda Sadeh { 7048cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 7049dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 70504e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 7051859c31dfSAlex Elder struct rbd_spec *spec = NULL; 70529d3997fdSAlex Elder struct rbd_client *rbdc; 7053b51c83c2SIlya Dryomov int rc; 7054602adf40SYehuda Sadeh 7055602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 7056602adf40SYehuda Sadeh return -ENODEV; 7057602adf40SYehuda Sadeh 7058a725f65eSAlex Elder /* parse add command */ 7059859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 7060dc79b113SAlex Elder if (rc < 0) 7061dd5ac32dSIlya Dryomov goto out; 7062a725f65eSAlex Elder 70639d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 70649d3997fdSAlex Elder if (IS_ERR(rbdc)) { 70659d3997fdSAlex Elder rc = PTR_ERR(rbdc); 70660ddebc0cSAlex Elder goto err_out_args; 70679d3997fdSAlex Elder } 7068602adf40SYehuda Sadeh 7069602adf40SYehuda Sadeh /* pick the pool */ 7070dd435855SIlya Dryomov rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); 70711fe48023SIlya Dryomov if (rc < 0) { 70721fe48023SIlya Dryomov if (rc == -ENOENT) 70731fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 7074602adf40SYehuda Sadeh goto err_out_client; 70751fe48023SIlya Dryomov } 7076859c31dfSAlex Elder spec->pool_id = (u64)rc; 7077859c31dfSAlex Elder 7078d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 7079b51c83c2SIlya Dryomov if (!rbd_dev) { 7080b51c83c2SIlya Dryomov rc = -ENOMEM; 7081bd4ba655SAlex Elder goto err_out_client; 7082b51c83c2SIlya Dryomov } 7083c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 7084c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 7085d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 7086602adf40SYehuda Sadeh 70870d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 70880d6d1e9cSMike Christie if (!rbd_dev->config_info) { 70890d6d1e9cSMike Christie rc = -ENOMEM; 70900d6d1e9cSMike Christie goto err_out_rbd_dev; 70910d6d1e9cSMike Christie } 70920d6d1e9cSMike Christie 7093811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 70946d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 70950d6d1e9cSMike Christie if (rc < 0) { 70960d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 7097c53d5893SAlex Elder goto err_out_rbd_dev; 70980d6d1e9cSMike Christie } 709905fd6f6fSAlex Elder 71007ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 7101f3c0e459SIlya Dryomov if (rbd_is_snap(rbd_dev)) 71029568c93eSIlya Dryomov rbd_dev->opts->read_only = true; 71037ce4eef7SAlex Elder 71040c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { 71050c93e1b7SIlya Dryomov rbd_warn(rbd_dev, "alloc_size adjusted to %u", 71060c93e1b7SIlya Dryomov rbd_dev->layout.object_size); 71070c93e1b7SIlya Dryomov rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; 71080c93e1b7SIlya Dryomov } 71090c93e1b7SIlya Dryomov 7110b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 7111fd22aef8SIlya Dryomov if (rc) 71128b679ec5SIlya Dryomov goto err_out_image_probe; 71133abef3b3SAlex Elder 7114e010dd0aSIlya Dryomov rc = rbd_add_acquire_lock(rbd_dev); 7115e010dd0aSIlya Dryomov if (rc) 7116637cd060SIlya Dryomov goto err_out_image_lock; 7117b536f69aSAlex Elder 71185769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */ 71195769ed0cSIlya Dryomov 71205769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev); 71215769ed0cSIlya Dryomov if (rc) 7122e010dd0aSIlya Dryomov goto err_out_image_lock; 71235769ed0cSIlya Dryomov 71245769ed0cSIlya Dryomov add_disk(rbd_dev->disk); 71255769ed0cSIlya Dryomov /* see rbd_init_disk() */ 71265769ed0cSIlya Dryomov blk_put_queue(rbd_dev->disk->queue); 71275769ed0cSIlya Dryomov 71285769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 71295769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 71305769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 71315769ed0cSIlya Dryomov 71325769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 71335769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 71345769ed0cSIlya Dryomov rbd_dev->header.features); 7135dd5ac32dSIlya Dryomov rc = count; 7136dd5ac32dSIlya Dryomov out: 7137dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 7138dd5ac32dSIlya Dryomov return rc; 7139b536f69aSAlex Elder 7140e010dd0aSIlya Dryomov err_out_image_lock: 7141e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 71425769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev); 71438b679ec5SIlya Dryomov err_out_image_probe: 71448b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev); 7145c53d5893SAlex Elder err_out_rbd_dev: 7146c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 7147bd4ba655SAlex Elder err_out_client: 71489d3997fdSAlex Elder rbd_put_client(rbdc); 71490ddebc0cSAlex Elder err_out_args: 7150859c31dfSAlex Elder rbd_spec_put(spec); 7151d147543dSIlya Dryomov kfree(rbd_opts); 7152dd5ac32dSIlya Dryomov goto out; 7153602adf40SYehuda Sadeh } 7154602adf40SYehuda Sadeh 71557e9586baSGreg Kroah-Hartman static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count) 71569b60e70bSIlya Dryomov { 71579b60e70bSIlya Dryomov if (single_major) 71589b60e70bSIlya Dryomov return -EINVAL; 71599b60e70bSIlya Dryomov 71609b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 71619b60e70bSIlya Dryomov } 71629b60e70bSIlya Dryomov 71637e9586baSGreg Kroah-Hartman static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, 71649b60e70bSIlya Dryomov size_t count) 71659b60e70bSIlya Dryomov { 71669b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 71679b60e70bSIlya Dryomov } 71689b60e70bSIlya Dryomov 716905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 717005a46afdSAlex Elder { 7171ad945fc1SAlex Elder while (rbd_dev->parent) { 717205a46afdSAlex Elder struct rbd_device *first = rbd_dev; 717305a46afdSAlex Elder struct rbd_device *second = first->parent; 717405a46afdSAlex Elder struct rbd_device *third; 717505a46afdSAlex Elder 717605a46afdSAlex Elder /* 717705a46afdSAlex Elder * Follow to the parent with no grandparent and 717805a46afdSAlex Elder * remove it. 717905a46afdSAlex Elder */ 718005a46afdSAlex Elder while (second && (third = second->parent)) { 718105a46afdSAlex Elder first = second; 718205a46afdSAlex Elder second = third; 718305a46afdSAlex Elder } 7184ad945fc1SAlex Elder rbd_assert(second); 71858ad42cd0SAlex Elder rbd_dev_image_release(second); 71868b679ec5SIlya Dryomov rbd_dev_destroy(second); 7187ad945fc1SAlex Elder first->parent = NULL; 7188ad945fc1SAlex Elder first->parent_overlap = 0; 7189ad945fc1SAlex Elder 7190ad945fc1SAlex Elder rbd_assert(first->parent_spec); 719105a46afdSAlex Elder rbd_spec_put(first->parent_spec); 719205a46afdSAlex Elder first->parent_spec = NULL; 719305a46afdSAlex Elder } 719405a46afdSAlex Elder } 719505a46afdSAlex Elder 71969b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 7197602adf40SYehuda Sadeh const char *buf, 7198602adf40SYehuda Sadeh size_t count) 7199602adf40SYehuda Sadeh { 7200602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 7201751cc0e3SAlex Elder struct list_head *tmp; 7202751cc0e3SAlex Elder int dev_id; 72030276dca6SMike Christie char opt_buf[6]; 72040276dca6SMike Christie bool force = false; 72050d8189e1SAlex Elder int ret; 7206602adf40SYehuda Sadeh 72070276dca6SMike Christie dev_id = -1; 72080276dca6SMike Christie opt_buf[0] = '\0'; 72090276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 72100276dca6SMike Christie if (dev_id < 0) { 72110276dca6SMike Christie pr_err("dev_id out of range\n"); 7212602adf40SYehuda Sadeh return -EINVAL; 72130276dca6SMike Christie } 72140276dca6SMike Christie if (opt_buf[0] != '\0') { 72150276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 72160276dca6SMike Christie force = true; 72170276dca6SMike Christie } else { 72180276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 72190276dca6SMike Christie return -EINVAL; 72200276dca6SMike Christie } 72210276dca6SMike Christie } 7222602adf40SYehuda Sadeh 7223602adf40SYehuda Sadeh ret = -ENOENT; 7224751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 7225751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 7226751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 7227751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 7228751cc0e3SAlex Elder ret = 0; 7229751cc0e3SAlex Elder break; 7230602adf40SYehuda Sadeh } 7231751cc0e3SAlex Elder } 7232751cc0e3SAlex Elder if (!ret) { 7233a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 72340276dca6SMike Christie if (rbd_dev->open_count && !force) 723542382b70SAlex Elder ret = -EBUSY; 723685f5a4d6SIlya Dryomov else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, 723785f5a4d6SIlya Dryomov &rbd_dev->flags)) 723885f5a4d6SIlya Dryomov ret = -EINPROGRESS; 7239a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 7240751cc0e3SAlex Elder } 7241751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 724285f5a4d6SIlya Dryomov if (ret) 72431ba0f1e7SAlex Elder return ret; 7244751cc0e3SAlex Elder 72450276dca6SMike Christie if (force) { 72460276dca6SMike Christie /* 72470276dca6SMike Christie * Prevent new IO from being queued and wait for existing 72480276dca6SMike Christie * IO to complete/fail. 72490276dca6SMike Christie */ 72500276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 72510276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 72520276dca6SMike Christie } 72530276dca6SMike Christie 72545769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk); 72555769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 72565769ed0cSIlya Dryomov list_del_init(&rbd_dev->node); 72575769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 72585769ed0cSIlya Dryomov device_del(&rbd_dev->dev); 7259fca27065SIlya Dryomov 7260e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 7261dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 72628ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 72638b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev); 72641ba0f1e7SAlex Elder return count; 7265602adf40SYehuda Sadeh } 7266602adf40SYehuda Sadeh 72677e9586baSGreg Kroah-Hartman static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) 72689b60e70bSIlya Dryomov { 72699b60e70bSIlya Dryomov if (single_major) 72709b60e70bSIlya Dryomov return -EINVAL; 72719b60e70bSIlya Dryomov 72729b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 72739b60e70bSIlya Dryomov } 72749b60e70bSIlya Dryomov 72757e9586baSGreg Kroah-Hartman static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, 72769b60e70bSIlya Dryomov size_t count) 72779b60e70bSIlya Dryomov { 72789b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 72799b60e70bSIlya Dryomov } 72809b60e70bSIlya Dryomov 7281602adf40SYehuda Sadeh /* 7282602adf40SYehuda Sadeh * create control files in sysfs 7283dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 7284602adf40SYehuda Sadeh */ 72857d8dc534SChengguang Xu static int __init rbd_sysfs_init(void) 7286602adf40SYehuda Sadeh { 7287dfc5606dSYehuda Sadeh int ret; 7288602adf40SYehuda Sadeh 7289fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 7290dfc5606dSYehuda Sadeh if (ret < 0) 7291dfc5606dSYehuda Sadeh return ret; 7292602adf40SYehuda Sadeh 7293fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 7294fed4c143SAlex Elder if (ret < 0) 7295fed4c143SAlex Elder device_unregister(&rbd_root_dev); 7296602adf40SYehuda Sadeh 7297602adf40SYehuda Sadeh return ret; 7298602adf40SYehuda Sadeh } 7299602adf40SYehuda Sadeh 73007d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void) 7301602adf40SYehuda Sadeh { 7302dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 7303fed4c143SAlex Elder device_unregister(&rbd_root_dev); 7304602adf40SYehuda Sadeh } 7305602adf40SYehuda Sadeh 73067d8dc534SChengguang Xu static int __init rbd_slab_init(void) 73071c2a9dfeSAlex Elder { 73081c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 730903d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 7310868311b1SAlex Elder if (!rbd_img_request_cache) 7311868311b1SAlex Elder return -ENOMEM; 7312868311b1SAlex Elder 7313868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 731403d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 731578c2a44aSAlex Elder if (!rbd_obj_request_cache) 731678c2a44aSAlex Elder goto out_err; 731778c2a44aSAlex Elder 73181c2a9dfeSAlex Elder return 0; 73191c2a9dfeSAlex Elder 73206c696d85SIlya Dryomov out_err: 7321868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 7322868311b1SAlex Elder rbd_img_request_cache = NULL; 73231c2a9dfeSAlex Elder return -ENOMEM; 73241c2a9dfeSAlex Elder } 73251c2a9dfeSAlex Elder 73261c2a9dfeSAlex Elder static void rbd_slab_exit(void) 73271c2a9dfeSAlex Elder { 7328868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 7329868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 7330868311b1SAlex Elder rbd_obj_request_cache = NULL; 7331868311b1SAlex Elder 73321c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 73331c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 73341c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 73351c2a9dfeSAlex Elder } 73361c2a9dfeSAlex Elder 7337cc344fa1SAlex Elder static int __init rbd_init(void) 7338602adf40SYehuda Sadeh { 7339602adf40SYehuda Sadeh int rc; 7340602adf40SYehuda Sadeh 73411e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 73421e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 73431e32d34cSAlex Elder return -EINVAL; 73441e32d34cSAlex Elder } 7345e1b4d96dSIlya Dryomov 73461c2a9dfeSAlex Elder rc = rbd_slab_init(); 7347602adf40SYehuda Sadeh if (rc) 7348602adf40SYehuda Sadeh return rc; 7349e1b4d96dSIlya Dryomov 7350f5ee37bdSIlya Dryomov /* 7351f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 7352f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 7353f5ee37bdSIlya Dryomov */ 7354f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 7355f5ee37bdSIlya Dryomov if (!rbd_wq) { 7356f5ee37bdSIlya Dryomov rc = -ENOMEM; 7357f5ee37bdSIlya Dryomov goto err_out_slab; 7358f5ee37bdSIlya Dryomov } 7359f5ee37bdSIlya Dryomov 73609b60e70bSIlya Dryomov if (single_major) { 73619b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 73629b60e70bSIlya Dryomov if (rbd_major < 0) { 73639b60e70bSIlya Dryomov rc = rbd_major; 7364f5ee37bdSIlya Dryomov goto err_out_wq; 73659b60e70bSIlya Dryomov } 73669b60e70bSIlya Dryomov } 73679b60e70bSIlya Dryomov 73681c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 73691c2a9dfeSAlex Elder if (rc) 73709b60e70bSIlya Dryomov goto err_out_blkdev; 73711c2a9dfeSAlex Elder 73729b60e70bSIlya Dryomov if (single_major) 73739b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 73749b60e70bSIlya Dryomov else 7375e1b4d96dSIlya Dryomov pr_info("loaded\n"); 73769b60e70bSIlya Dryomov 7377e1b4d96dSIlya Dryomov return 0; 7378e1b4d96dSIlya Dryomov 73799b60e70bSIlya Dryomov err_out_blkdev: 73809b60e70bSIlya Dryomov if (single_major) 73819b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 7382f5ee37bdSIlya Dryomov err_out_wq: 7383f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 7384e1b4d96dSIlya Dryomov err_out_slab: 7385e1b4d96dSIlya Dryomov rbd_slab_exit(); 73861c2a9dfeSAlex Elder return rc; 7387602adf40SYehuda Sadeh } 7388602adf40SYehuda Sadeh 7389cc344fa1SAlex Elder static void __exit rbd_exit(void) 7390602adf40SYehuda Sadeh { 7391ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 7392602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 73939b60e70bSIlya Dryomov if (single_major) 73949b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 7395f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 73961c2a9dfeSAlex Elder rbd_slab_exit(); 7397602adf40SYehuda Sadeh } 7398602adf40SYehuda Sadeh 7399602adf40SYehuda Sadeh module_init(rbd_init); 7400602adf40SYehuda Sadeh module_exit(rbd_exit); 7401602adf40SYehuda Sadeh 7402d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 7403602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 7404602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 7405602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 7406602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 7407602adf40SYehuda Sadeh 740890da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 7409602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 7410