1e2a58ee5SAlex Elder 2602adf40SYehuda Sadeh /* 3602adf40SYehuda Sadeh rbd.c -- Export ceph rados objects as a Linux block device 4602adf40SYehuda Sadeh 5602adf40SYehuda Sadeh 6602adf40SYehuda Sadeh based on drivers/block/osdblk.c: 7602adf40SYehuda Sadeh 8602adf40SYehuda Sadeh Copyright 2009 Red Hat, Inc. 9602adf40SYehuda Sadeh 10602adf40SYehuda Sadeh This program is free software; you can redistribute it and/or modify 11602adf40SYehuda Sadeh it under the terms of the GNU General Public License as published by 12602adf40SYehuda Sadeh the Free Software Foundation. 13602adf40SYehuda Sadeh 14602adf40SYehuda Sadeh This program is distributed in the hope that it will be useful, 15602adf40SYehuda Sadeh but WITHOUT ANY WARRANTY; without even the implied warranty of 16602adf40SYehuda Sadeh MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17602adf40SYehuda Sadeh GNU General Public License for more details. 18602adf40SYehuda Sadeh 19602adf40SYehuda Sadeh You should have received a copy of the GNU General Public License 20602adf40SYehuda Sadeh along with this program; see the file COPYING. If not, write to 21602adf40SYehuda Sadeh the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 22602adf40SYehuda Sadeh 23602adf40SYehuda Sadeh 24602adf40SYehuda Sadeh 25dfc5606dSYehuda Sadeh For usage instructions, please refer to: 26602adf40SYehuda Sadeh 27dfc5606dSYehuda Sadeh Documentation/ABI/testing/sysfs-bus-rbd 28602adf40SYehuda Sadeh 29602adf40SYehuda Sadeh */ 30602adf40SYehuda Sadeh 31602adf40SYehuda Sadeh #include <linux/ceph/libceph.h> 32602adf40SYehuda Sadeh #include <linux/ceph/osd_client.h> 33602adf40SYehuda Sadeh #include <linux/ceph/mon_client.h> 34ed95b21aSIlya Dryomov #include <linux/ceph/cls_lock_client.h> 3543df3d35SIlya Dryomov #include <linux/ceph/striper.h> 36602adf40SYehuda Sadeh #include <linux/ceph/decode.h> 3759c2be1eSYehuda Sadeh #include <linux/parser.h> 3830d1cff8SAlex Elder #include <linux/bsearch.h> 39602adf40SYehuda Sadeh 40602adf40SYehuda Sadeh #include <linux/kernel.h> 41602adf40SYehuda Sadeh #include <linux/device.h> 42602adf40SYehuda Sadeh #include <linux/module.h> 437ad18afaSChristoph Hellwig #include <linux/blk-mq.h> 44602adf40SYehuda Sadeh #include <linux/fs.h> 45602adf40SYehuda Sadeh #include <linux/blkdev.h> 461c2a9dfeSAlex Elder #include <linux/slab.h> 47f8a22fc2SIlya Dryomov #include <linux/idr.h> 48bc1ecc65SIlya Dryomov #include <linux/workqueue.h> 49602adf40SYehuda Sadeh 50602adf40SYehuda Sadeh #include "rbd_types.h" 51602adf40SYehuda Sadeh 52aafb230eSAlex Elder #define RBD_DEBUG /* Activate rbd_assert() calls */ 53aafb230eSAlex Elder 54593a9e7bSAlex Elder /* 55a2acd00eSAlex Elder * Increment the given counter and return its updated value. 56a2acd00eSAlex Elder * If the counter is already 0 it will not be incremented. 57a2acd00eSAlex Elder * If the counter is already at its maximum value returns 58a2acd00eSAlex Elder * -EINVAL without updating it. 59a2acd00eSAlex Elder */ 60a2acd00eSAlex Elder static int atomic_inc_return_safe(atomic_t *v) 61a2acd00eSAlex Elder { 62a2acd00eSAlex Elder unsigned int counter; 63a2acd00eSAlex Elder 64bfc18e38SMark Rutland counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0); 65a2acd00eSAlex Elder if (counter <= (unsigned int)INT_MAX) 66a2acd00eSAlex Elder return (int)counter; 67a2acd00eSAlex Elder 68a2acd00eSAlex Elder atomic_dec(v); 69a2acd00eSAlex Elder 70a2acd00eSAlex Elder return -EINVAL; 71a2acd00eSAlex Elder } 72a2acd00eSAlex Elder 73a2acd00eSAlex Elder /* Decrement the counter. Return the resulting value, or -EINVAL */ 74a2acd00eSAlex Elder static int atomic_dec_return_safe(atomic_t *v) 75a2acd00eSAlex Elder { 76a2acd00eSAlex Elder int counter; 77a2acd00eSAlex Elder 78a2acd00eSAlex Elder counter = atomic_dec_return(v); 79a2acd00eSAlex Elder if (counter >= 0) 80a2acd00eSAlex Elder return counter; 81a2acd00eSAlex Elder 82a2acd00eSAlex Elder atomic_inc(v); 83a2acd00eSAlex Elder 84a2acd00eSAlex Elder return -EINVAL; 85a2acd00eSAlex Elder } 86a2acd00eSAlex Elder 87f0f8cef5SAlex Elder #define RBD_DRV_NAME "rbd" 88602adf40SYehuda Sadeh 897e513d43SIlya Dryomov #define RBD_MINORS_PER_MAJOR 256 907e513d43SIlya Dryomov #define RBD_SINGLE_MAJOR_PART_SHIFT 4 91602adf40SYehuda Sadeh 926d69bb53SIlya Dryomov #define RBD_MAX_PARENT_CHAIN_LEN 16 936d69bb53SIlya Dryomov 94d4b125e9SAlex Elder #define RBD_SNAP_DEV_NAME_PREFIX "snap_" 95d4b125e9SAlex Elder #define RBD_MAX_SNAP_NAME_LEN \ 96d4b125e9SAlex Elder (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) 97d4b125e9SAlex Elder 9835d489f9SAlex Elder #define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ 99602adf40SYehuda Sadeh 100602adf40SYehuda Sadeh #define RBD_SNAP_HEAD_NAME "-" 101602adf40SYehuda Sadeh 1029682fc6dSAlex Elder #define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ 1039682fc6dSAlex Elder 1049e15b77dSAlex Elder /* This allows a single page to hold an image name sent by OSD */ 1059e15b77dSAlex Elder #define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) 106589d30e0SAlex Elder #define RBD_IMAGE_ID_LEN_MAX 64 1079e15b77dSAlex Elder 1081e130199SAlex Elder #define RBD_OBJ_PREFIX_LEN_MAX 64 109589d30e0SAlex Elder 110ed95b21aSIlya Dryomov #define RBD_NOTIFY_TIMEOUT 5 /* seconds */ 11199d16943SIlya Dryomov #define RBD_RETRY_DELAY msecs_to_jiffies(1000) 11299d16943SIlya Dryomov 113d889140cSAlex Elder /* Feature bits */ 114d889140cSAlex Elder 1158767b293SIlya Dryomov #define RBD_FEATURE_LAYERING (1ULL<<0) 1168767b293SIlya Dryomov #define RBD_FEATURE_STRIPINGV2 (1ULL<<1) 1178767b293SIlya Dryomov #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2) 1188767b293SIlya Dryomov #define RBD_FEATURE_DATA_POOL (1ULL<<7) 119e573427aSIlya Dryomov #define RBD_FEATURE_OPERATIONS (1ULL<<8) 1208767b293SIlya Dryomov 121ed95b21aSIlya Dryomov #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ 122ed95b21aSIlya Dryomov RBD_FEATURE_STRIPINGV2 | \ 1237e97332eSIlya Dryomov RBD_FEATURE_EXCLUSIVE_LOCK | \ 124e573427aSIlya Dryomov RBD_FEATURE_DATA_POOL | \ 125e573427aSIlya Dryomov RBD_FEATURE_OPERATIONS) 126d889140cSAlex Elder 127d889140cSAlex Elder /* Features supported by this (client software) implementation. */ 128d889140cSAlex Elder 129770eba6eSAlex Elder #define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) 130d889140cSAlex Elder 13181a89793SAlex Elder /* 13281a89793SAlex Elder * An RBD device name will be "rbd#", where the "rbd" comes from 13381a89793SAlex Elder * RBD_DRV_NAME above, and # is a unique integer identifier. 13481a89793SAlex Elder */ 135602adf40SYehuda Sadeh #define DEV_NAME_LEN 32 136602adf40SYehuda Sadeh 137602adf40SYehuda Sadeh /* 138602adf40SYehuda Sadeh * block device image metadata (in-memory version) 139602adf40SYehuda Sadeh */ 140602adf40SYehuda Sadeh struct rbd_image_header { 141f35a4deeSAlex Elder /* These six fields never change for a given rbd image */ 142849b4260SAlex Elder char *object_prefix; 143602adf40SYehuda Sadeh __u8 obj_order; 144f35a4deeSAlex Elder u64 stripe_unit; 145f35a4deeSAlex Elder u64 stripe_count; 1467e97332eSIlya Dryomov s64 data_pool_id; 147f35a4deeSAlex Elder u64 features; /* Might be changeable someday? */ 148602adf40SYehuda Sadeh 149f84344f3SAlex Elder /* The remaining fields need to be updated occasionally */ 150f84344f3SAlex Elder u64 image_size; 151f84344f3SAlex Elder struct ceph_snap_context *snapc; 152f35a4deeSAlex Elder char *snap_names; /* format 1 only */ 153f35a4deeSAlex Elder u64 *snap_sizes; /* format 1 only */ 15459c2be1eSYehuda Sadeh }; 15559c2be1eSYehuda Sadeh 1560d7dbfceSAlex Elder /* 1570d7dbfceSAlex Elder * An rbd image specification. 1580d7dbfceSAlex Elder * 1590d7dbfceSAlex Elder * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely 160c66c6e0cSAlex Elder * identify an image. Each rbd_dev structure includes a pointer to 161c66c6e0cSAlex Elder * an rbd_spec structure that encapsulates this identity. 162c66c6e0cSAlex Elder * 163c66c6e0cSAlex Elder * Each of the id's in an rbd_spec has an associated name. For a 164c66c6e0cSAlex Elder * user-mapped image, the names are supplied and the id's associated 165c66c6e0cSAlex Elder * with them are looked up. For a layered image, a parent image is 166c66c6e0cSAlex Elder * defined by the tuple, and the names are looked up. 167c66c6e0cSAlex Elder * 168c66c6e0cSAlex Elder * An rbd_dev structure contains a parent_spec pointer which is 169c66c6e0cSAlex Elder * non-null if the image it represents is a child in a layered 170c66c6e0cSAlex Elder * image. This pointer will refer to the rbd_spec structure used 171c66c6e0cSAlex Elder * by the parent rbd_dev for its own identity (i.e., the structure 172c66c6e0cSAlex Elder * is shared between the parent and child). 173c66c6e0cSAlex Elder * 174c66c6e0cSAlex Elder * Since these structures are populated once, during the discovery 175c66c6e0cSAlex Elder * phase of image construction, they are effectively immutable so 176c66c6e0cSAlex Elder * we make no effort to synchronize access to them. 177c66c6e0cSAlex Elder * 178c66c6e0cSAlex Elder * Note that code herein does not assume the image name is known (it 179c66c6e0cSAlex Elder * could be a null pointer). 1800d7dbfceSAlex Elder */ 1810d7dbfceSAlex Elder struct rbd_spec { 1820d7dbfceSAlex Elder u64 pool_id; 183ecb4dc22SAlex Elder const char *pool_name; 184b26c047bSIlya Dryomov const char *pool_ns; /* NULL if default, never "" */ 1850d7dbfceSAlex Elder 186ecb4dc22SAlex Elder const char *image_id; 187ecb4dc22SAlex Elder const char *image_name; 1880d7dbfceSAlex Elder 1890d7dbfceSAlex Elder u64 snap_id; 190ecb4dc22SAlex Elder const char *snap_name; 1910d7dbfceSAlex Elder 1920d7dbfceSAlex Elder struct kref kref; 1930d7dbfceSAlex Elder }; 1940d7dbfceSAlex Elder 195602adf40SYehuda Sadeh /* 196f0f8cef5SAlex Elder * an instance of the client. multiple devices may share an rbd client. 197602adf40SYehuda Sadeh */ 198602adf40SYehuda Sadeh struct rbd_client { 199602adf40SYehuda Sadeh struct ceph_client *client; 200602adf40SYehuda Sadeh struct kref kref; 201602adf40SYehuda Sadeh struct list_head node; 202602adf40SYehuda Sadeh }; 203602adf40SYehuda Sadeh 204bf0d5f50SAlex Elder struct rbd_img_request; 205bf0d5f50SAlex Elder 2069969ebc5SAlex Elder enum obj_request_type { 207a1fbb5e7SIlya Dryomov OBJ_REQUEST_NODATA = 1, 2085359a17dSIlya Dryomov OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 2097e07efb1SIlya Dryomov OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 210afb97888SIlya Dryomov OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ 2119969ebc5SAlex Elder }; 212bf0d5f50SAlex Elder 2136d2940c8SGuangliang Zhao enum obj_operation_type { 214a1fbb5e7SIlya Dryomov OBJ_OP_READ = 1, 2156d2940c8SGuangliang Zhao OBJ_OP_WRITE, 21690e98c52SGuangliang Zhao OBJ_OP_DISCARD, 2176484cbe9SIlya Dryomov OBJ_OP_ZEROOUT, 2186d2940c8SGuangliang Zhao }; 2196d2940c8SGuangliang Zhao 2203da691bfSIlya Dryomov /* 2213da691bfSIlya Dryomov * Writes go through the following state machine to deal with 2223da691bfSIlya Dryomov * layering: 2233da691bfSIlya Dryomov * 2243da691bfSIlya Dryomov * need copyup 2253da691bfSIlya Dryomov * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP 2263da691bfSIlya Dryomov * | ^ | 2273da691bfSIlya Dryomov * v \------------------------------/ 2283da691bfSIlya Dryomov * done 2293da691bfSIlya Dryomov * ^ 2303da691bfSIlya Dryomov * | 2313da691bfSIlya Dryomov * RBD_OBJ_WRITE_FLAT 2323da691bfSIlya Dryomov * 2333da691bfSIlya Dryomov * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether 2343da691bfSIlya Dryomov * there is a parent or not. 2353da691bfSIlya Dryomov */ 2363da691bfSIlya Dryomov enum rbd_obj_write_state { 2373da691bfSIlya Dryomov RBD_OBJ_WRITE_FLAT = 1, 2383da691bfSIlya Dryomov RBD_OBJ_WRITE_GUARD, 2393a482501SIlya Dryomov RBD_OBJ_WRITE_READ_FROM_PARENT, 2403a482501SIlya Dryomov RBD_OBJ_WRITE_COPYUP_OPS, 241926f9b3fSAlex Elder }; 242926f9b3fSAlex Elder 243bf0d5f50SAlex Elder struct rbd_obj_request { 24443df3d35SIlya Dryomov struct ceph_object_extent ex; 245c5b5ef6cSAlex Elder union { 2463da691bfSIlya Dryomov bool tried_parent; /* for reads */ 2473da691bfSIlya Dryomov enum rbd_obj_write_state write_state; /* for writes */ 2483da691bfSIlya Dryomov }; 249bf0d5f50SAlex Elder 250bf0d5f50SAlex Elder struct rbd_img_request *img_request; 25186bd7998SIlya Dryomov struct ceph_file_extent *img_extents; 25286bd7998SIlya Dryomov u32 num_img_extents; 253bf0d5f50SAlex Elder 254788e2df3SAlex Elder union { 2555359a17dSIlya Dryomov struct ceph_bio_iter bio_pos; 256788e2df3SAlex Elder struct { 2577e07efb1SIlya Dryomov struct ceph_bvec_iter bvec_pos; 2587e07efb1SIlya Dryomov u32 bvec_count; 259afb97888SIlya Dryomov u32 bvec_idx; 260788e2df3SAlex Elder }; 261788e2df3SAlex Elder }; 2627e07efb1SIlya Dryomov struct bio_vec *copyup_bvecs; 2637e07efb1SIlya Dryomov u32 copyup_bvec_count; 264bf0d5f50SAlex Elder 265bf0d5f50SAlex Elder struct ceph_osd_request *osd_req; 266bf0d5f50SAlex Elder 267bf0d5f50SAlex Elder u64 xferred; /* bytes transferred */ 2681b83bef2SSage Weil int result; 269bf0d5f50SAlex Elder 270bf0d5f50SAlex Elder struct kref kref; 271bf0d5f50SAlex Elder }; 272bf0d5f50SAlex Elder 2730c425248SAlex Elder enum img_req_flags { 2749849e986SAlex Elder IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 275d0b2e944SAlex Elder IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 2760c425248SAlex Elder }; 2770c425248SAlex Elder 278bf0d5f50SAlex Elder struct rbd_img_request { 279bf0d5f50SAlex Elder struct rbd_device *rbd_dev; 2809bb0248dSIlya Dryomov enum obj_operation_type op_type; 281ecc633caSIlya Dryomov enum obj_request_type data_type; 2820c425248SAlex Elder unsigned long flags; 283bf0d5f50SAlex Elder union { 284bf0d5f50SAlex Elder u64 snap_id; /* for reads */ 2859849e986SAlex Elder struct ceph_snap_context *snapc; /* for writes */ 2869849e986SAlex Elder }; 2879849e986SAlex Elder union { 2889849e986SAlex Elder struct request *rq; /* block request */ 2899849e986SAlex Elder struct rbd_obj_request *obj_request; /* obj req initiator */ 290bf0d5f50SAlex Elder }; 29115961b44SIlya Dryomov spinlock_t completion_lock; 29255f27e09SAlex Elder u64 xferred;/* aggregate bytes transferred */ 293a5a337d4SAlex Elder int result; /* first nonzero obj_request result */ 294bf0d5f50SAlex Elder 29543df3d35SIlya Dryomov struct list_head object_extents; /* obj_req.ex structs */ 2967114edacSIlya Dryomov u32 pending_count; 297bf0d5f50SAlex Elder 298bf0d5f50SAlex Elder struct kref kref; 299bf0d5f50SAlex Elder }; 300bf0d5f50SAlex Elder 301bf0d5f50SAlex Elder #define for_each_obj_request(ireq, oreq) \ 30243df3d35SIlya Dryomov list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) 303bf0d5f50SAlex Elder #define for_each_obj_request_safe(ireq, oreq, n) \ 30443df3d35SIlya Dryomov list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) 305bf0d5f50SAlex Elder 30699d16943SIlya Dryomov enum rbd_watch_state { 30799d16943SIlya Dryomov RBD_WATCH_STATE_UNREGISTERED, 30899d16943SIlya Dryomov RBD_WATCH_STATE_REGISTERED, 30999d16943SIlya Dryomov RBD_WATCH_STATE_ERROR, 31099d16943SIlya Dryomov }; 31199d16943SIlya Dryomov 312ed95b21aSIlya Dryomov enum rbd_lock_state { 313ed95b21aSIlya Dryomov RBD_LOCK_STATE_UNLOCKED, 314ed95b21aSIlya Dryomov RBD_LOCK_STATE_LOCKED, 315ed95b21aSIlya Dryomov RBD_LOCK_STATE_RELEASING, 316ed95b21aSIlya Dryomov }; 317ed95b21aSIlya Dryomov 318ed95b21aSIlya Dryomov /* WatchNotify::ClientId */ 319ed95b21aSIlya Dryomov struct rbd_client_id { 320ed95b21aSIlya Dryomov u64 gid; 321ed95b21aSIlya Dryomov u64 handle; 322ed95b21aSIlya Dryomov }; 323ed95b21aSIlya Dryomov 324f84344f3SAlex Elder struct rbd_mapping { 32599c1f08fSAlex Elder u64 size; 32634b13184SAlex Elder u64 features; 327f84344f3SAlex Elder }; 328f84344f3SAlex Elder 329602adf40SYehuda Sadeh /* 330602adf40SYehuda Sadeh * a single device 331602adf40SYehuda Sadeh */ 332602adf40SYehuda Sadeh struct rbd_device { 333de71a297SAlex Elder int dev_id; /* blkdev unique id */ 334602adf40SYehuda Sadeh 335602adf40SYehuda Sadeh int major; /* blkdev assigned major */ 336dd82fff1SIlya Dryomov int minor; 337602adf40SYehuda Sadeh struct gendisk *disk; /* blkdev's gendisk and rq */ 338602adf40SYehuda Sadeh 339a30b71b9SAlex Elder u32 image_format; /* Either 1 or 2 */ 340602adf40SYehuda Sadeh struct rbd_client *rbd_client; 341602adf40SYehuda Sadeh 342602adf40SYehuda Sadeh char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ 343602adf40SYehuda Sadeh 344b82d167bSAlex Elder spinlock_t lock; /* queue, flags, open_count */ 345602adf40SYehuda Sadeh 346602adf40SYehuda Sadeh struct rbd_image_header header; 347b82d167bSAlex Elder unsigned long flags; /* possibly lock protected */ 3480d7dbfceSAlex Elder struct rbd_spec *spec; 349d147543dSIlya Dryomov struct rbd_options *opts; 3500d6d1e9cSMike Christie char *config_info; /* add{,_single_major} string */ 351602adf40SYehuda Sadeh 352c41d13a3SIlya Dryomov struct ceph_object_id header_oid; 353922dab61SIlya Dryomov struct ceph_object_locator header_oloc; 354971f839aSAlex Elder 3551643dfa4SIlya Dryomov struct ceph_file_layout layout; /* used for all rbd requests */ 3560903e875SAlex Elder 35799d16943SIlya Dryomov struct mutex watch_mutex; 35899d16943SIlya Dryomov enum rbd_watch_state watch_state; 359922dab61SIlya Dryomov struct ceph_osd_linger_request *watch_handle; 36099d16943SIlya Dryomov u64 watch_cookie; 36199d16943SIlya Dryomov struct delayed_work watch_dwork; 36259c2be1eSYehuda Sadeh 363ed95b21aSIlya Dryomov struct rw_semaphore lock_rwsem; 364ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 365cbbfb0ffSIlya Dryomov char lock_cookie[32]; 366ed95b21aSIlya Dryomov struct rbd_client_id owner_cid; 367ed95b21aSIlya Dryomov struct work_struct acquired_lock_work; 368ed95b21aSIlya Dryomov struct work_struct released_lock_work; 369ed95b21aSIlya Dryomov struct delayed_work lock_dwork; 370ed95b21aSIlya Dryomov struct work_struct unlock_work; 371ed95b21aSIlya Dryomov wait_queue_head_t lock_waitq; 372ed95b21aSIlya Dryomov 3731643dfa4SIlya Dryomov struct workqueue_struct *task_wq; 374602adf40SYehuda Sadeh 37586b00e0dSAlex Elder struct rbd_spec *parent_spec; 37686b00e0dSAlex Elder u64 parent_overlap; 377a2acd00eSAlex Elder atomic_t parent_ref; 3782f82ee54SAlex Elder struct rbd_device *parent; 37986b00e0dSAlex Elder 3807ad18afaSChristoph Hellwig /* Block layer tags. */ 3817ad18afaSChristoph Hellwig struct blk_mq_tag_set tag_set; 3827ad18afaSChristoph Hellwig 383c666601aSJosh Durgin /* protects updating the header */ 384c666601aSJosh Durgin struct rw_semaphore header_rwsem; 385f84344f3SAlex Elder 386f84344f3SAlex Elder struct rbd_mapping mapping; 387602adf40SYehuda Sadeh 388602adf40SYehuda Sadeh struct list_head node; 389dfc5606dSYehuda Sadeh 390dfc5606dSYehuda Sadeh /* sysfs related */ 391dfc5606dSYehuda Sadeh struct device dev; 392b82d167bSAlex Elder unsigned long open_count; /* protected by lock */ 393dfc5606dSYehuda Sadeh }; 394dfc5606dSYehuda Sadeh 395b82d167bSAlex Elder /* 39687c0fdedSIlya Dryomov * Flag bits for rbd_dev->flags: 39787c0fdedSIlya Dryomov * - REMOVING (which is coupled with rbd_dev->open_count) is protected 39887c0fdedSIlya Dryomov * by rbd_dev->lock 39987c0fdedSIlya Dryomov * - BLACKLISTED is protected by rbd_dev->lock_rwsem 400b82d167bSAlex Elder */ 4016d292906SAlex Elder enum rbd_dev_flags { 4026d292906SAlex Elder RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ 403b82d167bSAlex Elder RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ 40487c0fdedSIlya Dryomov RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */ 4056d292906SAlex Elder }; 4066d292906SAlex Elder 407cfbf6377SAlex Elder static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ 408e124a82fSAlex Elder 409602adf40SYehuda Sadeh static LIST_HEAD(rbd_dev_list); /* devices */ 410e124a82fSAlex Elder static DEFINE_SPINLOCK(rbd_dev_list_lock); 411e124a82fSAlex Elder 412602adf40SYehuda Sadeh static LIST_HEAD(rbd_client_list); /* clients */ 413432b8587SAlex Elder static DEFINE_SPINLOCK(rbd_client_list_lock); 414602adf40SYehuda Sadeh 41578c2a44aSAlex Elder /* Slab caches for frequently-allocated structures */ 41678c2a44aSAlex Elder 4171c2a9dfeSAlex Elder static struct kmem_cache *rbd_img_request_cache; 418868311b1SAlex Elder static struct kmem_cache *rbd_obj_request_cache; 4191c2a9dfeSAlex Elder 4209b60e70bSIlya Dryomov static int rbd_major; 421f8a22fc2SIlya Dryomov static DEFINE_IDA(rbd_dev_id_ida); 422f8a22fc2SIlya Dryomov 423f5ee37bdSIlya Dryomov static struct workqueue_struct *rbd_wq; 424f5ee37bdSIlya Dryomov 4259b60e70bSIlya Dryomov /* 4263cfa3b16SIlya Dryomov * single-major requires >= 0.75 version of userspace rbd utility. 4279b60e70bSIlya Dryomov */ 4283cfa3b16SIlya Dryomov static bool single_major = true; 4295657a819SJoe Perches module_param(single_major, bool, 0444); 4303cfa3b16SIlya Dryomov MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 4319b60e70bSIlya Dryomov 432f0f8cef5SAlex Elder static ssize_t rbd_add(struct bus_type *bus, const char *buf, 433f0f8cef5SAlex Elder size_t count); 434f0f8cef5SAlex Elder static ssize_t rbd_remove(struct bus_type *bus, const char *buf, 435f0f8cef5SAlex Elder size_t count); 4369b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, 4379b60e70bSIlya Dryomov size_t count); 4389b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 4399b60e70bSIlya Dryomov size_t count); 4406d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 441f0f8cef5SAlex Elder 4429b60e70bSIlya Dryomov static int rbd_dev_id_to_minor(int dev_id) 4439b60e70bSIlya Dryomov { 4447e513d43SIlya Dryomov return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; 4459b60e70bSIlya Dryomov } 4469b60e70bSIlya Dryomov 4479b60e70bSIlya Dryomov static int minor_to_rbd_dev_id(int minor) 4489b60e70bSIlya Dryomov { 4497e513d43SIlya Dryomov return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; 4509b60e70bSIlya Dryomov } 4519b60e70bSIlya Dryomov 452ed95b21aSIlya Dryomov static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev) 453ed95b21aSIlya Dryomov { 454ed95b21aSIlya Dryomov return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED || 455ed95b21aSIlya Dryomov rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING; 456ed95b21aSIlya Dryomov } 457ed95b21aSIlya Dryomov 458ed95b21aSIlya Dryomov static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) 459ed95b21aSIlya Dryomov { 460ed95b21aSIlya Dryomov bool is_lock_owner; 461ed95b21aSIlya Dryomov 462ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 463ed95b21aSIlya Dryomov is_lock_owner = __rbd_is_lock_owner(rbd_dev); 464ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 465ed95b21aSIlya Dryomov return is_lock_owner; 466ed95b21aSIlya Dryomov } 467ed95b21aSIlya Dryomov 4688767b293SIlya Dryomov static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf) 4698767b293SIlya Dryomov { 4708767b293SIlya Dryomov return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); 4718767b293SIlya Dryomov } 4728767b293SIlya Dryomov 4735657a819SJoe Perches static BUS_ATTR(add, 0200, NULL, rbd_add); 4745657a819SJoe Perches static BUS_ATTR(remove, 0200, NULL, rbd_remove); 4755657a819SJoe Perches static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major); 4765657a819SJoe Perches static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major); 4775657a819SJoe Perches static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL); 478b15a21ddSGreg Kroah-Hartman 479b15a21ddSGreg Kroah-Hartman static struct attribute *rbd_bus_attrs[] = { 480b15a21ddSGreg Kroah-Hartman &bus_attr_add.attr, 481b15a21ddSGreg Kroah-Hartman &bus_attr_remove.attr, 4829b60e70bSIlya Dryomov &bus_attr_add_single_major.attr, 4839b60e70bSIlya Dryomov &bus_attr_remove_single_major.attr, 4848767b293SIlya Dryomov &bus_attr_supported_features.attr, 485b15a21ddSGreg Kroah-Hartman NULL, 486f0f8cef5SAlex Elder }; 48792c76dc0SIlya Dryomov 48892c76dc0SIlya Dryomov static umode_t rbd_bus_is_visible(struct kobject *kobj, 48992c76dc0SIlya Dryomov struct attribute *attr, int index) 49092c76dc0SIlya Dryomov { 4919b60e70bSIlya Dryomov if (!single_major && 4929b60e70bSIlya Dryomov (attr == &bus_attr_add_single_major.attr || 4939b60e70bSIlya Dryomov attr == &bus_attr_remove_single_major.attr)) 4949b60e70bSIlya Dryomov return 0; 4959b60e70bSIlya Dryomov 49692c76dc0SIlya Dryomov return attr->mode; 49792c76dc0SIlya Dryomov } 49892c76dc0SIlya Dryomov 49992c76dc0SIlya Dryomov static const struct attribute_group rbd_bus_group = { 50092c76dc0SIlya Dryomov .attrs = rbd_bus_attrs, 50192c76dc0SIlya Dryomov .is_visible = rbd_bus_is_visible, 50292c76dc0SIlya Dryomov }; 50392c76dc0SIlya Dryomov __ATTRIBUTE_GROUPS(rbd_bus); 504f0f8cef5SAlex Elder 505f0f8cef5SAlex Elder static struct bus_type rbd_bus_type = { 506f0f8cef5SAlex Elder .name = "rbd", 507b15a21ddSGreg Kroah-Hartman .bus_groups = rbd_bus_groups, 508f0f8cef5SAlex Elder }; 509f0f8cef5SAlex Elder 510f0f8cef5SAlex Elder static void rbd_root_dev_release(struct device *dev) 511f0f8cef5SAlex Elder { 512f0f8cef5SAlex Elder } 513f0f8cef5SAlex Elder 514f0f8cef5SAlex Elder static struct device rbd_root_dev = { 515f0f8cef5SAlex Elder .init_name = "rbd", 516f0f8cef5SAlex Elder .release = rbd_root_dev_release, 517f0f8cef5SAlex Elder }; 518f0f8cef5SAlex Elder 51906ecc6cbSAlex Elder static __printf(2, 3) 52006ecc6cbSAlex Elder void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) 52106ecc6cbSAlex Elder { 52206ecc6cbSAlex Elder struct va_format vaf; 52306ecc6cbSAlex Elder va_list args; 52406ecc6cbSAlex Elder 52506ecc6cbSAlex Elder va_start(args, fmt); 52606ecc6cbSAlex Elder vaf.fmt = fmt; 52706ecc6cbSAlex Elder vaf.va = &args; 52806ecc6cbSAlex Elder 52906ecc6cbSAlex Elder if (!rbd_dev) 53006ecc6cbSAlex Elder printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); 53106ecc6cbSAlex Elder else if (rbd_dev->disk) 53206ecc6cbSAlex Elder printk(KERN_WARNING "%s: %s: %pV\n", 53306ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); 53406ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_name) 53506ecc6cbSAlex Elder printk(KERN_WARNING "%s: image %s: %pV\n", 53606ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); 53706ecc6cbSAlex Elder else if (rbd_dev->spec && rbd_dev->spec->image_id) 53806ecc6cbSAlex Elder printk(KERN_WARNING "%s: id %s: %pV\n", 53906ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); 54006ecc6cbSAlex Elder else /* punt */ 54106ecc6cbSAlex Elder printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", 54206ecc6cbSAlex Elder RBD_DRV_NAME, rbd_dev, &vaf); 54306ecc6cbSAlex Elder va_end(args); 54406ecc6cbSAlex Elder } 54506ecc6cbSAlex Elder 546aafb230eSAlex Elder #ifdef RBD_DEBUG 547aafb230eSAlex Elder #define rbd_assert(expr) \ 548aafb230eSAlex Elder if (unlikely(!(expr))) { \ 549aafb230eSAlex Elder printk(KERN_ERR "\nAssertion failure in %s() " \ 550aafb230eSAlex Elder "at line %d:\n\n" \ 551aafb230eSAlex Elder "\trbd_assert(%s);\n\n", \ 552aafb230eSAlex Elder __func__, __LINE__, #expr); \ 553aafb230eSAlex Elder BUG(); \ 554aafb230eSAlex Elder } 555aafb230eSAlex Elder #else /* !RBD_DEBUG */ 556aafb230eSAlex Elder # define rbd_assert(expr) ((void) 0) 557aafb230eSAlex Elder #endif /* !RBD_DEBUG */ 558dfc5606dSYehuda Sadeh 55905a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 5608b3e1a56SAlex Elder 561cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev); 5622df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); 563a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev); 564e8f59b59SIlya Dryomov static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); 56554cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 56654cac61fSAlex Elder u64 snap_id); 5672ad3d716SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 5682ad3d716SAlex Elder u8 *order, u64 *snap_size); 5692ad3d716SAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 5702ad3d716SAlex Elder u64 *snap_features); 57159c2be1eSYehuda Sadeh 572602adf40SYehuda Sadeh static int rbd_open(struct block_device *bdev, fmode_t mode) 573602adf40SYehuda Sadeh { 574f0f8cef5SAlex Elder struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 575b82d167bSAlex Elder bool removing = false; 576602adf40SYehuda Sadeh 577a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 578b82d167bSAlex Elder if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) 579b82d167bSAlex Elder removing = true; 580b82d167bSAlex Elder else 581b82d167bSAlex Elder rbd_dev->open_count++; 582a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 583b82d167bSAlex Elder if (removing) 584b82d167bSAlex Elder return -ENOENT; 585b82d167bSAlex Elder 586c3e946ceSAlex Elder (void) get_device(&rbd_dev->dev); 587340c7a2bSAlex Elder 588602adf40SYehuda Sadeh return 0; 589602adf40SYehuda Sadeh } 590602adf40SYehuda Sadeh 591db2a144bSAl Viro static void rbd_release(struct gendisk *disk, fmode_t mode) 592dfc5606dSYehuda Sadeh { 593dfc5606dSYehuda Sadeh struct rbd_device *rbd_dev = disk->private_data; 594b82d167bSAlex Elder unsigned long open_count_before; 595b82d167bSAlex Elder 596a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 597b82d167bSAlex Elder open_count_before = rbd_dev->open_count--; 598a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 599b82d167bSAlex Elder rbd_assert(open_count_before > 0); 600dfc5606dSYehuda Sadeh 601c3e946ceSAlex Elder put_device(&rbd_dev->dev); 602dfc5606dSYehuda Sadeh } 603dfc5606dSYehuda Sadeh 604131fd9f6SGuangliang Zhao static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 605131fd9f6SGuangliang Zhao { 6061de797bbSIlya Dryomov int ro; 607131fd9f6SGuangliang Zhao 6081de797bbSIlya Dryomov if (get_user(ro, (int __user *)arg)) 609131fd9f6SGuangliang Zhao return -EFAULT; 610131fd9f6SGuangliang Zhao 6111de797bbSIlya Dryomov /* Snapshots can't be marked read-write */ 612131fd9f6SGuangliang Zhao if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 613131fd9f6SGuangliang Zhao return -EROFS; 614131fd9f6SGuangliang Zhao 6151de797bbSIlya Dryomov /* Let blkdev_roset() handle it */ 6161de797bbSIlya Dryomov return -ENOTTY; 617131fd9f6SGuangliang Zhao } 618131fd9f6SGuangliang Zhao 619131fd9f6SGuangliang Zhao static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 620131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 621131fd9f6SGuangliang Zhao { 622131fd9f6SGuangliang Zhao struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 6231de797bbSIlya Dryomov int ret; 624131fd9f6SGuangliang Zhao 625131fd9f6SGuangliang Zhao switch (cmd) { 626131fd9f6SGuangliang Zhao case BLKROSET: 627131fd9f6SGuangliang Zhao ret = rbd_ioctl_set_ro(rbd_dev, arg); 628131fd9f6SGuangliang Zhao break; 629131fd9f6SGuangliang Zhao default: 630131fd9f6SGuangliang Zhao ret = -ENOTTY; 631131fd9f6SGuangliang Zhao } 632131fd9f6SGuangliang Zhao 633131fd9f6SGuangliang Zhao return ret; 634131fd9f6SGuangliang Zhao } 635131fd9f6SGuangliang Zhao 636131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 637131fd9f6SGuangliang Zhao static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 638131fd9f6SGuangliang Zhao unsigned int cmd, unsigned long arg) 639131fd9f6SGuangliang Zhao { 640131fd9f6SGuangliang Zhao return rbd_ioctl(bdev, mode, cmd, arg); 641131fd9f6SGuangliang Zhao } 642131fd9f6SGuangliang Zhao #endif /* CONFIG_COMPAT */ 643131fd9f6SGuangliang Zhao 644602adf40SYehuda Sadeh static const struct block_device_operations rbd_bd_ops = { 645602adf40SYehuda Sadeh .owner = THIS_MODULE, 646602adf40SYehuda Sadeh .open = rbd_open, 647dfc5606dSYehuda Sadeh .release = rbd_release, 648131fd9f6SGuangliang Zhao .ioctl = rbd_ioctl, 649131fd9f6SGuangliang Zhao #ifdef CONFIG_COMPAT 650131fd9f6SGuangliang Zhao .compat_ioctl = rbd_compat_ioctl, 651131fd9f6SGuangliang Zhao #endif 652602adf40SYehuda Sadeh }; 653602adf40SYehuda Sadeh 654602adf40SYehuda Sadeh /* 6557262cfcaSAlex Elder * Initialize an rbd client instance. Success or not, this function 656cfbf6377SAlex Elder * consumes ceph_opts. Caller holds client_mutex. 657602adf40SYehuda Sadeh */ 658f8c38929SAlex Elder static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) 659602adf40SYehuda Sadeh { 660602adf40SYehuda Sadeh struct rbd_client *rbdc; 661602adf40SYehuda Sadeh int ret = -ENOMEM; 662602adf40SYehuda Sadeh 66337206ee5SAlex Elder dout("%s:\n", __func__); 664602adf40SYehuda Sadeh rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); 665602adf40SYehuda Sadeh if (!rbdc) 666602adf40SYehuda Sadeh goto out_opt; 667602adf40SYehuda Sadeh 668602adf40SYehuda Sadeh kref_init(&rbdc->kref); 669602adf40SYehuda Sadeh INIT_LIST_HEAD(&rbdc->node); 670602adf40SYehuda Sadeh 67174da4a0fSIlya Dryomov rbdc->client = ceph_create_client(ceph_opts, rbdc); 672602adf40SYehuda Sadeh if (IS_ERR(rbdc->client)) 67308f75463SAlex Elder goto out_rbdc; 67443ae4701SAlex Elder ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ 675602adf40SYehuda Sadeh 676602adf40SYehuda Sadeh ret = ceph_open_session(rbdc->client); 677602adf40SYehuda Sadeh if (ret < 0) 67808f75463SAlex Elder goto out_client; 679602adf40SYehuda Sadeh 680432b8587SAlex Elder spin_lock(&rbd_client_list_lock); 681602adf40SYehuda Sadeh list_add_tail(&rbdc->node, &rbd_client_list); 682432b8587SAlex Elder spin_unlock(&rbd_client_list_lock); 683602adf40SYehuda Sadeh 68437206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 685bc534d86SAlex Elder 686602adf40SYehuda Sadeh return rbdc; 68708f75463SAlex Elder out_client: 688602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 68908f75463SAlex Elder out_rbdc: 690602adf40SYehuda Sadeh kfree(rbdc); 691602adf40SYehuda Sadeh out_opt: 69243ae4701SAlex Elder if (ceph_opts) 69343ae4701SAlex Elder ceph_destroy_options(ceph_opts); 69437206ee5SAlex Elder dout("%s: error %d\n", __func__, ret); 69537206ee5SAlex Elder 69628f259b7SVasiliy Kulikov return ERR_PTR(ret); 697602adf40SYehuda Sadeh } 698602adf40SYehuda Sadeh 6992f82ee54SAlex Elder static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) 7002f82ee54SAlex Elder { 7012f82ee54SAlex Elder kref_get(&rbdc->kref); 7022f82ee54SAlex Elder 7032f82ee54SAlex Elder return rbdc; 7042f82ee54SAlex Elder } 7052f82ee54SAlex Elder 706602adf40SYehuda Sadeh /* 7071f7ba331SAlex Elder * Find a ceph client with specific addr and configuration. If 7081f7ba331SAlex Elder * found, bump its reference count. 709602adf40SYehuda Sadeh */ 7101f7ba331SAlex Elder static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) 711602adf40SYehuda Sadeh { 712602adf40SYehuda Sadeh struct rbd_client *client_node; 7131f7ba331SAlex Elder bool found = false; 714602adf40SYehuda Sadeh 71543ae4701SAlex Elder if (ceph_opts->flags & CEPH_OPT_NOSHARE) 716602adf40SYehuda Sadeh return NULL; 717602adf40SYehuda Sadeh 7181f7ba331SAlex Elder spin_lock(&rbd_client_list_lock); 7191f7ba331SAlex Elder list_for_each_entry(client_node, &rbd_client_list, node) { 7201f7ba331SAlex Elder if (!ceph_compare_options(ceph_opts, client_node->client)) { 7212f82ee54SAlex Elder __rbd_get_client(client_node); 7222f82ee54SAlex Elder 7231f7ba331SAlex Elder found = true; 7241f7ba331SAlex Elder break; 7251f7ba331SAlex Elder } 7261f7ba331SAlex Elder } 7271f7ba331SAlex Elder spin_unlock(&rbd_client_list_lock); 7281f7ba331SAlex Elder 7291f7ba331SAlex Elder return found ? client_node : NULL; 730602adf40SYehuda Sadeh } 731602adf40SYehuda Sadeh 732602adf40SYehuda Sadeh /* 733210c104cSIlya Dryomov * (Per device) rbd map options 73459c2be1eSYehuda Sadeh */ 73559c2be1eSYehuda Sadeh enum { 736b5584180SIlya Dryomov Opt_queue_depth, 7370c93e1b7SIlya Dryomov Opt_alloc_size, 73834f55d0bSDongsheng Yang Opt_lock_timeout, 73959c2be1eSYehuda Sadeh Opt_last_int, 74059c2be1eSYehuda Sadeh /* int args above */ 741b26c047bSIlya Dryomov Opt_pool_ns, 74259c2be1eSYehuda Sadeh Opt_last_string, 74359c2be1eSYehuda Sadeh /* string args above */ 744cc0538b6SAlex Elder Opt_read_only, 745cc0538b6SAlex Elder Opt_read_write, 74680de1912SIlya Dryomov Opt_lock_on_read, 747e010dd0aSIlya Dryomov Opt_exclusive, 748d9360540SIlya Dryomov Opt_notrim, 749210c104cSIlya Dryomov Opt_err 75059c2be1eSYehuda Sadeh }; 75159c2be1eSYehuda Sadeh 75243ae4701SAlex Elder static match_table_t rbd_opts_tokens = { 753b5584180SIlya Dryomov {Opt_queue_depth, "queue_depth=%d"}, 7540c93e1b7SIlya Dryomov {Opt_alloc_size, "alloc_size=%d"}, 75534f55d0bSDongsheng Yang {Opt_lock_timeout, "lock_timeout=%d"}, 75659c2be1eSYehuda Sadeh /* int args above */ 757b26c047bSIlya Dryomov {Opt_pool_ns, "_pool_ns=%s"}, 75859c2be1eSYehuda Sadeh /* string args above */ 759be466c1cSAlex Elder {Opt_read_only, "read_only"}, 760cc0538b6SAlex Elder {Opt_read_only, "ro"}, /* Alternate spelling */ 761cc0538b6SAlex Elder {Opt_read_write, "read_write"}, 762cc0538b6SAlex Elder {Opt_read_write, "rw"}, /* Alternate spelling */ 76380de1912SIlya Dryomov {Opt_lock_on_read, "lock_on_read"}, 764e010dd0aSIlya Dryomov {Opt_exclusive, "exclusive"}, 765d9360540SIlya Dryomov {Opt_notrim, "notrim"}, 766210c104cSIlya Dryomov {Opt_err, NULL} 76759c2be1eSYehuda Sadeh }; 76859c2be1eSYehuda Sadeh 76998571b5aSAlex Elder struct rbd_options { 770b5584180SIlya Dryomov int queue_depth; 7710c93e1b7SIlya Dryomov int alloc_size; 77234f55d0bSDongsheng Yang unsigned long lock_timeout; 77398571b5aSAlex Elder bool read_only; 77480de1912SIlya Dryomov bool lock_on_read; 775e010dd0aSIlya Dryomov bool exclusive; 776d9360540SIlya Dryomov bool trim; 77798571b5aSAlex Elder }; 77898571b5aSAlex Elder 779b5584180SIlya Dryomov #define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ 7800c93e1b7SIlya Dryomov #define RBD_ALLOC_SIZE_DEFAULT (64 * 1024) 78134f55d0bSDongsheng Yang #define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */ 78298571b5aSAlex Elder #define RBD_READ_ONLY_DEFAULT false 78380de1912SIlya Dryomov #define RBD_LOCK_ON_READ_DEFAULT false 784e010dd0aSIlya Dryomov #define RBD_EXCLUSIVE_DEFAULT false 785d9360540SIlya Dryomov #define RBD_TRIM_DEFAULT true 78698571b5aSAlex Elder 787c300156bSIlya Dryomov struct parse_rbd_opts_ctx { 788c300156bSIlya Dryomov struct rbd_spec *spec; 789c300156bSIlya Dryomov struct rbd_options *opts; 790c300156bSIlya Dryomov }; 791c300156bSIlya Dryomov 79259c2be1eSYehuda Sadeh static int parse_rbd_opts_token(char *c, void *private) 79359c2be1eSYehuda Sadeh { 794c300156bSIlya Dryomov struct parse_rbd_opts_ctx *pctx = private; 79559c2be1eSYehuda Sadeh substring_t argstr[MAX_OPT_ARGS]; 79659c2be1eSYehuda Sadeh int token, intval, ret; 79759c2be1eSYehuda Sadeh 79843ae4701SAlex Elder token = match_token(c, rbd_opts_tokens, argstr); 79959c2be1eSYehuda Sadeh if (token < Opt_last_int) { 80059c2be1eSYehuda Sadeh ret = match_int(&argstr[0], &intval); 80159c2be1eSYehuda Sadeh if (ret < 0) { 8022f56b6baSIlya Dryomov pr_err("bad option arg (not int) at '%s'\n", c); 80359c2be1eSYehuda Sadeh return ret; 80459c2be1eSYehuda Sadeh } 80559c2be1eSYehuda Sadeh dout("got int token %d val %d\n", token, intval); 80659c2be1eSYehuda Sadeh } else if (token > Opt_last_int && token < Opt_last_string) { 807210c104cSIlya Dryomov dout("got string token %d val %s\n", token, argstr[0].from); 80859c2be1eSYehuda Sadeh } else { 80959c2be1eSYehuda Sadeh dout("got token %d\n", token); 81059c2be1eSYehuda Sadeh } 81159c2be1eSYehuda Sadeh 81259c2be1eSYehuda Sadeh switch (token) { 813b5584180SIlya Dryomov case Opt_queue_depth: 814b5584180SIlya Dryomov if (intval < 1) { 815b5584180SIlya Dryomov pr_err("queue_depth out of range\n"); 816b5584180SIlya Dryomov return -EINVAL; 817b5584180SIlya Dryomov } 818c300156bSIlya Dryomov pctx->opts->queue_depth = intval; 819b5584180SIlya Dryomov break; 8200c93e1b7SIlya Dryomov case Opt_alloc_size: 8210c93e1b7SIlya Dryomov if (intval < 1) { 8220c93e1b7SIlya Dryomov pr_err("alloc_size out of range\n"); 8230c93e1b7SIlya Dryomov return -EINVAL; 8240c93e1b7SIlya Dryomov } 8250c93e1b7SIlya Dryomov if (!is_power_of_2(intval)) { 8260c93e1b7SIlya Dryomov pr_err("alloc_size must be a power of 2\n"); 8270c93e1b7SIlya Dryomov return -EINVAL; 8280c93e1b7SIlya Dryomov } 8290c93e1b7SIlya Dryomov pctx->opts->alloc_size = intval; 8300c93e1b7SIlya Dryomov break; 83134f55d0bSDongsheng Yang case Opt_lock_timeout: 83234f55d0bSDongsheng Yang /* 0 is "wait forever" (i.e. infinite timeout) */ 83334f55d0bSDongsheng Yang if (intval < 0 || intval > INT_MAX / 1000) { 83434f55d0bSDongsheng Yang pr_err("lock_timeout out of range\n"); 83534f55d0bSDongsheng Yang return -EINVAL; 83634f55d0bSDongsheng Yang } 837c300156bSIlya Dryomov pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); 83834f55d0bSDongsheng Yang break; 839b26c047bSIlya Dryomov case Opt_pool_ns: 840b26c047bSIlya Dryomov kfree(pctx->spec->pool_ns); 841b26c047bSIlya Dryomov pctx->spec->pool_ns = match_strdup(argstr); 842b26c047bSIlya Dryomov if (!pctx->spec->pool_ns) 843b26c047bSIlya Dryomov return -ENOMEM; 84459c2be1eSYehuda Sadeh break; 845cc0538b6SAlex Elder case Opt_read_only: 846c300156bSIlya Dryomov pctx->opts->read_only = true; 847cc0538b6SAlex Elder break; 848cc0538b6SAlex Elder case Opt_read_write: 849c300156bSIlya Dryomov pctx->opts->read_only = false; 850cc0538b6SAlex Elder break; 85180de1912SIlya Dryomov case Opt_lock_on_read: 852c300156bSIlya Dryomov pctx->opts->lock_on_read = true; 85380de1912SIlya Dryomov break; 854e010dd0aSIlya Dryomov case Opt_exclusive: 855c300156bSIlya Dryomov pctx->opts->exclusive = true; 856e010dd0aSIlya Dryomov break; 857d9360540SIlya Dryomov case Opt_notrim: 858c300156bSIlya Dryomov pctx->opts->trim = false; 859d9360540SIlya Dryomov break; 86059c2be1eSYehuda Sadeh default: 861210c104cSIlya Dryomov /* libceph prints "bad option" msg */ 862210c104cSIlya Dryomov return -EINVAL; 86359c2be1eSYehuda Sadeh } 864210c104cSIlya Dryomov 86559c2be1eSYehuda Sadeh return 0; 86659c2be1eSYehuda Sadeh } 86759c2be1eSYehuda Sadeh 8686d2940c8SGuangliang Zhao static char* obj_op_name(enum obj_operation_type op_type) 8696d2940c8SGuangliang Zhao { 8706d2940c8SGuangliang Zhao switch (op_type) { 8716d2940c8SGuangliang Zhao case OBJ_OP_READ: 8726d2940c8SGuangliang Zhao return "read"; 8736d2940c8SGuangliang Zhao case OBJ_OP_WRITE: 8746d2940c8SGuangliang Zhao return "write"; 87590e98c52SGuangliang Zhao case OBJ_OP_DISCARD: 87690e98c52SGuangliang Zhao return "discard"; 8776484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 8786484cbe9SIlya Dryomov return "zeroout"; 8796d2940c8SGuangliang Zhao default: 8806d2940c8SGuangliang Zhao return "???"; 8816d2940c8SGuangliang Zhao } 8826d2940c8SGuangliang Zhao } 8836d2940c8SGuangliang Zhao 88459c2be1eSYehuda Sadeh /* 885602adf40SYehuda Sadeh * Destroy ceph client 886d23a4b3fSAlex Elder * 887432b8587SAlex Elder * Caller must hold rbd_client_list_lock. 888602adf40SYehuda Sadeh */ 889602adf40SYehuda Sadeh static void rbd_client_release(struct kref *kref) 890602adf40SYehuda Sadeh { 891602adf40SYehuda Sadeh struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); 892602adf40SYehuda Sadeh 89337206ee5SAlex Elder dout("%s: rbdc %p\n", __func__, rbdc); 894cd9d9f5dSAlex Elder spin_lock(&rbd_client_list_lock); 895602adf40SYehuda Sadeh list_del(&rbdc->node); 896cd9d9f5dSAlex Elder spin_unlock(&rbd_client_list_lock); 897602adf40SYehuda Sadeh 898602adf40SYehuda Sadeh ceph_destroy_client(rbdc->client); 899602adf40SYehuda Sadeh kfree(rbdc); 900602adf40SYehuda Sadeh } 901602adf40SYehuda Sadeh 902602adf40SYehuda Sadeh /* 903602adf40SYehuda Sadeh * Drop reference to ceph client node. If it's not referenced anymore, release 904602adf40SYehuda Sadeh * it. 905602adf40SYehuda Sadeh */ 9069d3997fdSAlex Elder static void rbd_put_client(struct rbd_client *rbdc) 907602adf40SYehuda Sadeh { 908c53d5893SAlex Elder if (rbdc) 9099d3997fdSAlex Elder kref_put(&rbdc->kref, rbd_client_release); 910602adf40SYehuda Sadeh } 911602adf40SYehuda Sadeh 912dd435855SIlya Dryomov static int wait_for_latest_osdmap(struct ceph_client *client) 913dd435855SIlya Dryomov { 914dd435855SIlya Dryomov u64 newest_epoch; 915dd435855SIlya Dryomov int ret; 916dd435855SIlya Dryomov 917dd435855SIlya Dryomov ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); 918dd435855SIlya Dryomov if (ret) 919dd435855SIlya Dryomov return ret; 920dd435855SIlya Dryomov 921dd435855SIlya Dryomov if (client->osdc.osdmap->epoch >= newest_epoch) 922dd435855SIlya Dryomov return 0; 923dd435855SIlya Dryomov 924dd435855SIlya Dryomov ceph_osdc_maybe_request_map(&client->osdc); 925dd435855SIlya Dryomov return ceph_monc_wait_osdmap(&client->monc, newest_epoch, 926dd435855SIlya Dryomov client->options->mount_timeout); 927dd435855SIlya Dryomov } 928dd435855SIlya Dryomov 9295feb0d8dSIlya Dryomov /* 9305feb0d8dSIlya Dryomov * Get a ceph client with specific addr and configuration, if one does 9315feb0d8dSIlya Dryomov * not exist create it. Either way, ceph_opts is consumed by this 9325feb0d8dSIlya Dryomov * function. 9335feb0d8dSIlya Dryomov */ 9345feb0d8dSIlya Dryomov static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 9355feb0d8dSIlya Dryomov { 9365feb0d8dSIlya Dryomov struct rbd_client *rbdc; 937dd435855SIlya Dryomov int ret; 9385feb0d8dSIlya Dryomov 9395feb0d8dSIlya Dryomov mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 9405feb0d8dSIlya Dryomov rbdc = rbd_client_find(ceph_opts); 941dd435855SIlya Dryomov if (rbdc) { 9425feb0d8dSIlya Dryomov ceph_destroy_options(ceph_opts); 943dd435855SIlya Dryomov 944dd435855SIlya Dryomov /* 945dd435855SIlya Dryomov * Using an existing client. Make sure ->pg_pools is up to 946dd435855SIlya Dryomov * date before we look up the pool id in do_rbd_add(). 947dd435855SIlya Dryomov */ 948dd435855SIlya Dryomov ret = wait_for_latest_osdmap(rbdc->client); 949dd435855SIlya Dryomov if (ret) { 950dd435855SIlya Dryomov rbd_warn(NULL, "failed to get latest osdmap: %d", ret); 951dd435855SIlya Dryomov rbd_put_client(rbdc); 952dd435855SIlya Dryomov rbdc = ERR_PTR(ret); 953dd435855SIlya Dryomov } 954dd435855SIlya Dryomov } else { 9555feb0d8dSIlya Dryomov rbdc = rbd_client_create(ceph_opts); 956dd435855SIlya Dryomov } 9575feb0d8dSIlya Dryomov mutex_unlock(&client_mutex); 9585feb0d8dSIlya Dryomov 9595feb0d8dSIlya Dryomov return rbdc; 9605feb0d8dSIlya Dryomov } 9615feb0d8dSIlya Dryomov 962a30b71b9SAlex Elder static bool rbd_image_format_valid(u32 image_format) 963a30b71b9SAlex Elder { 964a30b71b9SAlex Elder return image_format == 1 || image_format == 2; 965a30b71b9SAlex Elder } 966a30b71b9SAlex Elder 9678e94af8eSAlex Elder static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) 9688e94af8eSAlex Elder { 969103a150fSAlex Elder size_t size; 970103a150fSAlex Elder u32 snap_count; 971103a150fSAlex Elder 972103a150fSAlex Elder /* The header has to start with the magic rbd header text */ 973103a150fSAlex Elder if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) 974103a150fSAlex Elder return false; 975103a150fSAlex Elder 976db2388b6SAlex Elder /* The bio layer requires at least sector-sized I/O */ 977db2388b6SAlex Elder 978db2388b6SAlex Elder if (ondisk->options.order < SECTOR_SHIFT) 979db2388b6SAlex Elder return false; 980db2388b6SAlex Elder 981db2388b6SAlex Elder /* If we use u64 in a few spots we may be able to loosen this */ 982db2388b6SAlex Elder 983db2388b6SAlex Elder if (ondisk->options.order > 8 * sizeof (int) - 1) 984db2388b6SAlex Elder return false; 985db2388b6SAlex Elder 986103a150fSAlex Elder /* 987103a150fSAlex Elder * The size of a snapshot header has to fit in a size_t, and 988103a150fSAlex Elder * that limits the number of snapshots. 989103a150fSAlex Elder */ 990103a150fSAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 991103a150fSAlex Elder size = SIZE_MAX - sizeof (struct ceph_snap_context); 992103a150fSAlex Elder if (snap_count > size / sizeof (__le64)) 993103a150fSAlex Elder return false; 994103a150fSAlex Elder 995103a150fSAlex Elder /* 996103a150fSAlex Elder * Not only that, but the size of the entire the snapshot 997103a150fSAlex Elder * header must also be representable in a size_t. 998103a150fSAlex Elder */ 999103a150fSAlex Elder size -= snap_count * sizeof (__le64); 1000103a150fSAlex Elder if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) 1001103a150fSAlex Elder return false; 1002103a150fSAlex Elder 1003103a150fSAlex Elder return true; 10048e94af8eSAlex Elder } 10058e94af8eSAlex Elder 1006602adf40SYehuda Sadeh /* 10075bc3fb17SIlya Dryomov * returns the size of an object in the image 10085bc3fb17SIlya Dryomov */ 10095bc3fb17SIlya Dryomov static u32 rbd_obj_bytes(struct rbd_image_header *header) 10105bc3fb17SIlya Dryomov { 10115bc3fb17SIlya Dryomov return 1U << header->obj_order; 10125bc3fb17SIlya Dryomov } 10135bc3fb17SIlya Dryomov 1014263423f8SIlya Dryomov static void rbd_init_layout(struct rbd_device *rbd_dev) 1015263423f8SIlya Dryomov { 1016263423f8SIlya Dryomov if (rbd_dev->header.stripe_unit == 0 || 1017263423f8SIlya Dryomov rbd_dev->header.stripe_count == 0) { 1018263423f8SIlya Dryomov rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header); 1019263423f8SIlya Dryomov rbd_dev->header.stripe_count = 1; 1020263423f8SIlya Dryomov } 1021263423f8SIlya Dryomov 1022263423f8SIlya Dryomov rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit; 1023263423f8SIlya Dryomov rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count; 1024263423f8SIlya Dryomov rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header); 10257e97332eSIlya Dryomov rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ? 10267e97332eSIlya Dryomov rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id; 1027263423f8SIlya Dryomov RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL); 1028263423f8SIlya Dryomov } 1029263423f8SIlya Dryomov 10305bc3fb17SIlya Dryomov /* 1031bb23e37aSAlex Elder * Fill an rbd image header with information from the given format 1 1032bb23e37aSAlex Elder * on-disk header. 1033602adf40SYehuda Sadeh */ 1034662518b1SAlex Elder static int rbd_header_from_disk(struct rbd_device *rbd_dev, 10354156d998SAlex Elder struct rbd_image_header_ondisk *ondisk) 1036602adf40SYehuda Sadeh { 1037662518b1SAlex Elder struct rbd_image_header *header = &rbd_dev->header; 1038bb23e37aSAlex Elder bool first_time = header->object_prefix == NULL; 1039bb23e37aSAlex Elder struct ceph_snap_context *snapc; 1040bb23e37aSAlex Elder char *object_prefix = NULL; 1041bb23e37aSAlex Elder char *snap_names = NULL; 1042bb23e37aSAlex Elder u64 *snap_sizes = NULL; 1043ccece235SAlex Elder u32 snap_count; 1044bb23e37aSAlex Elder int ret = -ENOMEM; 1045621901d6SAlex Elder u32 i; 1046602adf40SYehuda Sadeh 1047bb23e37aSAlex Elder /* Allocate this now to avoid having to handle failure below */ 1048103a150fSAlex Elder 1049bb23e37aSAlex Elder if (first_time) { 1050848d796cSIlya Dryomov object_prefix = kstrndup(ondisk->object_prefix, 1051848d796cSIlya Dryomov sizeof(ondisk->object_prefix), 1052848d796cSIlya Dryomov GFP_KERNEL); 1053bb23e37aSAlex Elder if (!object_prefix) 1054602adf40SYehuda Sadeh return -ENOMEM; 1055bb23e37aSAlex Elder } 105600f1f36fSAlex Elder 1057bb23e37aSAlex Elder /* Allocate the snapshot context and fill it in */ 1058d2bb24e5SAlex Elder 1059602adf40SYehuda Sadeh snap_count = le32_to_cpu(ondisk->snap_count); 1060bb23e37aSAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 1061bb23e37aSAlex Elder if (!snapc) 1062bb23e37aSAlex Elder goto out_err; 1063bb23e37aSAlex Elder snapc->seq = le64_to_cpu(ondisk->snap_seq); 1064602adf40SYehuda Sadeh if (snap_count) { 1065bb23e37aSAlex Elder struct rbd_image_snap_ondisk *snaps; 1066f785cc1dSAlex Elder u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); 1067f785cc1dSAlex Elder 1068bb23e37aSAlex Elder /* We'll keep a copy of the snapshot names... */ 1069621901d6SAlex Elder 1070f785cc1dSAlex Elder if (snap_names_len > (u64)SIZE_MAX) 1071bb23e37aSAlex Elder goto out_2big; 1072bb23e37aSAlex Elder snap_names = kmalloc(snap_names_len, GFP_KERNEL); 1073bb23e37aSAlex Elder if (!snap_names) 1074602adf40SYehuda Sadeh goto out_err; 1075bb23e37aSAlex Elder 1076bb23e37aSAlex Elder /* ...as well as the array of their sizes. */ 107788a25a5fSMarkus Elfring snap_sizes = kmalloc_array(snap_count, 107888a25a5fSMarkus Elfring sizeof(*header->snap_sizes), 107988a25a5fSMarkus Elfring GFP_KERNEL); 1080bb23e37aSAlex Elder if (!snap_sizes) 1081bb23e37aSAlex Elder goto out_err; 1082bb23e37aSAlex Elder 1083f785cc1dSAlex Elder /* 1084bb23e37aSAlex Elder * Copy the names, and fill in each snapshot's id 1085bb23e37aSAlex Elder * and size. 1086bb23e37aSAlex Elder * 108799a41ebcSAlex Elder * Note that rbd_dev_v1_header_info() guarantees the 1088bb23e37aSAlex Elder * ondisk buffer we're working with has 1089f785cc1dSAlex Elder * snap_names_len bytes beyond the end of the 1090f785cc1dSAlex Elder * snapshot id array, this memcpy() is safe. 1091f785cc1dSAlex Elder */ 1092bb23e37aSAlex Elder memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); 1093bb23e37aSAlex Elder snaps = ondisk->snaps; 1094bb23e37aSAlex Elder for (i = 0; i < snap_count; i++) { 1095bb23e37aSAlex Elder snapc->snaps[i] = le64_to_cpu(snaps[i].id); 1096bb23e37aSAlex Elder snap_sizes[i] = le64_to_cpu(snaps[i].image_size); 1097bb23e37aSAlex Elder } 1098602adf40SYehuda Sadeh } 1099849b4260SAlex Elder 1100bb23e37aSAlex Elder /* We won't fail any more, fill in the header */ 1101bb23e37aSAlex Elder 1102bb23e37aSAlex Elder if (first_time) { 1103bb23e37aSAlex Elder header->object_prefix = object_prefix; 1104602adf40SYehuda Sadeh header->obj_order = ondisk->options.order; 1105263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 1106662518b1SAlex Elder } else { 1107662518b1SAlex Elder ceph_put_snap_context(header->snapc); 1108662518b1SAlex Elder kfree(header->snap_names); 1109662518b1SAlex Elder kfree(header->snap_sizes); 1110bb23e37aSAlex Elder } 11116a52325fSAlex Elder 1112bb23e37aSAlex Elder /* The remaining fields always get updated (when we refresh) */ 1113621901d6SAlex Elder 1114f84344f3SAlex Elder header->image_size = le64_to_cpu(ondisk->image_size); 1115bb23e37aSAlex Elder header->snapc = snapc; 1116bb23e37aSAlex Elder header->snap_names = snap_names; 1117bb23e37aSAlex Elder header->snap_sizes = snap_sizes; 1118468521c1SAlex Elder 1119602adf40SYehuda Sadeh return 0; 1120bb23e37aSAlex Elder out_2big: 1121bb23e37aSAlex Elder ret = -EIO; 11226a52325fSAlex Elder out_err: 1123bb23e37aSAlex Elder kfree(snap_sizes); 1124bb23e37aSAlex Elder kfree(snap_names); 1125bb23e37aSAlex Elder ceph_put_snap_context(snapc); 1126bb23e37aSAlex Elder kfree(object_prefix); 1127ccece235SAlex Elder 1128bb23e37aSAlex Elder return ret; 1129602adf40SYehuda Sadeh } 1130602adf40SYehuda Sadeh 11319682fc6dSAlex Elder static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) 11329682fc6dSAlex Elder { 11339682fc6dSAlex Elder const char *snap_name; 11349682fc6dSAlex Elder 11359682fc6dSAlex Elder rbd_assert(which < rbd_dev->header.snapc->num_snaps); 11369682fc6dSAlex Elder 11379682fc6dSAlex Elder /* Skip over names until we find the one we are looking for */ 11389682fc6dSAlex Elder 11399682fc6dSAlex Elder snap_name = rbd_dev->header.snap_names; 11409682fc6dSAlex Elder while (which--) 11419682fc6dSAlex Elder snap_name += strlen(snap_name) + 1; 11429682fc6dSAlex Elder 11439682fc6dSAlex Elder return kstrdup(snap_name, GFP_KERNEL); 11449682fc6dSAlex Elder } 11459682fc6dSAlex Elder 114630d1cff8SAlex Elder /* 114730d1cff8SAlex Elder * Snapshot id comparison function for use with qsort()/bsearch(). 114830d1cff8SAlex Elder * Note that result is for snapshots in *descending* order. 114930d1cff8SAlex Elder */ 115030d1cff8SAlex Elder static int snapid_compare_reverse(const void *s1, const void *s2) 115130d1cff8SAlex Elder { 115230d1cff8SAlex Elder u64 snap_id1 = *(u64 *)s1; 115330d1cff8SAlex Elder u64 snap_id2 = *(u64 *)s2; 115430d1cff8SAlex Elder 115530d1cff8SAlex Elder if (snap_id1 < snap_id2) 115630d1cff8SAlex Elder return 1; 115730d1cff8SAlex Elder return snap_id1 == snap_id2 ? 0 : -1; 115830d1cff8SAlex Elder } 115930d1cff8SAlex Elder 116030d1cff8SAlex Elder /* 116130d1cff8SAlex Elder * Search a snapshot context to see if the given snapshot id is 116230d1cff8SAlex Elder * present. 116330d1cff8SAlex Elder * 116430d1cff8SAlex Elder * Returns the position of the snapshot id in the array if it's found, 116530d1cff8SAlex Elder * or BAD_SNAP_INDEX otherwise. 116630d1cff8SAlex Elder * 116730d1cff8SAlex Elder * Note: The snapshot array is in kept sorted (by the osd) in 116830d1cff8SAlex Elder * reverse order, highest snapshot id first. 116930d1cff8SAlex Elder */ 11709682fc6dSAlex Elder static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) 11719682fc6dSAlex Elder { 11729682fc6dSAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 117330d1cff8SAlex Elder u64 *found; 11749682fc6dSAlex Elder 117530d1cff8SAlex Elder found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, 117630d1cff8SAlex Elder sizeof (snap_id), snapid_compare_reverse); 11779682fc6dSAlex Elder 117830d1cff8SAlex Elder return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; 11799682fc6dSAlex Elder } 11809682fc6dSAlex Elder 11812ad3d716SAlex Elder static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, 11822ad3d716SAlex Elder u64 snap_id) 118354cac61fSAlex Elder { 118454cac61fSAlex Elder u32 which; 1185da6a6b63SJosh Durgin const char *snap_name; 118654cac61fSAlex Elder 118754cac61fSAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 118854cac61fSAlex Elder if (which == BAD_SNAP_INDEX) 1189da6a6b63SJosh Durgin return ERR_PTR(-ENOENT); 119054cac61fSAlex Elder 1191da6a6b63SJosh Durgin snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); 1192da6a6b63SJosh Durgin return snap_name ? snap_name : ERR_PTR(-ENOMEM); 119354cac61fSAlex Elder } 119454cac61fSAlex Elder 11959e15b77dSAlex Elder static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) 11969e15b77dSAlex Elder { 11979e15b77dSAlex Elder if (snap_id == CEPH_NOSNAP) 11989e15b77dSAlex Elder return RBD_SNAP_HEAD_NAME; 11999e15b77dSAlex Elder 120054cac61fSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 120154cac61fSAlex Elder if (rbd_dev->image_format == 1) 120254cac61fSAlex Elder return rbd_dev_v1_snap_name(rbd_dev, snap_id); 12039e15b77dSAlex Elder 120454cac61fSAlex Elder return rbd_dev_v2_snap_name(rbd_dev, snap_id); 12059e15b77dSAlex Elder } 12069e15b77dSAlex Elder 12072ad3d716SAlex Elder static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 12082ad3d716SAlex Elder u64 *snap_size) 1209602adf40SYehuda Sadeh { 12102ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12112ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12122ad3d716SAlex Elder *snap_size = rbd_dev->header.image_size; 12132ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12142ad3d716SAlex Elder u32 which; 121500f1f36fSAlex Elder 12162ad3d716SAlex Elder which = rbd_dev_snap_index(rbd_dev, snap_id); 12172ad3d716SAlex Elder if (which == BAD_SNAP_INDEX) 12182ad3d716SAlex Elder return -ENOENT; 121900f1f36fSAlex Elder 12202ad3d716SAlex Elder *snap_size = rbd_dev->header.snap_sizes[which]; 12212ad3d716SAlex Elder } else { 12222ad3d716SAlex Elder u64 size = 0; 12232ad3d716SAlex Elder int ret; 12242ad3d716SAlex Elder 12252ad3d716SAlex Elder ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); 12262ad3d716SAlex Elder if (ret) 12272ad3d716SAlex Elder return ret; 12282ad3d716SAlex Elder 12292ad3d716SAlex Elder *snap_size = size; 12302ad3d716SAlex Elder } 12312ad3d716SAlex Elder return 0; 12322ad3d716SAlex Elder } 12332ad3d716SAlex Elder 12342ad3d716SAlex Elder static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 12352ad3d716SAlex Elder u64 *snap_features) 12362ad3d716SAlex Elder { 12372ad3d716SAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 12382ad3d716SAlex Elder if (snap_id == CEPH_NOSNAP) { 12392ad3d716SAlex Elder *snap_features = rbd_dev->header.features; 12402ad3d716SAlex Elder } else if (rbd_dev->image_format == 1) { 12412ad3d716SAlex Elder *snap_features = 0; /* No features for format 1 */ 12422ad3d716SAlex Elder } else { 12432ad3d716SAlex Elder u64 features = 0; 12442ad3d716SAlex Elder int ret; 12452ad3d716SAlex Elder 12462ad3d716SAlex Elder ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); 12472ad3d716SAlex Elder if (ret) 12482ad3d716SAlex Elder return ret; 12492ad3d716SAlex Elder 12502ad3d716SAlex Elder *snap_features = features; 12512ad3d716SAlex Elder } 12522ad3d716SAlex Elder return 0; 125300f1f36fSAlex Elder } 1254602adf40SYehuda Sadeh 1255d1cf5788SAlex Elder static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) 1256602adf40SYehuda Sadeh { 12578f4b7d98SAlex Elder u64 snap_id = rbd_dev->spec->snap_id; 12582ad3d716SAlex Elder u64 size = 0; 12592ad3d716SAlex Elder u64 features = 0; 12602ad3d716SAlex Elder int ret; 12618b0241f8SAlex Elder 12622ad3d716SAlex Elder ret = rbd_snap_size(rbd_dev, snap_id, &size); 12632ad3d716SAlex Elder if (ret) 12642ad3d716SAlex Elder return ret; 12652ad3d716SAlex Elder ret = rbd_snap_features(rbd_dev, snap_id, &features); 12662ad3d716SAlex Elder if (ret) 12672ad3d716SAlex Elder return ret; 12682ad3d716SAlex Elder 12692ad3d716SAlex Elder rbd_dev->mapping.size = size; 12702ad3d716SAlex Elder rbd_dev->mapping.features = features; 12712ad3d716SAlex Elder 12728b0241f8SAlex Elder return 0; 1273602adf40SYehuda Sadeh } 1274602adf40SYehuda Sadeh 1275d1cf5788SAlex Elder static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) 1276d1cf5788SAlex Elder { 1277d1cf5788SAlex Elder rbd_dev->mapping.size = 0; 1278d1cf5788SAlex Elder rbd_dev->mapping.features = 0; 1279200a6a8bSAlex Elder } 1280200a6a8bSAlex Elder 12815359a17dSIlya Dryomov static void zero_bvec(struct bio_vec *bv) 128265ccfe21SAlex Elder { 1283602adf40SYehuda Sadeh void *buf; 12845359a17dSIlya Dryomov unsigned long flags; 1285602adf40SYehuda Sadeh 12865359a17dSIlya Dryomov buf = bvec_kmap_irq(bv, &flags); 12875359a17dSIlya Dryomov memset(buf, 0, bv->bv_len); 12885359a17dSIlya Dryomov flush_dcache_page(bv->bv_page); 128985b5aaa6SDan Carpenter bvec_kunmap_irq(buf, &flags); 1290602adf40SYehuda Sadeh } 1291602adf40SYehuda Sadeh 12925359a17dSIlya Dryomov static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) 1293b9434c5bSAlex Elder { 12945359a17dSIlya Dryomov struct ceph_bio_iter it = *bio_pos; 1295b9434c5bSAlex Elder 12965359a17dSIlya Dryomov ceph_bio_iter_advance(&it, off); 12975359a17dSIlya Dryomov ceph_bio_iter_advance_step(&it, bytes, ({ 12985359a17dSIlya Dryomov zero_bvec(&bv); 12995359a17dSIlya Dryomov })); 1300b9434c5bSAlex Elder } 1301b9434c5bSAlex Elder 13027e07efb1SIlya Dryomov static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) 1303602adf40SYehuda Sadeh { 13047e07efb1SIlya Dryomov struct ceph_bvec_iter it = *bvec_pos; 1305602adf40SYehuda Sadeh 13067e07efb1SIlya Dryomov ceph_bvec_iter_advance(&it, off); 13077e07efb1SIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 13087e07efb1SIlya Dryomov zero_bvec(&bv); 13097e07efb1SIlya Dryomov })); 1310602adf40SYehuda Sadeh } 1311602adf40SYehuda Sadeh 1312f7760dadSAlex Elder /* 13133da691bfSIlya Dryomov * Zero a range in @obj_req data buffer defined by a bio (list) or 1314afb97888SIlya Dryomov * (private) bio_vec array. 1315f7760dadSAlex Elder * 13163da691bfSIlya Dryomov * @off is relative to the start of the data buffer. 1317f7760dadSAlex Elder */ 13183da691bfSIlya Dryomov static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 13193da691bfSIlya Dryomov u32 bytes) 1320f7760dadSAlex Elder { 1321ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 13223da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 13233da691bfSIlya Dryomov zero_bios(&obj_req->bio_pos, off, bytes); 13243da691bfSIlya Dryomov break; 13253da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1326afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 13273da691bfSIlya Dryomov zero_bvecs(&obj_req->bvec_pos, off, bytes); 13283da691bfSIlya Dryomov break; 13293da691bfSIlya Dryomov default: 13303da691bfSIlya Dryomov rbd_assert(0); 1331f5400b7aSAlex Elder } 1332bf0d5f50SAlex Elder } 1333bf0d5f50SAlex Elder 1334bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref); 1335bf0d5f50SAlex Elder static void rbd_obj_request_put(struct rbd_obj_request *obj_request) 1336bf0d5f50SAlex Elder { 1337bf0d5f50SAlex Elder rbd_assert(obj_request != NULL); 133837206ee5SAlex Elder dout("%s: obj %p (was %d)\n", __func__, obj_request, 13392c935bc5SPeter Zijlstra kref_read(&obj_request->kref)); 1340bf0d5f50SAlex Elder kref_put(&obj_request->kref, rbd_obj_request_destroy); 1341bf0d5f50SAlex Elder } 1342bf0d5f50SAlex Elder 13430f2d5be7SAlex Elder static void rbd_img_request_get(struct rbd_img_request *img_request) 13440f2d5be7SAlex Elder { 13450f2d5be7SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 13462c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 13470f2d5be7SAlex Elder kref_get(&img_request->kref); 13480f2d5be7SAlex Elder } 13490f2d5be7SAlex Elder 1350bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref); 1351bf0d5f50SAlex Elder static void rbd_img_request_put(struct rbd_img_request *img_request) 1352bf0d5f50SAlex Elder { 1353bf0d5f50SAlex Elder rbd_assert(img_request != NULL); 135437206ee5SAlex Elder dout("%s: img %p (was %d)\n", __func__, img_request, 13552c935bc5SPeter Zijlstra kref_read(&img_request->kref)); 1356bf0d5f50SAlex Elder kref_put(&img_request->kref, rbd_img_request_destroy); 1357bf0d5f50SAlex Elder } 1358bf0d5f50SAlex Elder 1359bf0d5f50SAlex Elder static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, 1360bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1361bf0d5f50SAlex Elder { 136225dcf954SAlex Elder rbd_assert(obj_request->img_request == NULL); 136325dcf954SAlex Elder 1364b155e86cSAlex Elder /* Image request now owns object's original reference */ 1365bf0d5f50SAlex Elder obj_request->img_request = img_request; 13667114edacSIlya Dryomov img_request->pending_count++; 136715961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1368bf0d5f50SAlex Elder } 1369bf0d5f50SAlex Elder 1370bf0d5f50SAlex Elder static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1371bf0d5f50SAlex Elder struct rbd_obj_request *obj_request) 1372bf0d5f50SAlex Elder { 137315961b44SIlya Dryomov dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 137443df3d35SIlya Dryomov list_del(&obj_request->ex.oe_item); 1375bf0d5f50SAlex Elder rbd_assert(obj_request->img_request == img_request); 1376bf0d5f50SAlex Elder rbd_obj_request_put(obj_request); 1377bf0d5f50SAlex Elder } 1378bf0d5f50SAlex Elder 1379980917fcSIlya Dryomov static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1380bf0d5f50SAlex Elder { 1381980917fcSIlya Dryomov struct ceph_osd_request *osd_req = obj_request->osd_req; 1382980917fcSIlya Dryomov 1383a90bb0c1SIlya Dryomov dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 138443df3d35SIlya Dryomov obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, 138543df3d35SIlya Dryomov obj_request->ex.oe_len, osd_req); 1386980917fcSIlya Dryomov ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1387bf0d5f50SAlex Elder } 1388bf0d5f50SAlex Elder 13890c425248SAlex Elder /* 13900c425248SAlex Elder * The default/initial value for all image request flags is 0. Each 13910c425248SAlex Elder * is conditionally set to 1 at image request initialization time 13920c425248SAlex Elder * and currently never change thereafter. 13930c425248SAlex Elder */ 1394d0b2e944SAlex Elder static void img_request_layered_set(struct rbd_img_request *img_request) 1395d0b2e944SAlex Elder { 1396d0b2e944SAlex Elder set_bit(IMG_REQ_LAYERED, &img_request->flags); 1397d0b2e944SAlex Elder smp_mb(); 1398d0b2e944SAlex Elder } 1399d0b2e944SAlex Elder 1400a2acd00eSAlex Elder static void img_request_layered_clear(struct rbd_img_request *img_request) 1401a2acd00eSAlex Elder { 1402a2acd00eSAlex Elder clear_bit(IMG_REQ_LAYERED, &img_request->flags); 1403a2acd00eSAlex Elder smp_mb(); 1404a2acd00eSAlex Elder } 1405a2acd00eSAlex Elder 1406d0b2e944SAlex Elder static bool img_request_layered_test(struct rbd_img_request *img_request) 1407d0b2e944SAlex Elder { 1408d0b2e944SAlex Elder smp_mb(); 1409d0b2e944SAlex Elder return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1410d0b2e944SAlex Elder } 1411d0b2e944SAlex Elder 14123da691bfSIlya Dryomov static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) 14133b434a2aSJosh Durgin { 14143da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 14153da691bfSIlya Dryomov 141643df3d35SIlya Dryomov return !obj_req->ex.oe_off && 141743df3d35SIlya Dryomov obj_req->ex.oe_len == rbd_dev->layout.object_size; 14183b434a2aSJosh Durgin } 14193b434a2aSJosh Durgin 14203da691bfSIlya Dryomov static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) 14216e2a4505SAlex Elder { 14223da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1423b9434c5bSAlex Elder 142443df3d35SIlya Dryomov return obj_req->ex.oe_off + obj_req->ex.oe_len == 14253da691bfSIlya Dryomov rbd_dev->layout.object_size; 14266e2a4505SAlex Elder } 14276e2a4505SAlex Elder 142813488d53SIlya Dryomov /* 142913488d53SIlya Dryomov * Must be called after rbd_obj_calc_img_extents(). 143013488d53SIlya Dryomov */ 143113488d53SIlya Dryomov static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req) 143213488d53SIlya Dryomov { 143313488d53SIlya Dryomov if (!obj_req->num_img_extents || 143413488d53SIlya Dryomov rbd_obj_is_entire(obj_req)) 143513488d53SIlya Dryomov return false; 143613488d53SIlya Dryomov 143713488d53SIlya Dryomov return true; 143813488d53SIlya Dryomov } 143913488d53SIlya Dryomov 144086bd7998SIlya Dryomov static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) 1441bf0d5f50SAlex Elder { 144286bd7998SIlya Dryomov return ceph_file_extents_bytes(obj_req->img_extents, 144386bd7998SIlya Dryomov obj_req->num_img_extents); 1444bf0d5f50SAlex Elder } 1445bf0d5f50SAlex Elder 14463da691bfSIlya Dryomov static bool rbd_img_is_write(struct rbd_img_request *img_req) 14470dcc685eSIlya Dryomov { 14489bb0248dSIlya Dryomov switch (img_req->op_type) { 14493da691bfSIlya Dryomov case OBJ_OP_READ: 14503da691bfSIlya Dryomov return false; 14513da691bfSIlya Dryomov case OBJ_OP_WRITE: 14523da691bfSIlya Dryomov case OBJ_OP_DISCARD: 14536484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 14543da691bfSIlya Dryomov return true; 14553da691bfSIlya Dryomov default: 1456c6244b3bSArnd Bergmann BUG(); 14570dcc685eSIlya Dryomov } 14580dcc685eSIlya Dryomov } 14590dcc685eSIlya Dryomov 14603da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); 14612761713dSIlya Dryomov 146285e084feSIlya Dryomov static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1463bf0d5f50SAlex Elder { 14643da691bfSIlya Dryomov struct rbd_obj_request *obj_req = osd_req->r_priv; 1465bf0d5f50SAlex Elder 14663da691bfSIlya Dryomov dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 14673da691bfSIlya Dryomov osd_req->r_result, obj_req); 14683da691bfSIlya Dryomov rbd_assert(osd_req == obj_req->osd_req); 1469bf0d5f50SAlex Elder 14703da691bfSIlya Dryomov obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; 14713da691bfSIlya Dryomov if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) 14723da691bfSIlya Dryomov obj_req->xferred = osd_req->r_result; 14733da691bfSIlya Dryomov else 1474c47f9371SAlex Elder /* 14753da691bfSIlya Dryomov * Writes aren't allowed to return a data payload. In some 14763da691bfSIlya Dryomov * guarded write cases (e.g. stat + zero on an empty object) 14773da691bfSIlya Dryomov * a stat response makes it through, but we don't care. 1478c47f9371SAlex Elder */ 14793da691bfSIlya Dryomov obj_req->xferred = 0; 14800ccd5926SIlya Dryomov 14813da691bfSIlya Dryomov rbd_obj_handle_request(obj_req); 1482bf0d5f50SAlex Elder } 1483bf0d5f50SAlex Elder 14849d4df01fSAlex Elder static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1485430c28c3SAlex Elder { 14868c042b0dSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 1487430c28c3SAlex Elder 1488a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_READ; 14897c84883aSIlya Dryomov osd_req->r_snapid = obj_request->img_request->snap_id; 14909d4df01fSAlex Elder } 14919d4df01fSAlex Elder 14929d4df01fSAlex Elder static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) 14939d4df01fSAlex Elder { 14949d4df01fSAlex Elder struct ceph_osd_request *osd_req = obj_request->osd_req; 14959d4df01fSAlex Elder 1496a162b308SIlya Dryomov osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1497fac02ddfSArnd Bergmann ktime_get_real_ts64(&osd_req->r_mtime); 149843df3d35SIlya Dryomov osd_req->r_data_offset = obj_request->ex.oe_off; 1499430c28c3SAlex Elder } 1500430c28c3SAlex Elder 1501bc81207eSIlya Dryomov static struct ceph_osd_request * 1502e28eded5SIlya Dryomov __rbd_osd_req_create(struct rbd_obj_request *obj_req, 1503e28eded5SIlya Dryomov struct ceph_snap_context *snapc, unsigned int num_ops) 1504bc81207eSIlya Dryomov { 1505e28eded5SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1506bc81207eSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1507bc81207eSIlya Dryomov struct ceph_osd_request *req; 1508a90bb0c1SIlya Dryomov const char *name_format = rbd_dev->image_format == 1 ? 1509a90bb0c1SIlya Dryomov RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1510bc81207eSIlya Dryomov 1511e28eded5SIlya Dryomov req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1512bc81207eSIlya Dryomov if (!req) 1513bc81207eSIlya Dryomov return NULL; 1514bc81207eSIlya Dryomov 1515bc81207eSIlya Dryomov req->r_callback = rbd_osd_req_callback; 1516a162b308SIlya Dryomov req->r_priv = obj_req; 1517bc81207eSIlya Dryomov 1518b26c047bSIlya Dryomov /* 1519b26c047bSIlya Dryomov * Data objects may be stored in a separate pool, but always in 1520b26c047bSIlya Dryomov * the same namespace in that pool as the header in its pool. 1521b26c047bSIlya Dryomov */ 1522b26c047bSIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); 1523bc81207eSIlya Dryomov req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1524b26c047bSIlya Dryomov 1525a90bb0c1SIlya Dryomov if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 152643df3d35SIlya Dryomov rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) 1527bc81207eSIlya Dryomov goto err_req; 1528bc81207eSIlya Dryomov 1529bc81207eSIlya Dryomov return req; 1530bc81207eSIlya Dryomov 1531bc81207eSIlya Dryomov err_req: 1532bc81207eSIlya Dryomov ceph_osdc_put_request(req); 1533bc81207eSIlya Dryomov return NULL; 1534bc81207eSIlya Dryomov } 1535bc81207eSIlya Dryomov 1536e28eded5SIlya Dryomov static struct ceph_osd_request * 1537e28eded5SIlya Dryomov rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) 1538e28eded5SIlya Dryomov { 1539e28eded5SIlya Dryomov return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc, 1540e28eded5SIlya Dryomov num_ops); 1541e28eded5SIlya Dryomov } 1542e28eded5SIlya Dryomov 1543bf0d5f50SAlex Elder static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1544bf0d5f50SAlex Elder { 1545bf0d5f50SAlex Elder ceph_osdc_put_request(osd_req); 1546bf0d5f50SAlex Elder } 1547bf0d5f50SAlex Elder 1548ecc633caSIlya Dryomov static struct rbd_obj_request *rbd_obj_request_create(void) 1549bf0d5f50SAlex Elder { 1550bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1551bf0d5f50SAlex Elder 15525a60e876SIlya Dryomov obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 15536c696d85SIlya Dryomov if (!obj_request) 1554f907ad55SAlex Elder return NULL; 1555f907ad55SAlex Elder 155643df3d35SIlya Dryomov ceph_object_extent_init(&obj_request->ex); 1557bf0d5f50SAlex Elder kref_init(&obj_request->kref); 1558bf0d5f50SAlex Elder 155967e2b652SIlya Dryomov dout("%s %p\n", __func__, obj_request); 1560bf0d5f50SAlex Elder return obj_request; 1561bf0d5f50SAlex Elder } 1562bf0d5f50SAlex Elder 1563bf0d5f50SAlex Elder static void rbd_obj_request_destroy(struct kref *kref) 1564bf0d5f50SAlex Elder { 1565bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 15667e07efb1SIlya Dryomov u32 i; 1567bf0d5f50SAlex Elder 1568bf0d5f50SAlex Elder obj_request = container_of(kref, struct rbd_obj_request, kref); 1569bf0d5f50SAlex Elder 157037206ee5SAlex Elder dout("%s: obj %p\n", __func__, obj_request); 157137206ee5SAlex Elder 1572bf0d5f50SAlex Elder if (obj_request->osd_req) 1573bf0d5f50SAlex Elder rbd_osd_req_destroy(obj_request->osd_req); 1574bf0d5f50SAlex Elder 1575ecc633caSIlya Dryomov switch (obj_request->img_request->data_type) { 15769969ebc5SAlex Elder case OBJ_REQUEST_NODATA: 1577bf0d5f50SAlex Elder case OBJ_REQUEST_BIO: 15787e07efb1SIlya Dryomov case OBJ_REQUEST_BVECS: 15795359a17dSIlya Dryomov break; /* Nothing to do */ 1580afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 1581afb97888SIlya Dryomov kfree(obj_request->bvec_pos.bvecs); 1582bf0d5f50SAlex Elder break; 15837e07efb1SIlya Dryomov default: 15847e07efb1SIlya Dryomov rbd_assert(0); 1585bf0d5f50SAlex Elder } 1586bf0d5f50SAlex Elder 158786bd7998SIlya Dryomov kfree(obj_request->img_extents); 15887e07efb1SIlya Dryomov if (obj_request->copyup_bvecs) { 15897e07efb1SIlya Dryomov for (i = 0; i < obj_request->copyup_bvec_count; i++) { 15907e07efb1SIlya Dryomov if (obj_request->copyup_bvecs[i].bv_page) 15917e07efb1SIlya Dryomov __free_page(obj_request->copyup_bvecs[i].bv_page); 15927e07efb1SIlya Dryomov } 15937e07efb1SIlya Dryomov kfree(obj_request->copyup_bvecs); 1594bf0d5f50SAlex Elder } 1595bf0d5f50SAlex Elder 1596868311b1SAlex Elder kmem_cache_free(rbd_obj_request_cache, obj_request); 1597bf0d5f50SAlex Elder } 1598bf0d5f50SAlex Elder 1599fb65d228SAlex Elder /* It's OK to call this for a device with no parent */ 1600fb65d228SAlex Elder 1601fb65d228SAlex Elder static void rbd_spec_put(struct rbd_spec *spec); 1602fb65d228SAlex Elder static void rbd_dev_unparent(struct rbd_device *rbd_dev) 1603fb65d228SAlex Elder { 1604fb65d228SAlex Elder rbd_dev_remove_parent(rbd_dev); 1605fb65d228SAlex Elder rbd_spec_put(rbd_dev->parent_spec); 1606fb65d228SAlex Elder rbd_dev->parent_spec = NULL; 1607fb65d228SAlex Elder rbd_dev->parent_overlap = 0; 1608fb65d228SAlex Elder } 1609fb65d228SAlex Elder 1610bf0d5f50SAlex Elder /* 1611a2acd00eSAlex Elder * Parent image reference counting is used to determine when an 1612a2acd00eSAlex Elder * image's parent fields can be safely torn down--after there are no 1613a2acd00eSAlex Elder * more in-flight requests to the parent image. When the last 1614a2acd00eSAlex Elder * reference is dropped, cleaning them up is safe. 1615a2acd00eSAlex Elder */ 1616a2acd00eSAlex Elder static void rbd_dev_parent_put(struct rbd_device *rbd_dev) 1617a2acd00eSAlex Elder { 1618a2acd00eSAlex Elder int counter; 1619a2acd00eSAlex Elder 1620a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1621a2acd00eSAlex Elder return; 1622a2acd00eSAlex Elder 1623a2acd00eSAlex Elder counter = atomic_dec_return_safe(&rbd_dev->parent_ref); 1624a2acd00eSAlex Elder if (counter > 0) 1625a2acd00eSAlex Elder return; 1626a2acd00eSAlex Elder 1627a2acd00eSAlex Elder /* Last reference; clean up parent data structures */ 1628a2acd00eSAlex Elder 1629a2acd00eSAlex Elder if (!counter) 1630a2acd00eSAlex Elder rbd_dev_unparent(rbd_dev); 1631a2acd00eSAlex Elder else 16329584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference underflow"); 1633a2acd00eSAlex Elder } 1634a2acd00eSAlex Elder 1635a2acd00eSAlex Elder /* 1636a2acd00eSAlex Elder * If an image has a non-zero parent overlap, get a reference to its 1637a2acd00eSAlex Elder * parent. 1638a2acd00eSAlex Elder * 1639a2acd00eSAlex Elder * Returns true if the rbd device has a parent with a non-zero 1640a2acd00eSAlex Elder * overlap and a reference for it was successfully taken, or 1641a2acd00eSAlex Elder * false otherwise. 1642a2acd00eSAlex Elder */ 1643a2acd00eSAlex Elder static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) 1644a2acd00eSAlex Elder { 1645ae43e9d0SIlya Dryomov int counter = 0; 1646a2acd00eSAlex Elder 1647a2acd00eSAlex Elder if (!rbd_dev->parent_spec) 1648a2acd00eSAlex Elder return false; 1649a2acd00eSAlex Elder 1650ae43e9d0SIlya Dryomov down_read(&rbd_dev->header_rwsem); 1651ae43e9d0SIlya Dryomov if (rbd_dev->parent_overlap) 1652a2acd00eSAlex Elder counter = atomic_inc_return_safe(&rbd_dev->parent_ref); 1653ae43e9d0SIlya Dryomov up_read(&rbd_dev->header_rwsem); 1654a2acd00eSAlex Elder 1655a2acd00eSAlex Elder if (counter < 0) 16569584d508SIlya Dryomov rbd_warn(rbd_dev, "parent reference overflow"); 1657a2acd00eSAlex Elder 1658ae43e9d0SIlya Dryomov return counter > 0; 1659a2acd00eSAlex Elder } 1660a2acd00eSAlex Elder 1661bf0d5f50SAlex Elder /* 1662bf0d5f50SAlex Elder * Caller is responsible for filling in the list of object requests 1663bf0d5f50SAlex Elder * that comprises the image request, and the Linux request pointer 1664bf0d5f50SAlex Elder * (if there is one). 1665bf0d5f50SAlex Elder */ 1666cc344fa1SAlex Elder static struct rbd_img_request *rbd_img_request_create( 1667cc344fa1SAlex Elder struct rbd_device *rbd_dev, 16686d2940c8SGuangliang Zhao enum obj_operation_type op_type, 16694e752f0aSJosh Durgin struct ceph_snap_context *snapc) 1670bf0d5f50SAlex Elder { 1671bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1672bf0d5f50SAlex Elder 1673a0c5895bSIlya Dryomov img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO); 1674bf0d5f50SAlex Elder if (!img_request) 1675bf0d5f50SAlex Elder return NULL; 1676bf0d5f50SAlex Elder 1677bf0d5f50SAlex Elder img_request->rbd_dev = rbd_dev; 16789bb0248dSIlya Dryomov img_request->op_type = op_type; 16799bb0248dSIlya Dryomov if (!rbd_img_is_write(img_request)) 1680bf0d5f50SAlex Elder img_request->snap_id = rbd_dev->spec->snap_id; 16819bb0248dSIlya Dryomov else 16829bb0248dSIlya Dryomov img_request->snapc = snapc; 16839bb0248dSIlya Dryomov 1684a2acd00eSAlex Elder if (rbd_dev_parent_get(rbd_dev)) 1685d0b2e944SAlex Elder img_request_layered_set(img_request); 1686a0c5895bSIlya Dryomov 1687bf0d5f50SAlex Elder spin_lock_init(&img_request->completion_lock); 168843df3d35SIlya Dryomov INIT_LIST_HEAD(&img_request->object_extents); 1689bf0d5f50SAlex Elder kref_init(&img_request->kref); 1690bf0d5f50SAlex Elder 1691dfd9875fSIlya Dryomov dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, 1692dfd9875fSIlya Dryomov obj_op_name(op_type), img_request); 1693bf0d5f50SAlex Elder return img_request; 1694bf0d5f50SAlex Elder } 1695bf0d5f50SAlex Elder 1696bf0d5f50SAlex Elder static void rbd_img_request_destroy(struct kref *kref) 1697bf0d5f50SAlex Elder { 1698bf0d5f50SAlex Elder struct rbd_img_request *img_request; 1699bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 1700bf0d5f50SAlex Elder struct rbd_obj_request *next_obj_request; 1701bf0d5f50SAlex Elder 1702bf0d5f50SAlex Elder img_request = container_of(kref, struct rbd_img_request, kref); 1703bf0d5f50SAlex Elder 170437206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 170537206ee5SAlex Elder 1706bf0d5f50SAlex Elder for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1707bf0d5f50SAlex Elder rbd_img_obj_request_del(img_request, obj_request); 1708bf0d5f50SAlex Elder 1709a2acd00eSAlex Elder if (img_request_layered_test(img_request)) { 1710a2acd00eSAlex Elder img_request_layered_clear(img_request); 1711a2acd00eSAlex Elder rbd_dev_parent_put(img_request->rbd_dev); 1712a2acd00eSAlex Elder } 1713a2acd00eSAlex Elder 17149bb0248dSIlya Dryomov if (rbd_img_is_write(img_request)) 1715812164f8SAlex Elder ceph_put_snap_context(img_request->snapc); 1716bf0d5f50SAlex Elder 17171c2a9dfeSAlex Elder kmem_cache_free(rbd_img_request_cache, img_request); 1718bf0d5f50SAlex Elder } 1719bf0d5f50SAlex Elder 172086bd7998SIlya Dryomov static void prune_extents(struct ceph_file_extent *img_extents, 172186bd7998SIlya Dryomov u32 *num_img_extents, u64 overlap) 1722e93f3152SAlex Elder { 172386bd7998SIlya Dryomov u32 cnt = *num_img_extents; 1724e93f3152SAlex Elder 172586bd7998SIlya Dryomov /* drop extents completely beyond the overlap */ 172686bd7998SIlya Dryomov while (cnt && img_extents[cnt - 1].fe_off >= overlap) 172786bd7998SIlya Dryomov cnt--; 1728e93f3152SAlex Elder 172986bd7998SIlya Dryomov if (cnt) { 173086bd7998SIlya Dryomov struct ceph_file_extent *ex = &img_extents[cnt - 1]; 1731e93f3152SAlex Elder 173286bd7998SIlya Dryomov /* trim final overlapping extent */ 173386bd7998SIlya Dryomov if (ex->fe_off + ex->fe_len > overlap) 173486bd7998SIlya Dryomov ex->fe_len = overlap - ex->fe_off; 1735e93f3152SAlex Elder } 1736e93f3152SAlex Elder 173786bd7998SIlya Dryomov *num_img_extents = cnt; 173886bd7998SIlya Dryomov } 173986bd7998SIlya Dryomov 174086bd7998SIlya Dryomov /* 174186bd7998SIlya Dryomov * Determine the byte range(s) covered by either just the object extent 174286bd7998SIlya Dryomov * or the entire object in the parent image. 174386bd7998SIlya Dryomov */ 174486bd7998SIlya Dryomov static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, 174586bd7998SIlya Dryomov bool entire) 1746e93f3152SAlex Elder { 174786bd7998SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1748c5b5ef6cSAlex Elder int ret; 1749c5b5ef6cSAlex Elder 175086bd7998SIlya Dryomov if (!rbd_dev->parent_overlap) 175186bd7998SIlya Dryomov return 0; 175286bd7998SIlya Dryomov 175386bd7998SIlya Dryomov ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, 175486bd7998SIlya Dryomov entire ? 0 : obj_req->ex.oe_off, 175586bd7998SIlya Dryomov entire ? rbd_dev->layout.object_size : 175686bd7998SIlya Dryomov obj_req->ex.oe_len, 175786bd7998SIlya Dryomov &obj_req->img_extents, 175886bd7998SIlya Dryomov &obj_req->num_img_extents); 175986bd7998SIlya Dryomov if (ret) 176086bd7998SIlya Dryomov return ret; 176186bd7998SIlya Dryomov 176286bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 176386bd7998SIlya Dryomov rbd_dev->parent_overlap); 176486bd7998SIlya Dryomov return 0; 176586bd7998SIlya Dryomov } 176686bd7998SIlya Dryomov 17673da691bfSIlya Dryomov static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) 17683da691bfSIlya Dryomov { 1769ecc633caSIlya Dryomov switch (obj_req->img_request->data_type) { 17703da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 17713da691bfSIlya Dryomov osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, 17723da691bfSIlya Dryomov &obj_req->bio_pos, 177343df3d35SIlya Dryomov obj_req->ex.oe_len); 17743da691bfSIlya Dryomov break; 17753da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 1776afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 17773da691bfSIlya Dryomov rbd_assert(obj_req->bvec_pos.iter.bi_size == 177843df3d35SIlya Dryomov obj_req->ex.oe_len); 1779afb97888SIlya Dryomov rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 17803da691bfSIlya Dryomov osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, 17813da691bfSIlya Dryomov &obj_req->bvec_pos); 17823da691bfSIlya Dryomov break; 17833da691bfSIlya Dryomov default: 17843da691bfSIlya Dryomov rbd_assert(0); 17853da691bfSIlya Dryomov } 17863da691bfSIlya Dryomov } 17873da691bfSIlya Dryomov 17883da691bfSIlya Dryomov static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) 17893da691bfSIlya Dryomov { 1790e28eded5SIlya Dryomov obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1); 17913da691bfSIlya Dryomov if (!obj_req->osd_req) 1792710214e3SIlya Dryomov return -ENOMEM; 1793710214e3SIlya Dryomov 17943da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, 179543df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 17963da691bfSIlya Dryomov rbd_osd_req_setup_data(obj_req, 0); 1797a90bb0c1SIlya Dryomov 17983da691bfSIlya Dryomov rbd_osd_req_format_read(obj_req); 17993da691bfSIlya Dryomov return 0; 1800710214e3SIlya Dryomov } 1801710214e3SIlya Dryomov 18023da691bfSIlya Dryomov static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, 18033da691bfSIlya Dryomov unsigned int which) 18043da691bfSIlya Dryomov { 18053da691bfSIlya Dryomov struct page **pages; 18063da691bfSIlya Dryomov 1807c5b5ef6cSAlex Elder /* 1808c5b5ef6cSAlex Elder * The response data for a STAT call consists of: 1809c5b5ef6cSAlex Elder * le64 length; 1810c5b5ef6cSAlex Elder * struct { 1811c5b5ef6cSAlex Elder * le32 tv_sec; 1812c5b5ef6cSAlex Elder * le32 tv_nsec; 1813c5b5ef6cSAlex Elder * } mtime; 1814c5b5ef6cSAlex Elder */ 18153da691bfSIlya Dryomov pages = ceph_alloc_page_vector(1, GFP_NOIO); 18163da691bfSIlya Dryomov if (IS_ERR(pages)) 18173da691bfSIlya Dryomov return PTR_ERR(pages); 18183da691bfSIlya Dryomov 18193da691bfSIlya Dryomov osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); 18203da691bfSIlya Dryomov osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, 18213da691bfSIlya Dryomov 8 + sizeof(struct ceph_timespec), 18223da691bfSIlya Dryomov 0, false, true); 18233da691bfSIlya Dryomov return 0; 1824710214e3SIlya Dryomov } 1825c5b5ef6cSAlex Elder 182613488d53SIlya Dryomov static int count_write_ops(struct rbd_obj_request *obj_req) 182713488d53SIlya Dryomov { 182813488d53SIlya Dryomov return 2; /* setallochint + write/writefull */ 182913488d53SIlya Dryomov } 183013488d53SIlya Dryomov 18313da691bfSIlya Dryomov static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, 18323da691bfSIlya Dryomov unsigned int which) 18333da691bfSIlya Dryomov { 18343da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 18353da691bfSIlya Dryomov u16 opcode; 1836c5b5ef6cSAlex Elder 18373da691bfSIlya Dryomov osd_req_op_alloc_hint_init(obj_req->osd_req, which++, 18383da691bfSIlya Dryomov rbd_dev->layout.object_size, 18393da691bfSIlya Dryomov rbd_dev->layout.object_size); 1840c5b5ef6cSAlex Elder 18413da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) 18423da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITEFULL; 18433da691bfSIlya Dryomov else 18443da691bfSIlya Dryomov opcode = CEPH_OSD_OP_WRITE; 1845c5b5ef6cSAlex Elder 18463da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, which, opcode, 184743df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 18483da691bfSIlya Dryomov rbd_osd_req_setup_data(obj_req, which++); 18493da691bfSIlya Dryomov 18503da691bfSIlya Dryomov rbd_assert(which == obj_req->osd_req->r_num_ops); 18513da691bfSIlya Dryomov rbd_osd_req_format_write(obj_req); 18523da691bfSIlya Dryomov } 18533da691bfSIlya Dryomov 18543da691bfSIlya Dryomov static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) 18553da691bfSIlya Dryomov { 18563da691bfSIlya Dryomov unsigned int num_osd_ops, which = 0; 185713488d53SIlya Dryomov bool need_guard; 18583da691bfSIlya Dryomov int ret; 18593da691bfSIlya Dryomov 186086bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 186186bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 186286bd7998SIlya Dryomov if (ret) 186386bd7998SIlya Dryomov return ret; 186486bd7998SIlya Dryomov 186513488d53SIlya Dryomov need_guard = rbd_obj_copyup_enabled(obj_req); 186613488d53SIlya Dryomov num_osd_ops = need_guard + count_write_ops(obj_req); 18673da691bfSIlya Dryomov 1868a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 18693da691bfSIlya Dryomov if (!obj_req->osd_req) 18703da691bfSIlya Dryomov return -ENOMEM; 18713da691bfSIlya Dryomov 187213488d53SIlya Dryomov if (need_guard) { 18733da691bfSIlya Dryomov ret = __rbd_obj_setup_stat(obj_req, which++); 18743da691bfSIlya Dryomov if (ret) 1875c5b5ef6cSAlex Elder return ret; 187613488d53SIlya Dryomov 187713488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_GUARD; 187813488d53SIlya Dryomov } else { 187913488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 1880c5b5ef6cSAlex Elder } 1881c5b5ef6cSAlex Elder 18823da691bfSIlya Dryomov __rbd_obj_setup_write(obj_req, which); 18833da691bfSIlya Dryomov return 0; 188470d045f6SIlya Dryomov } 188570d045f6SIlya Dryomov 18866484cbe9SIlya Dryomov static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req) 18876484cbe9SIlya Dryomov { 18886484cbe9SIlya Dryomov return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE : 18896484cbe9SIlya Dryomov CEPH_OSD_OP_ZERO; 18906484cbe9SIlya Dryomov } 18916484cbe9SIlya Dryomov 18926484cbe9SIlya Dryomov static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) 18936484cbe9SIlya Dryomov { 18940c93e1b7SIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 18950c93e1b7SIlya Dryomov u64 off = obj_req->ex.oe_off; 18960c93e1b7SIlya Dryomov u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len; 18976484cbe9SIlya Dryomov int ret; 18986484cbe9SIlya Dryomov 18990c93e1b7SIlya Dryomov /* 19000c93e1b7SIlya Dryomov * Align the range to alloc_size boundary and punt on discards 19010c93e1b7SIlya Dryomov * that are too small to free up any space. 19020c93e1b7SIlya Dryomov * 19030c93e1b7SIlya Dryomov * alloc_size == object_size && is_tail() is a special case for 19040c93e1b7SIlya Dryomov * filestore with filestore_punch_hole = false, needed to allow 19050c93e1b7SIlya Dryomov * truncate (in addition to delete). 19060c93e1b7SIlya Dryomov */ 19070c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size || 19080c93e1b7SIlya Dryomov !rbd_obj_is_tail(obj_req)) { 19090c93e1b7SIlya Dryomov off = round_up(off, rbd_dev->opts->alloc_size); 19100c93e1b7SIlya Dryomov next_off = round_down(next_off, rbd_dev->opts->alloc_size); 19110c93e1b7SIlya Dryomov if (off >= next_off) 19120c93e1b7SIlya Dryomov return 1; 19130c93e1b7SIlya Dryomov } 19140c93e1b7SIlya Dryomov 19156484cbe9SIlya Dryomov /* reverse map the entire object onto the parent */ 19166484cbe9SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 19176484cbe9SIlya Dryomov if (ret) 19186484cbe9SIlya Dryomov return ret; 19196484cbe9SIlya Dryomov 19206484cbe9SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, 1); 19216484cbe9SIlya Dryomov if (!obj_req->osd_req) 19226484cbe9SIlya Dryomov return -ENOMEM; 19236484cbe9SIlya Dryomov 19246484cbe9SIlya Dryomov if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) { 19256484cbe9SIlya Dryomov osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0); 19266484cbe9SIlya Dryomov } else { 19270c93e1b7SIlya Dryomov dout("%s %p %llu~%llu -> %llu~%llu\n", __func__, 19280c93e1b7SIlya Dryomov obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len, 19290c93e1b7SIlya Dryomov off, next_off - off); 19306484cbe9SIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, 0, 19316484cbe9SIlya Dryomov truncate_or_zero_opcode(obj_req), 19320c93e1b7SIlya Dryomov off, next_off - off, 0, 0); 19336484cbe9SIlya Dryomov } 19346484cbe9SIlya Dryomov 19356484cbe9SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 19366484cbe9SIlya Dryomov rbd_osd_req_format_write(obj_req); 19376484cbe9SIlya Dryomov return 0; 19386484cbe9SIlya Dryomov } 19396484cbe9SIlya Dryomov 194013488d53SIlya Dryomov static int count_zeroout_ops(struct rbd_obj_request *obj_req) 194113488d53SIlya Dryomov { 194213488d53SIlya Dryomov int num_osd_ops; 194313488d53SIlya Dryomov 194413488d53SIlya Dryomov if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) 194513488d53SIlya Dryomov num_osd_ops = 2; /* create + truncate */ 194613488d53SIlya Dryomov else 194713488d53SIlya Dryomov num_osd_ops = 1; /* delete/truncate/zero */ 194813488d53SIlya Dryomov 194913488d53SIlya Dryomov return num_osd_ops; 195013488d53SIlya Dryomov } 195113488d53SIlya Dryomov 19526484cbe9SIlya Dryomov static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req, 19533da691bfSIlya Dryomov unsigned int which) 195470d045f6SIlya Dryomov { 19553da691bfSIlya Dryomov u16 opcode; 1956058aa991SIlya Dryomov 19573da691bfSIlya Dryomov if (rbd_obj_is_entire(obj_req)) { 195886bd7998SIlya Dryomov if (obj_req->num_img_extents) { 19592bb1e56eSIlya Dryomov osd_req_op_init(obj_req->osd_req, which++, 19602bb1e56eSIlya Dryomov CEPH_OSD_OP_CREATE, 0); 19613da691bfSIlya Dryomov opcode = CEPH_OSD_OP_TRUNCATE; 19623da691bfSIlya Dryomov } else { 19633da691bfSIlya Dryomov osd_req_op_init(obj_req->osd_req, which++, 19643da691bfSIlya Dryomov CEPH_OSD_OP_DELETE, 0); 19653da691bfSIlya Dryomov opcode = 0; 19663da691bfSIlya Dryomov } 19673da691bfSIlya Dryomov } else { 19686484cbe9SIlya Dryomov opcode = truncate_or_zero_opcode(obj_req); 19693da691bfSIlya Dryomov } 19703da691bfSIlya Dryomov 19713da691bfSIlya Dryomov if (opcode) 19723da691bfSIlya Dryomov osd_req_op_extent_init(obj_req->osd_req, which++, opcode, 197343df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, 19743da691bfSIlya Dryomov 0, 0); 19753da691bfSIlya Dryomov 19763da691bfSIlya Dryomov rbd_assert(which == obj_req->osd_req->r_num_ops); 19773da691bfSIlya Dryomov rbd_osd_req_format_write(obj_req); 19783da691bfSIlya Dryomov } 19793da691bfSIlya Dryomov 19806484cbe9SIlya Dryomov static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req) 19813da691bfSIlya Dryomov { 19823da691bfSIlya Dryomov unsigned int num_osd_ops, which = 0; 198313488d53SIlya Dryomov bool need_guard; 19843da691bfSIlya Dryomov int ret; 19853da691bfSIlya Dryomov 198686bd7998SIlya Dryomov /* reverse map the entire object onto the parent */ 198786bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, true); 198886bd7998SIlya Dryomov if (ret) 198986bd7998SIlya Dryomov return ret; 199086bd7998SIlya Dryomov 199113488d53SIlya Dryomov need_guard = rbd_obj_copyup_enabled(obj_req); 199213488d53SIlya Dryomov num_osd_ops = need_guard + count_zeroout_ops(obj_req); 19933da691bfSIlya Dryomov 1994a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 19953da691bfSIlya Dryomov if (!obj_req->osd_req) 19963da691bfSIlya Dryomov return -ENOMEM; 19973da691bfSIlya Dryomov 199813488d53SIlya Dryomov if (need_guard) { 19993da691bfSIlya Dryomov ret = __rbd_obj_setup_stat(obj_req, which++); 20003da691bfSIlya Dryomov if (ret) 20013da691bfSIlya Dryomov return ret; 200213488d53SIlya Dryomov 200313488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_GUARD; 200413488d53SIlya Dryomov } else { 200513488d53SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_FLAT; 20063da691bfSIlya Dryomov } 20073da691bfSIlya Dryomov 20086484cbe9SIlya Dryomov __rbd_obj_setup_zeroout(obj_req, which); 2009980917fcSIlya Dryomov return 0; 2010b454e36dSAlex Elder } 2011b454e36dSAlex Elder 2012b454e36dSAlex Elder /* 20133da691bfSIlya Dryomov * For each object request in @img_req, allocate an OSD request, add 20143da691bfSIlya Dryomov * individual OSD ops and prepare them for submission. The number of 20153da691bfSIlya Dryomov * OSD ops depends on op_type and the overlap point (if any). 2016b454e36dSAlex Elder */ 20173da691bfSIlya Dryomov static int __rbd_img_fill_request(struct rbd_img_request *img_req) 20183da691bfSIlya Dryomov { 20190c93e1b7SIlya Dryomov struct rbd_obj_request *obj_req, *next_obj_req; 20203da691bfSIlya Dryomov int ret; 20213d7efd18SAlex Elder 20220c93e1b7SIlya Dryomov for_each_obj_request_safe(img_req, obj_req, next_obj_req) { 20239bb0248dSIlya Dryomov switch (img_req->op_type) { 20243da691bfSIlya Dryomov case OBJ_OP_READ: 20253da691bfSIlya Dryomov ret = rbd_obj_setup_read(obj_req); 20263da691bfSIlya Dryomov break; 20273da691bfSIlya Dryomov case OBJ_OP_WRITE: 20283da691bfSIlya Dryomov ret = rbd_obj_setup_write(obj_req); 20293da691bfSIlya Dryomov break; 20303da691bfSIlya Dryomov case OBJ_OP_DISCARD: 20313da691bfSIlya Dryomov ret = rbd_obj_setup_discard(obj_req); 20323da691bfSIlya Dryomov break; 20336484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 20346484cbe9SIlya Dryomov ret = rbd_obj_setup_zeroout(obj_req); 20356484cbe9SIlya Dryomov break; 20363da691bfSIlya Dryomov default: 20373da691bfSIlya Dryomov rbd_assert(0); 20383da691bfSIlya Dryomov } 20390c93e1b7SIlya Dryomov if (ret < 0) 20403da691bfSIlya Dryomov return ret; 20410c93e1b7SIlya Dryomov if (ret > 0) { 20420c93e1b7SIlya Dryomov img_req->xferred += obj_req->ex.oe_len; 20430c93e1b7SIlya Dryomov img_req->pending_count--; 20440c93e1b7SIlya Dryomov rbd_img_obj_request_del(img_req, obj_req); 20450c93e1b7SIlya Dryomov continue; 20460c93e1b7SIlya Dryomov } 204726f887e0SIlya Dryomov 204826f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 204926f887e0SIlya Dryomov if (ret) 205026f887e0SIlya Dryomov return ret; 2051b454e36dSAlex Elder } 2052b454e36dSAlex Elder 20533da691bfSIlya Dryomov return 0; 20543da691bfSIlya Dryomov } 20553da691bfSIlya Dryomov 20565a237819SIlya Dryomov union rbd_img_fill_iter { 20575a237819SIlya Dryomov struct ceph_bio_iter bio_iter; 20585a237819SIlya Dryomov struct ceph_bvec_iter bvec_iter; 20595a237819SIlya Dryomov }; 20605a237819SIlya Dryomov 20615a237819SIlya Dryomov struct rbd_img_fill_ctx { 20625a237819SIlya Dryomov enum obj_request_type pos_type; 20635a237819SIlya Dryomov union rbd_img_fill_iter *pos; 20645a237819SIlya Dryomov union rbd_img_fill_iter iter; 20655a237819SIlya Dryomov ceph_object_extent_fn_t set_pos_fn; 2066afb97888SIlya Dryomov ceph_object_extent_fn_t count_fn; 2067afb97888SIlya Dryomov ceph_object_extent_fn_t copy_fn; 20685a237819SIlya Dryomov }; 20695a237819SIlya Dryomov 20705a237819SIlya Dryomov static struct ceph_object_extent *alloc_object_extent(void *arg) 20715a237819SIlya Dryomov { 20725a237819SIlya Dryomov struct rbd_img_request *img_req = arg; 20735a237819SIlya Dryomov struct rbd_obj_request *obj_req; 20745a237819SIlya Dryomov 20755a237819SIlya Dryomov obj_req = rbd_obj_request_create(); 20765a237819SIlya Dryomov if (!obj_req) 20775a237819SIlya Dryomov return NULL; 20785a237819SIlya Dryomov 20795a237819SIlya Dryomov rbd_img_obj_request_add(img_req, obj_req); 20805a237819SIlya Dryomov return &obj_req->ex; 20815a237819SIlya Dryomov } 20825a237819SIlya Dryomov 20835a237819SIlya Dryomov /* 2084afb97888SIlya Dryomov * While su != os && sc == 1 is technically not fancy (it's the same 2085afb97888SIlya Dryomov * layout as su == os && sc == 1), we can't use the nocopy path for it 2086afb97888SIlya Dryomov * because ->set_pos_fn() should be called only once per object. 2087afb97888SIlya Dryomov * ceph_file_to_extents() invokes action_fn once per stripe unit, so 2088afb97888SIlya Dryomov * treat su != os && sc == 1 as fancy. 20895a237819SIlya Dryomov */ 2090afb97888SIlya Dryomov static bool rbd_layout_is_fancy(struct ceph_file_layout *l) 2091afb97888SIlya Dryomov { 2092afb97888SIlya Dryomov return l->stripe_unit != l->object_size; 2093afb97888SIlya Dryomov } 2094afb97888SIlya Dryomov 2095afb97888SIlya Dryomov static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, 20965a237819SIlya Dryomov struct ceph_file_extent *img_extents, 20975a237819SIlya Dryomov u32 num_img_extents, 20985a237819SIlya Dryomov struct rbd_img_fill_ctx *fctx) 20995a237819SIlya Dryomov { 21005a237819SIlya Dryomov u32 i; 21015a237819SIlya Dryomov int ret; 21025a237819SIlya Dryomov 21035a237819SIlya Dryomov img_req->data_type = fctx->pos_type; 21045a237819SIlya Dryomov 21055a237819SIlya Dryomov /* 21065a237819SIlya Dryomov * Create object requests and set each object request's starting 21075a237819SIlya Dryomov * position in the provided bio (list) or bio_vec array. 21085a237819SIlya Dryomov */ 21095a237819SIlya Dryomov fctx->iter = *fctx->pos; 21105a237819SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 21115a237819SIlya Dryomov ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 21125a237819SIlya Dryomov img_extents[i].fe_off, 21135a237819SIlya Dryomov img_extents[i].fe_len, 21145a237819SIlya Dryomov &img_req->object_extents, 21155a237819SIlya Dryomov alloc_object_extent, img_req, 21165a237819SIlya Dryomov fctx->set_pos_fn, &fctx->iter); 21175a237819SIlya Dryomov if (ret) 21185a237819SIlya Dryomov return ret; 21195a237819SIlya Dryomov } 21205a237819SIlya Dryomov 21215a237819SIlya Dryomov return __rbd_img_fill_request(img_req); 21225a237819SIlya Dryomov } 21235a237819SIlya Dryomov 2124afb97888SIlya Dryomov /* 2125afb97888SIlya Dryomov * Map a list of image extents to a list of object extents, create the 2126afb97888SIlya Dryomov * corresponding object requests (normally each to a different object, 2127afb97888SIlya Dryomov * but not always) and add them to @img_req. For each object request, 2128afb97888SIlya Dryomov * set up its data descriptor to point to the corresponding chunk(s) of 2129afb97888SIlya Dryomov * @fctx->pos data buffer. 2130afb97888SIlya Dryomov * 2131afb97888SIlya Dryomov * Because ceph_file_to_extents() will merge adjacent object extents 2132afb97888SIlya Dryomov * together, each object request's data descriptor may point to multiple 2133afb97888SIlya Dryomov * different chunks of @fctx->pos data buffer. 2134afb97888SIlya Dryomov * 2135afb97888SIlya Dryomov * @fctx->pos data buffer is assumed to be large enough. 2136afb97888SIlya Dryomov */ 2137afb97888SIlya Dryomov static int rbd_img_fill_request(struct rbd_img_request *img_req, 2138afb97888SIlya Dryomov struct ceph_file_extent *img_extents, 2139afb97888SIlya Dryomov u32 num_img_extents, 2140afb97888SIlya Dryomov struct rbd_img_fill_ctx *fctx) 2141afb97888SIlya Dryomov { 2142afb97888SIlya Dryomov struct rbd_device *rbd_dev = img_req->rbd_dev; 2143afb97888SIlya Dryomov struct rbd_obj_request *obj_req; 2144afb97888SIlya Dryomov u32 i; 2145afb97888SIlya Dryomov int ret; 2146afb97888SIlya Dryomov 2147afb97888SIlya Dryomov if (fctx->pos_type == OBJ_REQUEST_NODATA || 2148afb97888SIlya Dryomov !rbd_layout_is_fancy(&rbd_dev->layout)) 2149afb97888SIlya Dryomov return rbd_img_fill_request_nocopy(img_req, img_extents, 2150afb97888SIlya Dryomov num_img_extents, fctx); 2151afb97888SIlya Dryomov 2152afb97888SIlya Dryomov img_req->data_type = OBJ_REQUEST_OWN_BVECS; 2153afb97888SIlya Dryomov 2154afb97888SIlya Dryomov /* 2155afb97888SIlya Dryomov * Create object requests and determine ->bvec_count for each object 2156afb97888SIlya Dryomov * request. Note that ->bvec_count sum over all object requests may 2157afb97888SIlya Dryomov * be greater than the number of bio_vecs in the provided bio (list) 2158afb97888SIlya Dryomov * or bio_vec array because when mapped, those bio_vecs can straddle 2159afb97888SIlya Dryomov * stripe unit boundaries. 2160afb97888SIlya Dryomov */ 2161afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2162afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2163afb97888SIlya Dryomov ret = ceph_file_to_extents(&rbd_dev->layout, 2164afb97888SIlya Dryomov img_extents[i].fe_off, 2165afb97888SIlya Dryomov img_extents[i].fe_len, 2166afb97888SIlya Dryomov &img_req->object_extents, 2167afb97888SIlya Dryomov alloc_object_extent, img_req, 2168afb97888SIlya Dryomov fctx->count_fn, &fctx->iter); 2169afb97888SIlya Dryomov if (ret) 2170afb97888SIlya Dryomov return ret; 2171afb97888SIlya Dryomov } 2172afb97888SIlya Dryomov 2173afb97888SIlya Dryomov for_each_obj_request(img_req, obj_req) { 2174afb97888SIlya Dryomov obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, 2175afb97888SIlya Dryomov sizeof(*obj_req->bvec_pos.bvecs), 2176afb97888SIlya Dryomov GFP_NOIO); 2177afb97888SIlya Dryomov if (!obj_req->bvec_pos.bvecs) 2178afb97888SIlya Dryomov return -ENOMEM; 2179afb97888SIlya Dryomov } 2180afb97888SIlya Dryomov 2181afb97888SIlya Dryomov /* 2182afb97888SIlya Dryomov * Fill in each object request's private bio_vec array, splitting and 2183afb97888SIlya Dryomov * rearranging the provided bio_vecs in stripe unit chunks as needed. 2184afb97888SIlya Dryomov */ 2185afb97888SIlya Dryomov fctx->iter = *fctx->pos; 2186afb97888SIlya Dryomov for (i = 0; i < num_img_extents; i++) { 2187afb97888SIlya Dryomov ret = ceph_iterate_extents(&rbd_dev->layout, 2188afb97888SIlya Dryomov img_extents[i].fe_off, 2189afb97888SIlya Dryomov img_extents[i].fe_len, 2190afb97888SIlya Dryomov &img_req->object_extents, 2191afb97888SIlya Dryomov fctx->copy_fn, &fctx->iter); 2192afb97888SIlya Dryomov if (ret) 2193afb97888SIlya Dryomov return ret; 2194afb97888SIlya Dryomov } 2195afb97888SIlya Dryomov 2196afb97888SIlya Dryomov return __rbd_img_fill_request(img_req); 2197afb97888SIlya Dryomov } 2198afb97888SIlya Dryomov 21995a237819SIlya Dryomov static int rbd_img_fill_nodata(struct rbd_img_request *img_req, 22005a237819SIlya Dryomov u64 off, u64 len) 22015a237819SIlya Dryomov { 22025a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 22035a237819SIlya Dryomov union rbd_img_fill_iter dummy; 22045a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 22055a237819SIlya Dryomov .pos_type = OBJ_REQUEST_NODATA, 22065a237819SIlya Dryomov .pos = &dummy, 22075a237819SIlya Dryomov }; 22085a237819SIlya Dryomov 22095a237819SIlya Dryomov return rbd_img_fill_request(img_req, &ex, 1, &fctx); 22105a237819SIlya Dryomov } 22115a237819SIlya Dryomov 22125a237819SIlya Dryomov static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 22135a237819SIlya Dryomov { 22145a237819SIlya Dryomov struct rbd_obj_request *obj_req = 22155a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 22165a237819SIlya Dryomov struct ceph_bio_iter *it = arg; 22175a237819SIlya Dryomov 22185a237819SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 22195a237819SIlya Dryomov obj_req->bio_pos = *it; 22205a237819SIlya Dryomov ceph_bio_iter_advance(it, bytes); 22215a237819SIlya Dryomov } 22225a237819SIlya Dryomov 2223afb97888SIlya Dryomov static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2224afb97888SIlya Dryomov { 2225afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2226afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2227afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2228afb97888SIlya Dryomov 2229afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2230afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2231afb97888SIlya Dryomov obj_req->bvec_count++; 2232afb97888SIlya Dryomov })); 2233afb97888SIlya Dryomov 2234afb97888SIlya Dryomov } 2235afb97888SIlya Dryomov 2236afb97888SIlya Dryomov static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2237afb97888SIlya Dryomov { 2238afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2239afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2240afb97888SIlya Dryomov struct ceph_bio_iter *it = arg; 2241afb97888SIlya Dryomov 2242afb97888SIlya Dryomov dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 2243afb97888SIlya Dryomov ceph_bio_iter_advance_step(it, bytes, ({ 2244afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2245afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2246afb97888SIlya Dryomov })); 2247afb97888SIlya Dryomov } 2248afb97888SIlya Dryomov 22495a237819SIlya Dryomov static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 22505a237819SIlya Dryomov struct ceph_file_extent *img_extents, 22515a237819SIlya Dryomov u32 num_img_extents, 22525a237819SIlya Dryomov struct ceph_bio_iter *bio_pos) 22535a237819SIlya Dryomov { 22545a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 22555a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BIO, 22565a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bio_pos, 22575a237819SIlya Dryomov .set_pos_fn = set_bio_pos, 2258afb97888SIlya Dryomov .count_fn = count_bio_bvecs, 2259afb97888SIlya Dryomov .copy_fn = copy_bio_bvecs, 22605a237819SIlya Dryomov }; 22615a237819SIlya Dryomov 22625a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 22635a237819SIlya Dryomov &fctx); 22645a237819SIlya Dryomov } 22655a237819SIlya Dryomov 22665a237819SIlya Dryomov static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, 22675a237819SIlya Dryomov u64 off, u64 len, struct bio *bio) 22685a237819SIlya Dryomov { 22695a237819SIlya Dryomov struct ceph_file_extent ex = { off, len }; 22705a237819SIlya Dryomov struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; 22715a237819SIlya Dryomov 22725a237819SIlya Dryomov return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); 22735a237819SIlya Dryomov } 22745a237819SIlya Dryomov 22755a237819SIlya Dryomov static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 22765a237819SIlya Dryomov { 22775a237819SIlya Dryomov struct rbd_obj_request *obj_req = 22785a237819SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 22795a237819SIlya Dryomov struct ceph_bvec_iter *it = arg; 22805a237819SIlya Dryomov 22815a237819SIlya Dryomov obj_req->bvec_pos = *it; 22825a237819SIlya Dryomov ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); 22835a237819SIlya Dryomov ceph_bvec_iter_advance(it, bytes); 22845a237819SIlya Dryomov } 22855a237819SIlya Dryomov 2286afb97888SIlya Dryomov static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2287afb97888SIlya Dryomov { 2288afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2289afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2290afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2291afb97888SIlya Dryomov 2292afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2293afb97888SIlya Dryomov obj_req->bvec_count++; 2294afb97888SIlya Dryomov })); 2295afb97888SIlya Dryomov } 2296afb97888SIlya Dryomov 2297afb97888SIlya Dryomov static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 2298afb97888SIlya Dryomov { 2299afb97888SIlya Dryomov struct rbd_obj_request *obj_req = 2300afb97888SIlya Dryomov container_of(ex, struct rbd_obj_request, ex); 2301afb97888SIlya Dryomov struct ceph_bvec_iter *it = arg; 2302afb97888SIlya Dryomov 2303afb97888SIlya Dryomov ceph_bvec_iter_advance_step(it, bytes, ({ 2304afb97888SIlya Dryomov obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 2305afb97888SIlya Dryomov obj_req->bvec_pos.iter.bi_size += bv.bv_len; 2306afb97888SIlya Dryomov })); 2307afb97888SIlya Dryomov } 2308afb97888SIlya Dryomov 23095a237819SIlya Dryomov static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 23105a237819SIlya Dryomov struct ceph_file_extent *img_extents, 23115a237819SIlya Dryomov u32 num_img_extents, 23125a237819SIlya Dryomov struct ceph_bvec_iter *bvec_pos) 23135a237819SIlya Dryomov { 23145a237819SIlya Dryomov struct rbd_img_fill_ctx fctx = { 23155a237819SIlya Dryomov .pos_type = OBJ_REQUEST_BVECS, 23165a237819SIlya Dryomov .pos = (union rbd_img_fill_iter *)bvec_pos, 23175a237819SIlya Dryomov .set_pos_fn = set_bvec_pos, 2318afb97888SIlya Dryomov .count_fn = count_bvecs, 2319afb97888SIlya Dryomov .copy_fn = copy_bvecs, 23205a237819SIlya Dryomov }; 23215a237819SIlya Dryomov 23225a237819SIlya Dryomov return rbd_img_fill_request(img_req, img_extents, num_img_extents, 23235a237819SIlya Dryomov &fctx); 23245a237819SIlya Dryomov } 23255a237819SIlya Dryomov 23265a237819SIlya Dryomov static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 23275a237819SIlya Dryomov struct ceph_file_extent *img_extents, 23285a237819SIlya Dryomov u32 num_img_extents, 23295a237819SIlya Dryomov struct bio_vec *bvecs) 23305a237819SIlya Dryomov { 23315a237819SIlya Dryomov struct ceph_bvec_iter it = { 23325a237819SIlya Dryomov .bvecs = bvecs, 23335a237819SIlya Dryomov .iter = { .bi_size = ceph_file_extents_bytes(img_extents, 23345a237819SIlya Dryomov num_img_extents) }, 23355a237819SIlya Dryomov }; 23365a237819SIlya Dryomov 23375a237819SIlya Dryomov return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, 23385a237819SIlya Dryomov &it); 23395a237819SIlya Dryomov } 23405a237819SIlya Dryomov 2341efbd1a11SIlya Dryomov static void rbd_img_request_submit(struct rbd_img_request *img_request) 2342bf0d5f50SAlex Elder { 2343bf0d5f50SAlex Elder struct rbd_obj_request *obj_request; 2344bf0d5f50SAlex Elder 234537206ee5SAlex Elder dout("%s: img %p\n", __func__, img_request); 2346bf0d5f50SAlex Elder 2347663ae2ccSIlya Dryomov rbd_img_request_get(img_request); 2348efbd1a11SIlya Dryomov for_each_obj_request(img_request, obj_request) 23493da691bfSIlya Dryomov rbd_obj_request_submit(obj_request); 2350bf0d5f50SAlex Elder 2351663ae2ccSIlya Dryomov rbd_img_request_put(img_request); 2352bf0d5f50SAlex Elder } 2353bf0d5f50SAlex Elder 235486bd7998SIlya Dryomov static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 23553da691bfSIlya Dryomov { 23563da691bfSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 23573da691bfSIlya Dryomov struct rbd_img_request *child_img_req; 23583da691bfSIlya Dryomov int ret; 23593da691bfSIlya Dryomov 2360e93aca0aSIlya Dryomov child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, 2361e93aca0aSIlya Dryomov OBJ_OP_READ, NULL); 23623da691bfSIlya Dryomov if (!child_img_req) 23633da691bfSIlya Dryomov return -ENOMEM; 23643da691bfSIlya Dryomov 2365e93aca0aSIlya Dryomov __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2366e93aca0aSIlya Dryomov child_img_req->obj_request = obj_req; 2367e93aca0aSIlya Dryomov 23683da691bfSIlya Dryomov if (!rbd_img_is_write(img_req)) { 2369ecc633caSIlya Dryomov switch (img_req->data_type) { 23703da691bfSIlya Dryomov case OBJ_REQUEST_BIO: 23715a237819SIlya Dryomov ret = __rbd_img_fill_from_bio(child_img_req, 23725a237819SIlya Dryomov obj_req->img_extents, 23735a237819SIlya Dryomov obj_req->num_img_extents, 23743da691bfSIlya Dryomov &obj_req->bio_pos); 23753da691bfSIlya Dryomov break; 23763da691bfSIlya Dryomov case OBJ_REQUEST_BVECS: 2377afb97888SIlya Dryomov case OBJ_REQUEST_OWN_BVECS: 23785a237819SIlya Dryomov ret = __rbd_img_fill_from_bvecs(child_img_req, 23795a237819SIlya Dryomov obj_req->img_extents, 23805a237819SIlya Dryomov obj_req->num_img_extents, 23813da691bfSIlya Dryomov &obj_req->bvec_pos); 23823da691bfSIlya Dryomov break; 23833da691bfSIlya Dryomov default: 23843da691bfSIlya Dryomov rbd_assert(0); 23853da691bfSIlya Dryomov } 23863da691bfSIlya Dryomov } else { 23875a237819SIlya Dryomov ret = rbd_img_fill_from_bvecs(child_img_req, 23885a237819SIlya Dryomov obj_req->img_extents, 23895a237819SIlya Dryomov obj_req->num_img_extents, 23905a237819SIlya Dryomov obj_req->copyup_bvecs); 23913da691bfSIlya Dryomov } 23923da691bfSIlya Dryomov if (ret) { 23933da691bfSIlya Dryomov rbd_img_request_put(child_img_req); 2394663ae2ccSIlya Dryomov return ret; 2395bf0d5f50SAlex Elder } 2396bf0d5f50SAlex Elder 23973da691bfSIlya Dryomov rbd_img_request_submit(child_img_req); 23983da691bfSIlya Dryomov return 0; 23993da691bfSIlya Dryomov } 24003da691bfSIlya Dryomov 24013da691bfSIlya Dryomov static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) 24028b3e1a56SAlex Elder { 24033da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 24043da691bfSIlya Dryomov int ret; 24058b3e1a56SAlex Elder 24063da691bfSIlya Dryomov if (obj_req->result == -ENOENT && 240786bd7998SIlya Dryomov rbd_dev->parent_overlap && !obj_req->tried_parent) { 240886bd7998SIlya Dryomov /* reverse map this object extent onto the parent */ 240986bd7998SIlya Dryomov ret = rbd_obj_calc_img_extents(obj_req, false); 241086bd7998SIlya Dryomov if (ret) { 241186bd7998SIlya Dryomov obj_req->result = ret; 241286bd7998SIlya Dryomov return true; 241386bd7998SIlya Dryomov } 24148b3e1a56SAlex Elder 241586bd7998SIlya Dryomov if (obj_req->num_img_extents) { 24163da691bfSIlya Dryomov obj_req->tried_parent = true; 241786bd7998SIlya Dryomov ret = rbd_obj_read_from_parent(obj_req); 24183da691bfSIlya Dryomov if (ret) { 24193da691bfSIlya Dryomov obj_req->result = ret; 24203da691bfSIlya Dryomov return true; 24213da691bfSIlya Dryomov } 24223da691bfSIlya Dryomov return false; 24233da691bfSIlya Dryomov } 242486bd7998SIlya Dryomov } 242502c74fbaSAlex Elder 242602c74fbaSAlex Elder /* 24273da691bfSIlya Dryomov * -ENOENT means a hole in the image -- zero-fill the entire 24283da691bfSIlya Dryomov * length of the request. A short read also implies zero-fill 24293da691bfSIlya Dryomov * to the end of the request. In both cases we update xferred 24303da691bfSIlya Dryomov * count to indicate the whole request was satisfied. 243102c74fbaSAlex Elder */ 24323da691bfSIlya Dryomov if (obj_req->result == -ENOENT || 243343df3d35SIlya Dryomov (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { 24343da691bfSIlya Dryomov rbd_assert(!obj_req->xferred || !obj_req->result); 24353da691bfSIlya Dryomov rbd_obj_zero_range(obj_req, obj_req->xferred, 243643df3d35SIlya Dryomov obj_req->ex.oe_len - obj_req->xferred); 24373da691bfSIlya Dryomov obj_req->result = 0; 243843df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 24393da691bfSIlya Dryomov } 24403da691bfSIlya Dryomov 24413da691bfSIlya Dryomov return true; 24423da691bfSIlya Dryomov } 24433da691bfSIlya Dryomov 24443da691bfSIlya Dryomov /* 24453da691bfSIlya Dryomov * copyup_bvecs pages are never highmem pages 24463da691bfSIlya Dryomov */ 24473da691bfSIlya Dryomov static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) 24483da691bfSIlya Dryomov { 24493da691bfSIlya Dryomov struct ceph_bvec_iter it = { 24503da691bfSIlya Dryomov .bvecs = bvecs, 24513da691bfSIlya Dryomov .iter = { .bi_size = bytes }, 24523da691bfSIlya Dryomov }; 24533da691bfSIlya Dryomov 24543da691bfSIlya Dryomov ceph_bvec_iter_advance_step(&it, bytes, ({ 24553da691bfSIlya Dryomov if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, 24563da691bfSIlya Dryomov bv.bv_len)) 24573da691bfSIlya Dryomov return false; 24583da691bfSIlya Dryomov })); 24593da691bfSIlya Dryomov return true; 24603da691bfSIlya Dryomov } 24613da691bfSIlya Dryomov 24623a482501SIlya Dryomov #define MODS_ONLY U32_MAX 24633a482501SIlya Dryomov 24643a482501SIlya Dryomov static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) 24653da691bfSIlya Dryomov { 246613488d53SIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 24673a482501SIlya Dryomov unsigned int num_osd_ops = (bytes != MODS_ONLY); 24683a482501SIlya Dryomov unsigned int which = 0; 2469fe943d50SChengguang Xu int ret; 24703da691bfSIlya Dryomov 24713da691bfSIlya Dryomov dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 24723da691bfSIlya Dryomov rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); 24733da691bfSIlya Dryomov rbd_osd_req_destroy(obj_req->osd_req); 24743da691bfSIlya Dryomov 247513488d53SIlya Dryomov switch (img_req->op_type) { 247613488d53SIlya Dryomov case OBJ_OP_WRITE: 247713488d53SIlya Dryomov num_osd_ops += count_write_ops(obj_req); 247813488d53SIlya Dryomov break; 247913488d53SIlya Dryomov case OBJ_OP_ZEROOUT: 248013488d53SIlya Dryomov num_osd_ops += count_zeroout_ops(obj_req); 248113488d53SIlya Dryomov break; 248213488d53SIlya Dryomov default: 248313488d53SIlya Dryomov rbd_assert(0); 248413488d53SIlya Dryomov } 248513488d53SIlya Dryomov 2486a162b308SIlya Dryomov obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 24873da691bfSIlya Dryomov if (!obj_req->osd_req) 24883da691bfSIlya Dryomov return -ENOMEM; 24893da691bfSIlya Dryomov 24903a482501SIlya Dryomov if (bytes != MODS_ONLY) { 24913a482501SIlya Dryomov ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd", 24923a482501SIlya Dryomov "copyup"); 2493fe943d50SChengguang Xu if (ret) 2494fe943d50SChengguang Xu return ret; 2495fe943d50SChengguang Xu 24963a482501SIlya Dryomov osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++, 24970010f705SIlya Dryomov obj_req->copyup_bvecs, 24980010f705SIlya Dryomov obj_req->copyup_bvec_count, 24990010f705SIlya Dryomov bytes); 25003a482501SIlya Dryomov } 25013da691bfSIlya Dryomov 250213488d53SIlya Dryomov switch (img_req->op_type) { 25033da691bfSIlya Dryomov case OBJ_OP_WRITE: 25043a482501SIlya Dryomov __rbd_obj_setup_write(obj_req, which); 25053da691bfSIlya Dryomov break; 25066484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 25073da691bfSIlya Dryomov rbd_assert(!rbd_obj_is_entire(obj_req)); 25083a482501SIlya Dryomov __rbd_obj_setup_zeroout(obj_req, which); 25093da691bfSIlya Dryomov break; 25103da691bfSIlya Dryomov default: 25113da691bfSIlya Dryomov rbd_assert(0); 25123da691bfSIlya Dryomov } 25133da691bfSIlya Dryomov 251426f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); 251526f887e0SIlya Dryomov if (ret) 251626f887e0SIlya Dryomov return ret; 251726f887e0SIlya Dryomov 25183da691bfSIlya Dryomov rbd_obj_request_submit(obj_req); 25193da691bfSIlya Dryomov return 0; 25203da691bfSIlya Dryomov } 25213da691bfSIlya Dryomov 25223a482501SIlya Dryomov static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) 25233a482501SIlya Dryomov { 25243a482501SIlya Dryomov /* 25253a482501SIlya Dryomov * Only send non-zero copyup data to save some I/O and network 25263a482501SIlya Dryomov * bandwidth -- zero copyup data is equivalent to the object not 25273a482501SIlya Dryomov * existing. 25283a482501SIlya Dryomov */ 25293a482501SIlya Dryomov if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { 25303a482501SIlya Dryomov dout("%s obj_req %p detected zeroes\n", __func__, obj_req); 25313a482501SIlya Dryomov bytes = 0; 25323a482501SIlya Dryomov } 25333a482501SIlya Dryomov 25343a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 25353a482501SIlya Dryomov return rbd_obj_issue_copyup_ops(obj_req, bytes); 25363a482501SIlya Dryomov } 25373a482501SIlya Dryomov 25387e07efb1SIlya Dryomov static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 25397e07efb1SIlya Dryomov { 25407e07efb1SIlya Dryomov u32 i; 25417e07efb1SIlya Dryomov 25427e07efb1SIlya Dryomov rbd_assert(!obj_req->copyup_bvecs); 25437e07efb1SIlya Dryomov obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); 25447e07efb1SIlya Dryomov obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, 25457e07efb1SIlya Dryomov sizeof(*obj_req->copyup_bvecs), 25467e07efb1SIlya Dryomov GFP_NOIO); 25477e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs) 25487e07efb1SIlya Dryomov return -ENOMEM; 25497e07efb1SIlya Dryomov 25507e07efb1SIlya Dryomov for (i = 0; i < obj_req->copyup_bvec_count; i++) { 25517e07efb1SIlya Dryomov unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); 25527e07efb1SIlya Dryomov 25537e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); 25547e07efb1SIlya Dryomov if (!obj_req->copyup_bvecs[i].bv_page) 25557e07efb1SIlya Dryomov return -ENOMEM; 25567e07efb1SIlya Dryomov 25577e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_offset = 0; 25587e07efb1SIlya Dryomov obj_req->copyup_bvecs[i].bv_len = len; 25597e07efb1SIlya Dryomov obj_overlap -= len; 25607e07efb1SIlya Dryomov } 25617e07efb1SIlya Dryomov 25627e07efb1SIlya Dryomov rbd_assert(!obj_overlap); 25637e07efb1SIlya Dryomov return 0; 25647e07efb1SIlya Dryomov } 25657e07efb1SIlya Dryomov 25663da691bfSIlya Dryomov static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) 25673da691bfSIlya Dryomov { 25683da691bfSIlya Dryomov struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 25693da691bfSIlya Dryomov int ret; 25703da691bfSIlya Dryomov 257186bd7998SIlya Dryomov rbd_assert(obj_req->num_img_extents); 257286bd7998SIlya Dryomov prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 257386bd7998SIlya Dryomov rbd_dev->parent_overlap); 257486bd7998SIlya Dryomov if (!obj_req->num_img_extents) { 25753da691bfSIlya Dryomov /* 25763da691bfSIlya Dryomov * The overlap has become 0 (most likely because the 25773a482501SIlya Dryomov * image has been flattened). Re-submit the original write 25783a482501SIlya Dryomov * request -- pass MODS_ONLY since the copyup isn't needed 25793a482501SIlya Dryomov * anymore. 25803da691bfSIlya Dryomov */ 25813a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; 25823a482501SIlya Dryomov return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY); 25833da691bfSIlya Dryomov } 25843da691bfSIlya Dryomov 258586bd7998SIlya Dryomov ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 25863da691bfSIlya Dryomov if (ret) 25873da691bfSIlya Dryomov return ret; 25883da691bfSIlya Dryomov 25893a482501SIlya Dryomov obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT; 259086bd7998SIlya Dryomov return rbd_obj_read_from_parent(obj_req); 25913da691bfSIlya Dryomov } 25923da691bfSIlya Dryomov 25933da691bfSIlya Dryomov static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) 25943da691bfSIlya Dryomov { 25953da691bfSIlya Dryomov int ret; 25963da691bfSIlya Dryomov 25973da691bfSIlya Dryomov switch (obj_req->write_state) { 25983da691bfSIlya Dryomov case RBD_OBJ_WRITE_GUARD: 25993da691bfSIlya Dryomov rbd_assert(!obj_req->xferred); 26003da691bfSIlya Dryomov if (obj_req->result == -ENOENT) { 26013da691bfSIlya Dryomov /* 26023da691bfSIlya Dryomov * The target object doesn't exist. Read the data for 26033da691bfSIlya Dryomov * the entire target object up to the overlap point (if 26043da691bfSIlya Dryomov * any) from the parent, so we can use it for a copyup. 26053da691bfSIlya Dryomov */ 26063da691bfSIlya Dryomov ret = rbd_obj_handle_write_guard(obj_req); 26073da691bfSIlya Dryomov if (ret) { 26083da691bfSIlya Dryomov obj_req->result = ret; 26093da691bfSIlya Dryomov return true; 26103da691bfSIlya Dryomov } 26113da691bfSIlya Dryomov return false; 26123da691bfSIlya Dryomov } 26133da691bfSIlya Dryomov /* fall through */ 26143da691bfSIlya Dryomov case RBD_OBJ_WRITE_FLAT: 26153a482501SIlya Dryomov case RBD_OBJ_WRITE_COPYUP_OPS: 26163da691bfSIlya Dryomov if (!obj_req->result) 26173da691bfSIlya Dryomov /* 26183da691bfSIlya Dryomov * There is no such thing as a successful short 26193da691bfSIlya Dryomov * write -- indicate the whole request was satisfied. 26203da691bfSIlya Dryomov */ 262143df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 26223da691bfSIlya Dryomov return true; 26233a482501SIlya Dryomov case RBD_OBJ_WRITE_READ_FROM_PARENT: 26243da691bfSIlya Dryomov if (obj_req->result) 26253a482501SIlya Dryomov return true; 26263da691bfSIlya Dryomov 26273da691bfSIlya Dryomov rbd_assert(obj_req->xferred); 26283da691bfSIlya Dryomov ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); 26293da691bfSIlya Dryomov if (ret) { 26303da691bfSIlya Dryomov obj_req->result = ret; 2631356889c4SIlya Dryomov obj_req->xferred = 0; 26323da691bfSIlya Dryomov return true; 26333da691bfSIlya Dryomov } 26343da691bfSIlya Dryomov return false; 26353da691bfSIlya Dryomov default: 2636c6244b3bSArnd Bergmann BUG(); 26373da691bfSIlya Dryomov } 26383da691bfSIlya Dryomov } 26393da691bfSIlya Dryomov 26403da691bfSIlya Dryomov /* 26413da691bfSIlya Dryomov * Returns true if @obj_req is completed, or false otherwise. 26423da691bfSIlya Dryomov */ 26433da691bfSIlya Dryomov static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) 26443da691bfSIlya Dryomov { 26459bb0248dSIlya Dryomov switch (obj_req->img_request->op_type) { 26463da691bfSIlya Dryomov case OBJ_OP_READ: 26473da691bfSIlya Dryomov return rbd_obj_handle_read(obj_req); 26483da691bfSIlya Dryomov case OBJ_OP_WRITE: 26493da691bfSIlya Dryomov return rbd_obj_handle_write(obj_req); 26503da691bfSIlya Dryomov case OBJ_OP_DISCARD: 26516484cbe9SIlya Dryomov case OBJ_OP_ZEROOUT: 26523da691bfSIlya Dryomov if (rbd_obj_handle_write(obj_req)) { 26533da691bfSIlya Dryomov /* 26543da691bfSIlya Dryomov * Hide -ENOENT from delete/truncate/zero -- discarding 26553da691bfSIlya Dryomov * a non-existent object is not a problem. 26563da691bfSIlya Dryomov */ 26573da691bfSIlya Dryomov if (obj_req->result == -ENOENT) { 26583da691bfSIlya Dryomov obj_req->result = 0; 265943df3d35SIlya Dryomov obj_req->xferred = obj_req->ex.oe_len; 26603da691bfSIlya Dryomov } 26613da691bfSIlya Dryomov return true; 26623da691bfSIlya Dryomov } 26633da691bfSIlya Dryomov return false; 26643da691bfSIlya Dryomov default: 2665c6244b3bSArnd Bergmann BUG(); 26663da691bfSIlya Dryomov } 26673da691bfSIlya Dryomov } 26683da691bfSIlya Dryomov 26697114edacSIlya Dryomov static void rbd_obj_end_request(struct rbd_obj_request *obj_req) 26707114edacSIlya Dryomov { 26717114edacSIlya Dryomov struct rbd_img_request *img_req = obj_req->img_request; 26727114edacSIlya Dryomov 26737114edacSIlya Dryomov rbd_assert((!obj_req->result && 267443df3d35SIlya Dryomov obj_req->xferred == obj_req->ex.oe_len) || 26757114edacSIlya Dryomov (obj_req->result < 0 && !obj_req->xferred)); 26767114edacSIlya Dryomov if (!obj_req->result) { 26777114edacSIlya Dryomov img_req->xferred += obj_req->xferred; 267802c74fbaSAlex Elder return; 267902c74fbaSAlex Elder } 268002c74fbaSAlex Elder 26817114edacSIlya Dryomov rbd_warn(img_req->rbd_dev, 26827114edacSIlya Dryomov "%s at objno %llu %llu~%llu result %d xferred %llu", 268343df3d35SIlya Dryomov obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 268443df3d35SIlya Dryomov obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, 26857114edacSIlya Dryomov obj_req->xferred); 26867114edacSIlya Dryomov if (!img_req->result) { 26877114edacSIlya Dryomov img_req->result = obj_req->result; 26887114edacSIlya Dryomov img_req->xferred = 0; 2689a9e8ba2cSAlex Elder } 26908b3e1a56SAlex Elder } 26918b3e1a56SAlex Elder 26923da691bfSIlya Dryomov static void rbd_img_end_child_request(struct rbd_img_request *img_req) 26938b3e1a56SAlex Elder { 26943da691bfSIlya Dryomov struct rbd_obj_request *obj_req = img_req->obj_request; 26958b3e1a56SAlex Elder 26963da691bfSIlya Dryomov rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); 269786bd7998SIlya Dryomov rbd_assert((!img_req->result && 269886bd7998SIlya Dryomov img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || 269986bd7998SIlya Dryomov (img_req->result < 0 && !img_req->xferred)); 27008b3e1a56SAlex Elder 27013da691bfSIlya Dryomov obj_req->result = img_req->result; 27023da691bfSIlya Dryomov obj_req->xferred = img_req->xferred; 27033da691bfSIlya Dryomov rbd_img_request_put(img_req); 27047114edacSIlya Dryomov } 27058b3e1a56SAlex Elder 27067114edacSIlya Dryomov static void rbd_img_end_request(struct rbd_img_request *img_req) 27077114edacSIlya Dryomov { 27087114edacSIlya Dryomov rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 27097114edacSIlya Dryomov rbd_assert((!img_req->result && 27107114edacSIlya Dryomov img_req->xferred == blk_rq_bytes(img_req->rq)) || 27117114edacSIlya Dryomov (img_req->result < 0 && !img_req->xferred)); 27128b3e1a56SAlex Elder 27137114edacSIlya Dryomov blk_mq_end_request(img_req->rq, 27147114edacSIlya Dryomov errno_to_blk_status(img_req->result)); 27157114edacSIlya Dryomov rbd_img_request_put(img_req); 27163da691bfSIlya Dryomov } 27178b3e1a56SAlex Elder 27183da691bfSIlya Dryomov static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) 27193da691bfSIlya Dryomov { 27207114edacSIlya Dryomov struct rbd_img_request *img_req; 27217114edacSIlya Dryomov 27227114edacSIlya Dryomov again: 27233da691bfSIlya Dryomov if (!__rbd_obj_handle_request(obj_req)) 27248b3e1a56SAlex Elder return; 27253da691bfSIlya Dryomov 27267114edacSIlya Dryomov img_req = obj_req->img_request; 27277114edacSIlya Dryomov spin_lock(&img_req->completion_lock); 27287114edacSIlya Dryomov rbd_obj_end_request(obj_req); 27297114edacSIlya Dryomov rbd_assert(img_req->pending_count); 27307114edacSIlya Dryomov if (--img_req->pending_count) { 27317114edacSIlya Dryomov spin_unlock(&img_req->completion_lock); 27327114edacSIlya Dryomov return; 27337114edacSIlya Dryomov } 27347114edacSIlya Dryomov 27357114edacSIlya Dryomov spin_unlock(&img_req->completion_lock); 27367114edacSIlya Dryomov if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 27377114edacSIlya Dryomov obj_req = img_req->obj_request; 27387114edacSIlya Dryomov rbd_img_end_child_request(img_req); 27397114edacSIlya Dryomov goto again; 27407114edacSIlya Dryomov } 27417114edacSIlya Dryomov rbd_img_end_request(img_req); 27428b3e1a56SAlex Elder } 27438b3e1a56SAlex Elder 2744ed95b21aSIlya Dryomov static const struct rbd_client_id rbd_empty_cid; 2745ed95b21aSIlya Dryomov 2746ed95b21aSIlya Dryomov static bool rbd_cid_equal(const struct rbd_client_id *lhs, 2747ed95b21aSIlya Dryomov const struct rbd_client_id *rhs) 2748ed95b21aSIlya Dryomov { 2749ed95b21aSIlya Dryomov return lhs->gid == rhs->gid && lhs->handle == rhs->handle; 2750ed95b21aSIlya Dryomov } 2751ed95b21aSIlya Dryomov 2752ed95b21aSIlya Dryomov static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev) 2753ed95b21aSIlya Dryomov { 2754ed95b21aSIlya Dryomov struct rbd_client_id cid; 2755ed95b21aSIlya Dryomov 2756ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 2757ed95b21aSIlya Dryomov cid.gid = ceph_client_gid(rbd_dev->rbd_client->client); 2758ed95b21aSIlya Dryomov cid.handle = rbd_dev->watch_cookie; 2759ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 2760ed95b21aSIlya Dryomov return cid; 2761ed95b21aSIlya Dryomov } 2762ed95b21aSIlya Dryomov 2763ed95b21aSIlya Dryomov /* 2764ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2765ed95b21aSIlya Dryomov */ 2766ed95b21aSIlya Dryomov static void rbd_set_owner_cid(struct rbd_device *rbd_dev, 2767ed95b21aSIlya Dryomov const struct rbd_client_id *cid) 2768ed95b21aSIlya Dryomov { 2769ed95b21aSIlya Dryomov dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev, 2770ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle, 2771ed95b21aSIlya Dryomov cid->gid, cid->handle); 2772ed95b21aSIlya Dryomov rbd_dev->owner_cid = *cid; /* struct */ 2773ed95b21aSIlya Dryomov } 2774ed95b21aSIlya Dryomov 2775ed95b21aSIlya Dryomov static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf) 2776ed95b21aSIlya Dryomov { 2777ed95b21aSIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 2778ed95b21aSIlya Dryomov sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie); 2779ed95b21aSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 2780ed95b21aSIlya Dryomov } 2781ed95b21aSIlya Dryomov 2782edd8ca80SFlorian Margaine static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie) 2783edd8ca80SFlorian Margaine { 2784edd8ca80SFlorian Margaine struct rbd_client_id cid = rbd_get_cid(rbd_dev); 2785edd8ca80SFlorian Margaine 2786edd8ca80SFlorian Margaine strcpy(rbd_dev->lock_cookie, cookie); 2787edd8ca80SFlorian Margaine rbd_set_owner_cid(rbd_dev, &cid); 2788edd8ca80SFlorian Margaine queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work); 2789edd8ca80SFlorian Margaine } 2790edd8ca80SFlorian Margaine 2791ed95b21aSIlya Dryomov /* 2792ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2793ed95b21aSIlya Dryomov */ 2794ed95b21aSIlya Dryomov static int rbd_lock(struct rbd_device *rbd_dev) 2795ed95b21aSIlya Dryomov { 2796ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2797ed95b21aSIlya Dryomov char cookie[32]; 2798ed95b21aSIlya Dryomov int ret; 2799ed95b21aSIlya Dryomov 2800cbbfb0ffSIlya Dryomov WARN_ON(__rbd_is_lock_owner(rbd_dev) || 2801cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] != '\0'); 2802ed95b21aSIlya Dryomov 2803ed95b21aSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 2804ed95b21aSIlya Dryomov ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 2805ed95b21aSIlya Dryomov RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie, 2806ed95b21aSIlya Dryomov RBD_LOCK_TAG, "", 0); 2807ed95b21aSIlya Dryomov if (ret) 2808ed95b21aSIlya Dryomov return ret; 2809ed95b21aSIlya Dryomov 2810ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED; 2811edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 2812ed95b21aSIlya Dryomov return 0; 2813ed95b21aSIlya Dryomov } 2814ed95b21aSIlya Dryomov 2815ed95b21aSIlya Dryomov /* 2816ed95b21aSIlya Dryomov * lock_rwsem must be held for write 2817ed95b21aSIlya Dryomov */ 2818bbead745SIlya Dryomov static void rbd_unlock(struct rbd_device *rbd_dev) 2819ed95b21aSIlya Dryomov { 2820ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2821ed95b21aSIlya Dryomov int ret; 2822ed95b21aSIlya Dryomov 2823cbbfb0ffSIlya Dryomov WARN_ON(!__rbd_is_lock_owner(rbd_dev) || 2824cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] == '\0'); 2825ed95b21aSIlya Dryomov 2826ed95b21aSIlya Dryomov ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 2827cbbfb0ffSIlya Dryomov RBD_LOCK_NAME, rbd_dev->lock_cookie); 2828bbead745SIlya Dryomov if (ret && ret != -ENOENT) 2829bbead745SIlya Dryomov rbd_warn(rbd_dev, "failed to unlock: %d", ret); 2830ed95b21aSIlya Dryomov 2831bbead745SIlya Dryomov /* treat errors as the image is unlocked */ 2832bbead745SIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 2833cbbfb0ffSIlya Dryomov rbd_dev->lock_cookie[0] = '\0'; 2834ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 2835ed95b21aSIlya Dryomov queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work); 2836ed95b21aSIlya Dryomov } 2837ed95b21aSIlya Dryomov 2838ed95b21aSIlya Dryomov static int __rbd_notify_op_lock(struct rbd_device *rbd_dev, 2839ed95b21aSIlya Dryomov enum rbd_notify_op notify_op, 2840ed95b21aSIlya Dryomov struct page ***preply_pages, 2841ed95b21aSIlya Dryomov size_t *preply_len) 2842ed95b21aSIlya Dryomov { 2843ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2844ed95b21aSIlya Dryomov struct rbd_client_id cid = rbd_get_cid(rbd_dev); 284508a79102SKyle Spiers char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; 284608a79102SKyle Spiers int buf_size = sizeof(buf); 2847ed95b21aSIlya Dryomov void *p = buf; 2848ed95b21aSIlya Dryomov 2849ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); 2850ed95b21aSIlya Dryomov 2851ed95b21aSIlya Dryomov /* encode *LockPayload NotifyMessage (op + ClientId) */ 2852ed95b21aSIlya Dryomov ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN); 2853ed95b21aSIlya Dryomov ceph_encode_32(&p, notify_op); 2854ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.gid); 2855ed95b21aSIlya Dryomov ceph_encode_64(&p, cid.handle); 2856ed95b21aSIlya Dryomov 2857ed95b21aSIlya Dryomov return ceph_osdc_notify(osdc, &rbd_dev->header_oid, 2858ed95b21aSIlya Dryomov &rbd_dev->header_oloc, buf, buf_size, 2859ed95b21aSIlya Dryomov RBD_NOTIFY_TIMEOUT, preply_pages, preply_len); 2860ed95b21aSIlya Dryomov } 2861ed95b21aSIlya Dryomov 2862ed95b21aSIlya Dryomov static void rbd_notify_op_lock(struct rbd_device *rbd_dev, 2863ed95b21aSIlya Dryomov enum rbd_notify_op notify_op) 2864ed95b21aSIlya Dryomov { 2865ed95b21aSIlya Dryomov struct page **reply_pages; 2866ed95b21aSIlya Dryomov size_t reply_len; 2867ed95b21aSIlya Dryomov 2868ed95b21aSIlya Dryomov __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len); 2869ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 2870ed95b21aSIlya Dryomov } 2871ed95b21aSIlya Dryomov 2872ed95b21aSIlya Dryomov static void rbd_notify_acquired_lock(struct work_struct *work) 2873ed95b21aSIlya Dryomov { 2874ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 2875ed95b21aSIlya Dryomov acquired_lock_work); 2876ed95b21aSIlya Dryomov 2877ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK); 2878ed95b21aSIlya Dryomov } 2879ed95b21aSIlya Dryomov 2880ed95b21aSIlya Dryomov static void rbd_notify_released_lock(struct work_struct *work) 2881ed95b21aSIlya Dryomov { 2882ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 2883ed95b21aSIlya Dryomov released_lock_work); 2884ed95b21aSIlya Dryomov 2885ed95b21aSIlya Dryomov rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK); 2886ed95b21aSIlya Dryomov } 2887ed95b21aSIlya Dryomov 2888ed95b21aSIlya Dryomov static int rbd_request_lock(struct rbd_device *rbd_dev) 2889ed95b21aSIlya Dryomov { 2890ed95b21aSIlya Dryomov struct page **reply_pages; 2891ed95b21aSIlya Dryomov size_t reply_len; 2892ed95b21aSIlya Dryomov bool lock_owner_responded = false; 2893ed95b21aSIlya Dryomov int ret; 2894ed95b21aSIlya Dryomov 2895ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 2896ed95b21aSIlya Dryomov 2897ed95b21aSIlya Dryomov ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK, 2898ed95b21aSIlya Dryomov &reply_pages, &reply_len); 2899ed95b21aSIlya Dryomov if (ret && ret != -ETIMEDOUT) { 2900ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to request lock: %d", ret); 2901ed95b21aSIlya Dryomov goto out; 2902ed95b21aSIlya Dryomov } 2903ed95b21aSIlya Dryomov 2904ed95b21aSIlya Dryomov if (reply_len > 0 && reply_len <= PAGE_SIZE) { 2905ed95b21aSIlya Dryomov void *p = page_address(reply_pages[0]); 2906ed95b21aSIlya Dryomov void *const end = p + reply_len; 2907ed95b21aSIlya Dryomov u32 n; 2908ed95b21aSIlya Dryomov 2909ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */ 2910ed95b21aSIlya Dryomov while (n--) { 2911ed95b21aSIlya Dryomov u8 struct_v; 2912ed95b21aSIlya Dryomov u32 len; 2913ed95b21aSIlya Dryomov 2914ed95b21aSIlya Dryomov ceph_decode_need(&p, end, 8 + 8, e_inval); 2915ed95b21aSIlya Dryomov p += 8 + 8; /* skip gid and cookie */ 2916ed95b21aSIlya Dryomov 2917ed95b21aSIlya Dryomov ceph_decode_32_safe(&p, end, len, e_inval); 2918ed95b21aSIlya Dryomov if (!len) 2919ed95b21aSIlya Dryomov continue; 2920ed95b21aSIlya Dryomov 2921ed95b21aSIlya Dryomov if (lock_owner_responded) { 2922ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 2923ed95b21aSIlya Dryomov "duplicate lock owners detected"); 2924ed95b21aSIlya Dryomov ret = -EIO; 2925ed95b21aSIlya Dryomov goto out; 2926ed95b21aSIlya Dryomov } 2927ed95b21aSIlya Dryomov 2928ed95b21aSIlya Dryomov lock_owner_responded = true; 2929ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "ResponseMessage", 2930ed95b21aSIlya Dryomov &struct_v, &len); 2931ed95b21aSIlya Dryomov if (ret) { 2932ed95b21aSIlya Dryomov rbd_warn(rbd_dev, 2933ed95b21aSIlya Dryomov "failed to decode ResponseMessage: %d", 2934ed95b21aSIlya Dryomov ret); 2935ed95b21aSIlya Dryomov goto e_inval; 2936ed95b21aSIlya Dryomov } 2937ed95b21aSIlya Dryomov 2938ed95b21aSIlya Dryomov ret = ceph_decode_32(&p); 2939ed95b21aSIlya Dryomov } 2940ed95b21aSIlya Dryomov } 2941ed95b21aSIlya Dryomov 2942ed95b21aSIlya Dryomov if (!lock_owner_responded) { 2943ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "no lock owners detected"); 2944ed95b21aSIlya Dryomov ret = -ETIMEDOUT; 2945ed95b21aSIlya Dryomov } 2946ed95b21aSIlya Dryomov 2947ed95b21aSIlya Dryomov out: 2948ed95b21aSIlya Dryomov ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)); 2949ed95b21aSIlya Dryomov return ret; 2950ed95b21aSIlya Dryomov 2951ed95b21aSIlya Dryomov e_inval: 2952ed95b21aSIlya Dryomov ret = -EINVAL; 2953ed95b21aSIlya Dryomov goto out; 2954ed95b21aSIlya Dryomov } 2955ed95b21aSIlya Dryomov 2956ed95b21aSIlya Dryomov static void wake_requests(struct rbd_device *rbd_dev, bool wake_all) 2957ed95b21aSIlya Dryomov { 2958ed95b21aSIlya Dryomov dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all); 2959ed95b21aSIlya Dryomov 2960ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 2961ed95b21aSIlya Dryomov if (wake_all) 2962ed95b21aSIlya Dryomov wake_up_all(&rbd_dev->lock_waitq); 2963ed95b21aSIlya Dryomov else 2964ed95b21aSIlya Dryomov wake_up(&rbd_dev->lock_waitq); 2965ed95b21aSIlya Dryomov } 2966ed95b21aSIlya Dryomov 2967ed95b21aSIlya Dryomov static int get_lock_owner_info(struct rbd_device *rbd_dev, 2968ed95b21aSIlya Dryomov struct ceph_locker **lockers, u32 *num_lockers) 2969ed95b21aSIlya Dryomov { 2970ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2971ed95b21aSIlya Dryomov u8 lock_type; 2972ed95b21aSIlya Dryomov char *lock_tag; 2973ed95b21aSIlya Dryomov int ret; 2974ed95b21aSIlya Dryomov 2975ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 2976ed95b21aSIlya Dryomov 2977ed95b21aSIlya Dryomov ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid, 2978ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 2979ed95b21aSIlya Dryomov &lock_type, &lock_tag, lockers, num_lockers); 2980ed95b21aSIlya Dryomov if (ret) 2981ed95b21aSIlya Dryomov return ret; 2982ed95b21aSIlya Dryomov 2983ed95b21aSIlya Dryomov if (*num_lockers == 0) { 2984ed95b21aSIlya Dryomov dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev); 2985ed95b21aSIlya Dryomov goto out; 2986ed95b21aSIlya Dryomov } 2987ed95b21aSIlya Dryomov 2988ed95b21aSIlya Dryomov if (strcmp(lock_tag, RBD_LOCK_TAG)) { 2989ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, tag %s", 2990ed95b21aSIlya Dryomov lock_tag); 2991ed95b21aSIlya Dryomov ret = -EBUSY; 2992ed95b21aSIlya Dryomov goto out; 2993ed95b21aSIlya Dryomov } 2994ed95b21aSIlya Dryomov 2995ed95b21aSIlya Dryomov if (lock_type == CEPH_CLS_LOCK_SHARED) { 2996ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "shared lock type detected"); 2997ed95b21aSIlya Dryomov ret = -EBUSY; 2998ed95b21aSIlya Dryomov goto out; 2999ed95b21aSIlya Dryomov } 3000ed95b21aSIlya Dryomov 3001ed95b21aSIlya Dryomov if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX, 3002ed95b21aSIlya Dryomov strlen(RBD_LOCK_COOKIE_PREFIX))) { 3003ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "locked by external mechanism, cookie %s", 3004ed95b21aSIlya Dryomov (*lockers)[0].id.cookie); 3005ed95b21aSIlya Dryomov ret = -EBUSY; 3006ed95b21aSIlya Dryomov goto out; 3007ed95b21aSIlya Dryomov } 3008ed95b21aSIlya Dryomov 3009ed95b21aSIlya Dryomov out: 3010ed95b21aSIlya Dryomov kfree(lock_tag); 3011ed95b21aSIlya Dryomov return ret; 3012ed95b21aSIlya Dryomov } 3013ed95b21aSIlya Dryomov 3014ed95b21aSIlya Dryomov static int find_watcher(struct rbd_device *rbd_dev, 3015ed95b21aSIlya Dryomov const struct ceph_locker *locker) 3016ed95b21aSIlya Dryomov { 3017ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3018ed95b21aSIlya Dryomov struct ceph_watch_item *watchers; 3019ed95b21aSIlya Dryomov u32 num_watchers; 3020ed95b21aSIlya Dryomov u64 cookie; 3021ed95b21aSIlya Dryomov int i; 3022ed95b21aSIlya Dryomov int ret; 3023ed95b21aSIlya Dryomov 3024ed95b21aSIlya Dryomov ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid, 3025ed95b21aSIlya Dryomov &rbd_dev->header_oloc, &watchers, 3026ed95b21aSIlya Dryomov &num_watchers); 3027ed95b21aSIlya Dryomov if (ret) 3028ed95b21aSIlya Dryomov return ret; 3029ed95b21aSIlya Dryomov 3030ed95b21aSIlya Dryomov sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie); 3031ed95b21aSIlya Dryomov for (i = 0; i < num_watchers; i++) { 3032ed95b21aSIlya Dryomov if (!memcmp(&watchers[i].addr, &locker->info.addr, 3033ed95b21aSIlya Dryomov sizeof(locker->info.addr)) && 3034ed95b21aSIlya Dryomov watchers[i].cookie == cookie) { 3035ed95b21aSIlya Dryomov struct rbd_client_id cid = { 3036ed95b21aSIlya Dryomov .gid = le64_to_cpu(watchers[i].name.num), 3037ed95b21aSIlya Dryomov .handle = cookie, 3038ed95b21aSIlya Dryomov }; 3039ed95b21aSIlya Dryomov 3040ed95b21aSIlya Dryomov dout("%s rbd_dev %p found cid %llu-%llu\n", __func__, 3041ed95b21aSIlya Dryomov rbd_dev, cid.gid, cid.handle); 3042ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3043ed95b21aSIlya Dryomov ret = 1; 3044ed95b21aSIlya Dryomov goto out; 3045ed95b21aSIlya Dryomov } 3046ed95b21aSIlya Dryomov } 3047ed95b21aSIlya Dryomov 3048ed95b21aSIlya Dryomov dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev); 3049ed95b21aSIlya Dryomov ret = 0; 3050ed95b21aSIlya Dryomov out: 3051ed95b21aSIlya Dryomov kfree(watchers); 3052ed95b21aSIlya Dryomov return ret; 3053ed95b21aSIlya Dryomov } 3054ed95b21aSIlya Dryomov 3055ed95b21aSIlya Dryomov /* 3056ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3057ed95b21aSIlya Dryomov */ 3058ed95b21aSIlya Dryomov static int rbd_try_lock(struct rbd_device *rbd_dev) 3059ed95b21aSIlya Dryomov { 3060ed95b21aSIlya Dryomov struct ceph_client *client = rbd_dev->rbd_client->client; 3061ed95b21aSIlya Dryomov struct ceph_locker *lockers; 3062ed95b21aSIlya Dryomov u32 num_lockers; 3063ed95b21aSIlya Dryomov int ret; 3064ed95b21aSIlya Dryomov 3065ed95b21aSIlya Dryomov for (;;) { 3066ed95b21aSIlya Dryomov ret = rbd_lock(rbd_dev); 3067ed95b21aSIlya Dryomov if (ret != -EBUSY) 3068ed95b21aSIlya Dryomov return ret; 3069ed95b21aSIlya Dryomov 3070ed95b21aSIlya Dryomov /* determine if the current lock holder is still alive */ 3071ed95b21aSIlya Dryomov ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers); 3072ed95b21aSIlya Dryomov if (ret) 3073ed95b21aSIlya Dryomov return ret; 3074ed95b21aSIlya Dryomov 3075ed95b21aSIlya Dryomov if (num_lockers == 0) 3076ed95b21aSIlya Dryomov goto again; 3077ed95b21aSIlya Dryomov 3078ed95b21aSIlya Dryomov ret = find_watcher(rbd_dev, lockers); 3079ed95b21aSIlya Dryomov if (ret) { 3080ed95b21aSIlya Dryomov if (ret > 0) 3081ed95b21aSIlya Dryomov ret = 0; /* have to request lock */ 3082ed95b21aSIlya Dryomov goto out; 3083ed95b21aSIlya Dryomov } 3084ed95b21aSIlya Dryomov 3085ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock", 3086ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name)); 3087ed95b21aSIlya Dryomov 3088ed95b21aSIlya Dryomov ret = ceph_monc_blacklist_add(&client->monc, 3089ed95b21aSIlya Dryomov &lockers[0].info.addr); 3090ed95b21aSIlya Dryomov if (ret) { 3091ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", 3092ed95b21aSIlya Dryomov ENTITY_NAME(lockers[0].id.name), ret); 3093ed95b21aSIlya Dryomov goto out; 3094ed95b21aSIlya Dryomov } 3095ed95b21aSIlya Dryomov 3096ed95b21aSIlya Dryomov ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid, 3097ed95b21aSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 3098ed95b21aSIlya Dryomov lockers[0].id.cookie, 3099ed95b21aSIlya Dryomov &lockers[0].id.name); 3100ed95b21aSIlya Dryomov if (ret && ret != -ENOENT) 3101ed95b21aSIlya Dryomov goto out; 3102ed95b21aSIlya Dryomov 3103ed95b21aSIlya Dryomov again: 3104ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3105ed95b21aSIlya Dryomov } 3106ed95b21aSIlya Dryomov 3107ed95b21aSIlya Dryomov out: 3108ed95b21aSIlya Dryomov ceph_free_lockers(lockers, num_lockers); 3109ed95b21aSIlya Dryomov return ret; 3110ed95b21aSIlya Dryomov } 3111ed95b21aSIlya Dryomov 3112ed95b21aSIlya Dryomov /* 3113ed95b21aSIlya Dryomov * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED 3114ed95b21aSIlya Dryomov */ 3115ed95b21aSIlya Dryomov static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev, 3116ed95b21aSIlya Dryomov int *pret) 3117ed95b21aSIlya Dryomov { 3118ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 3119ed95b21aSIlya Dryomov 3120ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3121ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3122ed95b21aSIlya Dryomov rbd_dev->lock_state); 3123ed95b21aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 3124ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3125ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3126ed95b21aSIlya Dryomov return lock_state; 3127ed95b21aSIlya Dryomov } 3128ed95b21aSIlya Dryomov 3129ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3130ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3131ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3132ed95b21aSIlya Dryomov rbd_dev->lock_state); 3133ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) { 3134ed95b21aSIlya Dryomov *pret = rbd_try_lock(rbd_dev); 3135ed95b21aSIlya Dryomov if (*pret) 3136ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret); 3137ed95b21aSIlya Dryomov } 3138ed95b21aSIlya Dryomov 3139ed95b21aSIlya Dryomov lock_state = rbd_dev->lock_state; 3140ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3141ed95b21aSIlya Dryomov return lock_state; 3142ed95b21aSIlya Dryomov } 3143ed95b21aSIlya Dryomov 3144ed95b21aSIlya Dryomov static void rbd_acquire_lock(struct work_struct *work) 3145ed95b21aSIlya Dryomov { 3146ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 3147ed95b21aSIlya Dryomov struct rbd_device, lock_dwork); 3148ed95b21aSIlya Dryomov enum rbd_lock_state lock_state; 314937f13252SKefeng Wang int ret = 0; 3150ed95b21aSIlya Dryomov 3151ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3152ed95b21aSIlya Dryomov again: 3153ed95b21aSIlya Dryomov lock_state = rbd_try_acquire_lock(rbd_dev, &ret); 3154ed95b21aSIlya Dryomov if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) { 3155ed95b21aSIlya Dryomov if (lock_state == RBD_LOCK_STATE_LOCKED) 3156ed95b21aSIlya Dryomov wake_requests(rbd_dev, true); 3157ed95b21aSIlya Dryomov dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__, 3158ed95b21aSIlya Dryomov rbd_dev, lock_state, ret); 3159ed95b21aSIlya Dryomov return; 3160ed95b21aSIlya Dryomov } 3161ed95b21aSIlya Dryomov 3162ed95b21aSIlya Dryomov ret = rbd_request_lock(rbd_dev); 3163ed95b21aSIlya Dryomov if (ret == -ETIMEDOUT) { 3164ed95b21aSIlya Dryomov goto again; /* treat this as a dead client */ 3165e010dd0aSIlya Dryomov } else if (ret == -EROFS) { 3166e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "peer will not release lock"); 3167e010dd0aSIlya Dryomov /* 3168e010dd0aSIlya Dryomov * If this is rbd_add_acquire_lock(), we want to fail 3169e010dd0aSIlya Dryomov * immediately -- reuse BLACKLISTED flag. Otherwise we 3170e010dd0aSIlya Dryomov * want to block. 3171e010dd0aSIlya Dryomov */ 3172e010dd0aSIlya Dryomov if (!(rbd_dev->disk->flags & GENHD_FL_UP)) { 3173e010dd0aSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 3174e010dd0aSIlya Dryomov /* wake "rbd map --exclusive" process */ 3175e010dd0aSIlya Dryomov wake_requests(rbd_dev, false); 3176e010dd0aSIlya Dryomov } 3177ed95b21aSIlya Dryomov } else if (ret < 0) { 3178ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "error requesting lock: %d", ret); 3179ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3180ed95b21aSIlya Dryomov RBD_RETRY_DELAY); 3181ed95b21aSIlya Dryomov } else { 3182ed95b21aSIlya Dryomov /* 3183ed95b21aSIlya Dryomov * lock owner acked, but resend if we don't see them 3184ed95b21aSIlya Dryomov * release the lock 3185ed95b21aSIlya Dryomov */ 3186ed95b21aSIlya Dryomov dout("%s rbd_dev %p requeueing lock_dwork\n", __func__, 3187ed95b21aSIlya Dryomov rbd_dev); 3188ed95b21aSIlya Dryomov mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 3189ed95b21aSIlya Dryomov msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC)); 3190ed95b21aSIlya Dryomov } 3191ed95b21aSIlya Dryomov } 3192ed95b21aSIlya Dryomov 3193ed95b21aSIlya Dryomov /* 3194ed95b21aSIlya Dryomov * lock_rwsem must be held for write 3195ed95b21aSIlya Dryomov */ 3196ed95b21aSIlya Dryomov static bool rbd_release_lock(struct rbd_device *rbd_dev) 3197ed95b21aSIlya Dryomov { 3198ed95b21aSIlya Dryomov dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev, 3199ed95b21aSIlya Dryomov rbd_dev->lock_state); 3200ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED) 3201ed95b21aSIlya Dryomov return false; 3202ed95b21aSIlya Dryomov 3203ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING; 3204ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3205ed95b21aSIlya Dryomov /* 3206ed95b21aSIlya Dryomov * Ensure that all in-flight IO is flushed. 3207ed95b21aSIlya Dryomov * 3208ed95b21aSIlya Dryomov * FIXME: ceph_osdc_sync() flushes the entire OSD client, which 3209ed95b21aSIlya Dryomov * may be shared with other devices. 3210ed95b21aSIlya Dryomov */ 3211ed95b21aSIlya Dryomov ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc); 3212ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3213ed95b21aSIlya Dryomov 3214ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3215ed95b21aSIlya Dryomov dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev, 3216ed95b21aSIlya Dryomov rbd_dev->lock_state); 3217ed95b21aSIlya Dryomov if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING) 3218ed95b21aSIlya Dryomov return false; 3219ed95b21aSIlya Dryomov 3220bbead745SIlya Dryomov rbd_unlock(rbd_dev); 3221ed95b21aSIlya Dryomov /* 3222ed95b21aSIlya Dryomov * Give others a chance to grab the lock - we would re-acquire 3223ed95b21aSIlya Dryomov * almost immediately if we got new IO during ceph_osdc_sync() 3224ed95b21aSIlya Dryomov * otherwise. We need to ack our own notifications, so this 3225ed95b21aSIlya Dryomov * lock_dwork will be requeued from rbd_wait_state_locked() 3226ed95b21aSIlya Dryomov * after wake_requests() in rbd_handle_released_lock(). 3227ed95b21aSIlya Dryomov */ 3228ed95b21aSIlya Dryomov cancel_delayed_work(&rbd_dev->lock_dwork); 3229ed95b21aSIlya Dryomov return true; 3230ed95b21aSIlya Dryomov } 3231ed95b21aSIlya Dryomov 3232ed95b21aSIlya Dryomov static void rbd_release_lock_work(struct work_struct *work) 3233ed95b21aSIlya Dryomov { 3234ed95b21aSIlya Dryomov struct rbd_device *rbd_dev = container_of(work, struct rbd_device, 3235ed95b21aSIlya Dryomov unlock_work); 3236ed95b21aSIlya Dryomov 3237ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3238ed95b21aSIlya Dryomov rbd_release_lock(rbd_dev); 3239ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3240ed95b21aSIlya Dryomov } 3241ed95b21aSIlya Dryomov 3242ed95b21aSIlya Dryomov static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v, 3243ed95b21aSIlya Dryomov void **p) 3244ed95b21aSIlya Dryomov { 3245ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3246ed95b21aSIlya Dryomov 3247ed95b21aSIlya Dryomov if (struct_v >= 2) { 3248ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3249ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3250ed95b21aSIlya Dryomov } 3251ed95b21aSIlya Dryomov 3252ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3253ed95b21aSIlya Dryomov cid.handle); 3254ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3255ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3256ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3257ed95b21aSIlya Dryomov /* 3258ed95b21aSIlya Dryomov * we already know that the remote client is 3259ed95b21aSIlya Dryomov * the owner 3260ed95b21aSIlya Dryomov */ 3261ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3262ed95b21aSIlya Dryomov return; 3263ed95b21aSIlya Dryomov } 3264ed95b21aSIlya Dryomov 3265ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &cid); 3266ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3267ed95b21aSIlya Dryomov } else { 3268ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3269ed95b21aSIlya Dryomov } 3270ed95b21aSIlya Dryomov 3271ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3272ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3273ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3274ed95b21aSIlya Dryomov } 3275ed95b21aSIlya Dryomov 3276ed95b21aSIlya Dryomov static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v, 3277ed95b21aSIlya Dryomov void **p) 3278ed95b21aSIlya Dryomov { 3279ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 3280ed95b21aSIlya Dryomov 3281ed95b21aSIlya Dryomov if (struct_v >= 2) { 3282ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3283ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3284ed95b21aSIlya Dryomov } 3285ed95b21aSIlya Dryomov 3286ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3287ed95b21aSIlya Dryomov cid.handle); 3288ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_empty_cid)) { 3289ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3290ed95b21aSIlya Dryomov if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) { 3291ed95b21aSIlya Dryomov dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n", 3292ed95b21aSIlya Dryomov __func__, rbd_dev, cid.gid, cid.handle, 3293ed95b21aSIlya Dryomov rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle); 3294ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3295ed95b21aSIlya Dryomov return; 3296ed95b21aSIlya Dryomov } 3297ed95b21aSIlya Dryomov 3298ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3299ed95b21aSIlya Dryomov downgrade_write(&rbd_dev->lock_rwsem); 3300ed95b21aSIlya Dryomov } else { 3301ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 3302ed95b21aSIlya Dryomov } 3303ed95b21aSIlya Dryomov 3304ed95b21aSIlya Dryomov if (!__rbd_is_lock_owner(rbd_dev)) 3305ed95b21aSIlya Dryomov wake_requests(rbd_dev, false); 3306ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3307ed95b21aSIlya Dryomov } 3308ed95b21aSIlya Dryomov 33093b77faa0SIlya Dryomov /* 33103b77faa0SIlya Dryomov * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no 33113b77faa0SIlya Dryomov * ResponseMessage is needed. 33123b77faa0SIlya Dryomov */ 33133b77faa0SIlya Dryomov static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v, 3314ed95b21aSIlya Dryomov void **p) 3315ed95b21aSIlya Dryomov { 3316ed95b21aSIlya Dryomov struct rbd_client_id my_cid = rbd_get_cid(rbd_dev); 3317ed95b21aSIlya Dryomov struct rbd_client_id cid = { 0 }; 33183b77faa0SIlya Dryomov int result = 1; 3319ed95b21aSIlya Dryomov 3320ed95b21aSIlya Dryomov if (struct_v >= 2) { 3321ed95b21aSIlya Dryomov cid.gid = ceph_decode_64(p); 3322ed95b21aSIlya Dryomov cid.handle = ceph_decode_64(p); 3323ed95b21aSIlya Dryomov } 3324ed95b21aSIlya Dryomov 3325ed95b21aSIlya Dryomov dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid, 3326ed95b21aSIlya Dryomov cid.handle); 3327ed95b21aSIlya Dryomov if (rbd_cid_equal(&cid, &my_cid)) 33283b77faa0SIlya Dryomov return result; 3329ed95b21aSIlya Dryomov 3330ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 33313b77faa0SIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) { 33323b77faa0SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED && 33333b77faa0SIlya Dryomov rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) 33343b77faa0SIlya Dryomov goto out_unlock; 33353b77faa0SIlya Dryomov 33363b77faa0SIlya Dryomov /* 33373b77faa0SIlya Dryomov * encode ResponseMessage(0) so the peer can detect 33383b77faa0SIlya Dryomov * a missing owner 33393b77faa0SIlya Dryomov */ 33403b77faa0SIlya Dryomov result = 0; 33413b77faa0SIlya Dryomov 3342ed95b21aSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) { 3343e010dd0aSIlya Dryomov if (!rbd_dev->opts->exclusive) { 3344e010dd0aSIlya Dryomov dout("%s rbd_dev %p queueing unlock_work\n", 3345e010dd0aSIlya Dryomov __func__, rbd_dev); 3346e010dd0aSIlya Dryomov queue_work(rbd_dev->task_wq, 3347e010dd0aSIlya Dryomov &rbd_dev->unlock_work); 3348e010dd0aSIlya Dryomov } else { 3349e010dd0aSIlya Dryomov /* refuse to release the lock */ 3350e010dd0aSIlya Dryomov result = -EROFS; 3351ed95b21aSIlya Dryomov } 3352ed95b21aSIlya Dryomov } 3353ed95b21aSIlya Dryomov } 33543b77faa0SIlya Dryomov 33553b77faa0SIlya Dryomov out_unlock: 3356ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 33573b77faa0SIlya Dryomov return result; 3358ed95b21aSIlya Dryomov } 3359ed95b21aSIlya Dryomov 3360ed95b21aSIlya Dryomov static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev, 3361ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 *result) 3362ed95b21aSIlya Dryomov { 3363ed95b21aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 336408a79102SKyle Spiers char buf[4 + CEPH_ENCODING_START_BLK_LEN]; 336508a79102SKyle Spiers int buf_size = sizeof(buf); 3366ed95b21aSIlya Dryomov int ret; 3367ed95b21aSIlya Dryomov 3368ed95b21aSIlya Dryomov if (result) { 3369ed95b21aSIlya Dryomov void *p = buf; 3370ed95b21aSIlya Dryomov 3371ed95b21aSIlya Dryomov /* encode ResponseMessage */ 3372ed95b21aSIlya Dryomov ceph_start_encoding(&p, 1, 1, 3373ed95b21aSIlya Dryomov buf_size - CEPH_ENCODING_START_BLK_LEN); 3374ed95b21aSIlya Dryomov ceph_encode_32(&p, *result); 3375ed95b21aSIlya Dryomov } else { 3376ed95b21aSIlya Dryomov buf_size = 0; 3377ed95b21aSIlya Dryomov } 3378ed95b21aSIlya Dryomov 3379ed95b21aSIlya Dryomov ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid, 3380ed95b21aSIlya Dryomov &rbd_dev->header_oloc, notify_id, cookie, 3381ed95b21aSIlya Dryomov buf, buf_size); 3382ed95b21aSIlya Dryomov if (ret) 3383ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret); 3384ed95b21aSIlya Dryomov } 3385ed95b21aSIlya Dryomov 3386ed95b21aSIlya Dryomov static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id, 3387ed95b21aSIlya Dryomov u64 cookie) 3388ed95b21aSIlya Dryomov { 3389ed95b21aSIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3390ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL); 3391ed95b21aSIlya Dryomov } 3392ed95b21aSIlya Dryomov 3393ed95b21aSIlya Dryomov static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev, 3394ed95b21aSIlya Dryomov u64 notify_id, u64 cookie, s32 result) 3395ed95b21aSIlya Dryomov { 3396ed95b21aSIlya Dryomov dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result); 3397ed95b21aSIlya Dryomov __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result); 3398ed95b21aSIlya Dryomov } 3399922dab61SIlya Dryomov 3400922dab61SIlya Dryomov static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie, 3401922dab61SIlya Dryomov u64 notifier_id, void *data, size_t data_len) 3402b8d70035SAlex Elder { 3403922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3404ed95b21aSIlya Dryomov void *p = data; 3405ed95b21aSIlya Dryomov void *const end = p + data_len; 3406d4c2269bSIlya Dryomov u8 struct_v = 0; 3407ed95b21aSIlya Dryomov u32 len; 3408ed95b21aSIlya Dryomov u32 notify_op; 3409b8d70035SAlex Elder int ret; 3410b8d70035SAlex Elder 3411ed95b21aSIlya Dryomov dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n", 3412ed95b21aSIlya Dryomov __func__, rbd_dev, cookie, notify_id, data_len); 3413ed95b21aSIlya Dryomov if (data_len) { 3414ed95b21aSIlya Dryomov ret = ceph_start_decoding(&p, end, 1, "NotifyMessage", 3415ed95b21aSIlya Dryomov &struct_v, &len); 3416ed95b21aSIlya Dryomov if (ret) { 3417ed95b21aSIlya Dryomov rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d", 3418ed95b21aSIlya Dryomov ret); 3419ed95b21aSIlya Dryomov return; 3420ed95b21aSIlya Dryomov } 342152bb1f9bSIlya Dryomov 3422ed95b21aSIlya Dryomov notify_op = ceph_decode_32(&p); 3423ed95b21aSIlya Dryomov } else { 3424ed95b21aSIlya Dryomov /* legacy notification for header updates */ 3425ed95b21aSIlya Dryomov notify_op = RBD_NOTIFY_OP_HEADER_UPDATE; 3426ed95b21aSIlya Dryomov len = 0; 3427ed95b21aSIlya Dryomov } 3428ed95b21aSIlya Dryomov 3429ed95b21aSIlya Dryomov dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op); 3430ed95b21aSIlya Dryomov switch (notify_op) { 3431ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_ACQUIRED_LOCK: 3432ed95b21aSIlya Dryomov rbd_handle_acquired_lock(rbd_dev, struct_v, &p); 3433ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3434ed95b21aSIlya Dryomov break; 3435ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_RELEASED_LOCK: 3436ed95b21aSIlya Dryomov rbd_handle_released_lock(rbd_dev, struct_v, &p); 3437ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3438ed95b21aSIlya Dryomov break; 3439ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_REQUEST_LOCK: 34403b77faa0SIlya Dryomov ret = rbd_handle_request_lock(rbd_dev, struct_v, &p); 34413b77faa0SIlya Dryomov if (ret <= 0) 3442ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 34433b77faa0SIlya Dryomov cookie, ret); 3444ed95b21aSIlya Dryomov else 3445ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3446ed95b21aSIlya Dryomov break; 3447ed95b21aSIlya Dryomov case RBD_NOTIFY_OP_HEADER_UPDATE: 3448e627db08SAlex Elder ret = rbd_dev_refresh(rbd_dev); 3449e627db08SAlex Elder if (ret) 34509584d508SIlya Dryomov rbd_warn(rbd_dev, "refresh failed: %d", ret); 3451b8d70035SAlex Elder 3452ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3453ed95b21aSIlya Dryomov break; 3454ed95b21aSIlya Dryomov default: 3455ed95b21aSIlya Dryomov if (rbd_is_lock_owner(rbd_dev)) 3456ed95b21aSIlya Dryomov rbd_acknowledge_notify_result(rbd_dev, notify_id, 3457ed95b21aSIlya Dryomov cookie, -EOPNOTSUPP); 3458ed95b21aSIlya Dryomov else 3459ed95b21aSIlya Dryomov rbd_acknowledge_notify(rbd_dev, notify_id, cookie); 3460ed95b21aSIlya Dryomov break; 3461b8d70035SAlex Elder } 3462b8d70035SAlex Elder } 3463b8d70035SAlex Elder 346499d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev); 34659969ebc5SAlex Elder 3466922dab61SIlya Dryomov static void rbd_watch_errcb(void *arg, u64 cookie, int err) 3467bb040aa0SIlya Dryomov { 3468922dab61SIlya Dryomov struct rbd_device *rbd_dev = arg; 3469bb040aa0SIlya Dryomov 3470922dab61SIlya Dryomov rbd_warn(rbd_dev, "encountered watch error: %d", err); 3471bb040aa0SIlya Dryomov 3472ed95b21aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 3473ed95b21aSIlya Dryomov rbd_set_owner_cid(rbd_dev, &rbd_empty_cid); 3474ed95b21aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 3475bb040aa0SIlya Dryomov 347699d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 347799d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) { 347899d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 347999d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_ERROR; 3480bb040aa0SIlya Dryomov 348199d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0); 3482bb040aa0SIlya Dryomov } 348399d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 3484bb040aa0SIlya Dryomov } 3485bb040aa0SIlya Dryomov 3486bb040aa0SIlya Dryomov /* 348799d16943SIlya Dryomov * watch_mutex must be locked 34889969ebc5SAlex Elder */ 348999d16943SIlya Dryomov static int __rbd_register_watch(struct rbd_device *rbd_dev) 34909969ebc5SAlex Elder { 34919969ebc5SAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3492922dab61SIlya Dryomov struct ceph_osd_linger_request *handle; 34939969ebc5SAlex Elder 3494922dab61SIlya Dryomov rbd_assert(!rbd_dev->watch_handle); 349599d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 34969969ebc5SAlex Elder 3497922dab61SIlya Dryomov handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid, 3498922dab61SIlya Dryomov &rbd_dev->header_oloc, rbd_watch_cb, 3499922dab61SIlya Dryomov rbd_watch_errcb, rbd_dev); 3500922dab61SIlya Dryomov if (IS_ERR(handle)) 3501922dab61SIlya Dryomov return PTR_ERR(handle); 35029969ebc5SAlex Elder 3503922dab61SIlya Dryomov rbd_dev->watch_handle = handle; 35048eb87565SAlex Elder return 0; 35059969ebc5SAlex Elder } 35069969ebc5SAlex Elder 350799d16943SIlya Dryomov /* 350899d16943SIlya Dryomov * watch_mutex must be locked 350999d16943SIlya Dryomov */ 351099d16943SIlya Dryomov static void __rbd_unregister_watch(struct rbd_device *rbd_dev) 3511fca27065SIlya Dryomov { 3512922dab61SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3513922dab61SIlya Dryomov int ret; 3514b30a01f2SIlya Dryomov 351599d16943SIlya Dryomov rbd_assert(rbd_dev->watch_handle); 351699d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 3517b30a01f2SIlya Dryomov 3518922dab61SIlya Dryomov ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle); 3519922dab61SIlya Dryomov if (ret) 3520922dab61SIlya Dryomov rbd_warn(rbd_dev, "failed to unwatch: %d", ret); 3521b30a01f2SIlya Dryomov 3522922dab61SIlya Dryomov rbd_dev->watch_handle = NULL; 3523c525f036SIlya Dryomov } 3524c525f036SIlya Dryomov 352599d16943SIlya Dryomov static int rbd_register_watch(struct rbd_device *rbd_dev) 3526c525f036SIlya Dryomov { 352799d16943SIlya Dryomov int ret; 3528811c6688SIlya Dryomov 352999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 353099d16943SIlya Dryomov rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED); 353199d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 353299d16943SIlya Dryomov if (ret) 353399d16943SIlya Dryomov goto out; 353499d16943SIlya Dryomov 353599d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 353699d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 353799d16943SIlya Dryomov 353899d16943SIlya Dryomov out: 353999d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 354099d16943SIlya Dryomov return ret; 354199d16943SIlya Dryomov } 354299d16943SIlya Dryomov 354399d16943SIlya Dryomov static void cancel_tasks_sync(struct rbd_device *rbd_dev) 354499d16943SIlya Dryomov { 354599d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 354699d16943SIlya Dryomov 3547ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->acquired_lock_work); 3548ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->released_lock_work); 3549ed95b21aSIlya Dryomov cancel_delayed_work_sync(&rbd_dev->lock_dwork); 3550ed95b21aSIlya Dryomov cancel_work_sync(&rbd_dev->unlock_work); 355199d16943SIlya Dryomov } 355299d16943SIlya Dryomov 355399d16943SIlya Dryomov static void rbd_unregister_watch(struct rbd_device *rbd_dev) 355499d16943SIlya Dryomov { 3555ed95b21aSIlya Dryomov WARN_ON(waitqueue_active(&rbd_dev->lock_waitq)); 355699d16943SIlya Dryomov cancel_tasks_sync(rbd_dev); 355799d16943SIlya Dryomov 355899d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 355999d16943SIlya Dryomov if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) 356099d16943SIlya Dryomov __rbd_unregister_watch(rbd_dev); 356199d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 356299d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 356399d16943SIlya Dryomov 356423edca86SDongsheng Yang cancel_delayed_work_sync(&rbd_dev->watch_dwork); 3565811c6688SIlya Dryomov ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); 3566fca27065SIlya Dryomov } 3567fca27065SIlya Dryomov 356814bb211dSIlya Dryomov /* 356914bb211dSIlya Dryomov * lock_rwsem must be held for write 357014bb211dSIlya Dryomov */ 357114bb211dSIlya Dryomov static void rbd_reacquire_lock(struct rbd_device *rbd_dev) 357214bb211dSIlya Dryomov { 357314bb211dSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 357414bb211dSIlya Dryomov char cookie[32]; 357514bb211dSIlya Dryomov int ret; 357614bb211dSIlya Dryomov 357714bb211dSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 357814bb211dSIlya Dryomov 357914bb211dSIlya Dryomov format_lock_cookie(rbd_dev, cookie); 358014bb211dSIlya Dryomov ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid, 358114bb211dSIlya Dryomov &rbd_dev->header_oloc, RBD_LOCK_NAME, 358214bb211dSIlya Dryomov CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie, 358314bb211dSIlya Dryomov RBD_LOCK_TAG, cookie); 358414bb211dSIlya Dryomov if (ret) { 358514bb211dSIlya Dryomov if (ret != -EOPNOTSUPP) 358614bb211dSIlya Dryomov rbd_warn(rbd_dev, "failed to update lock cookie: %d", 358714bb211dSIlya Dryomov ret); 358814bb211dSIlya Dryomov 358914bb211dSIlya Dryomov /* 359014bb211dSIlya Dryomov * Lock cookie cannot be updated on older OSDs, so do 359114bb211dSIlya Dryomov * a manual release and queue an acquire. 359214bb211dSIlya Dryomov */ 359314bb211dSIlya Dryomov if (rbd_release_lock(rbd_dev)) 359414bb211dSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 359514bb211dSIlya Dryomov &rbd_dev->lock_dwork, 0); 359614bb211dSIlya Dryomov } else { 3597edd8ca80SFlorian Margaine __rbd_lock(rbd_dev, cookie); 359814bb211dSIlya Dryomov } 359914bb211dSIlya Dryomov } 360014bb211dSIlya Dryomov 360199d16943SIlya Dryomov static void rbd_reregister_watch(struct work_struct *work) 360299d16943SIlya Dryomov { 360399d16943SIlya Dryomov struct rbd_device *rbd_dev = container_of(to_delayed_work(work), 360499d16943SIlya Dryomov struct rbd_device, watch_dwork); 360599d16943SIlya Dryomov int ret; 360699d16943SIlya Dryomov 360799d16943SIlya Dryomov dout("%s rbd_dev %p\n", __func__, rbd_dev); 360899d16943SIlya Dryomov 360999d16943SIlya Dryomov mutex_lock(&rbd_dev->watch_mutex); 361087c0fdedSIlya Dryomov if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) { 361187c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 361214bb211dSIlya Dryomov return; 361387c0fdedSIlya Dryomov } 361499d16943SIlya Dryomov 361599d16943SIlya Dryomov ret = __rbd_register_watch(rbd_dev); 361699d16943SIlya Dryomov if (ret) { 361799d16943SIlya Dryomov rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); 36184d73644bSIlya Dryomov if (ret == -EBLACKLISTED || ret == -ENOENT) { 361987c0fdedSIlya Dryomov set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags); 362014bb211dSIlya Dryomov wake_requests(rbd_dev, true); 362187c0fdedSIlya Dryomov } else { 362299d16943SIlya Dryomov queue_delayed_work(rbd_dev->task_wq, 362399d16943SIlya Dryomov &rbd_dev->watch_dwork, 362499d16943SIlya Dryomov RBD_RETRY_DELAY); 362587c0fdedSIlya Dryomov } 362687c0fdedSIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 362714bb211dSIlya Dryomov return; 362899d16943SIlya Dryomov } 362999d16943SIlya Dryomov 363099d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED; 363199d16943SIlya Dryomov rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id; 363299d16943SIlya Dryomov mutex_unlock(&rbd_dev->watch_mutex); 363399d16943SIlya Dryomov 363414bb211dSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 363514bb211dSIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 363614bb211dSIlya Dryomov rbd_reacquire_lock(rbd_dev); 363714bb211dSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 363814bb211dSIlya Dryomov 363999d16943SIlya Dryomov ret = rbd_dev_refresh(rbd_dev); 364099d16943SIlya Dryomov if (ret) 3641f6870cc9SColin Ian King rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); 364299d16943SIlya Dryomov } 364399d16943SIlya Dryomov 364436be9a76SAlex Elder /* 3645f40eb349SAlex Elder * Synchronous osd object method call. Returns the number of bytes 3646f40eb349SAlex Elder * returned in the outbound buffer, or a negative error code. 364736be9a76SAlex Elder */ 364836be9a76SAlex Elder static int rbd_obj_method_sync(struct rbd_device *rbd_dev, 3649ecd4a68aSIlya Dryomov struct ceph_object_id *oid, 3650ecd4a68aSIlya Dryomov struct ceph_object_locator *oloc, 365136be9a76SAlex Elder const char *method_name, 36524157976bSAlex Elder const void *outbound, 365336be9a76SAlex Elder size_t outbound_size, 36544157976bSAlex Elder void *inbound, 3655e2a58ee5SAlex Elder size_t inbound_size) 365636be9a76SAlex Elder { 3657ecd4a68aSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3658ecd4a68aSIlya Dryomov struct page *req_page = NULL; 3659ecd4a68aSIlya Dryomov struct page *reply_page; 366036be9a76SAlex Elder int ret; 366136be9a76SAlex Elder 366236be9a76SAlex Elder /* 36636010a451SAlex Elder * Method calls are ultimately read operations. The result 36646010a451SAlex Elder * should placed into the inbound buffer provided. They 36656010a451SAlex Elder * also supply outbound data--parameters for the object 36666010a451SAlex Elder * method. Currently if this is present it will be a 36676010a451SAlex Elder * snapshot id. 366836be9a76SAlex Elder */ 3669ecd4a68aSIlya Dryomov if (outbound) { 3670ecd4a68aSIlya Dryomov if (outbound_size > PAGE_SIZE) 3671ecd4a68aSIlya Dryomov return -E2BIG; 367236be9a76SAlex Elder 3673ecd4a68aSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 3674ecd4a68aSIlya Dryomov if (!req_page) 3675ecd4a68aSIlya Dryomov return -ENOMEM; 367636be9a76SAlex Elder 3677ecd4a68aSIlya Dryomov memcpy(page_address(req_page), outbound, outbound_size); 367804017e29SAlex Elder } 3679430c28c3SAlex Elder 3680ecd4a68aSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 3681ecd4a68aSIlya Dryomov if (!reply_page) { 3682ecd4a68aSIlya Dryomov if (req_page) 3683ecd4a68aSIlya Dryomov __free_page(req_page); 3684ecd4a68aSIlya Dryomov return -ENOMEM; 3685ecd4a68aSIlya Dryomov } 368636be9a76SAlex Elder 3687ecd4a68aSIlya Dryomov ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name, 3688ecd4a68aSIlya Dryomov CEPH_OSD_FLAG_READ, req_page, outbound_size, 3689ecd4a68aSIlya Dryomov reply_page, &inbound_size); 3690ecd4a68aSIlya Dryomov if (!ret) { 3691ecd4a68aSIlya Dryomov memcpy(inbound, page_address(reply_page), inbound_size); 3692ecd4a68aSIlya Dryomov ret = inbound_size; 3693ecd4a68aSIlya Dryomov } 369457385b51SAlex Elder 3695ecd4a68aSIlya Dryomov if (req_page) 3696ecd4a68aSIlya Dryomov __free_page(req_page); 3697ecd4a68aSIlya Dryomov __free_page(reply_page); 369836be9a76SAlex Elder return ret; 369936be9a76SAlex Elder } 370036be9a76SAlex Elder 3701ed95b21aSIlya Dryomov /* 3702ed95b21aSIlya Dryomov * lock_rwsem must be held for read 3703ed95b21aSIlya Dryomov */ 37042f18d466SIlya Dryomov static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire) 3705ed95b21aSIlya Dryomov { 3706ed95b21aSIlya Dryomov DEFINE_WAIT(wait); 370734f55d0bSDongsheng Yang unsigned long timeout; 37082f18d466SIlya Dryomov int ret = 0; 37092f18d466SIlya Dryomov 37102f18d466SIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) 37112f18d466SIlya Dryomov return -EBLACKLISTED; 37122f18d466SIlya Dryomov 37132f18d466SIlya Dryomov if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) 37142f18d466SIlya Dryomov return 0; 37152f18d466SIlya Dryomov 37162f18d466SIlya Dryomov if (!may_acquire) { 37172f18d466SIlya Dryomov rbd_warn(rbd_dev, "exclusive lock required"); 37182f18d466SIlya Dryomov return -EROFS; 37192f18d466SIlya Dryomov } 3720ed95b21aSIlya Dryomov 3721ed95b21aSIlya Dryomov do { 3722ed95b21aSIlya Dryomov /* 3723ed95b21aSIlya Dryomov * Note the use of mod_delayed_work() in rbd_acquire_lock() 3724ed95b21aSIlya Dryomov * and cancel_delayed_work() in wake_requests(). 3725ed95b21aSIlya Dryomov */ 3726ed95b21aSIlya Dryomov dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev); 3727ed95b21aSIlya Dryomov queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0); 3728ed95b21aSIlya Dryomov prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait, 3729ed95b21aSIlya Dryomov TASK_UNINTERRUPTIBLE); 3730ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 373134f55d0bSDongsheng Yang timeout = schedule_timeout(ceph_timeout_jiffies( 373234f55d0bSDongsheng Yang rbd_dev->opts->lock_timeout)); 3733ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 37342f18d466SIlya Dryomov if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) { 37352f18d466SIlya Dryomov ret = -EBLACKLISTED; 37362f18d466SIlya Dryomov break; 37372f18d466SIlya Dryomov } 373834f55d0bSDongsheng Yang if (!timeout) { 373934f55d0bSDongsheng Yang rbd_warn(rbd_dev, "timed out waiting for lock"); 374034f55d0bSDongsheng Yang ret = -ETIMEDOUT; 374134f55d0bSDongsheng Yang break; 374234f55d0bSDongsheng Yang } 37432f18d466SIlya Dryomov } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED); 374487c0fdedSIlya Dryomov 3745ed95b21aSIlya Dryomov finish_wait(&rbd_dev->lock_waitq, &wait); 37462f18d466SIlya Dryomov return ret; 3747ed95b21aSIlya Dryomov } 3748ed95b21aSIlya Dryomov 37497ad18afaSChristoph Hellwig static void rbd_queue_workfn(struct work_struct *work) 3750bc1ecc65SIlya Dryomov { 37517ad18afaSChristoph Hellwig struct request *rq = blk_mq_rq_from_pdu(work); 37527ad18afaSChristoph Hellwig struct rbd_device *rbd_dev = rq->q->queuedata; 3753bc1ecc65SIlya Dryomov struct rbd_img_request *img_request; 37544e752f0aSJosh Durgin struct ceph_snap_context *snapc = NULL; 3755bc1ecc65SIlya Dryomov u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3756bc1ecc65SIlya Dryomov u64 length = blk_rq_bytes(rq); 37576d2940c8SGuangliang Zhao enum obj_operation_type op_type; 37584e752f0aSJosh Durgin u64 mapping_size; 375980de1912SIlya Dryomov bool must_be_locked; 3760bc1ecc65SIlya Dryomov int result; 3761bc1ecc65SIlya Dryomov 3762aebf526bSChristoph Hellwig switch (req_op(rq)) { 3763aebf526bSChristoph Hellwig case REQ_OP_DISCARD: 3764aebf526bSChristoph Hellwig op_type = OBJ_OP_DISCARD; 3765aebf526bSChristoph Hellwig break; 37666484cbe9SIlya Dryomov case REQ_OP_WRITE_ZEROES: 37676484cbe9SIlya Dryomov op_type = OBJ_OP_ZEROOUT; 37686484cbe9SIlya Dryomov break; 3769aebf526bSChristoph Hellwig case REQ_OP_WRITE: 3770aebf526bSChristoph Hellwig op_type = OBJ_OP_WRITE; 3771aebf526bSChristoph Hellwig break; 3772aebf526bSChristoph Hellwig case REQ_OP_READ: 3773aebf526bSChristoph Hellwig op_type = OBJ_OP_READ; 3774aebf526bSChristoph Hellwig break; 3775aebf526bSChristoph Hellwig default: 3776aebf526bSChristoph Hellwig dout("%s: non-fs request type %d\n", __func__, req_op(rq)); 37777ad18afaSChristoph Hellwig result = -EIO; 37787ad18afaSChristoph Hellwig goto err; 37797ad18afaSChristoph Hellwig } 37807ad18afaSChristoph Hellwig 3781bc1ecc65SIlya Dryomov /* Ignore/skip any zero-length requests */ 3782bc1ecc65SIlya Dryomov 3783bc1ecc65SIlya Dryomov if (!length) { 3784bc1ecc65SIlya Dryomov dout("%s: zero-length request\n", __func__); 3785bc1ecc65SIlya Dryomov result = 0; 3786bc1ecc65SIlya Dryomov goto err_rq; 3787bc1ecc65SIlya Dryomov } 3788bc1ecc65SIlya Dryomov 37899568c93eSIlya Dryomov rbd_assert(op_type == OBJ_OP_READ || 37909568c93eSIlya Dryomov rbd_dev->spec->snap_id == CEPH_NOSNAP); 3791bc1ecc65SIlya Dryomov 3792bc1ecc65SIlya Dryomov /* 3793bc1ecc65SIlya Dryomov * Quit early if the mapped snapshot no longer exists. It's 3794bc1ecc65SIlya Dryomov * still possible the snapshot will have disappeared by the 3795bc1ecc65SIlya Dryomov * time our request arrives at the osd, but there's no sense in 3796bc1ecc65SIlya Dryomov * sending it if we already know. 3797bc1ecc65SIlya Dryomov */ 3798bc1ecc65SIlya Dryomov if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { 3799bc1ecc65SIlya Dryomov dout("request for non-existent snapshot"); 3800bc1ecc65SIlya Dryomov rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); 3801bc1ecc65SIlya Dryomov result = -ENXIO; 3802bc1ecc65SIlya Dryomov goto err_rq; 3803bc1ecc65SIlya Dryomov } 3804bc1ecc65SIlya Dryomov 3805bc1ecc65SIlya Dryomov if (offset && length > U64_MAX - offset + 1) { 3806bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, 3807bc1ecc65SIlya Dryomov length); 3808bc1ecc65SIlya Dryomov result = -EINVAL; 3809bc1ecc65SIlya Dryomov goto err_rq; /* Shouldn't happen */ 3810bc1ecc65SIlya Dryomov } 3811bc1ecc65SIlya Dryomov 38127ad18afaSChristoph Hellwig blk_mq_start_request(rq); 38137ad18afaSChristoph Hellwig 38144e752f0aSJosh Durgin down_read(&rbd_dev->header_rwsem); 38154e752f0aSJosh Durgin mapping_size = rbd_dev->mapping.size; 38166d2940c8SGuangliang Zhao if (op_type != OBJ_OP_READ) { 38174e752f0aSJosh Durgin snapc = rbd_dev->header.snapc; 38184e752f0aSJosh Durgin ceph_get_snap_context(snapc); 38194e752f0aSJosh Durgin } 38204e752f0aSJosh Durgin up_read(&rbd_dev->header_rwsem); 38214e752f0aSJosh Durgin 38224e752f0aSJosh Durgin if (offset + length > mapping_size) { 3823bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 38244e752f0aSJosh Durgin length, mapping_size); 3825bc1ecc65SIlya Dryomov result = -EIO; 3826bc1ecc65SIlya Dryomov goto err_rq; 3827bc1ecc65SIlya Dryomov } 3828bc1ecc65SIlya Dryomov 3829f9bebd58SIlya Dryomov must_be_locked = 3830f9bebd58SIlya Dryomov (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) && 3831f9bebd58SIlya Dryomov (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read); 3832ed95b21aSIlya Dryomov if (must_be_locked) { 3833ed95b21aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 38342f18d466SIlya Dryomov result = rbd_wait_state_locked(rbd_dev, 38352f18d466SIlya Dryomov !rbd_dev->opts->exclusive); 38362f18d466SIlya Dryomov if (result) 3837e010dd0aSIlya Dryomov goto err_unlock; 3838e010dd0aSIlya Dryomov } 3839ed95b21aSIlya Dryomov 3840dfd9875fSIlya Dryomov img_request = rbd_img_request_create(rbd_dev, op_type, snapc); 3841bc1ecc65SIlya Dryomov if (!img_request) { 3842bc1ecc65SIlya Dryomov result = -ENOMEM; 3843ed95b21aSIlya Dryomov goto err_unlock; 3844bc1ecc65SIlya Dryomov } 3845bc1ecc65SIlya Dryomov img_request->rq = rq; 384670b16db8SIlya Dryomov snapc = NULL; /* img_request consumes a ref */ 3847bc1ecc65SIlya Dryomov 38486484cbe9SIlya Dryomov if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 38495a237819SIlya Dryomov result = rbd_img_fill_nodata(img_request, offset, length); 385090e98c52SGuangliang Zhao else 38515a237819SIlya Dryomov result = rbd_img_fill_from_bio(img_request, offset, length, 385290e98c52SGuangliang Zhao rq->bio); 38530c93e1b7SIlya Dryomov if (result || !img_request->pending_count) 3854bc1ecc65SIlya Dryomov goto err_img_request; 3855bc1ecc65SIlya Dryomov 3856efbd1a11SIlya Dryomov rbd_img_request_submit(img_request); 3857ed95b21aSIlya Dryomov if (must_be_locked) 3858ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3859bc1ecc65SIlya Dryomov return; 3860bc1ecc65SIlya Dryomov 3861bc1ecc65SIlya Dryomov err_img_request: 3862bc1ecc65SIlya Dryomov rbd_img_request_put(img_request); 3863ed95b21aSIlya Dryomov err_unlock: 3864ed95b21aSIlya Dryomov if (must_be_locked) 3865ed95b21aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 3866bc1ecc65SIlya Dryomov err_rq: 3867bc1ecc65SIlya Dryomov if (result) 3868bc1ecc65SIlya Dryomov rbd_warn(rbd_dev, "%s %llx at %llx result %d", 38696d2940c8SGuangliang Zhao obj_op_name(op_type), length, offset, result); 38704e752f0aSJosh Durgin ceph_put_snap_context(snapc); 38717ad18afaSChristoph Hellwig err: 38722a842acaSChristoph Hellwig blk_mq_end_request(rq, errno_to_blk_status(result)); 3873bc1ecc65SIlya Dryomov } 3874bc1ecc65SIlya Dryomov 3875fc17b653SChristoph Hellwig static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, 38767ad18afaSChristoph Hellwig const struct blk_mq_queue_data *bd) 3877bc1ecc65SIlya Dryomov { 38787ad18afaSChristoph Hellwig struct request *rq = bd->rq; 38797ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 3880bc1ecc65SIlya Dryomov 38817ad18afaSChristoph Hellwig queue_work(rbd_wq, work); 3882fc17b653SChristoph Hellwig return BLK_STS_OK; 3883bf0d5f50SAlex Elder } 3884bf0d5f50SAlex Elder 3885602adf40SYehuda Sadeh static void rbd_free_disk(struct rbd_device *rbd_dev) 3886602adf40SYehuda Sadeh { 38875769ed0cSIlya Dryomov blk_cleanup_queue(rbd_dev->disk->queue); 38887ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 38895769ed0cSIlya Dryomov put_disk(rbd_dev->disk); 38905769ed0cSIlya Dryomov rbd_dev->disk = NULL; 3891602adf40SYehuda Sadeh } 3892602adf40SYehuda Sadeh 3893788e2df3SAlex Elder static int rbd_obj_read_sync(struct rbd_device *rbd_dev, 3894fe5478e0SIlya Dryomov struct ceph_object_id *oid, 3895fe5478e0SIlya Dryomov struct ceph_object_locator *oloc, 3896fe5478e0SIlya Dryomov void *buf, int buf_len) 3897788e2df3SAlex Elder 3898788e2df3SAlex Elder { 3899fe5478e0SIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3900fe5478e0SIlya Dryomov struct ceph_osd_request *req; 3901fe5478e0SIlya Dryomov struct page **pages; 3902fe5478e0SIlya Dryomov int num_pages = calc_pages_for(0, buf_len); 3903788e2df3SAlex Elder int ret; 3904788e2df3SAlex Elder 3905fe5478e0SIlya Dryomov req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); 3906fe5478e0SIlya Dryomov if (!req) 3907fe5478e0SIlya Dryomov return -ENOMEM; 3908788e2df3SAlex Elder 3909fe5478e0SIlya Dryomov ceph_oid_copy(&req->r_base_oid, oid); 3910fe5478e0SIlya Dryomov ceph_oloc_copy(&req->r_base_oloc, oloc); 3911fe5478e0SIlya Dryomov req->r_flags = CEPH_OSD_FLAG_READ; 3912788e2df3SAlex Elder 3913fe5478e0SIlya Dryomov pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 3914fe5478e0SIlya Dryomov if (IS_ERR(pages)) { 3915fe5478e0SIlya Dryomov ret = PTR_ERR(pages); 3916fe5478e0SIlya Dryomov goto out_req; 3917fe5478e0SIlya Dryomov } 39181ceae7efSAlex Elder 3919fe5478e0SIlya Dryomov osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0); 3920fe5478e0SIlya Dryomov osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, 3921fe5478e0SIlya Dryomov true); 3922788e2df3SAlex Elder 392326f887e0SIlya Dryomov ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); 392426f887e0SIlya Dryomov if (ret) 392526f887e0SIlya Dryomov goto out_req; 392626f887e0SIlya Dryomov 3927fe5478e0SIlya Dryomov ceph_osdc_start_request(osdc, req, false); 3928fe5478e0SIlya Dryomov ret = ceph_osdc_wait_request(osdc, req); 3929fe5478e0SIlya Dryomov if (ret >= 0) 3930fe5478e0SIlya Dryomov ceph_copy_from_page_vector(pages, buf, 0, ret); 3931fe5478e0SIlya Dryomov 3932fe5478e0SIlya Dryomov out_req: 3933fe5478e0SIlya Dryomov ceph_osdc_put_request(req); 3934788e2df3SAlex Elder return ret; 3935788e2df3SAlex Elder } 3936788e2df3SAlex Elder 3937602adf40SYehuda Sadeh /* 3938662518b1SAlex Elder * Read the complete header for the given rbd device. On successful 3939662518b1SAlex Elder * return, the rbd_dev->header field will contain up-to-date 3940662518b1SAlex Elder * information about the image. 39414156d998SAlex Elder */ 394299a41ebcSAlex Elder static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) 39434156d998SAlex Elder { 39444156d998SAlex Elder struct rbd_image_header_ondisk *ondisk = NULL; 39454156d998SAlex Elder u32 snap_count = 0; 39464156d998SAlex Elder u64 names_size = 0; 39474156d998SAlex Elder u32 want_count; 39484156d998SAlex Elder int ret; 39494156d998SAlex Elder 39504156d998SAlex Elder /* 39514156d998SAlex Elder * The complete header will include an array of its 64-bit 39524156d998SAlex Elder * snapshot ids, followed by the names of those snapshots as 39534156d998SAlex Elder * a contiguous block of NUL-terminated strings. Note that 39544156d998SAlex Elder * the number of snapshots could change by the time we read 39554156d998SAlex Elder * it in, in which case we re-read it. 39564156d998SAlex Elder */ 39574156d998SAlex Elder do { 39584156d998SAlex Elder size_t size; 39594156d998SAlex Elder 39604156d998SAlex Elder kfree(ondisk); 39614156d998SAlex Elder 39624156d998SAlex Elder size = sizeof (*ondisk); 39634156d998SAlex Elder size += snap_count * sizeof (struct rbd_image_snap_ondisk); 39644156d998SAlex Elder size += names_size; 39654156d998SAlex Elder ondisk = kmalloc(size, GFP_KERNEL); 39664156d998SAlex Elder if (!ondisk) 3967662518b1SAlex Elder return -ENOMEM; 39684156d998SAlex Elder 3969fe5478e0SIlya Dryomov ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid, 3970fe5478e0SIlya Dryomov &rbd_dev->header_oloc, ondisk, size); 39714156d998SAlex Elder if (ret < 0) 3972662518b1SAlex Elder goto out; 3973c0cd10dbSAlex Elder if ((size_t)ret < size) { 39744156d998SAlex Elder ret = -ENXIO; 397506ecc6cbSAlex Elder rbd_warn(rbd_dev, "short header read (want %zd got %d)", 397606ecc6cbSAlex Elder size, ret); 3977662518b1SAlex Elder goto out; 39784156d998SAlex Elder } 39794156d998SAlex Elder if (!rbd_dev_ondisk_valid(ondisk)) { 39804156d998SAlex Elder ret = -ENXIO; 398106ecc6cbSAlex Elder rbd_warn(rbd_dev, "invalid header"); 3982662518b1SAlex Elder goto out; 39834156d998SAlex Elder } 39844156d998SAlex Elder 39854156d998SAlex Elder names_size = le64_to_cpu(ondisk->snap_names_len); 39864156d998SAlex Elder want_count = snap_count; 39874156d998SAlex Elder snap_count = le32_to_cpu(ondisk->snap_count); 39884156d998SAlex Elder } while (snap_count != want_count); 39894156d998SAlex Elder 3990662518b1SAlex Elder ret = rbd_header_from_disk(rbd_dev, ondisk); 3991662518b1SAlex Elder out: 39924156d998SAlex Elder kfree(ondisk); 39934156d998SAlex Elder 3994dfc5606dSYehuda Sadeh return ret; 3995602adf40SYehuda Sadeh } 3996602adf40SYehuda Sadeh 399715228edeSAlex Elder /* 399815228edeSAlex Elder * Clear the rbd device's EXISTS flag if the snapshot it's mapped to 399915228edeSAlex Elder * has disappeared from the (just updated) snapshot context. 400015228edeSAlex Elder */ 400115228edeSAlex Elder static void rbd_exists_validate(struct rbd_device *rbd_dev) 400215228edeSAlex Elder { 400315228edeSAlex Elder u64 snap_id; 400415228edeSAlex Elder 400515228edeSAlex Elder if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) 400615228edeSAlex Elder return; 400715228edeSAlex Elder 400815228edeSAlex Elder snap_id = rbd_dev->spec->snap_id; 400915228edeSAlex Elder if (snap_id == CEPH_NOSNAP) 401015228edeSAlex Elder return; 401115228edeSAlex Elder 401215228edeSAlex Elder if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) 401315228edeSAlex Elder clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 401415228edeSAlex Elder } 401515228edeSAlex Elder 40169875201eSJosh Durgin static void rbd_dev_update_size(struct rbd_device *rbd_dev) 40179875201eSJosh Durgin { 40189875201eSJosh Durgin sector_t size; 40199875201eSJosh Durgin 40209875201eSJosh Durgin /* 4021811c6688SIlya Dryomov * If EXISTS is not set, rbd_dev->disk may be NULL, so don't 4022811c6688SIlya Dryomov * try to update its size. If REMOVING is set, updating size 4023811c6688SIlya Dryomov * is just useless work since the device can't be opened. 40249875201eSJosh Durgin */ 4025811c6688SIlya Dryomov if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) && 4026811c6688SIlya Dryomov !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) { 40279875201eSJosh Durgin size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; 40289875201eSJosh Durgin dout("setting size to %llu sectors", (unsigned long long)size); 40299875201eSJosh Durgin set_capacity(rbd_dev->disk, size); 40309875201eSJosh Durgin revalidate_disk(rbd_dev->disk); 40319875201eSJosh Durgin } 40329875201eSJosh Durgin } 40339875201eSJosh Durgin 4034cc4a38bdSAlex Elder static int rbd_dev_refresh(struct rbd_device *rbd_dev) 40351fe5e993SAlex Elder { 4036e627db08SAlex Elder u64 mapping_size; 40371fe5e993SAlex Elder int ret; 40381fe5e993SAlex Elder 4039cfbf6377SAlex Elder down_write(&rbd_dev->header_rwsem); 40403b5cf2a2SAlex Elder mapping_size = rbd_dev->mapping.size; 4041a720ae09SIlya Dryomov 4042a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 404352bb1f9bSIlya Dryomov if (ret) 404473e39e4dSIlya Dryomov goto out; 404515228edeSAlex Elder 4046e8f59b59SIlya Dryomov /* 4047e8f59b59SIlya Dryomov * If there is a parent, see if it has disappeared due to the 4048e8f59b59SIlya Dryomov * mapped image getting flattened. 4049e8f59b59SIlya Dryomov */ 4050e8f59b59SIlya Dryomov if (rbd_dev->parent) { 4051e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 4052e8f59b59SIlya Dryomov if (ret) 405373e39e4dSIlya Dryomov goto out; 4054e8f59b59SIlya Dryomov } 4055e8f59b59SIlya Dryomov 40565ff1108cSIlya Dryomov if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { 40575ff1108cSIlya Dryomov rbd_dev->mapping.size = rbd_dev->header.image_size; 40585ff1108cSIlya Dryomov } else { 40595ff1108cSIlya Dryomov /* validate mapped snapshot's EXISTS flag */ 406015228edeSAlex Elder rbd_exists_validate(rbd_dev); 40615ff1108cSIlya Dryomov } 40625ff1108cSIlya Dryomov 406373e39e4dSIlya Dryomov out: 4064cfbf6377SAlex Elder up_write(&rbd_dev->header_rwsem); 406573e39e4dSIlya Dryomov if (!ret && mapping_size != rbd_dev->mapping.size) 40669875201eSJosh Durgin rbd_dev_update_size(rbd_dev); 40671fe5e993SAlex Elder 406873e39e4dSIlya Dryomov return ret; 40691fe5e993SAlex Elder } 40701fe5e993SAlex Elder 4071d6296d39SChristoph Hellwig static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq, 4072d6296d39SChristoph Hellwig unsigned int hctx_idx, unsigned int numa_node) 40737ad18afaSChristoph Hellwig { 40747ad18afaSChristoph Hellwig struct work_struct *work = blk_mq_rq_to_pdu(rq); 40757ad18afaSChristoph Hellwig 40767ad18afaSChristoph Hellwig INIT_WORK(work, rbd_queue_workfn); 40777ad18afaSChristoph Hellwig return 0; 40787ad18afaSChristoph Hellwig } 40797ad18afaSChristoph Hellwig 4080f363b089SEric Biggers static const struct blk_mq_ops rbd_mq_ops = { 40817ad18afaSChristoph Hellwig .queue_rq = rbd_queue_rq, 40827ad18afaSChristoph Hellwig .init_request = rbd_init_request, 40837ad18afaSChristoph Hellwig }; 40847ad18afaSChristoph Hellwig 4085602adf40SYehuda Sadeh static int rbd_init_disk(struct rbd_device *rbd_dev) 4086602adf40SYehuda Sadeh { 4087602adf40SYehuda Sadeh struct gendisk *disk; 4088602adf40SYehuda Sadeh struct request_queue *q; 4089420efbdfSIlya Dryomov unsigned int objset_bytes = 4090420efbdfSIlya Dryomov rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; 40917ad18afaSChristoph Hellwig int err; 4092602adf40SYehuda Sadeh 4093602adf40SYehuda Sadeh /* create gendisk info */ 40947e513d43SIlya Dryomov disk = alloc_disk(single_major ? 40957e513d43SIlya Dryomov (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : 40967e513d43SIlya Dryomov RBD_MINORS_PER_MAJOR); 4097602adf40SYehuda Sadeh if (!disk) 40981fcdb8aaSAlex Elder return -ENOMEM; 4099602adf40SYehuda Sadeh 4100f0f8cef5SAlex Elder snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", 4101de71a297SAlex Elder rbd_dev->dev_id); 4102602adf40SYehuda Sadeh disk->major = rbd_dev->major; 4103dd82fff1SIlya Dryomov disk->first_minor = rbd_dev->minor; 41047e513d43SIlya Dryomov if (single_major) 41057e513d43SIlya Dryomov disk->flags |= GENHD_FL_EXT_DEVT; 4106602adf40SYehuda Sadeh disk->fops = &rbd_bd_ops; 4107602adf40SYehuda Sadeh disk->private_data = rbd_dev; 4108602adf40SYehuda Sadeh 41097ad18afaSChristoph Hellwig memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); 41107ad18afaSChristoph Hellwig rbd_dev->tag_set.ops = &rbd_mq_ops; 4111b5584180SIlya Dryomov rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; 41127ad18afaSChristoph Hellwig rbd_dev->tag_set.numa_node = NUMA_NO_NODE; 4113b5584180SIlya Dryomov rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; 41147ad18afaSChristoph Hellwig rbd_dev->tag_set.nr_hw_queues = 1; 41157ad18afaSChristoph Hellwig rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); 41167ad18afaSChristoph Hellwig 41177ad18afaSChristoph Hellwig err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); 41187ad18afaSChristoph Hellwig if (err) 4119602adf40SYehuda Sadeh goto out_disk; 4120029bcbd8SJosh Durgin 41217ad18afaSChristoph Hellwig q = blk_mq_init_queue(&rbd_dev->tag_set); 41227ad18afaSChristoph Hellwig if (IS_ERR(q)) { 41237ad18afaSChristoph Hellwig err = PTR_ERR(q); 41247ad18afaSChristoph Hellwig goto out_tag_set; 41257ad18afaSChristoph Hellwig } 41267ad18afaSChristoph Hellwig 41278b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_NONROT, q); 4128d8a2c89cSIlya Dryomov /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ 4129593a9e7bSAlex Elder 4130420efbdfSIlya Dryomov blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); 41310d9fde4fSIlya Dryomov q->limits.max_sectors = queue_max_hw_sectors(q); 413221acdf45SIlya Dryomov blk_queue_max_segments(q, USHRT_MAX); 413324f1df60SIlya Dryomov blk_queue_max_segment_size(q, UINT_MAX); 4134420efbdfSIlya Dryomov blk_queue_io_min(q, objset_bytes); 4135420efbdfSIlya Dryomov blk_queue_io_opt(q, objset_bytes); 4136029bcbd8SJosh Durgin 4137d9360540SIlya Dryomov if (rbd_dev->opts->trim) { 41388b904b5bSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); 4139420efbdfSIlya Dryomov q->limits.discard_granularity = objset_bytes; 4140420efbdfSIlya Dryomov blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); 4141420efbdfSIlya Dryomov blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); 4142d9360540SIlya Dryomov } 414390e98c52SGuangliang Zhao 4144bae818eeSRonny Hegewald if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) 4145dc3b17ccSJan Kara q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; 4146bae818eeSRonny Hegewald 41475769ed0cSIlya Dryomov /* 41485769ed0cSIlya Dryomov * disk_release() expects a queue ref from add_disk() and will 41495769ed0cSIlya Dryomov * put it. Hold an extra ref until add_disk() is called. 41505769ed0cSIlya Dryomov */ 41515769ed0cSIlya Dryomov WARN_ON(!blk_get_queue(q)); 4152602adf40SYehuda Sadeh disk->queue = q; 4153602adf40SYehuda Sadeh q->queuedata = rbd_dev; 4154602adf40SYehuda Sadeh 4155602adf40SYehuda Sadeh rbd_dev->disk = disk; 4156602adf40SYehuda Sadeh 4157602adf40SYehuda Sadeh return 0; 41587ad18afaSChristoph Hellwig out_tag_set: 41597ad18afaSChristoph Hellwig blk_mq_free_tag_set(&rbd_dev->tag_set); 4160602adf40SYehuda Sadeh out_disk: 4161602adf40SYehuda Sadeh put_disk(disk); 41627ad18afaSChristoph Hellwig return err; 4163602adf40SYehuda Sadeh } 4164602adf40SYehuda Sadeh 4165dfc5606dSYehuda Sadeh /* 4166dfc5606dSYehuda Sadeh sysfs 4167dfc5606dSYehuda Sadeh */ 4168602adf40SYehuda Sadeh 4169593a9e7bSAlex Elder static struct rbd_device *dev_to_rbd_dev(struct device *dev) 4170593a9e7bSAlex Elder { 4171593a9e7bSAlex Elder return container_of(dev, struct rbd_device, dev); 4172593a9e7bSAlex Elder } 4173593a9e7bSAlex Elder 4174dfc5606dSYehuda Sadeh static ssize_t rbd_size_show(struct device *dev, 4175dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4176602adf40SYehuda Sadeh { 4177593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4178dfc5606dSYehuda Sadeh 4179fc71d833SAlex Elder return sprintf(buf, "%llu\n", 4180fc71d833SAlex Elder (unsigned long long)rbd_dev->mapping.size); 4181602adf40SYehuda Sadeh } 4182602adf40SYehuda Sadeh 418334b13184SAlex Elder /* 418434b13184SAlex Elder * Note this shows the features for whatever's mapped, which is not 418534b13184SAlex Elder * necessarily the base image. 418634b13184SAlex Elder */ 418734b13184SAlex Elder static ssize_t rbd_features_show(struct device *dev, 418834b13184SAlex Elder struct device_attribute *attr, char *buf) 418934b13184SAlex Elder { 419034b13184SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 419134b13184SAlex Elder 419234b13184SAlex Elder return sprintf(buf, "0x%016llx\n", 419334b13184SAlex Elder (unsigned long long)rbd_dev->mapping.features); 419434b13184SAlex Elder } 419534b13184SAlex Elder 4196dfc5606dSYehuda Sadeh static ssize_t rbd_major_show(struct device *dev, 4197dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4198602adf40SYehuda Sadeh { 4199593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4200dfc5606dSYehuda Sadeh 4201fc71d833SAlex Elder if (rbd_dev->major) 4202dfc5606dSYehuda Sadeh return sprintf(buf, "%d\n", rbd_dev->major); 4203fc71d833SAlex Elder 4204fc71d833SAlex Elder return sprintf(buf, "(none)\n"); 4205dd82fff1SIlya Dryomov } 4206fc71d833SAlex Elder 4207dd82fff1SIlya Dryomov static ssize_t rbd_minor_show(struct device *dev, 4208dd82fff1SIlya Dryomov struct device_attribute *attr, char *buf) 4209dd82fff1SIlya Dryomov { 4210dd82fff1SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4211dd82fff1SIlya Dryomov 4212dd82fff1SIlya Dryomov return sprintf(buf, "%d\n", rbd_dev->minor); 4213dfc5606dSYehuda Sadeh } 4214dfc5606dSYehuda Sadeh 4215005a07bfSIlya Dryomov static ssize_t rbd_client_addr_show(struct device *dev, 4216005a07bfSIlya Dryomov struct device_attribute *attr, char *buf) 4217005a07bfSIlya Dryomov { 4218005a07bfSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4219005a07bfSIlya Dryomov struct ceph_entity_addr *client_addr = 4220005a07bfSIlya Dryomov ceph_client_addr(rbd_dev->rbd_client->client); 4221005a07bfSIlya Dryomov 4222005a07bfSIlya Dryomov return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr, 4223005a07bfSIlya Dryomov le32_to_cpu(client_addr->nonce)); 4224005a07bfSIlya Dryomov } 4225005a07bfSIlya Dryomov 4226dfc5606dSYehuda Sadeh static ssize_t rbd_client_id_show(struct device *dev, 4227dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4228dfc5606dSYehuda Sadeh { 4229593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4230dfc5606dSYehuda Sadeh 42311dbb4399SAlex Elder return sprintf(buf, "client%lld\n", 4232033268a5SIlya Dryomov ceph_client_gid(rbd_dev->rbd_client->client)); 4233dfc5606dSYehuda Sadeh } 4234dfc5606dSYehuda Sadeh 4235267fb90bSMike Christie static ssize_t rbd_cluster_fsid_show(struct device *dev, 4236267fb90bSMike Christie struct device_attribute *attr, char *buf) 4237267fb90bSMike Christie { 4238267fb90bSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4239267fb90bSMike Christie 4240267fb90bSMike Christie return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid); 4241267fb90bSMike Christie } 4242267fb90bSMike Christie 42430d6d1e9cSMike Christie static ssize_t rbd_config_info_show(struct device *dev, 42440d6d1e9cSMike Christie struct device_attribute *attr, char *buf) 42450d6d1e9cSMike Christie { 42460d6d1e9cSMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 42470d6d1e9cSMike Christie 42480d6d1e9cSMike Christie return sprintf(buf, "%s\n", rbd_dev->config_info); 4249dfc5606dSYehuda Sadeh } 4250dfc5606dSYehuda Sadeh 4251dfc5606dSYehuda Sadeh static ssize_t rbd_pool_show(struct device *dev, 4252dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4253dfc5606dSYehuda Sadeh { 4254593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4255dfc5606dSYehuda Sadeh 42560d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); 4257dfc5606dSYehuda Sadeh } 4258dfc5606dSYehuda Sadeh 42599bb2f334SAlex Elder static ssize_t rbd_pool_id_show(struct device *dev, 42609bb2f334SAlex Elder struct device_attribute *attr, char *buf) 42619bb2f334SAlex Elder { 42629bb2f334SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 42639bb2f334SAlex Elder 42640d7dbfceSAlex Elder return sprintf(buf, "%llu\n", 42650d7dbfceSAlex Elder (unsigned long long) rbd_dev->spec->pool_id); 42669bb2f334SAlex Elder } 42679bb2f334SAlex Elder 4268b26c047bSIlya Dryomov static ssize_t rbd_pool_ns_show(struct device *dev, 4269b26c047bSIlya Dryomov struct device_attribute *attr, char *buf) 4270b26c047bSIlya Dryomov { 4271b26c047bSIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4272b26c047bSIlya Dryomov 4273b26c047bSIlya Dryomov return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: ""); 4274b26c047bSIlya Dryomov } 4275b26c047bSIlya Dryomov 4276dfc5606dSYehuda Sadeh static ssize_t rbd_name_show(struct device *dev, 4277dfc5606dSYehuda Sadeh struct device_attribute *attr, char *buf) 4278dfc5606dSYehuda Sadeh { 4279593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4280dfc5606dSYehuda Sadeh 4281a92ffdf8SAlex Elder if (rbd_dev->spec->image_name) 42820d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_name); 4283a92ffdf8SAlex Elder 4284a92ffdf8SAlex Elder return sprintf(buf, "(unknown)\n"); 4285dfc5606dSYehuda Sadeh } 4286dfc5606dSYehuda Sadeh 4287589d30e0SAlex Elder static ssize_t rbd_image_id_show(struct device *dev, 4288589d30e0SAlex Elder struct device_attribute *attr, char *buf) 4289589d30e0SAlex Elder { 4290589d30e0SAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4291589d30e0SAlex Elder 42920d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->image_id); 4293589d30e0SAlex Elder } 4294589d30e0SAlex Elder 429534b13184SAlex Elder /* 429634b13184SAlex Elder * Shows the name of the currently-mapped snapshot (or 429734b13184SAlex Elder * RBD_SNAP_HEAD_NAME for the base image). 429834b13184SAlex Elder */ 4299dfc5606dSYehuda Sadeh static ssize_t rbd_snap_show(struct device *dev, 4300dfc5606dSYehuda Sadeh struct device_attribute *attr, 4301dfc5606dSYehuda Sadeh char *buf) 4302dfc5606dSYehuda Sadeh { 4303593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4304dfc5606dSYehuda Sadeh 43050d7dbfceSAlex Elder return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); 4306dfc5606dSYehuda Sadeh } 4307dfc5606dSYehuda Sadeh 430892a58671SMike Christie static ssize_t rbd_snap_id_show(struct device *dev, 430992a58671SMike Christie struct device_attribute *attr, char *buf) 431092a58671SMike Christie { 431192a58671SMike Christie struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 431292a58671SMike Christie 431392a58671SMike Christie return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id); 431492a58671SMike Christie } 431592a58671SMike Christie 431686b00e0dSAlex Elder /* 4317ff96128fSIlya Dryomov * For a v2 image, shows the chain of parent images, separated by empty 4318ff96128fSIlya Dryomov * lines. For v1 images or if there is no parent, shows "(no parent 4319ff96128fSIlya Dryomov * image)". 432086b00e0dSAlex Elder */ 432186b00e0dSAlex Elder static ssize_t rbd_parent_show(struct device *dev, 432286b00e0dSAlex Elder struct device_attribute *attr, 432386b00e0dSAlex Elder char *buf) 432486b00e0dSAlex Elder { 432586b00e0dSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4326ff96128fSIlya Dryomov ssize_t count = 0; 432786b00e0dSAlex Elder 4328ff96128fSIlya Dryomov if (!rbd_dev->parent) 432986b00e0dSAlex Elder return sprintf(buf, "(no parent image)\n"); 433086b00e0dSAlex Elder 4331ff96128fSIlya Dryomov for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { 4332ff96128fSIlya Dryomov struct rbd_spec *spec = rbd_dev->parent_spec; 433386b00e0dSAlex Elder 4334ff96128fSIlya Dryomov count += sprintf(&buf[count], "%s" 4335ff96128fSIlya Dryomov "pool_id %llu\npool_name %s\n" 4336e92c0eafSIlya Dryomov "pool_ns %s\n" 4337ff96128fSIlya Dryomov "image_id %s\nimage_name %s\n" 4338ff96128fSIlya Dryomov "snap_id %llu\nsnap_name %s\n" 4339ff96128fSIlya Dryomov "overlap %llu\n", 4340ff96128fSIlya Dryomov !count ? "" : "\n", /* first? */ 4341ff96128fSIlya Dryomov spec->pool_id, spec->pool_name, 4342e92c0eafSIlya Dryomov spec->pool_ns ?: "", 4343ff96128fSIlya Dryomov spec->image_id, spec->image_name ?: "(unknown)", 4344ff96128fSIlya Dryomov spec->snap_id, spec->snap_name, 4345ff96128fSIlya Dryomov rbd_dev->parent_overlap); 4346ff96128fSIlya Dryomov } 434786b00e0dSAlex Elder 434886b00e0dSAlex Elder return count; 434986b00e0dSAlex Elder } 435086b00e0dSAlex Elder 4351dfc5606dSYehuda Sadeh static ssize_t rbd_image_refresh(struct device *dev, 4352dfc5606dSYehuda Sadeh struct device_attribute *attr, 4353dfc5606dSYehuda Sadeh const char *buf, 4354dfc5606dSYehuda Sadeh size_t size) 4355dfc5606dSYehuda Sadeh { 4356593a9e7bSAlex Elder struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 4357b813623aSAlex Elder int ret; 4358602adf40SYehuda Sadeh 4359cc4a38bdSAlex Elder ret = rbd_dev_refresh(rbd_dev); 4360e627db08SAlex Elder if (ret) 436152bb1f9bSIlya Dryomov return ret; 4362b813623aSAlex Elder 436352bb1f9bSIlya Dryomov return size; 4364dfc5606dSYehuda Sadeh } 4365602adf40SYehuda Sadeh 43665657a819SJoe Perches static DEVICE_ATTR(size, 0444, rbd_size_show, NULL); 43675657a819SJoe Perches static DEVICE_ATTR(features, 0444, rbd_features_show, NULL); 43685657a819SJoe Perches static DEVICE_ATTR(major, 0444, rbd_major_show, NULL); 43695657a819SJoe Perches static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL); 43705657a819SJoe Perches static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL); 43715657a819SJoe Perches static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL); 43725657a819SJoe Perches static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); 43735657a819SJoe Perches static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); 43745657a819SJoe Perches static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); 43755657a819SJoe Perches static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); 4376b26c047bSIlya Dryomov static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL); 43775657a819SJoe Perches static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); 43785657a819SJoe Perches static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); 43795657a819SJoe Perches static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); 43805657a819SJoe Perches static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL); 43815657a819SJoe Perches static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL); 43825657a819SJoe Perches static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL); 4383dfc5606dSYehuda Sadeh 4384dfc5606dSYehuda Sadeh static struct attribute *rbd_attrs[] = { 4385dfc5606dSYehuda Sadeh &dev_attr_size.attr, 438634b13184SAlex Elder &dev_attr_features.attr, 4387dfc5606dSYehuda Sadeh &dev_attr_major.attr, 4388dd82fff1SIlya Dryomov &dev_attr_minor.attr, 4389005a07bfSIlya Dryomov &dev_attr_client_addr.attr, 4390dfc5606dSYehuda Sadeh &dev_attr_client_id.attr, 4391267fb90bSMike Christie &dev_attr_cluster_fsid.attr, 43920d6d1e9cSMike Christie &dev_attr_config_info.attr, 4393dfc5606dSYehuda Sadeh &dev_attr_pool.attr, 43949bb2f334SAlex Elder &dev_attr_pool_id.attr, 4395b26c047bSIlya Dryomov &dev_attr_pool_ns.attr, 4396dfc5606dSYehuda Sadeh &dev_attr_name.attr, 4397589d30e0SAlex Elder &dev_attr_image_id.attr, 4398dfc5606dSYehuda Sadeh &dev_attr_current_snap.attr, 439992a58671SMike Christie &dev_attr_snap_id.attr, 440086b00e0dSAlex Elder &dev_attr_parent.attr, 4401dfc5606dSYehuda Sadeh &dev_attr_refresh.attr, 4402dfc5606dSYehuda Sadeh NULL 4403dfc5606dSYehuda Sadeh }; 4404dfc5606dSYehuda Sadeh 4405dfc5606dSYehuda Sadeh static struct attribute_group rbd_attr_group = { 4406dfc5606dSYehuda Sadeh .attrs = rbd_attrs, 4407dfc5606dSYehuda Sadeh }; 4408dfc5606dSYehuda Sadeh 4409dfc5606dSYehuda Sadeh static const struct attribute_group *rbd_attr_groups[] = { 4410dfc5606dSYehuda Sadeh &rbd_attr_group, 4411dfc5606dSYehuda Sadeh NULL 4412dfc5606dSYehuda Sadeh }; 4413dfc5606dSYehuda Sadeh 44146cac4695SIlya Dryomov static void rbd_dev_release(struct device *dev); 4415dfc5606dSYehuda Sadeh 4416b9942bc9SBhumika Goyal static const struct device_type rbd_device_type = { 4417dfc5606dSYehuda Sadeh .name = "rbd", 4418dfc5606dSYehuda Sadeh .groups = rbd_attr_groups, 44196cac4695SIlya Dryomov .release = rbd_dev_release, 4420dfc5606dSYehuda Sadeh }; 4421dfc5606dSYehuda Sadeh 44228b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) 44238b8fb99cSAlex Elder { 44248b8fb99cSAlex Elder kref_get(&spec->kref); 44258b8fb99cSAlex Elder 44268b8fb99cSAlex Elder return spec; 44278b8fb99cSAlex Elder } 44288b8fb99cSAlex Elder 44298b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref); 44308b8fb99cSAlex Elder static void rbd_spec_put(struct rbd_spec *spec) 44318b8fb99cSAlex Elder { 44328b8fb99cSAlex Elder if (spec) 44338b8fb99cSAlex Elder kref_put(&spec->kref, rbd_spec_free); 44348b8fb99cSAlex Elder } 44358b8fb99cSAlex Elder 44368b8fb99cSAlex Elder static struct rbd_spec *rbd_spec_alloc(void) 44378b8fb99cSAlex Elder { 44388b8fb99cSAlex Elder struct rbd_spec *spec; 44398b8fb99cSAlex Elder 44408b8fb99cSAlex Elder spec = kzalloc(sizeof (*spec), GFP_KERNEL); 44418b8fb99cSAlex Elder if (!spec) 44428b8fb99cSAlex Elder return NULL; 444304077599SIlya Dryomov 444404077599SIlya Dryomov spec->pool_id = CEPH_NOPOOL; 444504077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 44468b8fb99cSAlex Elder kref_init(&spec->kref); 44478b8fb99cSAlex Elder 44488b8fb99cSAlex Elder return spec; 44498b8fb99cSAlex Elder } 44508b8fb99cSAlex Elder 44518b8fb99cSAlex Elder static void rbd_spec_free(struct kref *kref) 44528b8fb99cSAlex Elder { 44538b8fb99cSAlex Elder struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); 44548b8fb99cSAlex Elder 44558b8fb99cSAlex Elder kfree(spec->pool_name); 4456b26c047bSIlya Dryomov kfree(spec->pool_ns); 44578b8fb99cSAlex Elder kfree(spec->image_id); 44588b8fb99cSAlex Elder kfree(spec->image_name); 44598b8fb99cSAlex Elder kfree(spec->snap_name); 44608b8fb99cSAlex Elder kfree(spec); 44618b8fb99cSAlex Elder } 44628b8fb99cSAlex Elder 44631643dfa4SIlya Dryomov static void rbd_dev_free(struct rbd_device *rbd_dev) 4464dd5ac32dSIlya Dryomov { 446599d16943SIlya Dryomov WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED); 4466ed95b21aSIlya Dryomov WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED); 4467dd5ac32dSIlya Dryomov 4468c41d13a3SIlya Dryomov ceph_oid_destroy(&rbd_dev->header_oid); 44696b6dddbeSIlya Dryomov ceph_oloc_destroy(&rbd_dev->header_oloc); 44700d6d1e9cSMike Christie kfree(rbd_dev->config_info); 4471c41d13a3SIlya Dryomov 4472dd5ac32dSIlya Dryomov rbd_put_client(rbd_dev->rbd_client); 4473dd5ac32dSIlya Dryomov rbd_spec_put(rbd_dev->spec); 4474dd5ac32dSIlya Dryomov kfree(rbd_dev->opts); 4475dd5ac32dSIlya Dryomov kfree(rbd_dev); 44761643dfa4SIlya Dryomov } 44771643dfa4SIlya Dryomov 44781643dfa4SIlya Dryomov static void rbd_dev_release(struct device *dev) 44791643dfa4SIlya Dryomov { 44801643dfa4SIlya Dryomov struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); 44811643dfa4SIlya Dryomov bool need_put = !!rbd_dev->opts; 44821643dfa4SIlya Dryomov 44831643dfa4SIlya Dryomov if (need_put) { 44841643dfa4SIlya Dryomov destroy_workqueue(rbd_dev->task_wq); 44851643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 44861643dfa4SIlya Dryomov } 44871643dfa4SIlya Dryomov 44881643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 4489dd5ac32dSIlya Dryomov 4490dd5ac32dSIlya Dryomov /* 4491dd5ac32dSIlya Dryomov * This is racy, but way better than putting module outside of 4492dd5ac32dSIlya Dryomov * the release callback. The race window is pretty small, so 4493dd5ac32dSIlya Dryomov * doing something similar to dm (dm-builtin.c) is overkill. 4494dd5ac32dSIlya Dryomov */ 4495dd5ac32dSIlya Dryomov if (need_put) 4496dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 4497dd5ac32dSIlya Dryomov } 4498dd5ac32dSIlya Dryomov 44991643dfa4SIlya Dryomov static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, 45001643dfa4SIlya Dryomov struct rbd_spec *spec) 4501c53d5893SAlex Elder { 4502c53d5893SAlex Elder struct rbd_device *rbd_dev; 4503c53d5893SAlex Elder 4504c53d5893SAlex Elder rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); 4505c53d5893SAlex Elder if (!rbd_dev) 4506c53d5893SAlex Elder return NULL; 4507c53d5893SAlex Elder 4508c53d5893SAlex Elder spin_lock_init(&rbd_dev->lock); 4509c53d5893SAlex Elder INIT_LIST_HEAD(&rbd_dev->node); 4510c53d5893SAlex Elder init_rwsem(&rbd_dev->header_rwsem); 4511c53d5893SAlex Elder 45127e97332eSIlya Dryomov rbd_dev->header.data_pool_id = CEPH_NOPOOL; 4513c41d13a3SIlya Dryomov ceph_oid_init(&rbd_dev->header_oid); 4514431a02cdSIlya Dryomov rbd_dev->header_oloc.pool = spec->pool_id; 4515b26c047bSIlya Dryomov if (spec->pool_ns) { 4516b26c047bSIlya Dryomov WARN_ON(!*spec->pool_ns); 4517b26c047bSIlya Dryomov rbd_dev->header_oloc.pool_ns = 4518b26c047bSIlya Dryomov ceph_find_or_create_string(spec->pool_ns, 4519b26c047bSIlya Dryomov strlen(spec->pool_ns)); 4520b26c047bSIlya Dryomov } 4521c41d13a3SIlya Dryomov 452299d16943SIlya Dryomov mutex_init(&rbd_dev->watch_mutex); 452399d16943SIlya Dryomov rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; 452499d16943SIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch); 452599d16943SIlya Dryomov 4526ed95b21aSIlya Dryomov init_rwsem(&rbd_dev->lock_rwsem); 4527ed95b21aSIlya Dryomov rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED; 4528ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock); 4529ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock); 4530ed95b21aSIlya Dryomov INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock); 4531ed95b21aSIlya Dryomov INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work); 4532ed95b21aSIlya Dryomov init_waitqueue_head(&rbd_dev->lock_waitq); 4533ed95b21aSIlya Dryomov 4534dd5ac32dSIlya Dryomov rbd_dev->dev.bus = &rbd_bus_type; 4535dd5ac32dSIlya Dryomov rbd_dev->dev.type = &rbd_device_type; 4536dd5ac32dSIlya Dryomov rbd_dev->dev.parent = &rbd_root_dev; 4537dd5ac32dSIlya Dryomov device_initialize(&rbd_dev->dev); 4538dd5ac32dSIlya Dryomov 4539c53d5893SAlex Elder rbd_dev->rbd_client = rbdc; 4540d147543dSIlya Dryomov rbd_dev->spec = spec; 45410903e875SAlex Elder 45421643dfa4SIlya Dryomov return rbd_dev; 45431643dfa4SIlya Dryomov } 45441643dfa4SIlya Dryomov 4545dd5ac32dSIlya Dryomov /* 45461643dfa4SIlya Dryomov * Create a mapping rbd_dev. 4547dd5ac32dSIlya Dryomov */ 45481643dfa4SIlya Dryomov static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, 45491643dfa4SIlya Dryomov struct rbd_spec *spec, 45501643dfa4SIlya Dryomov struct rbd_options *opts) 45511643dfa4SIlya Dryomov { 45521643dfa4SIlya Dryomov struct rbd_device *rbd_dev; 45531643dfa4SIlya Dryomov 45541643dfa4SIlya Dryomov rbd_dev = __rbd_dev_create(rbdc, spec); 45551643dfa4SIlya Dryomov if (!rbd_dev) 45561643dfa4SIlya Dryomov return NULL; 45571643dfa4SIlya Dryomov 45581643dfa4SIlya Dryomov rbd_dev->opts = opts; 45591643dfa4SIlya Dryomov 45601643dfa4SIlya Dryomov /* get an id and fill in device name */ 45611643dfa4SIlya Dryomov rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0, 45621643dfa4SIlya Dryomov minor_to_rbd_dev_id(1 << MINORBITS), 45631643dfa4SIlya Dryomov GFP_KERNEL); 45641643dfa4SIlya Dryomov if (rbd_dev->dev_id < 0) 45651643dfa4SIlya Dryomov goto fail_rbd_dev; 45661643dfa4SIlya Dryomov 45671643dfa4SIlya Dryomov sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id); 45681643dfa4SIlya Dryomov rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM, 45691643dfa4SIlya Dryomov rbd_dev->name); 45701643dfa4SIlya Dryomov if (!rbd_dev->task_wq) 45711643dfa4SIlya Dryomov goto fail_dev_id; 45721643dfa4SIlya Dryomov 45731643dfa4SIlya Dryomov /* we have a ref from do_rbd_add() */ 4574dd5ac32dSIlya Dryomov __module_get(THIS_MODULE); 4575dd5ac32dSIlya Dryomov 45761643dfa4SIlya Dryomov dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id); 4577c53d5893SAlex Elder return rbd_dev; 45781643dfa4SIlya Dryomov 45791643dfa4SIlya Dryomov fail_dev_id: 45801643dfa4SIlya Dryomov ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); 45811643dfa4SIlya Dryomov fail_rbd_dev: 45821643dfa4SIlya Dryomov rbd_dev_free(rbd_dev); 45831643dfa4SIlya Dryomov return NULL; 4584c53d5893SAlex Elder } 4585c53d5893SAlex Elder 4586c53d5893SAlex Elder static void rbd_dev_destroy(struct rbd_device *rbd_dev) 4587c53d5893SAlex Elder { 4588dd5ac32dSIlya Dryomov if (rbd_dev) 4589dd5ac32dSIlya Dryomov put_device(&rbd_dev->dev); 4590c53d5893SAlex Elder } 4591c53d5893SAlex Elder 4592dfc5606dSYehuda Sadeh /* 45939d475de5SAlex Elder * Get the size and object order for an image snapshot, or if 45949d475de5SAlex Elder * snap_id is CEPH_NOSNAP, gets this information for the base 45959d475de5SAlex Elder * image. 45969d475de5SAlex Elder */ 45979d475de5SAlex Elder static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, 45989d475de5SAlex Elder u8 *order, u64 *snap_size) 45999d475de5SAlex Elder { 46009d475de5SAlex Elder __le64 snapid = cpu_to_le64(snap_id); 46019d475de5SAlex Elder int ret; 46029d475de5SAlex Elder struct { 46039d475de5SAlex Elder u8 order; 46049d475de5SAlex Elder __le64 size; 46059d475de5SAlex Elder } __attribute__ ((packed)) size_buf = { 0 }; 46069d475de5SAlex Elder 4607ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4608ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_size", 46094157976bSAlex Elder &snapid, sizeof(snapid), 4610e2a58ee5SAlex Elder &size_buf, sizeof(size_buf)); 461136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 46129d475de5SAlex Elder if (ret < 0) 46139d475de5SAlex Elder return ret; 461457385b51SAlex Elder if (ret < sizeof (size_buf)) 461557385b51SAlex Elder return -ERANGE; 46169d475de5SAlex Elder 4617c3545579SJosh Durgin if (order) { 46189d475de5SAlex Elder *order = size_buf.order; 4619c3545579SJosh Durgin dout(" order %u", (unsigned int)*order); 4620c3545579SJosh Durgin } 46219d475de5SAlex Elder *snap_size = le64_to_cpu(size_buf.size); 46229d475de5SAlex Elder 4623c3545579SJosh Durgin dout(" snap_id 0x%016llx snap_size = %llu\n", 4624c3545579SJosh Durgin (unsigned long long)snap_id, 46259d475de5SAlex Elder (unsigned long long)*snap_size); 46269d475de5SAlex Elder 46279d475de5SAlex Elder return 0; 46289d475de5SAlex Elder } 46299d475de5SAlex Elder 46309d475de5SAlex Elder static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) 46319d475de5SAlex Elder { 46329d475de5SAlex Elder return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, 46339d475de5SAlex Elder &rbd_dev->header.obj_order, 46349d475de5SAlex Elder &rbd_dev->header.image_size); 46359d475de5SAlex Elder } 46369d475de5SAlex Elder 46371e130199SAlex Elder static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 46381e130199SAlex Elder { 46391e130199SAlex Elder void *reply_buf; 46401e130199SAlex Elder int ret; 46411e130199SAlex Elder void *p; 46421e130199SAlex Elder 46431e130199SAlex Elder reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 46441e130199SAlex Elder if (!reply_buf) 46451e130199SAlex Elder return -ENOMEM; 46461e130199SAlex Elder 4647ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4648ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_object_prefix", 4649ecd4a68aSIlya Dryomov NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 465036be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 46511e130199SAlex Elder if (ret < 0) 46521e130199SAlex Elder goto out; 46531e130199SAlex Elder 46541e130199SAlex Elder p = reply_buf; 46551e130199SAlex Elder rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, 465657385b51SAlex Elder p + ret, NULL, GFP_NOIO); 465757385b51SAlex Elder ret = 0; 46581e130199SAlex Elder 46591e130199SAlex Elder if (IS_ERR(rbd_dev->header.object_prefix)) { 46601e130199SAlex Elder ret = PTR_ERR(rbd_dev->header.object_prefix); 46611e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 46621e130199SAlex Elder } else { 46631e130199SAlex Elder dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); 46641e130199SAlex Elder } 46651e130199SAlex Elder out: 46661e130199SAlex Elder kfree(reply_buf); 46671e130199SAlex Elder 46681e130199SAlex Elder return ret; 46691e130199SAlex Elder } 46701e130199SAlex Elder 4671b1b5402aSAlex Elder static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, 4672b1b5402aSAlex Elder u64 *snap_features) 4673b1b5402aSAlex Elder { 4674b1b5402aSAlex Elder __le64 snapid = cpu_to_le64(snap_id); 4675b1b5402aSAlex Elder struct { 4676b1b5402aSAlex Elder __le64 features; 4677b1b5402aSAlex Elder __le64 incompat; 46784157976bSAlex Elder } __attribute__ ((packed)) features_buf = { 0 }; 4679d3767f0fSIlya Dryomov u64 unsup; 4680b1b5402aSAlex Elder int ret; 4681b1b5402aSAlex Elder 4682ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4683ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_features", 46844157976bSAlex Elder &snapid, sizeof(snapid), 4685e2a58ee5SAlex Elder &features_buf, sizeof(features_buf)); 468636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4687b1b5402aSAlex Elder if (ret < 0) 4688b1b5402aSAlex Elder return ret; 468957385b51SAlex Elder if (ret < sizeof (features_buf)) 469057385b51SAlex Elder return -ERANGE; 4691d889140cSAlex Elder 4692d3767f0fSIlya Dryomov unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED; 4693d3767f0fSIlya Dryomov if (unsup) { 4694d3767f0fSIlya Dryomov rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx", 4695d3767f0fSIlya Dryomov unsup); 4696b8f5c6edSAlex Elder return -ENXIO; 4697d3767f0fSIlya Dryomov } 4698d889140cSAlex Elder 4699b1b5402aSAlex Elder *snap_features = le64_to_cpu(features_buf.features); 4700b1b5402aSAlex Elder 4701b1b5402aSAlex Elder dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", 4702b1b5402aSAlex Elder (unsigned long long)snap_id, 4703b1b5402aSAlex Elder (unsigned long long)*snap_features, 4704b1b5402aSAlex Elder (unsigned long long)le64_to_cpu(features_buf.incompat)); 4705b1b5402aSAlex Elder 4706b1b5402aSAlex Elder return 0; 4707b1b5402aSAlex Elder } 4708b1b5402aSAlex Elder 4709b1b5402aSAlex Elder static int rbd_dev_v2_features(struct rbd_device *rbd_dev) 4710b1b5402aSAlex Elder { 4711b1b5402aSAlex Elder return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, 4712b1b5402aSAlex Elder &rbd_dev->header.features); 4713b1b5402aSAlex Elder } 4714b1b5402aSAlex Elder 4715eb3b2d6bSIlya Dryomov struct parent_image_info { 4716eb3b2d6bSIlya Dryomov u64 pool_id; 4717e92c0eafSIlya Dryomov const char *pool_ns; 4718eb3b2d6bSIlya Dryomov const char *image_id; 4719eb3b2d6bSIlya Dryomov u64 snap_id; 4720eb3b2d6bSIlya Dryomov 4721e92c0eafSIlya Dryomov bool has_overlap; 4722eb3b2d6bSIlya Dryomov u64 overlap; 4723eb3b2d6bSIlya Dryomov }; 4724eb3b2d6bSIlya Dryomov 4725eb3b2d6bSIlya Dryomov /* 4726eb3b2d6bSIlya Dryomov * The caller is responsible for @pii. 4727eb3b2d6bSIlya Dryomov */ 4728e92c0eafSIlya Dryomov static int decode_parent_image_spec(void **p, void *end, 4729e92c0eafSIlya Dryomov struct parent_image_info *pii) 4730e92c0eafSIlya Dryomov { 4731e92c0eafSIlya Dryomov u8 struct_v; 4732e92c0eafSIlya Dryomov u32 struct_len; 4733e92c0eafSIlya Dryomov int ret; 4734e92c0eafSIlya Dryomov 4735e92c0eafSIlya Dryomov ret = ceph_start_decoding(p, end, 1, "ParentImageSpec", 4736e92c0eafSIlya Dryomov &struct_v, &struct_len); 4737e92c0eafSIlya Dryomov if (ret) 4738e92c0eafSIlya Dryomov return ret; 4739e92c0eafSIlya Dryomov 4740e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->pool_id, e_inval); 4741e92c0eafSIlya Dryomov pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 4742e92c0eafSIlya Dryomov if (IS_ERR(pii->pool_ns)) { 4743e92c0eafSIlya Dryomov ret = PTR_ERR(pii->pool_ns); 4744e92c0eafSIlya Dryomov pii->pool_ns = NULL; 4745e92c0eafSIlya Dryomov return ret; 4746e92c0eafSIlya Dryomov } 4747e92c0eafSIlya Dryomov pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL); 4748e92c0eafSIlya Dryomov if (IS_ERR(pii->image_id)) { 4749e92c0eafSIlya Dryomov ret = PTR_ERR(pii->image_id); 4750e92c0eafSIlya Dryomov pii->image_id = NULL; 4751e92c0eafSIlya Dryomov return ret; 4752e92c0eafSIlya Dryomov } 4753e92c0eafSIlya Dryomov ceph_decode_64_safe(p, end, pii->snap_id, e_inval); 4754e92c0eafSIlya Dryomov return 0; 4755e92c0eafSIlya Dryomov 4756e92c0eafSIlya Dryomov e_inval: 4757e92c0eafSIlya Dryomov return -EINVAL; 4758e92c0eafSIlya Dryomov } 4759e92c0eafSIlya Dryomov 4760e92c0eafSIlya Dryomov static int __get_parent_info(struct rbd_device *rbd_dev, 4761e92c0eafSIlya Dryomov struct page *req_page, 4762e92c0eafSIlya Dryomov struct page *reply_page, 4763e92c0eafSIlya Dryomov struct parent_image_info *pii) 4764e92c0eafSIlya Dryomov { 4765e92c0eafSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4766e92c0eafSIlya Dryomov size_t reply_len = PAGE_SIZE; 4767e92c0eafSIlya Dryomov void *p, *end; 4768e92c0eafSIlya Dryomov int ret; 4769e92c0eafSIlya Dryomov 4770e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4771e92c0eafSIlya Dryomov "rbd", "parent_get", CEPH_OSD_FLAG_READ, 4772e92c0eafSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4773e92c0eafSIlya Dryomov if (ret) 4774e92c0eafSIlya Dryomov return ret == -EOPNOTSUPP ? 1 : ret; 4775e92c0eafSIlya Dryomov 4776e92c0eafSIlya Dryomov p = page_address(reply_page); 4777e92c0eafSIlya Dryomov end = p + reply_len; 4778e92c0eafSIlya Dryomov ret = decode_parent_image_spec(&p, end, pii); 4779e92c0eafSIlya Dryomov if (ret) 4780e92c0eafSIlya Dryomov return ret; 4781e92c0eafSIlya Dryomov 4782e92c0eafSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4783e92c0eafSIlya Dryomov "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ, 4784e92c0eafSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4785e92c0eafSIlya Dryomov if (ret) 4786e92c0eafSIlya Dryomov return ret; 4787e92c0eafSIlya Dryomov 4788e92c0eafSIlya Dryomov p = page_address(reply_page); 4789e92c0eafSIlya Dryomov end = p + reply_len; 4790e92c0eafSIlya Dryomov ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval); 4791e92c0eafSIlya Dryomov if (pii->has_overlap) 4792e92c0eafSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 4793e92c0eafSIlya Dryomov 4794e92c0eafSIlya Dryomov return 0; 4795e92c0eafSIlya Dryomov 4796e92c0eafSIlya Dryomov e_inval: 4797e92c0eafSIlya Dryomov return -EINVAL; 4798e92c0eafSIlya Dryomov } 4799e92c0eafSIlya Dryomov 4800e92c0eafSIlya Dryomov /* 4801e92c0eafSIlya Dryomov * The caller is responsible for @pii. 4802e92c0eafSIlya Dryomov */ 4803eb3b2d6bSIlya Dryomov static int __get_parent_info_legacy(struct rbd_device *rbd_dev, 4804eb3b2d6bSIlya Dryomov struct page *req_page, 4805eb3b2d6bSIlya Dryomov struct page *reply_page, 4806eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 4807eb3b2d6bSIlya Dryomov { 4808eb3b2d6bSIlya Dryomov struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 4809eb3b2d6bSIlya Dryomov size_t reply_len = PAGE_SIZE; 4810eb3b2d6bSIlya Dryomov void *p, *end; 4811eb3b2d6bSIlya Dryomov int ret; 4812eb3b2d6bSIlya Dryomov 4813eb3b2d6bSIlya Dryomov ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc, 4814eb3b2d6bSIlya Dryomov "rbd", "get_parent", CEPH_OSD_FLAG_READ, 4815eb3b2d6bSIlya Dryomov req_page, sizeof(u64), reply_page, &reply_len); 4816eb3b2d6bSIlya Dryomov if (ret) 4817eb3b2d6bSIlya Dryomov return ret; 4818eb3b2d6bSIlya Dryomov 4819eb3b2d6bSIlya Dryomov p = page_address(reply_page); 4820eb3b2d6bSIlya Dryomov end = p + reply_len; 4821eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->pool_id, e_inval); 4822eb3b2d6bSIlya Dryomov pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 4823eb3b2d6bSIlya Dryomov if (IS_ERR(pii->image_id)) { 4824eb3b2d6bSIlya Dryomov ret = PTR_ERR(pii->image_id); 4825eb3b2d6bSIlya Dryomov pii->image_id = NULL; 4826eb3b2d6bSIlya Dryomov return ret; 4827eb3b2d6bSIlya Dryomov } 4828eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->snap_id, e_inval); 4829e92c0eafSIlya Dryomov pii->has_overlap = true; 4830eb3b2d6bSIlya Dryomov ceph_decode_64_safe(&p, end, pii->overlap, e_inval); 4831eb3b2d6bSIlya Dryomov 4832eb3b2d6bSIlya Dryomov return 0; 4833eb3b2d6bSIlya Dryomov 4834eb3b2d6bSIlya Dryomov e_inval: 4835eb3b2d6bSIlya Dryomov return -EINVAL; 4836eb3b2d6bSIlya Dryomov } 4837eb3b2d6bSIlya Dryomov 4838eb3b2d6bSIlya Dryomov static int get_parent_info(struct rbd_device *rbd_dev, 4839eb3b2d6bSIlya Dryomov struct parent_image_info *pii) 4840eb3b2d6bSIlya Dryomov { 4841eb3b2d6bSIlya Dryomov struct page *req_page, *reply_page; 4842eb3b2d6bSIlya Dryomov void *p; 4843eb3b2d6bSIlya Dryomov int ret; 4844eb3b2d6bSIlya Dryomov 4845eb3b2d6bSIlya Dryomov req_page = alloc_page(GFP_KERNEL); 4846eb3b2d6bSIlya Dryomov if (!req_page) 4847eb3b2d6bSIlya Dryomov return -ENOMEM; 4848eb3b2d6bSIlya Dryomov 4849eb3b2d6bSIlya Dryomov reply_page = alloc_page(GFP_KERNEL); 4850eb3b2d6bSIlya Dryomov if (!reply_page) { 4851eb3b2d6bSIlya Dryomov __free_page(req_page); 4852eb3b2d6bSIlya Dryomov return -ENOMEM; 4853eb3b2d6bSIlya Dryomov } 4854eb3b2d6bSIlya Dryomov 4855eb3b2d6bSIlya Dryomov p = page_address(req_page); 4856eb3b2d6bSIlya Dryomov ceph_encode_64(&p, rbd_dev->spec->snap_id); 4857e92c0eafSIlya Dryomov ret = __get_parent_info(rbd_dev, req_page, reply_page, pii); 4858e92c0eafSIlya Dryomov if (ret > 0) 4859e92c0eafSIlya Dryomov ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page, 4860e92c0eafSIlya Dryomov pii); 4861eb3b2d6bSIlya Dryomov 4862eb3b2d6bSIlya Dryomov __free_page(req_page); 4863eb3b2d6bSIlya Dryomov __free_page(reply_page); 4864eb3b2d6bSIlya Dryomov return ret; 4865eb3b2d6bSIlya Dryomov } 4866eb3b2d6bSIlya Dryomov 486786b00e0dSAlex Elder static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) 486886b00e0dSAlex Elder { 486986b00e0dSAlex Elder struct rbd_spec *parent_spec; 4870eb3b2d6bSIlya Dryomov struct parent_image_info pii = { 0 }; 487186b00e0dSAlex Elder int ret; 487286b00e0dSAlex Elder 487386b00e0dSAlex Elder parent_spec = rbd_spec_alloc(); 487486b00e0dSAlex Elder if (!parent_spec) 487586b00e0dSAlex Elder return -ENOMEM; 487686b00e0dSAlex Elder 4877eb3b2d6bSIlya Dryomov ret = get_parent_info(rbd_dev, &pii); 4878eb3b2d6bSIlya Dryomov if (ret) 487986b00e0dSAlex Elder goto out_err; 488086b00e0dSAlex Elder 4881e92c0eafSIlya Dryomov dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n", 4882e92c0eafSIlya Dryomov __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id, 4883e92c0eafSIlya Dryomov pii.has_overlap, pii.overlap); 4884eb3b2d6bSIlya Dryomov 4885e92c0eafSIlya Dryomov if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) { 4886392a9dadSAlex Elder /* 4887392a9dadSAlex Elder * Either the parent never existed, or we have 4888392a9dadSAlex Elder * record of it but the image got flattened so it no 4889392a9dadSAlex Elder * longer has a parent. When the parent of a 4890392a9dadSAlex Elder * layered image disappears we immediately set the 4891392a9dadSAlex Elder * overlap to 0. The effect of this is that all new 4892392a9dadSAlex Elder * requests will be treated as if the image had no 4893392a9dadSAlex Elder * parent. 4894e92c0eafSIlya Dryomov * 4895e92c0eafSIlya Dryomov * If !pii.has_overlap, the parent image spec is not 4896e92c0eafSIlya Dryomov * applicable. It's there to avoid duplication in each 4897e92c0eafSIlya Dryomov * snapshot record. 4898392a9dadSAlex Elder */ 4899392a9dadSAlex Elder if (rbd_dev->parent_overlap) { 4900392a9dadSAlex Elder rbd_dev->parent_overlap = 0; 4901392a9dadSAlex Elder rbd_dev_parent_put(rbd_dev); 4902392a9dadSAlex Elder pr_info("%s: clone image has been flattened\n", 4903392a9dadSAlex Elder rbd_dev->disk->disk_name); 4904392a9dadSAlex Elder } 4905392a9dadSAlex Elder 490686b00e0dSAlex Elder goto out; /* No parent? No problem. */ 4907392a9dadSAlex Elder } 490886b00e0dSAlex Elder 49090903e875SAlex Elder /* The ceph file layout needs to fit pool id in 32 bits */ 49100903e875SAlex Elder 49110903e875SAlex Elder ret = -EIO; 4912eb3b2d6bSIlya Dryomov if (pii.pool_id > (u64)U32_MAX) { 49139584d508SIlya Dryomov rbd_warn(NULL, "parent pool id too large (%llu > %u)", 4914eb3b2d6bSIlya Dryomov (unsigned long long)pii.pool_id, U32_MAX); 491557385b51SAlex Elder goto out_err; 4916c0cd10dbSAlex Elder } 49170903e875SAlex Elder 49183b5cf2a2SAlex Elder /* 49193b5cf2a2SAlex Elder * The parent won't change (except when the clone is 49203b5cf2a2SAlex Elder * flattened, already handled that). So we only need to 49213b5cf2a2SAlex Elder * record the parent spec we have not already done so. 49223b5cf2a2SAlex Elder */ 49233b5cf2a2SAlex Elder if (!rbd_dev->parent_spec) { 4924eb3b2d6bSIlya Dryomov parent_spec->pool_id = pii.pool_id; 4925e92c0eafSIlya Dryomov if (pii.pool_ns && *pii.pool_ns) { 4926e92c0eafSIlya Dryomov parent_spec->pool_ns = pii.pool_ns; 4927e92c0eafSIlya Dryomov pii.pool_ns = NULL; 4928e92c0eafSIlya Dryomov } 4929eb3b2d6bSIlya Dryomov parent_spec->image_id = pii.image_id; 4930eb3b2d6bSIlya Dryomov pii.image_id = NULL; 4931eb3b2d6bSIlya Dryomov parent_spec->snap_id = pii.snap_id; 4932b26c047bSIlya Dryomov 493386b00e0dSAlex Elder rbd_dev->parent_spec = parent_spec; 493486b00e0dSAlex Elder parent_spec = NULL; /* rbd_dev now owns this */ 49353b5cf2a2SAlex Elder } 49363b5cf2a2SAlex Elder 49373b5cf2a2SAlex Elder /* 4938cf32bd9cSIlya Dryomov * We always update the parent overlap. If it's zero we issue 4939cf32bd9cSIlya Dryomov * a warning, as we will proceed as if there was no parent. 49403b5cf2a2SAlex Elder */ 4941eb3b2d6bSIlya Dryomov if (!pii.overlap) { 49423b5cf2a2SAlex Elder if (parent_spec) { 4943cf32bd9cSIlya Dryomov /* refresh, careful to warn just once */ 4944cf32bd9cSIlya Dryomov if (rbd_dev->parent_overlap) 4945cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, 4946cf32bd9cSIlya Dryomov "clone now standalone (overlap became 0)"); 494770cf49cfSAlex Elder } else { 4948cf32bd9cSIlya Dryomov /* initial probe */ 4949cf32bd9cSIlya Dryomov rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); 49503b5cf2a2SAlex Elder } 495170cf49cfSAlex Elder } 4952eb3b2d6bSIlya Dryomov rbd_dev->parent_overlap = pii.overlap; 4953cf32bd9cSIlya Dryomov 495486b00e0dSAlex Elder out: 495586b00e0dSAlex Elder ret = 0; 495686b00e0dSAlex Elder out_err: 4957e92c0eafSIlya Dryomov kfree(pii.pool_ns); 4958eb3b2d6bSIlya Dryomov kfree(pii.image_id); 495986b00e0dSAlex Elder rbd_spec_put(parent_spec); 496086b00e0dSAlex Elder return ret; 496186b00e0dSAlex Elder } 496286b00e0dSAlex Elder 4963cc070d59SAlex Elder static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) 4964cc070d59SAlex Elder { 4965cc070d59SAlex Elder struct { 4966cc070d59SAlex Elder __le64 stripe_unit; 4967cc070d59SAlex Elder __le64 stripe_count; 4968cc070d59SAlex Elder } __attribute__ ((packed)) striping_info_buf = { 0 }; 4969cc070d59SAlex Elder size_t size = sizeof (striping_info_buf); 4970cc070d59SAlex Elder void *p; 4971cc070d59SAlex Elder int ret; 4972cc070d59SAlex Elder 4973ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 4974ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_stripe_unit_count", 4975ecd4a68aSIlya Dryomov NULL, 0, &striping_info_buf, size); 4976cc070d59SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 4977cc070d59SAlex Elder if (ret < 0) 4978cc070d59SAlex Elder return ret; 4979cc070d59SAlex Elder if (ret < size) 4980cc070d59SAlex Elder return -ERANGE; 4981cc070d59SAlex Elder 4982cc070d59SAlex Elder p = &striping_info_buf; 4983b1331852SIlya Dryomov rbd_dev->header.stripe_unit = ceph_decode_64(&p); 4984b1331852SIlya Dryomov rbd_dev->header.stripe_count = ceph_decode_64(&p); 4985cc070d59SAlex Elder return 0; 4986cc070d59SAlex Elder } 4987cc070d59SAlex Elder 49887e97332eSIlya Dryomov static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev) 49897e97332eSIlya Dryomov { 49907e97332eSIlya Dryomov __le64 data_pool_id; 49917e97332eSIlya Dryomov int ret; 49927e97332eSIlya Dryomov 49937e97332eSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 49947e97332eSIlya Dryomov &rbd_dev->header_oloc, "get_data_pool", 49957e97332eSIlya Dryomov NULL, 0, &data_pool_id, sizeof(data_pool_id)); 49967e97332eSIlya Dryomov if (ret < 0) 49977e97332eSIlya Dryomov return ret; 49987e97332eSIlya Dryomov if (ret < sizeof(data_pool_id)) 49997e97332eSIlya Dryomov return -EBADMSG; 50007e97332eSIlya Dryomov 50017e97332eSIlya Dryomov rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id); 50027e97332eSIlya Dryomov WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL); 50037e97332eSIlya Dryomov return 0; 50047e97332eSIlya Dryomov } 50057e97332eSIlya Dryomov 50069e15b77dSAlex Elder static char *rbd_dev_image_name(struct rbd_device *rbd_dev) 50079e15b77dSAlex Elder { 5008ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 50099e15b77dSAlex Elder size_t image_id_size; 50109e15b77dSAlex Elder char *image_id; 50119e15b77dSAlex Elder void *p; 50129e15b77dSAlex Elder void *end; 50139e15b77dSAlex Elder size_t size; 50149e15b77dSAlex Elder void *reply_buf = NULL; 50159e15b77dSAlex Elder size_t len = 0; 50169e15b77dSAlex Elder char *image_name = NULL; 50179e15b77dSAlex Elder int ret; 50189e15b77dSAlex Elder 50199e15b77dSAlex Elder rbd_assert(!rbd_dev->spec->image_name); 50209e15b77dSAlex Elder 502169e7a02fSAlex Elder len = strlen(rbd_dev->spec->image_id); 502269e7a02fSAlex Elder image_id_size = sizeof (__le32) + len; 50239e15b77dSAlex Elder image_id = kmalloc(image_id_size, GFP_KERNEL); 50249e15b77dSAlex Elder if (!image_id) 50259e15b77dSAlex Elder return NULL; 50269e15b77dSAlex Elder 50279e15b77dSAlex Elder p = image_id; 50284157976bSAlex Elder end = image_id + image_id_size; 502969e7a02fSAlex Elder ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); 50309e15b77dSAlex Elder 50319e15b77dSAlex Elder size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; 50329e15b77dSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 50339e15b77dSAlex Elder if (!reply_buf) 50349e15b77dSAlex Elder goto out; 50359e15b77dSAlex Elder 5036ecd4a68aSIlya Dryomov ceph_oid_printf(&oid, "%s", RBD_DIRECTORY); 5037ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5038ecd4a68aSIlya Dryomov "dir_get_name", image_id, image_id_size, 5039e2a58ee5SAlex Elder reply_buf, size); 50409e15b77dSAlex Elder if (ret < 0) 50419e15b77dSAlex Elder goto out; 50429e15b77dSAlex Elder p = reply_buf; 5043f40eb349SAlex Elder end = reply_buf + ret; 5044f40eb349SAlex Elder 50459e15b77dSAlex Elder image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); 50469e15b77dSAlex Elder if (IS_ERR(image_name)) 50479e15b77dSAlex Elder image_name = NULL; 50489e15b77dSAlex Elder else 50499e15b77dSAlex Elder dout("%s: name is %s len is %zd\n", __func__, image_name, len); 50509e15b77dSAlex Elder out: 50519e15b77dSAlex Elder kfree(reply_buf); 50529e15b77dSAlex Elder kfree(image_id); 50539e15b77dSAlex Elder 50549e15b77dSAlex Elder return image_name; 50559e15b77dSAlex Elder } 50569e15b77dSAlex Elder 50572ad3d716SAlex Elder static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 50582ad3d716SAlex Elder { 50592ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 50602ad3d716SAlex Elder const char *snap_name; 50612ad3d716SAlex Elder u32 which = 0; 50622ad3d716SAlex Elder 50632ad3d716SAlex Elder /* Skip over names until we find the one we are looking for */ 50642ad3d716SAlex Elder 50652ad3d716SAlex Elder snap_name = rbd_dev->header.snap_names; 50662ad3d716SAlex Elder while (which < snapc->num_snaps) { 50672ad3d716SAlex Elder if (!strcmp(name, snap_name)) 50682ad3d716SAlex Elder return snapc->snaps[which]; 50692ad3d716SAlex Elder snap_name += strlen(snap_name) + 1; 50702ad3d716SAlex Elder which++; 50712ad3d716SAlex Elder } 50722ad3d716SAlex Elder return CEPH_NOSNAP; 50732ad3d716SAlex Elder } 50742ad3d716SAlex Elder 50752ad3d716SAlex Elder static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 50762ad3d716SAlex Elder { 50772ad3d716SAlex Elder struct ceph_snap_context *snapc = rbd_dev->header.snapc; 50782ad3d716SAlex Elder u32 which; 50792ad3d716SAlex Elder bool found = false; 50802ad3d716SAlex Elder u64 snap_id; 50812ad3d716SAlex Elder 50822ad3d716SAlex Elder for (which = 0; !found && which < snapc->num_snaps; which++) { 50832ad3d716SAlex Elder const char *snap_name; 50842ad3d716SAlex Elder 50852ad3d716SAlex Elder snap_id = snapc->snaps[which]; 50862ad3d716SAlex Elder snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); 5087efadc98aSJosh Durgin if (IS_ERR(snap_name)) { 5088efadc98aSJosh Durgin /* ignore no-longer existing snapshots */ 5089efadc98aSJosh Durgin if (PTR_ERR(snap_name) == -ENOENT) 5090efadc98aSJosh Durgin continue; 5091efadc98aSJosh Durgin else 50922ad3d716SAlex Elder break; 5093efadc98aSJosh Durgin } 50942ad3d716SAlex Elder found = !strcmp(name, snap_name); 50952ad3d716SAlex Elder kfree(snap_name); 50962ad3d716SAlex Elder } 50972ad3d716SAlex Elder return found ? snap_id : CEPH_NOSNAP; 50982ad3d716SAlex Elder } 50992ad3d716SAlex Elder 51002ad3d716SAlex Elder /* 51012ad3d716SAlex Elder * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if 51022ad3d716SAlex Elder * no snapshot by that name is found, or if an error occurs. 51032ad3d716SAlex Elder */ 51042ad3d716SAlex Elder static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) 51052ad3d716SAlex Elder { 51062ad3d716SAlex Elder if (rbd_dev->image_format == 1) 51072ad3d716SAlex Elder return rbd_v1_snap_id_by_name(rbd_dev, name); 51082ad3d716SAlex Elder 51092ad3d716SAlex Elder return rbd_v2_snap_id_by_name(rbd_dev, name); 51102ad3d716SAlex Elder } 51112ad3d716SAlex Elder 51129e15b77dSAlex Elder /* 511304077599SIlya Dryomov * An image being mapped will have everything but the snap id. 51149e15b77dSAlex Elder */ 511504077599SIlya Dryomov static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) 511604077599SIlya Dryomov { 511704077599SIlya Dryomov struct rbd_spec *spec = rbd_dev->spec; 511804077599SIlya Dryomov 511904077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); 512004077599SIlya Dryomov rbd_assert(spec->image_id && spec->image_name); 512104077599SIlya Dryomov rbd_assert(spec->snap_name); 512204077599SIlya Dryomov 512304077599SIlya Dryomov if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { 512404077599SIlya Dryomov u64 snap_id; 512504077599SIlya Dryomov 512604077599SIlya Dryomov snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); 512704077599SIlya Dryomov if (snap_id == CEPH_NOSNAP) 512804077599SIlya Dryomov return -ENOENT; 512904077599SIlya Dryomov 513004077599SIlya Dryomov spec->snap_id = snap_id; 513104077599SIlya Dryomov } else { 513204077599SIlya Dryomov spec->snap_id = CEPH_NOSNAP; 513304077599SIlya Dryomov } 513404077599SIlya Dryomov 513504077599SIlya Dryomov return 0; 513604077599SIlya Dryomov } 513704077599SIlya Dryomov 513804077599SIlya Dryomov /* 513904077599SIlya Dryomov * A parent image will have all ids but none of the names. 514004077599SIlya Dryomov * 514104077599SIlya Dryomov * All names in an rbd spec are dynamically allocated. It's OK if we 514204077599SIlya Dryomov * can't figure out the name for an image id. 514304077599SIlya Dryomov */ 514404077599SIlya Dryomov static int rbd_spec_fill_names(struct rbd_device *rbd_dev) 51459e15b77dSAlex Elder { 51462e9f7f1cSAlex Elder struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 51472e9f7f1cSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 51482e9f7f1cSAlex Elder const char *pool_name; 51492e9f7f1cSAlex Elder const char *image_name; 51502e9f7f1cSAlex Elder const char *snap_name; 51519e15b77dSAlex Elder int ret; 51529e15b77dSAlex Elder 515304077599SIlya Dryomov rbd_assert(spec->pool_id != CEPH_NOPOOL); 515404077599SIlya Dryomov rbd_assert(spec->image_id); 515504077599SIlya Dryomov rbd_assert(spec->snap_id != CEPH_NOSNAP); 51569e15b77dSAlex Elder 51572e9f7f1cSAlex Elder /* Get the pool name; we have to make our own copy of this */ 51589e15b77dSAlex Elder 51592e9f7f1cSAlex Elder pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); 51602e9f7f1cSAlex Elder if (!pool_name) { 51612e9f7f1cSAlex Elder rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); 5162935dc89fSAlex Elder return -EIO; 5163935dc89fSAlex Elder } 51642e9f7f1cSAlex Elder pool_name = kstrdup(pool_name, GFP_KERNEL); 51652e9f7f1cSAlex Elder if (!pool_name) 51669e15b77dSAlex Elder return -ENOMEM; 51679e15b77dSAlex Elder 51689e15b77dSAlex Elder /* Fetch the image name; tolerate failure here */ 51699e15b77dSAlex Elder 51702e9f7f1cSAlex Elder image_name = rbd_dev_image_name(rbd_dev); 51712e9f7f1cSAlex Elder if (!image_name) 517206ecc6cbSAlex Elder rbd_warn(rbd_dev, "unable to get image name"); 51739e15b77dSAlex Elder 517404077599SIlya Dryomov /* Fetch the snapshot name */ 51759e15b77dSAlex Elder 51762e9f7f1cSAlex Elder snap_name = rbd_snap_name(rbd_dev, spec->snap_id); 5177da6a6b63SJosh Durgin if (IS_ERR(snap_name)) { 5178da6a6b63SJosh Durgin ret = PTR_ERR(snap_name); 51799e15b77dSAlex Elder goto out_err; 51802e9f7f1cSAlex Elder } 51812e9f7f1cSAlex Elder 51822e9f7f1cSAlex Elder spec->pool_name = pool_name; 51832e9f7f1cSAlex Elder spec->image_name = image_name; 51842e9f7f1cSAlex Elder spec->snap_name = snap_name; 51859e15b77dSAlex Elder 51869e15b77dSAlex Elder return 0; 518704077599SIlya Dryomov 51889e15b77dSAlex Elder out_err: 51892e9f7f1cSAlex Elder kfree(image_name); 51902e9f7f1cSAlex Elder kfree(pool_name); 51919e15b77dSAlex Elder return ret; 51929e15b77dSAlex Elder } 51939e15b77dSAlex Elder 5194cc4a38bdSAlex Elder static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) 519535d489f9SAlex Elder { 519635d489f9SAlex Elder size_t size; 519735d489f9SAlex Elder int ret; 519835d489f9SAlex Elder void *reply_buf; 519935d489f9SAlex Elder void *p; 520035d489f9SAlex Elder void *end; 520135d489f9SAlex Elder u64 seq; 520235d489f9SAlex Elder u32 snap_count; 520335d489f9SAlex Elder struct ceph_snap_context *snapc; 520435d489f9SAlex Elder u32 i; 520535d489f9SAlex Elder 520635d489f9SAlex Elder /* 520735d489f9SAlex Elder * We'll need room for the seq value (maximum snapshot id), 520835d489f9SAlex Elder * snapshot count, and array of that many snapshot ids. 520935d489f9SAlex Elder * For now we have a fixed upper limit on the number we're 521035d489f9SAlex Elder * prepared to receive. 521135d489f9SAlex Elder */ 521235d489f9SAlex Elder size = sizeof (__le64) + sizeof (__le32) + 521335d489f9SAlex Elder RBD_MAX_SNAP_COUNT * sizeof (__le64); 521435d489f9SAlex Elder reply_buf = kzalloc(size, GFP_KERNEL); 521535d489f9SAlex Elder if (!reply_buf) 521635d489f9SAlex Elder return -ENOMEM; 521735d489f9SAlex Elder 5218ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5219ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapcontext", 5220ecd4a68aSIlya Dryomov NULL, 0, reply_buf, size); 522136be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 522235d489f9SAlex Elder if (ret < 0) 522335d489f9SAlex Elder goto out; 522435d489f9SAlex Elder 522535d489f9SAlex Elder p = reply_buf; 522657385b51SAlex Elder end = reply_buf + ret; 522757385b51SAlex Elder ret = -ERANGE; 522835d489f9SAlex Elder ceph_decode_64_safe(&p, end, seq, out); 522935d489f9SAlex Elder ceph_decode_32_safe(&p, end, snap_count, out); 523035d489f9SAlex Elder 523135d489f9SAlex Elder /* 523235d489f9SAlex Elder * Make sure the reported number of snapshot ids wouldn't go 523335d489f9SAlex Elder * beyond the end of our buffer. But before checking that, 523435d489f9SAlex Elder * make sure the computed size of the snapshot context we 523535d489f9SAlex Elder * allocate is representable in a size_t. 523635d489f9SAlex Elder */ 523735d489f9SAlex Elder if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) 523835d489f9SAlex Elder / sizeof (u64)) { 523935d489f9SAlex Elder ret = -EINVAL; 524035d489f9SAlex Elder goto out; 524135d489f9SAlex Elder } 524235d489f9SAlex Elder if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) 524335d489f9SAlex Elder goto out; 5244468521c1SAlex Elder ret = 0; 524535d489f9SAlex Elder 5246812164f8SAlex Elder snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); 524735d489f9SAlex Elder if (!snapc) { 524835d489f9SAlex Elder ret = -ENOMEM; 524935d489f9SAlex Elder goto out; 525035d489f9SAlex Elder } 525135d489f9SAlex Elder snapc->seq = seq; 525235d489f9SAlex Elder for (i = 0; i < snap_count; i++) 525335d489f9SAlex Elder snapc->snaps[i] = ceph_decode_64(&p); 525435d489f9SAlex Elder 525549ece554SAlex Elder ceph_put_snap_context(rbd_dev->header.snapc); 525635d489f9SAlex Elder rbd_dev->header.snapc = snapc; 525735d489f9SAlex Elder 525835d489f9SAlex Elder dout(" snap context seq = %llu, snap_count = %u\n", 525935d489f9SAlex Elder (unsigned long long)seq, (unsigned int)snap_count); 526035d489f9SAlex Elder out: 526135d489f9SAlex Elder kfree(reply_buf); 526235d489f9SAlex Elder 526357385b51SAlex Elder return ret; 526435d489f9SAlex Elder } 526535d489f9SAlex Elder 526654cac61fSAlex Elder static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, 526754cac61fSAlex Elder u64 snap_id) 5268b8b1e2dbSAlex Elder { 5269b8b1e2dbSAlex Elder size_t size; 5270b8b1e2dbSAlex Elder void *reply_buf; 527154cac61fSAlex Elder __le64 snapid; 5272b8b1e2dbSAlex Elder int ret; 5273b8b1e2dbSAlex Elder void *p; 5274b8b1e2dbSAlex Elder void *end; 5275b8b1e2dbSAlex Elder char *snap_name; 5276b8b1e2dbSAlex Elder 5277b8b1e2dbSAlex Elder size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; 5278b8b1e2dbSAlex Elder reply_buf = kmalloc(size, GFP_KERNEL); 5279b8b1e2dbSAlex Elder if (!reply_buf) 5280b8b1e2dbSAlex Elder return ERR_PTR(-ENOMEM); 5281b8b1e2dbSAlex Elder 528254cac61fSAlex Elder snapid = cpu_to_le64(snap_id); 5283ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5284ecd4a68aSIlya Dryomov &rbd_dev->header_oloc, "get_snapshot_name", 5285ecd4a68aSIlya Dryomov &snapid, sizeof(snapid), reply_buf, size); 528636be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5287f40eb349SAlex Elder if (ret < 0) { 5288f40eb349SAlex Elder snap_name = ERR_PTR(ret); 5289b8b1e2dbSAlex Elder goto out; 5290f40eb349SAlex Elder } 5291b8b1e2dbSAlex Elder 5292b8b1e2dbSAlex Elder p = reply_buf; 5293f40eb349SAlex Elder end = reply_buf + ret; 5294e5c35534SAlex Elder snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); 5295f40eb349SAlex Elder if (IS_ERR(snap_name)) 5296b8b1e2dbSAlex Elder goto out; 5297f40eb349SAlex Elder 5298b8b1e2dbSAlex Elder dout(" snap_id 0x%016llx snap_name = %s\n", 529954cac61fSAlex Elder (unsigned long long)snap_id, snap_name); 5300b8b1e2dbSAlex Elder out: 5301b8b1e2dbSAlex Elder kfree(reply_buf); 5302b8b1e2dbSAlex Elder 5303f40eb349SAlex Elder return snap_name; 5304b8b1e2dbSAlex Elder } 5305b8b1e2dbSAlex Elder 53062df3fac7SAlex Elder static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) 5307117973fbSAlex Elder { 53082df3fac7SAlex Elder bool first_time = rbd_dev->header.object_prefix == NULL; 5309117973fbSAlex Elder int ret; 5310117973fbSAlex Elder 53111617e40cSJosh Durgin ret = rbd_dev_v2_image_size(rbd_dev); 53121617e40cSJosh Durgin if (ret) 5313cfbf6377SAlex Elder return ret; 53141617e40cSJosh Durgin 53152df3fac7SAlex Elder if (first_time) { 53162df3fac7SAlex Elder ret = rbd_dev_v2_header_onetime(rbd_dev); 53172df3fac7SAlex Elder if (ret) 5318cfbf6377SAlex Elder return ret; 53192df3fac7SAlex Elder } 53202df3fac7SAlex Elder 5321cc4a38bdSAlex Elder ret = rbd_dev_v2_snap_context(rbd_dev); 5322d194cd1dSIlya Dryomov if (ret && first_time) { 5323d194cd1dSIlya Dryomov kfree(rbd_dev->header.object_prefix); 5324d194cd1dSIlya Dryomov rbd_dev->header.object_prefix = NULL; 5325d194cd1dSIlya Dryomov } 5326117973fbSAlex Elder 5327117973fbSAlex Elder return ret; 5328117973fbSAlex Elder } 5329117973fbSAlex Elder 5330a720ae09SIlya Dryomov static int rbd_dev_header_info(struct rbd_device *rbd_dev) 5331a720ae09SIlya Dryomov { 5332a720ae09SIlya Dryomov rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5333a720ae09SIlya Dryomov 5334a720ae09SIlya Dryomov if (rbd_dev->image_format == 1) 5335a720ae09SIlya Dryomov return rbd_dev_v1_header_info(rbd_dev); 5336a720ae09SIlya Dryomov 5337a720ae09SIlya Dryomov return rbd_dev_v2_header_info(rbd_dev); 5338a720ae09SIlya Dryomov } 5339a720ae09SIlya Dryomov 53401ddbe94eSAlex Elder /* 5341e28fff26SAlex Elder * Skips over white space at *buf, and updates *buf to point to the 5342e28fff26SAlex Elder * first found non-space character (if any). Returns the length of 5343593a9e7bSAlex Elder * the token (string of non-white space characters) found. Note 5344593a9e7bSAlex Elder * that *buf must be terminated with '\0'. 5345e28fff26SAlex Elder */ 5346e28fff26SAlex Elder static inline size_t next_token(const char **buf) 5347e28fff26SAlex Elder { 5348e28fff26SAlex Elder /* 5349e28fff26SAlex Elder * These are the characters that produce nonzero for 5350e28fff26SAlex Elder * isspace() in the "C" and "POSIX" locales. 5351e28fff26SAlex Elder */ 5352e28fff26SAlex Elder const char *spaces = " \f\n\r\t\v"; 5353e28fff26SAlex Elder 5354e28fff26SAlex Elder *buf += strspn(*buf, spaces); /* Find start of token */ 5355e28fff26SAlex Elder 5356e28fff26SAlex Elder return strcspn(*buf, spaces); /* Return token length */ 5357e28fff26SAlex Elder } 5358e28fff26SAlex Elder 5359e28fff26SAlex Elder /* 5360ea3352f4SAlex Elder * Finds the next token in *buf, dynamically allocates a buffer big 5361ea3352f4SAlex Elder * enough to hold a copy of it, and copies the token into the new 5362ea3352f4SAlex Elder * buffer. The copy is guaranteed to be terminated with '\0'. Note 5363ea3352f4SAlex Elder * that a duplicate buffer is created even for a zero-length token. 5364ea3352f4SAlex Elder * 5365ea3352f4SAlex Elder * Returns a pointer to the newly-allocated duplicate, or a null 5366ea3352f4SAlex Elder * pointer if memory for the duplicate was not available. If 5367ea3352f4SAlex Elder * the lenp argument is a non-null pointer, the length of the token 5368ea3352f4SAlex Elder * (not including the '\0') is returned in *lenp. 5369ea3352f4SAlex Elder * 5370ea3352f4SAlex Elder * If successful, the *buf pointer will be updated to point beyond 5371ea3352f4SAlex Elder * the end of the found token. 5372ea3352f4SAlex Elder * 5373ea3352f4SAlex Elder * Note: uses GFP_KERNEL for allocation. 5374ea3352f4SAlex Elder */ 5375ea3352f4SAlex Elder static inline char *dup_token(const char **buf, size_t *lenp) 5376ea3352f4SAlex Elder { 5377ea3352f4SAlex Elder char *dup; 5378ea3352f4SAlex Elder size_t len; 5379ea3352f4SAlex Elder 5380ea3352f4SAlex Elder len = next_token(buf); 53814caf35f9SAlex Elder dup = kmemdup(*buf, len + 1, GFP_KERNEL); 5382ea3352f4SAlex Elder if (!dup) 5383ea3352f4SAlex Elder return NULL; 5384ea3352f4SAlex Elder *(dup + len) = '\0'; 5385ea3352f4SAlex Elder *buf += len; 5386ea3352f4SAlex Elder 5387ea3352f4SAlex Elder if (lenp) 5388ea3352f4SAlex Elder *lenp = len; 5389ea3352f4SAlex Elder 5390ea3352f4SAlex Elder return dup; 5391ea3352f4SAlex Elder } 5392ea3352f4SAlex Elder 5393ea3352f4SAlex Elder /* 5394859c31dfSAlex Elder * Parse the options provided for an "rbd add" (i.e., rbd image 5395859c31dfSAlex Elder * mapping) request. These arrive via a write to /sys/bus/rbd/add, 5396859c31dfSAlex Elder * and the data written is passed here via a NUL-terminated buffer. 5397859c31dfSAlex Elder * Returns 0 if successful or an error code otherwise. 5398d22f76e7SAlex Elder * 5399859c31dfSAlex Elder * The information extracted from these options is recorded in 5400859c31dfSAlex Elder * the other parameters which return dynamically-allocated 5401859c31dfSAlex Elder * structures: 5402859c31dfSAlex Elder * ceph_opts 5403859c31dfSAlex Elder * The address of a pointer that will refer to a ceph options 5404859c31dfSAlex Elder * structure. Caller must release the returned pointer using 5405859c31dfSAlex Elder * ceph_destroy_options() when it is no longer needed. 5406859c31dfSAlex Elder * rbd_opts 5407859c31dfSAlex Elder * Address of an rbd options pointer. Fully initialized by 5408859c31dfSAlex Elder * this function; caller must release with kfree(). 5409859c31dfSAlex Elder * spec 5410859c31dfSAlex Elder * Address of an rbd image specification pointer. Fully 5411859c31dfSAlex Elder * initialized by this function based on parsed options. 5412859c31dfSAlex Elder * Caller must release with rbd_spec_put(). 5413859c31dfSAlex Elder * 5414859c31dfSAlex Elder * The options passed take this form: 5415859c31dfSAlex Elder * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>] 5416859c31dfSAlex Elder * where: 5417859c31dfSAlex Elder * <mon_addrs> 5418859c31dfSAlex Elder * A comma-separated list of one or more monitor addresses. 5419859c31dfSAlex Elder * A monitor address is an ip address, optionally followed 5420859c31dfSAlex Elder * by a port number (separated by a colon). 5421859c31dfSAlex Elder * I.e.: ip1[:port1][,ip2[:port2]...] 5422859c31dfSAlex Elder * <options> 5423859c31dfSAlex Elder * A comma-separated list of ceph and/or rbd options. 5424859c31dfSAlex Elder * <pool_name> 5425859c31dfSAlex Elder * The name of the rados pool containing the rbd image. 5426859c31dfSAlex Elder * <image_name> 5427859c31dfSAlex Elder * The name of the image in that pool to map. 5428859c31dfSAlex Elder * <snap_id> 5429859c31dfSAlex Elder * An optional snapshot id. If provided, the mapping will 5430859c31dfSAlex Elder * present data from the image at the time that snapshot was 5431859c31dfSAlex Elder * created. The image head is used if no snapshot id is 5432859c31dfSAlex Elder * provided. Snapshot mappings are always read-only. 5433a725f65eSAlex Elder */ 5434859c31dfSAlex Elder static int rbd_add_parse_args(const char *buf, 5435dc79b113SAlex Elder struct ceph_options **ceph_opts, 5436859c31dfSAlex Elder struct rbd_options **opts, 5437859c31dfSAlex Elder struct rbd_spec **rbd_spec) 5438a725f65eSAlex Elder { 5439e28fff26SAlex Elder size_t len; 5440859c31dfSAlex Elder char *options; 54410ddebc0cSAlex Elder const char *mon_addrs; 5442ecb4dc22SAlex Elder char *snap_name; 54430ddebc0cSAlex Elder size_t mon_addrs_size; 5444c300156bSIlya Dryomov struct parse_rbd_opts_ctx pctx = { 0 }; 5445859c31dfSAlex Elder struct ceph_options *copts; 5446dc79b113SAlex Elder int ret; 5447e28fff26SAlex Elder 5448e28fff26SAlex Elder /* The first four tokens are required */ 5449e28fff26SAlex Elder 54507ef3214aSAlex Elder len = next_token(&buf); 54514fb5d671SAlex Elder if (!len) { 54524fb5d671SAlex Elder rbd_warn(NULL, "no monitor address(es) provided"); 54534fb5d671SAlex Elder return -EINVAL; 54544fb5d671SAlex Elder } 54550ddebc0cSAlex Elder mon_addrs = buf; 5456f28e565aSAlex Elder mon_addrs_size = len + 1; 54577ef3214aSAlex Elder buf += len; 5458a725f65eSAlex Elder 5459dc79b113SAlex Elder ret = -EINVAL; 5460f28e565aSAlex Elder options = dup_token(&buf, NULL); 5461f28e565aSAlex Elder if (!options) 5462dc79b113SAlex Elder return -ENOMEM; 54634fb5d671SAlex Elder if (!*options) { 54644fb5d671SAlex Elder rbd_warn(NULL, "no options provided"); 54654fb5d671SAlex Elder goto out_err; 54664fb5d671SAlex Elder } 5467a725f65eSAlex Elder 5468c300156bSIlya Dryomov pctx.spec = rbd_spec_alloc(); 5469c300156bSIlya Dryomov if (!pctx.spec) 5470f28e565aSAlex Elder goto out_mem; 5471859c31dfSAlex Elder 5472c300156bSIlya Dryomov pctx.spec->pool_name = dup_token(&buf, NULL); 5473c300156bSIlya Dryomov if (!pctx.spec->pool_name) 5474859c31dfSAlex Elder goto out_mem; 5475c300156bSIlya Dryomov if (!*pctx.spec->pool_name) { 54764fb5d671SAlex Elder rbd_warn(NULL, "no pool name provided"); 54774fb5d671SAlex Elder goto out_err; 54784fb5d671SAlex Elder } 5479e28fff26SAlex Elder 5480c300156bSIlya Dryomov pctx.spec->image_name = dup_token(&buf, NULL); 5481c300156bSIlya Dryomov if (!pctx.spec->image_name) 5482f28e565aSAlex Elder goto out_mem; 5483c300156bSIlya Dryomov if (!*pctx.spec->image_name) { 54844fb5d671SAlex Elder rbd_warn(NULL, "no image name provided"); 54854fb5d671SAlex Elder goto out_err; 54864fb5d671SAlex Elder } 5487e28fff26SAlex Elder 5488f28e565aSAlex Elder /* 5489f28e565aSAlex Elder * Snapshot name is optional; default is to use "-" 5490f28e565aSAlex Elder * (indicating the head/no snapshot). 5491f28e565aSAlex Elder */ 54923feeb894SAlex Elder len = next_token(&buf); 5493820a5f3eSAlex Elder if (!len) { 54943feeb894SAlex Elder buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ 54953feeb894SAlex Elder len = sizeof (RBD_SNAP_HEAD_NAME) - 1; 5496f28e565aSAlex Elder } else if (len > RBD_MAX_SNAP_NAME_LEN) { 5497dc79b113SAlex Elder ret = -ENAMETOOLONG; 5498f28e565aSAlex Elder goto out_err; 5499849b4260SAlex Elder } 5500ecb4dc22SAlex Elder snap_name = kmemdup(buf, len + 1, GFP_KERNEL); 5501ecb4dc22SAlex Elder if (!snap_name) 5502f28e565aSAlex Elder goto out_mem; 5503ecb4dc22SAlex Elder *(snap_name + len) = '\0'; 5504c300156bSIlya Dryomov pctx.spec->snap_name = snap_name; 5505e5c35534SAlex Elder 55060ddebc0cSAlex Elder /* Initialize all rbd options to the defaults */ 5507e28fff26SAlex Elder 5508c300156bSIlya Dryomov pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); 5509c300156bSIlya Dryomov if (!pctx.opts) 55104e9afebaSAlex Elder goto out_mem; 55114e9afebaSAlex Elder 5512c300156bSIlya Dryomov pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; 5513c300156bSIlya Dryomov pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; 55140c93e1b7SIlya Dryomov pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT; 5515c300156bSIlya Dryomov pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; 5516c300156bSIlya Dryomov pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; 5517c300156bSIlya Dryomov pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; 5518c300156bSIlya Dryomov pctx.opts->trim = RBD_TRIM_DEFAULT; 5519d22f76e7SAlex Elder 5520859c31dfSAlex Elder copts = ceph_parse_options(options, mon_addrs, 55210ddebc0cSAlex Elder mon_addrs + mon_addrs_size - 1, 5522c300156bSIlya Dryomov parse_rbd_opts_token, &pctx); 5523859c31dfSAlex Elder if (IS_ERR(copts)) { 5524859c31dfSAlex Elder ret = PTR_ERR(copts); 5525dc79b113SAlex Elder goto out_err; 5526dc79b113SAlex Elder } 5527859c31dfSAlex Elder kfree(options); 5528859c31dfSAlex Elder 5529859c31dfSAlex Elder *ceph_opts = copts; 5530c300156bSIlya Dryomov *opts = pctx.opts; 5531c300156bSIlya Dryomov *rbd_spec = pctx.spec; 55320ddebc0cSAlex Elder 5533dc79b113SAlex Elder return 0; 5534f28e565aSAlex Elder out_mem: 5535dc79b113SAlex Elder ret = -ENOMEM; 5536d22f76e7SAlex Elder out_err: 5537c300156bSIlya Dryomov kfree(pctx.opts); 5538c300156bSIlya Dryomov rbd_spec_put(pctx.spec); 5539f28e565aSAlex Elder kfree(options); 5540d22f76e7SAlex Elder 5541dc79b113SAlex Elder return ret; 5542a725f65eSAlex Elder } 5543a725f65eSAlex Elder 5544e010dd0aSIlya Dryomov static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 5545e010dd0aSIlya Dryomov { 5546e010dd0aSIlya Dryomov down_write(&rbd_dev->lock_rwsem); 5547e010dd0aSIlya Dryomov if (__rbd_is_lock_owner(rbd_dev)) 5548e010dd0aSIlya Dryomov rbd_unlock(rbd_dev); 5549e010dd0aSIlya Dryomov up_write(&rbd_dev->lock_rwsem); 5550e010dd0aSIlya Dryomov } 5551e010dd0aSIlya Dryomov 5552e010dd0aSIlya Dryomov static int rbd_add_acquire_lock(struct rbd_device *rbd_dev) 5553e010dd0aSIlya Dryomov { 55542f18d466SIlya Dryomov int ret; 55552f18d466SIlya Dryomov 5556e010dd0aSIlya Dryomov if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) { 5557e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "exclusive-lock feature is not enabled"); 5558e010dd0aSIlya Dryomov return -EINVAL; 5559e010dd0aSIlya Dryomov } 5560e010dd0aSIlya Dryomov 5561e010dd0aSIlya Dryomov /* FIXME: "rbd map --exclusive" should be in interruptible */ 5562e010dd0aSIlya Dryomov down_read(&rbd_dev->lock_rwsem); 55632f18d466SIlya Dryomov ret = rbd_wait_state_locked(rbd_dev, true); 5564e010dd0aSIlya Dryomov up_read(&rbd_dev->lock_rwsem); 55652f18d466SIlya Dryomov if (ret) { 5566e010dd0aSIlya Dryomov rbd_warn(rbd_dev, "failed to acquire exclusive lock"); 5567e010dd0aSIlya Dryomov return -EROFS; 5568e010dd0aSIlya Dryomov } 5569e010dd0aSIlya Dryomov 5570e010dd0aSIlya Dryomov return 0; 5571e010dd0aSIlya Dryomov } 5572e010dd0aSIlya Dryomov 557330ba1f02SIlya Dryomov /* 5574589d30e0SAlex Elder * An rbd format 2 image has a unique identifier, distinct from the 5575589d30e0SAlex Elder * name given to it by the user. Internally, that identifier is 5576589d30e0SAlex Elder * what's used to specify the names of objects related to the image. 5577589d30e0SAlex Elder * 5578589d30e0SAlex Elder * A special "rbd id" object is used to map an rbd image name to its 5579589d30e0SAlex Elder * id. If that object doesn't exist, then there is no v2 rbd image 5580589d30e0SAlex Elder * with the supplied name. 5581589d30e0SAlex Elder * 5582589d30e0SAlex Elder * This function will record the given rbd_dev's image_id field if 5583589d30e0SAlex Elder * it can be determined, and in that case will return 0. If any 5584589d30e0SAlex Elder * errors occur a negative errno will be returned and the rbd_dev's 5585589d30e0SAlex Elder * image_id field will be unchanged (and should be NULL). 5586589d30e0SAlex Elder */ 5587589d30e0SAlex Elder static int rbd_dev_image_id(struct rbd_device *rbd_dev) 5588589d30e0SAlex Elder { 5589589d30e0SAlex Elder int ret; 5590589d30e0SAlex Elder size_t size; 5591ecd4a68aSIlya Dryomov CEPH_DEFINE_OID_ONSTACK(oid); 5592589d30e0SAlex Elder void *response; 5593c0fba368SAlex Elder char *image_id; 55942f82ee54SAlex Elder 5595589d30e0SAlex Elder /* 55962c0d0a10SAlex Elder * When probing a parent image, the image id is already 55972c0d0a10SAlex Elder * known (and the image name likely is not). There's no 5598c0fba368SAlex Elder * need to fetch the image id again in this case. We 5599c0fba368SAlex Elder * do still need to set the image format though. 56002c0d0a10SAlex Elder */ 5601c0fba368SAlex Elder if (rbd_dev->spec->image_id) { 5602c0fba368SAlex Elder rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; 5603c0fba368SAlex Elder 56042c0d0a10SAlex Elder return 0; 5605c0fba368SAlex Elder } 56062c0d0a10SAlex Elder 56072c0d0a10SAlex Elder /* 5608589d30e0SAlex Elder * First, see if the format 2 image id file exists, and if 5609589d30e0SAlex Elder * so, get the image's persistent id from it. 5610589d30e0SAlex Elder */ 5611ecd4a68aSIlya Dryomov ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX, 5612ecd4a68aSIlya Dryomov rbd_dev->spec->image_name); 5613ecd4a68aSIlya Dryomov if (ret) 5614ecd4a68aSIlya Dryomov return ret; 5615ecd4a68aSIlya Dryomov 5616ecd4a68aSIlya Dryomov dout("rbd id object name is %s\n", oid.name); 5617589d30e0SAlex Elder 5618589d30e0SAlex Elder /* Response will be an encoded string, which includes a length */ 5619589d30e0SAlex Elder 5620589d30e0SAlex Elder size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 5621589d30e0SAlex Elder response = kzalloc(size, GFP_NOIO); 5622589d30e0SAlex Elder if (!response) { 5623589d30e0SAlex Elder ret = -ENOMEM; 5624589d30e0SAlex Elder goto out; 5625589d30e0SAlex Elder } 5626589d30e0SAlex Elder 5627c0fba368SAlex Elder /* If it doesn't exist we'll assume it's a format 1 image */ 5628c0fba368SAlex Elder 5629ecd4a68aSIlya Dryomov ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 5630ecd4a68aSIlya Dryomov "get_id", NULL, 0, 5631e2a58ee5SAlex Elder response, RBD_IMAGE_ID_LEN_MAX); 563236be9a76SAlex Elder dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5633c0fba368SAlex Elder if (ret == -ENOENT) { 5634c0fba368SAlex Elder image_id = kstrdup("", GFP_KERNEL); 5635c0fba368SAlex Elder ret = image_id ? 0 : -ENOMEM; 5636c0fba368SAlex Elder if (!ret) 5637c0fba368SAlex Elder rbd_dev->image_format = 1; 56387dd440c9SIlya Dryomov } else if (ret >= 0) { 5639c0fba368SAlex Elder void *p = response; 5640589d30e0SAlex Elder 5641c0fba368SAlex Elder image_id = ceph_extract_encoded_string(&p, p + ret, 5642979ed480SAlex Elder NULL, GFP_NOIO); 5643461f758aSDuan Jiong ret = PTR_ERR_OR_ZERO(image_id); 5644c0fba368SAlex Elder if (!ret) 5645c0fba368SAlex Elder rbd_dev->image_format = 2; 5646c0fba368SAlex Elder } 5647c0fba368SAlex Elder 5648c0fba368SAlex Elder if (!ret) { 5649c0fba368SAlex Elder rbd_dev->spec->image_id = image_id; 5650c0fba368SAlex Elder dout("image_id is %s\n", image_id); 5651589d30e0SAlex Elder } 5652589d30e0SAlex Elder out: 5653589d30e0SAlex Elder kfree(response); 5654ecd4a68aSIlya Dryomov ceph_oid_destroy(&oid); 5655589d30e0SAlex Elder return ret; 5656589d30e0SAlex Elder } 5657589d30e0SAlex Elder 56583abef3b3SAlex Elder /* 56593abef3b3SAlex Elder * Undo whatever state changes are made by v1 or v2 header info 56603abef3b3SAlex Elder * call. 56613abef3b3SAlex Elder */ 56626fd48b3bSAlex Elder static void rbd_dev_unprobe(struct rbd_device *rbd_dev) 56636fd48b3bSAlex Elder { 56646fd48b3bSAlex Elder struct rbd_image_header *header; 56656fd48b3bSAlex Elder 5666a2acd00eSAlex Elder rbd_dev_parent_put(rbd_dev); 56676fd48b3bSAlex Elder 56686fd48b3bSAlex Elder /* Free dynamic fields from the header, then zero it out */ 56696fd48b3bSAlex Elder 56706fd48b3bSAlex Elder header = &rbd_dev->header; 5671812164f8SAlex Elder ceph_put_snap_context(header->snapc); 56726fd48b3bSAlex Elder kfree(header->snap_sizes); 56736fd48b3bSAlex Elder kfree(header->snap_names); 56746fd48b3bSAlex Elder kfree(header->object_prefix); 56756fd48b3bSAlex Elder memset(header, 0, sizeof (*header)); 56766fd48b3bSAlex Elder } 56776fd48b3bSAlex Elder 56782df3fac7SAlex Elder static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) 5679a30b71b9SAlex Elder { 5680a30b71b9SAlex Elder int ret; 5681a30b71b9SAlex Elder 56821e130199SAlex Elder ret = rbd_dev_v2_object_prefix(rbd_dev); 568357385b51SAlex Elder if (ret) 56841e130199SAlex Elder goto out_err; 5685b1b5402aSAlex Elder 56862df3fac7SAlex Elder /* 56872df3fac7SAlex Elder * Get the and check features for the image. Currently the 56882df3fac7SAlex Elder * features are assumed to never change. 56892df3fac7SAlex Elder */ 5690b1b5402aSAlex Elder ret = rbd_dev_v2_features(rbd_dev); 569157385b51SAlex Elder if (ret) 5692b1b5402aSAlex Elder goto out_err; 569335d489f9SAlex Elder 5694cc070d59SAlex Elder /* If the image supports fancy striping, get its parameters */ 5695cc070d59SAlex Elder 5696cc070d59SAlex Elder if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { 5697cc070d59SAlex Elder ret = rbd_dev_v2_striping_info(rbd_dev); 5698cc070d59SAlex Elder if (ret < 0) 5699cc070d59SAlex Elder goto out_err; 5700cc070d59SAlex Elder } 5701a30b71b9SAlex Elder 57027e97332eSIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) { 57037e97332eSIlya Dryomov ret = rbd_dev_v2_data_pool(rbd_dev); 57047e97332eSIlya Dryomov if (ret) 57057e97332eSIlya Dryomov goto out_err; 57067e97332eSIlya Dryomov } 57077e97332eSIlya Dryomov 5708263423f8SIlya Dryomov rbd_init_layout(rbd_dev); 570935152979SAlex Elder return 0; 5710263423f8SIlya Dryomov 57119d475de5SAlex Elder out_err: 5712642a2537SAlex Elder rbd_dev->header.features = 0; 57131e130199SAlex Elder kfree(rbd_dev->header.object_prefix); 57141e130199SAlex Elder rbd_dev->header.object_prefix = NULL; 57159d475de5SAlex Elder return ret; 5716a30b71b9SAlex Elder } 5717a30b71b9SAlex Elder 57186d69bb53SIlya Dryomov /* 57196d69bb53SIlya Dryomov * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> 57206d69bb53SIlya Dryomov * rbd_dev_image_probe() recursion depth, which means it's also the 57216d69bb53SIlya Dryomov * length of the already discovered part of the parent chain. 57226d69bb53SIlya Dryomov */ 57236d69bb53SIlya Dryomov static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) 572483a06263SAlex Elder { 57252f82ee54SAlex Elder struct rbd_device *parent = NULL; 5726124afba2SAlex Elder int ret; 5727124afba2SAlex Elder 5728124afba2SAlex Elder if (!rbd_dev->parent_spec) 5729124afba2SAlex Elder return 0; 5730124afba2SAlex Elder 57316d69bb53SIlya Dryomov if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { 57326d69bb53SIlya Dryomov pr_info("parent chain is too long (%d)\n", depth); 57336d69bb53SIlya Dryomov ret = -EINVAL; 57346d69bb53SIlya Dryomov goto out_err; 57356d69bb53SIlya Dryomov } 57366d69bb53SIlya Dryomov 57371643dfa4SIlya Dryomov parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec); 57381f2c6651SIlya Dryomov if (!parent) { 5739124afba2SAlex Elder ret = -ENOMEM; 5740124afba2SAlex Elder goto out_err; 57411f2c6651SIlya Dryomov } 57421f2c6651SIlya Dryomov 57431f2c6651SIlya Dryomov /* 57441f2c6651SIlya Dryomov * Images related by parent/child relationships always share 57451f2c6651SIlya Dryomov * rbd_client and spec/parent_spec, so bump their refcounts. 57461f2c6651SIlya Dryomov */ 57471f2c6651SIlya Dryomov __rbd_get_client(rbd_dev->rbd_client); 57481f2c6651SIlya Dryomov rbd_spec_get(rbd_dev->parent_spec); 5749124afba2SAlex Elder 57506d69bb53SIlya Dryomov ret = rbd_dev_image_probe(parent, depth); 5751124afba2SAlex Elder if (ret < 0) 5752124afba2SAlex Elder goto out_err; 57531f2c6651SIlya Dryomov 5754124afba2SAlex Elder rbd_dev->parent = parent; 5755a2acd00eSAlex Elder atomic_set(&rbd_dev->parent_ref, 1); 5756124afba2SAlex Elder return 0; 5757124afba2SAlex Elder 57581f2c6651SIlya Dryomov out_err: 57591f2c6651SIlya Dryomov rbd_dev_unparent(rbd_dev); 57601f2c6651SIlya Dryomov rbd_dev_destroy(parent); 5761124afba2SAlex Elder return ret; 5762124afba2SAlex Elder } 5763124afba2SAlex Elder 57645769ed0cSIlya Dryomov static void rbd_dev_device_release(struct rbd_device *rbd_dev) 57655769ed0cSIlya Dryomov { 57665769ed0cSIlya Dryomov clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 57675769ed0cSIlya Dryomov rbd_dev_mapping_clear(rbd_dev); 57685769ed0cSIlya Dryomov rbd_free_disk(rbd_dev); 57695769ed0cSIlya Dryomov if (!single_major) 57705769ed0cSIlya Dryomov unregister_blkdev(rbd_dev->major, rbd_dev->name); 57715769ed0cSIlya Dryomov } 57725769ed0cSIlya Dryomov 5773811c6688SIlya Dryomov /* 5774811c6688SIlya Dryomov * rbd_dev->header_rwsem must be locked for write and will be unlocked 5775811c6688SIlya Dryomov * upon return. 5776811c6688SIlya Dryomov */ 5777200a6a8bSAlex Elder static int rbd_dev_device_setup(struct rbd_device *rbd_dev) 5778124afba2SAlex Elder { 577983a06263SAlex Elder int ret; 578083a06263SAlex Elder 57819b60e70bSIlya Dryomov /* Record our major and minor device numbers. */ 578283a06263SAlex Elder 57839b60e70bSIlya Dryomov if (!single_major) { 578483a06263SAlex Elder ret = register_blkdev(0, rbd_dev->name); 578583a06263SAlex Elder if (ret < 0) 57861643dfa4SIlya Dryomov goto err_out_unlock; 57879b60e70bSIlya Dryomov 578883a06263SAlex Elder rbd_dev->major = ret; 5789dd82fff1SIlya Dryomov rbd_dev->minor = 0; 57909b60e70bSIlya Dryomov } else { 57919b60e70bSIlya Dryomov rbd_dev->major = rbd_major; 57929b60e70bSIlya Dryomov rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); 57939b60e70bSIlya Dryomov } 579483a06263SAlex Elder 579583a06263SAlex Elder /* Set up the blkdev mapping. */ 579683a06263SAlex Elder 579783a06263SAlex Elder ret = rbd_init_disk(rbd_dev); 579883a06263SAlex Elder if (ret) 579983a06263SAlex Elder goto err_out_blkdev; 580083a06263SAlex Elder 5801f35a4deeSAlex Elder ret = rbd_dev_mapping_set(rbd_dev); 580283a06263SAlex Elder if (ret) 580383a06263SAlex Elder goto err_out_disk; 5804bc1ecc65SIlya Dryomov 5805f35a4deeSAlex Elder set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 58069568c93eSIlya Dryomov set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); 5807f35a4deeSAlex Elder 58085769ed0cSIlya Dryomov ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); 5809f35a4deeSAlex Elder if (ret) 5810f5ee37bdSIlya Dryomov goto err_out_mapping; 581183a06263SAlex Elder 5812129b79d4SAlex Elder set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); 5813811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 58145769ed0cSIlya Dryomov return 0; 58152f82ee54SAlex Elder 5816f35a4deeSAlex Elder err_out_mapping: 5817f35a4deeSAlex Elder rbd_dev_mapping_clear(rbd_dev); 581883a06263SAlex Elder err_out_disk: 581983a06263SAlex Elder rbd_free_disk(rbd_dev); 582083a06263SAlex Elder err_out_blkdev: 58219b60e70bSIlya Dryomov if (!single_major) 582283a06263SAlex Elder unregister_blkdev(rbd_dev->major, rbd_dev->name); 5823811c6688SIlya Dryomov err_out_unlock: 5824811c6688SIlya Dryomov up_write(&rbd_dev->header_rwsem); 582583a06263SAlex Elder return ret; 582683a06263SAlex Elder } 582783a06263SAlex Elder 5828332bb12dSAlex Elder static int rbd_dev_header_name(struct rbd_device *rbd_dev) 5829332bb12dSAlex Elder { 5830332bb12dSAlex Elder struct rbd_spec *spec = rbd_dev->spec; 5831c41d13a3SIlya Dryomov int ret; 5832332bb12dSAlex Elder 5833332bb12dSAlex Elder /* Record the header object name for this rbd image. */ 5834332bb12dSAlex Elder 5835332bb12dSAlex Elder rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); 5836332bb12dSAlex Elder if (rbd_dev->image_format == 1) 5837c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5838332bb12dSAlex Elder spec->image_name, RBD_SUFFIX); 5839332bb12dSAlex Elder else 5840c41d13a3SIlya Dryomov ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s", 5841332bb12dSAlex Elder RBD_HEADER_PREFIX, spec->image_id); 5842c41d13a3SIlya Dryomov 5843c41d13a3SIlya Dryomov return ret; 5844332bb12dSAlex Elder } 5845332bb12dSAlex Elder 5846200a6a8bSAlex Elder static void rbd_dev_image_release(struct rbd_device *rbd_dev) 5847200a6a8bSAlex Elder { 58486fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5849fd22aef8SIlya Dryomov if (rbd_dev->opts) 5850fd22aef8SIlya Dryomov rbd_unregister_watch(rbd_dev); 58516fd48b3bSAlex Elder rbd_dev->image_format = 0; 58526fd48b3bSAlex Elder kfree(rbd_dev->spec->image_id); 58536fd48b3bSAlex Elder rbd_dev->spec->image_id = NULL; 5854200a6a8bSAlex Elder } 5855200a6a8bSAlex Elder 5856a30b71b9SAlex Elder /* 5857a30b71b9SAlex Elder * Probe for the existence of the header object for the given rbd 58581f3ef788SAlex Elder * device. If this image is the one being mapped (i.e., not a 58591f3ef788SAlex Elder * parent), initiate a watch on its header object before using that 58601f3ef788SAlex Elder * object to get detailed information about the rbd image. 5861a30b71b9SAlex Elder */ 58626d69bb53SIlya Dryomov static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) 5863a30b71b9SAlex Elder { 5864a30b71b9SAlex Elder int ret; 5865a30b71b9SAlex Elder 5866a30b71b9SAlex Elder /* 58673abef3b3SAlex Elder * Get the id from the image id object. Unless there's an 58683abef3b3SAlex Elder * error, rbd_dev->spec->image_id will be filled in with 58693abef3b3SAlex Elder * a dynamically-allocated string, and rbd_dev->image_format 58703abef3b3SAlex Elder * will be set to either 1 or 2. 5871a30b71b9SAlex Elder */ 5872a30b71b9SAlex Elder ret = rbd_dev_image_id(rbd_dev); 5873a30b71b9SAlex Elder if (ret) 5874c0fba368SAlex Elder return ret; 5875c0fba368SAlex Elder 5876332bb12dSAlex Elder ret = rbd_dev_header_name(rbd_dev); 5877332bb12dSAlex Elder if (ret) 5878332bb12dSAlex Elder goto err_out_format; 5879332bb12dSAlex Elder 58806d69bb53SIlya Dryomov if (!depth) { 588199d16943SIlya Dryomov ret = rbd_register_watch(rbd_dev); 58821fe48023SIlya Dryomov if (ret) { 58831fe48023SIlya Dryomov if (ret == -ENOENT) 5884b26c047bSIlya Dryomov pr_info("image %s/%s%s%s does not exist\n", 58851fe48023SIlya Dryomov rbd_dev->spec->pool_name, 5886b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 5887b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 58881fe48023SIlya Dryomov rbd_dev->spec->image_name); 5889c41d13a3SIlya Dryomov goto err_out_format; 58901f3ef788SAlex Elder } 58911fe48023SIlya Dryomov } 5892b644de2bSAlex Elder 5893a720ae09SIlya Dryomov ret = rbd_dev_header_info(rbd_dev); 58945655c4d9SAlex Elder if (ret) 5895b644de2bSAlex Elder goto err_out_watch; 5896a30b71b9SAlex Elder 589704077599SIlya Dryomov /* 589804077599SIlya Dryomov * If this image is the one being mapped, we have pool name and 589904077599SIlya Dryomov * id, image name and id, and snap name - need to fill snap id. 590004077599SIlya Dryomov * Otherwise this is a parent image, identified by pool, image 590104077599SIlya Dryomov * and snap ids - need to fill in names for those ids. 590204077599SIlya Dryomov */ 59036d69bb53SIlya Dryomov if (!depth) 590404077599SIlya Dryomov ret = rbd_spec_fill_snap_id(rbd_dev); 590504077599SIlya Dryomov else 590604077599SIlya Dryomov ret = rbd_spec_fill_names(rbd_dev); 59071fe48023SIlya Dryomov if (ret) { 59081fe48023SIlya Dryomov if (ret == -ENOENT) 5909b26c047bSIlya Dryomov pr_info("snap %s/%s%s%s@%s does not exist\n", 59101fe48023SIlya Dryomov rbd_dev->spec->pool_name, 5911b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ?: "", 5912b26c047bSIlya Dryomov rbd_dev->spec->pool_ns ? "/" : "", 59131fe48023SIlya Dryomov rbd_dev->spec->image_name, 59141fe48023SIlya Dryomov rbd_dev->spec->snap_name); 591533dca39fSAlex Elder goto err_out_probe; 59161fe48023SIlya Dryomov } 59179bb81c9bSAlex Elder 5918e8f59b59SIlya Dryomov if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { 5919e8f59b59SIlya Dryomov ret = rbd_dev_v2_parent_info(rbd_dev); 5920e8f59b59SIlya Dryomov if (ret) 5921e8f59b59SIlya Dryomov goto err_out_probe; 5922e8f59b59SIlya Dryomov } 5923e8f59b59SIlya Dryomov 59246d69bb53SIlya Dryomov ret = rbd_dev_probe_parent(rbd_dev, depth); 592530d60ba2SAlex Elder if (ret) 592630d60ba2SAlex Elder goto err_out_probe; 592783a06263SAlex Elder 592830d60ba2SAlex Elder dout("discovered format %u image, header name is %s\n", 5929c41d13a3SIlya Dryomov rbd_dev->image_format, rbd_dev->header_oid.name); 593030d60ba2SAlex Elder return 0; 5931e8f59b59SIlya Dryomov 59326fd48b3bSAlex Elder err_out_probe: 59336fd48b3bSAlex Elder rbd_dev_unprobe(rbd_dev); 5934b644de2bSAlex Elder err_out_watch: 59356d69bb53SIlya Dryomov if (!depth) 593699d16943SIlya Dryomov rbd_unregister_watch(rbd_dev); 5937332bb12dSAlex Elder err_out_format: 5938332bb12dSAlex Elder rbd_dev->image_format = 0; 59395655c4d9SAlex Elder kfree(rbd_dev->spec->image_id); 59405655c4d9SAlex Elder rbd_dev->spec->image_id = NULL; 59415655c4d9SAlex Elder return ret; 594283a06263SAlex Elder } 594383a06263SAlex Elder 59449b60e70bSIlya Dryomov static ssize_t do_rbd_add(struct bus_type *bus, 594559c2be1eSYehuda Sadeh const char *buf, 594659c2be1eSYehuda Sadeh size_t count) 5947602adf40SYehuda Sadeh { 5948cb8627c7SAlex Elder struct rbd_device *rbd_dev = NULL; 5949dc79b113SAlex Elder struct ceph_options *ceph_opts = NULL; 59504e9afebaSAlex Elder struct rbd_options *rbd_opts = NULL; 5951859c31dfSAlex Elder struct rbd_spec *spec = NULL; 59529d3997fdSAlex Elder struct rbd_client *rbdc; 5953b51c83c2SIlya Dryomov int rc; 5954602adf40SYehuda Sadeh 5955602adf40SYehuda Sadeh if (!try_module_get(THIS_MODULE)) 5956602adf40SYehuda Sadeh return -ENODEV; 5957602adf40SYehuda Sadeh 5958a725f65eSAlex Elder /* parse add command */ 5959859c31dfSAlex Elder rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); 5960dc79b113SAlex Elder if (rc < 0) 5961dd5ac32dSIlya Dryomov goto out; 5962a725f65eSAlex Elder 59639d3997fdSAlex Elder rbdc = rbd_get_client(ceph_opts); 59649d3997fdSAlex Elder if (IS_ERR(rbdc)) { 59659d3997fdSAlex Elder rc = PTR_ERR(rbdc); 59660ddebc0cSAlex Elder goto err_out_args; 59679d3997fdSAlex Elder } 5968602adf40SYehuda Sadeh 5969602adf40SYehuda Sadeh /* pick the pool */ 5970dd435855SIlya Dryomov rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); 59711fe48023SIlya Dryomov if (rc < 0) { 59721fe48023SIlya Dryomov if (rc == -ENOENT) 59731fe48023SIlya Dryomov pr_info("pool %s does not exist\n", spec->pool_name); 5974602adf40SYehuda Sadeh goto err_out_client; 59751fe48023SIlya Dryomov } 5976859c31dfSAlex Elder spec->pool_id = (u64)rc; 5977859c31dfSAlex Elder 5978d147543dSIlya Dryomov rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); 5979b51c83c2SIlya Dryomov if (!rbd_dev) { 5980b51c83c2SIlya Dryomov rc = -ENOMEM; 5981bd4ba655SAlex Elder goto err_out_client; 5982b51c83c2SIlya Dryomov } 5983c53d5893SAlex Elder rbdc = NULL; /* rbd_dev now owns this */ 5984c53d5893SAlex Elder spec = NULL; /* rbd_dev now owns this */ 5985d147543dSIlya Dryomov rbd_opts = NULL; /* rbd_dev now owns this */ 5986602adf40SYehuda Sadeh 59870d6d1e9cSMike Christie rbd_dev->config_info = kstrdup(buf, GFP_KERNEL); 59880d6d1e9cSMike Christie if (!rbd_dev->config_info) { 59890d6d1e9cSMike Christie rc = -ENOMEM; 59900d6d1e9cSMike Christie goto err_out_rbd_dev; 59910d6d1e9cSMike Christie } 59920d6d1e9cSMike Christie 5993811c6688SIlya Dryomov down_write(&rbd_dev->header_rwsem); 59946d69bb53SIlya Dryomov rc = rbd_dev_image_probe(rbd_dev, 0); 59950d6d1e9cSMike Christie if (rc < 0) { 59960d6d1e9cSMike Christie up_write(&rbd_dev->header_rwsem); 5997c53d5893SAlex Elder goto err_out_rbd_dev; 59980d6d1e9cSMike Christie } 599905fd6f6fSAlex Elder 60007ce4eef7SAlex Elder /* If we are mapping a snapshot it must be marked read-only */ 60017ce4eef7SAlex Elder if (rbd_dev->spec->snap_id != CEPH_NOSNAP) 60029568c93eSIlya Dryomov rbd_dev->opts->read_only = true; 60037ce4eef7SAlex Elder 60040c93e1b7SIlya Dryomov if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) { 60050c93e1b7SIlya Dryomov rbd_warn(rbd_dev, "alloc_size adjusted to %u", 60060c93e1b7SIlya Dryomov rbd_dev->layout.object_size); 60070c93e1b7SIlya Dryomov rbd_dev->opts->alloc_size = rbd_dev->layout.object_size; 60080c93e1b7SIlya Dryomov } 60090c93e1b7SIlya Dryomov 6010b536f69aSAlex Elder rc = rbd_dev_device_setup(rbd_dev); 6011fd22aef8SIlya Dryomov if (rc) 60128b679ec5SIlya Dryomov goto err_out_image_probe; 60133abef3b3SAlex Elder 6014e010dd0aSIlya Dryomov if (rbd_dev->opts->exclusive) { 6015e010dd0aSIlya Dryomov rc = rbd_add_acquire_lock(rbd_dev); 6016e010dd0aSIlya Dryomov if (rc) 6017e010dd0aSIlya Dryomov goto err_out_device_setup; 6018b536f69aSAlex Elder } 6019b536f69aSAlex Elder 60205769ed0cSIlya Dryomov /* Everything's ready. Announce the disk to the world. */ 60215769ed0cSIlya Dryomov 60225769ed0cSIlya Dryomov rc = device_add(&rbd_dev->dev); 60235769ed0cSIlya Dryomov if (rc) 6024e010dd0aSIlya Dryomov goto err_out_image_lock; 60255769ed0cSIlya Dryomov 60265769ed0cSIlya Dryomov add_disk(rbd_dev->disk); 60275769ed0cSIlya Dryomov /* see rbd_init_disk() */ 60285769ed0cSIlya Dryomov blk_put_queue(rbd_dev->disk->queue); 60295769ed0cSIlya Dryomov 60305769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 60315769ed0cSIlya Dryomov list_add_tail(&rbd_dev->node, &rbd_dev_list); 60325769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 60335769ed0cSIlya Dryomov 60345769ed0cSIlya Dryomov pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name, 60355769ed0cSIlya Dryomov (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT, 60365769ed0cSIlya Dryomov rbd_dev->header.features); 6037dd5ac32dSIlya Dryomov rc = count; 6038dd5ac32dSIlya Dryomov out: 6039dd5ac32dSIlya Dryomov module_put(THIS_MODULE); 6040dd5ac32dSIlya Dryomov return rc; 6041b536f69aSAlex Elder 6042e010dd0aSIlya Dryomov err_out_image_lock: 6043e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 60445769ed0cSIlya Dryomov err_out_device_setup: 60455769ed0cSIlya Dryomov rbd_dev_device_release(rbd_dev); 60468b679ec5SIlya Dryomov err_out_image_probe: 60478b679ec5SIlya Dryomov rbd_dev_image_release(rbd_dev); 6048c53d5893SAlex Elder err_out_rbd_dev: 6049c53d5893SAlex Elder rbd_dev_destroy(rbd_dev); 6050bd4ba655SAlex Elder err_out_client: 60519d3997fdSAlex Elder rbd_put_client(rbdc); 60520ddebc0cSAlex Elder err_out_args: 6053859c31dfSAlex Elder rbd_spec_put(spec); 6054d147543dSIlya Dryomov kfree(rbd_opts); 6055dd5ac32dSIlya Dryomov goto out; 6056602adf40SYehuda Sadeh } 6057602adf40SYehuda Sadeh 60589b60e70bSIlya Dryomov static ssize_t rbd_add(struct bus_type *bus, 60599b60e70bSIlya Dryomov const char *buf, 60609b60e70bSIlya Dryomov size_t count) 60619b60e70bSIlya Dryomov { 60629b60e70bSIlya Dryomov if (single_major) 60639b60e70bSIlya Dryomov return -EINVAL; 60649b60e70bSIlya Dryomov 60659b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 60669b60e70bSIlya Dryomov } 60679b60e70bSIlya Dryomov 60689b60e70bSIlya Dryomov static ssize_t rbd_add_single_major(struct bus_type *bus, 60699b60e70bSIlya Dryomov const char *buf, 60709b60e70bSIlya Dryomov size_t count) 60719b60e70bSIlya Dryomov { 60729b60e70bSIlya Dryomov return do_rbd_add(bus, buf, count); 60739b60e70bSIlya Dryomov } 60749b60e70bSIlya Dryomov 607505a46afdSAlex Elder static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) 607605a46afdSAlex Elder { 6077ad945fc1SAlex Elder while (rbd_dev->parent) { 607805a46afdSAlex Elder struct rbd_device *first = rbd_dev; 607905a46afdSAlex Elder struct rbd_device *second = first->parent; 608005a46afdSAlex Elder struct rbd_device *third; 608105a46afdSAlex Elder 608205a46afdSAlex Elder /* 608305a46afdSAlex Elder * Follow to the parent with no grandparent and 608405a46afdSAlex Elder * remove it. 608505a46afdSAlex Elder */ 608605a46afdSAlex Elder while (second && (third = second->parent)) { 608705a46afdSAlex Elder first = second; 608805a46afdSAlex Elder second = third; 608905a46afdSAlex Elder } 6090ad945fc1SAlex Elder rbd_assert(second); 60918ad42cd0SAlex Elder rbd_dev_image_release(second); 60928b679ec5SIlya Dryomov rbd_dev_destroy(second); 6093ad945fc1SAlex Elder first->parent = NULL; 6094ad945fc1SAlex Elder first->parent_overlap = 0; 6095ad945fc1SAlex Elder 6096ad945fc1SAlex Elder rbd_assert(first->parent_spec); 609705a46afdSAlex Elder rbd_spec_put(first->parent_spec); 609805a46afdSAlex Elder first->parent_spec = NULL; 609905a46afdSAlex Elder } 610005a46afdSAlex Elder } 610105a46afdSAlex Elder 61029b60e70bSIlya Dryomov static ssize_t do_rbd_remove(struct bus_type *bus, 6103602adf40SYehuda Sadeh const char *buf, 6104602adf40SYehuda Sadeh size_t count) 6105602adf40SYehuda Sadeh { 6106602adf40SYehuda Sadeh struct rbd_device *rbd_dev = NULL; 6107751cc0e3SAlex Elder struct list_head *tmp; 6108751cc0e3SAlex Elder int dev_id; 61090276dca6SMike Christie char opt_buf[6]; 61100276dca6SMike Christie bool force = false; 61110d8189e1SAlex Elder int ret; 6112602adf40SYehuda Sadeh 61130276dca6SMike Christie dev_id = -1; 61140276dca6SMike Christie opt_buf[0] = '\0'; 61150276dca6SMike Christie sscanf(buf, "%d %5s", &dev_id, opt_buf); 61160276dca6SMike Christie if (dev_id < 0) { 61170276dca6SMike Christie pr_err("dev_id out of range\n"); 6118602adf40SYehuda Sadeh return -EINVAL; 61190276dca6SMike Christie } 61200276dca6SMike Christie if (opt_buf[0] != '\0') { 61210276dca6SMike Christie if (!strcmp(opt_buf, "force")) { 61220276dca6SMike Christie force = true; 61230276dca6SMike Christie } else { 61240276dca6SMike Christie pr_err("bad remove option at '%s'\n", opt_buf); 61250276dca6SMike Christie return -EINVAL; 61260276dca6SMike Christie } 61270276dca6SMike Christie } 6128602adf40SYehuda Sadeh 6129602adf40SYehuda Sadeh ret = -ENOENT; 6130751cc0e3SAlex Elder spin_lock(&rbd_dev_list_lock); 6131751cc0e3SAlex Elder list_for_each(tmp, &rbd_dev_list) { 6132751cc0e3SAlex Elder rbd_dev = list_entry(tmp, struct rbd_device, node); 6133751cc0e3SAlex Elder if (rbd_dev->dev_id == dev_id) { 6134751cc0e3SAlex Elder ret = 0; 6135751cc0e3SAlex Elder break; 6136602adf40SYehuda Sadeh } 6137751cc0e3SAlex Elder } 6138751cc0e3SAlex Elder if (!ret) { 6139a14ea269SAlex Elder spin_lock_irq(&rbd_dev->lock); 61400276dca6SMike Christie if (rbd_dev->open_count && !force) 614142382b70SAlex Elder ret = -EBUSY; 614285f5a4d6SIlya Dryomov else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, 614385f5a4d6SIlya Dryomov &rbd_dev->flags)) 614485f5a4d6SIlya Dryomov ret = -EINPROGRESS; 6145a14ea269SAlex Elder spin_unlock_irq(&rbd_dev->lock); 6146751cc0e3SAlex Elder } 6147751cc0e3SAlex Elder spin_unlock(&rbd_dev_list_lock); 614885f5a4d6SIlya Dryomov if (ret) 61491ba0f1e7SAlex Elder return ret; 6150751cc0e3SAlex Elder 61510276dca6SMike Christie if (force) { 61520276dca6SMike Christie /* 61530276dca6SMike Christie * Prevent new IO from being queued and wait for existing 61540276dca6SMike Christie * IO to complete/fail. 61550276dca6SMike Christie */ 61560276dca6SMike Christie blk_mq_freeze_queue(rbd_dev->disk->queue); 61570276dca6SMike Christie blk_set_queue_dying(rbd_dev->disk->queue); 61580276dca6SMike Christie } 61590276dca6SMike Christie 61605769ed0cSIlya Dryomov del_gendisk(rbd_dev->disk); 61615769ed0cSIlya Dryomov spin_lock(&rbd_dev_list_lock); 61625769ed0cSIlya Dryomov list_del_init(&rbd_dev->node); 61635769ed0cSIlya Dryomov spin_unlock(&rbd_dev_list_lock); 61645769ed0cSIlya Dryomov device_del(&rbd_dev->dev); 6165fca27065SIlya Dryomov 6166e010dd0aSIlya Dryomov rbd_dev_image_unlock(rbd_dev); 6167dd5ac32dSIlya Dryomov rbd_dev_device_release(rbd_dev); 61688ad42cd0SAlex Elder rbd_dev_image_release(rbd_dev); 61698b679ec5SIlya Dryomov rbd_dev_destroy(rbd_dev); 61701ba0f1e7SAlex Elder return count; 6171602adf40SYehuda Sadeh } 6172602adf40SYehuda Sadeh 61739b60e70bSIlya Dryomov static ssize_t rbd_remove(struct bus_type *bus, 61749b60e70bSIlya Dryomov const char *buf, 61759b60e70bSIlya Dryomov size_t count) 61769b60e70bSIlya Dryomov { 61779b60e70bSIlya Dryomov if (single_major) 61789b60e70bSIlya Dryomov return -EINVAL; 61799b60e70bSIlya Dryomov 61809b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 61819b60e70bSIlya Dryomov } 61829b60e70bSIlya Dryomov 61839b60e70bSIlya Dryomov static ssize_t rbd_remove_single_major(struct bus_type *bus, 61849b60e70bSIlya Dryomov const char *buf, 61859b60e70bSIlya Dryomov size_t count) 61869b60e70bSIlya Dryomov { 61879b60e70bSIlya Dryomov return do_rbd_remove(bus, buf, count); 61889b60e70bSIlya Dryomov } 61899b60e70bSIlya Dryomov 6190602adf40SYehuda Sadeh /* 6191602adf40SYehuda Sadeh * create control files in sysfs 6192dfc5606dSYehuda Sadeh * /sys/bus/rbd/... 6193602adf40SYehuda Sadeh */ 61947d8dc534SChengguang Xu static int __init rbd_sysfs_init(void) 6195602adf40SYehuda Sadeh { 6196dfc5606dSYehuda Sadeh int ret; 6197602adf40SYehuda Sadeh 6198fed4c143SAlex Elder ret = device_register(&rbd_root_dev); 6199dfc5606dSYehuda Sadeh if (ret < 0) 6200dfc5606dSYehuda Sadeh return ret; 6201602adf40SYehuda Sadeh 6202fed4c143SAlex Elder ret = bus_register(&rbd_bus_type); 6203fed4c143SAlex Elder if (ret < 0) 6204fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6205602adf40SYehuda Sadeh 6206602adf40SYehuda Sadeh return ret; 6207602adf40SYehuda Sadeh } 6208602adf40SYehuda Sadeh 62097d8dc534SChengguang Xu static void __exit rbd_sysfs_cleanup(void) 6210602adf40SYehuda Sadeh { 6211dfc5606dSYehuda Sadeh bus_unregister(&rbd_bus_type); 6212fed4c143SAlex Elder device_unregister(&rbd_root_dev); 6213602adf40SYehuda Sadeh } 6214602adf40SYehuda Sadeh 62157d8dc534SChengguang Xu static int __init rbd_slab_init(void) 62161c2a9dfeSAlex Elder { 62171c2a9dfeSAlex Elder rbd_assert(!rbd_img_request_cache); 621803d94406SGeliang Tang rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 6219868311b1SAlex Elder if (!rbd_img_request_cache) 6220868311b1SAlex Elder return -ENOMEM; 6221868311b1SAlex Elder 6222868311b1SAlex Elder rbd_assert(!rbd_obj_request_cache); 622303d94406SGeliang Tang rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 622478c2a44aSAlex Elder if (!rbd_obj_request_cache) 622578c2a44aSAlex Elder goto out_err; 622678c2a44aSAlex Elder 62271c2a9dfeSAlex Elder return 0; 62281c2a9dfeSAlex Elder 62296c696d85SIlya Dryomov out_err: 6230868311b1SAlex Elder kmem_cache_destroy(rbd_img_request_cache); 6231868311b1SAlex Elder rbd_img_request_cache = NULL; 62321c2a9dfeSAlex Elder return -ENOMEM; 62331c2a9dfeSAlex Elder } 62341c2a9dfeSAlex Elder 62351c2a9dfeSAlex Elder static void rbd_slab_exit(void) 62361c2a9dfeSAlex Elder { 6237868311b1SAlex Elder rbd_assert(rbd_obj_request_cache); 6238868311b1SAlex Elder kmem_cache_destroy(rbd_obj_request_cache); 6239868311b1SAlex Elder rbd_obj_request_cache = NULL; 6240868311b1SAlex Elder 62411c2a9dfeSAlex Elder rbd_assert(rbd_img_request_cache); 62421c2a9dfeSAlex Elder kmem_cache_destroy(rbd_img_request_cache); 62431c2a9dfeSAlex Elder rbd_img_request_cache = NULL; 62441c2a9dfeSAlex Elder } 62451c2a9dfeSAlex Elder 6246cc344fa1SAlex Elder static int __init rbd_init(void) 6247602adf40SYehuda Sadeh { 6248602adf40SYehuda Sadeh int rc; 6249602adf40SYehuda Sadeh 62501e32d34cSAlex Elder if (!libceph_compatible(NULL)) { 62511e32d34cSAlex Elder rbd_warn(NULL, "libceph incompatibility (quitting)"); 62521e32d34cSAlex Elder return -EINVAL; 62531e32d34cSAlex Elder } 6254e1b4d96dSIlya Dryomov 62551c2a9dfeSAlex Elder rc = rbd_slab_init(); 6256602adf40SYehuda Sadeh if (rc) 6257602adf40SYehuda Sadeh return rc; 6258e1b4d96dSIlya Dryomov 6259f5ee37bdSIlya Dryomov /* 6260f5ee37bdSIlya Dryomov * The number of active work items is limited by the number of 6261f77303bdSIlya Dryomov * rbd devices * queue depth, so leave @max_active at default. 6262f5ee37bdSIlya Dryomov */ 6263f5ee37bdSIlya Dryomov rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); 6264f5ee37bdSIlya Dryomov if (!rbd_wq) { 6265f5ee37bdSIlya Dryomov rc = -ENOMEM; 6266f5ee37bdSIlya Dryomov goto err_out_slab; 6267f5ee37bdSIlya Dryomov } 6268f5ee37bdSIlya Dryomov 62699b60e70bSIlya Dryomov if (single_major) { 62709b60e70bSIlya Dryomov rbd_major = register_blkdev(0, RBD_DRV_NAME); 62719b60e70bSIlya Dryomov if (rbd_major < 0) { 62729b60e70bSIlya Dryomov rc = rbd_major; 6273f5ee37bdSIlya Dryomov goto err_out_wq; 62749b60e70bSIlya Dryomov } 62759b60e70bSIlya Dryomov } 62769b60e70bSIlya Dryomov 62771c2a9dfeSAlex Elder rc = rbd_sysfs_init(); 62781c2a9dfeSAlex Elder if (rc) 62799b60e70bSIlya Dryomov goto err_out_blkdev; 62801c2a9dfeSAlex Elder 62819b60e70bSIlya Dryomov if (single_major) 62829b60e70bSIlya Dryomov pr_info("loaded (major %d)\n", rbd_major); 62839b60e70bSIlya Dryomov else 6284e1b4d96dSIlya Dryomov pr_info("loaded\n"); 62859b60e70bSIlya Dryomov 6286e1b4d96dSIlya Dryomov return 0; 6287e1b4d96dSIlya Dryomov 62889b60e70bSIlya Dryomov err_out_blkdev: 62899b60e70bSIlya Dryomov if (single_major) 62909b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6291f5ee37bdSIlya Dryomov err_out_wq: 6292f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 6293e1b4d96dSIlya Dryomov err_out_slab: 6294e1b4d96dSIlya Dryomov rbd_slab_exit(); 62951c2a9dfeSAlex Elder return rc; 6296602adf40SYehuda Sadeh } 6297602adf40SYehuda Sadeh 6298cc344fa1SAlex Elder static void __exit rbd_exit(void) 6299602adf40SYehuda Sadeh { 6300ffe312cfSIlya Dryomov ida_destroy(&rbd_dev_id_ida); 6301602adf40SYehuda Sadeh rbd_sysfs_cleanup(); 63029b60e70bSIlya Dryomov if (single_major) 63039b60e70bSIlya Dryomov unregister_blkdev(rbd_major, RBD_DRV_NAME); 6304f5ee37bdSIlya Dryomov destroy_workqueue(rbd_wq); 63051c2a9dfeSAlex Elder rbd_slab_exit(); 6306602adf40SYehuda Sadeh } 6307602adf40SYehuda Sadeh 6308602adf40SYehuda Sadeh module_init(rbd_init); 6309602adf40SYehuda Sadeh module_exit(rbd_exit); 6310602adf40SYehuda Sadeh 6311d552c619SAlex Elder MODULE_AUTHOR("Alex Elder <elder@inktank.com>"); 6312602adf40SYehuda Sadeh MODULE_AUTHOR("Sage Weil <sage@newdream.net>"); 6313602adf40SYehuda Sadeh MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>"); 6314602adf40SYehuda Sadeh /* following authorship retained from original osdblk.c */ 6315602adf40SYehuda Sadeh MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>"); 6316602adf40SYehuda Sadeh 631790da258bSIlya Dryomov MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); 6318602adf40SYehuda Sadeh MODULE_LICENSE("GPL"); 6319