1*f27aaf4bSChristian Brunner /* 2*f27aaf4bSChristian Brunner * QEMU Block driver for RADOS (Ceph) 3*f27aaf4bSChristian Brunner * 4*f27aaf4bSChristian Brunner * Copyright (C) 2010 Christian Brunner <chb@muc.de> 5*f27aaf4bSChristian Brunner * 6*f27aaf4bSChristian Brunner * This work is licensed under the terms of the GNU GPL, version 2. See 7*f27aaf4bSChristian Brunner * the COPYING file in the top-level directory. 8*f27aaf4bSChristian Brunner * 9*f27aaf4bSChristian Brunner */ 10*f27aaf4bSChristian Brunner 11*f27aaf4bSChristian Brunner #include "qemu-common.h" 12*f27aaf4bSChristian Brunner #include "qemu-error.h" 13*f27aaf4bSChristian Brunner 14*f27aaf4bSChristian Brunner #include "rbd_types.h" 15*f27aaf4bSChristian Brunner #include "block_int.h" 16*f27aaf4bSChristian Brunner 17*f27aaf4bSChristian Brunner #include <rados/librados.h> 18*f27aaf4bSChristian Brunner 19*f27aaf4bSChristian Brunner 20*f27aaf4bSChristian Brunner 21*f27aaf4bSChristian Brunner /* 22*f27aaf4bSChristian Brunner * When specifying the image filename use: 23*f27aaf4bSChristian Brunner * 24*f27aaf4bSChristian Brunner * rbd:poolname/devicename 25*f27aaf4bSChristian Brunner * 26*f27aaf4bSChristian Brunner * poolname must be the name of an existing rados pool 27*f27aaf4bSChristian Brunner * 28*f27aaf4bSChristian Brunner * devicename is the basename for all objects used to 29*f27aaf4bSChristian Brunner * emulate the raw device. 30*f27aaf4bSChristian Brunner * 31*f27aaf4bSChristian Brunner * Metadata information (image size, ...) is stored in an 32*f27aaf4bSChristian Brunner * object with the name "devicename.rbd". 33*f27aaf4bSChristian Brunner * 34*f27aaf4bSChristian Brunner * The raw device is split into 4MB sized objects by default. 35*f27aaf4bSChristian Brunner * The sequencenumber is encoded in a 12 byte long hex-string, 36*f27aaf4bSChristian Brunner * and is attached to the devicename, separated by a dot. 37*f27aaf4bSChristian Brunner * e.g. "devicename.1234567890ab" 38*f27aaf4bSChristian Brunner * 39*f27aaf4bSChristian Brunner */ 40*f27aaf4bSChristian Brunner 41*f27aaf4bSChristian Brunner #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) 42*f27aaf4bSChristian Brunner 43*f27aaf4bSChristian Brunner typedef struct RBDAIOCB { 44*f27aaf4bSChristian Brunner BlockDriverAIOCB common; 45*f27aaf4bSChristian Brunner QEMUBH *bh; 46*f27aaf4bSChristian Brunner int ret; 47*f27aaf4bSChristian Brunner QEMUIOVector *qiov; 48*f27aaf4bSChristian Brunner char *bounce; 49*f27aaf4bSChristian Brunner int write; 50*f27aaf4bSChristian Brunner int64_t sector_num; 51*f27aaf4bSChristian Brunner int aiocnt; 52*f27aaf4bSChristian Brunner int error; 53*f27aaf4bSChristian Brunner struct BDRVRBDState *s; 54*f27aaf4bSChristian Brunner int cancelled; 55*f27aaf4bSChristian Brunner } RBDAIOCB; 56*f27aaf4bSChristian Brunner 57*f27aaf4bSChristian Brunner typedef struct RADOSCB { 58*f27aaf4bSChristian Brunner int rcbid; 59*f27aaf4bSChristian Brunner RBDAIOCB *acb; 60*f27aaf4bSChristian Brunner struct BDRVRBDState *s; 61*f27aaf4bSChristian Brunner int done; 62*f27aaf4bSChristian Brunner int64_t segsize; 63*f27aaf4bSChristian Brunner char *buf; 64*f27aaf4bSChristian Brunner int ret; 65*f27aaf4bSChristian Brunner } RADOSCB; 66*f27aaf4bSChristian Brunner 67*f27aaf4bSChristian Brunner #define RBD_FD_READ 0 68*f27aaf4bSChristian Brunner #define RBD_FD_WRITE 1 69*f27aaf4bSChristian Brunner 70*f27aaf4bSChristian Brunner typedef struct BDRVRBDState { 71*f27aaf4bSChristian Brunner int fds[2]; 72*f27aaf4bSChristian Brunner rados_pool_t pool; 73*f27aaf4bSChristian Brunner rados_pool_t header_pool; 74*f27aaf4bSChristian Brunner char name[RBD_MAX_OBJ_NAME_SIZE]; 75*f27aaf4bSChristian Brunner char block_name[RBD_MAX_BLOCK_NAME_SIZE]; 76*f27aaf4bSChristian Brunner uint64_t size; 77*f27aaf4bSChristian Brunner uint64_t objsize; 78*f27aaf4bSChristian Brunner int qemu_aio_count; 79*f27aaf4bSChristian Brunner int event_reader_pos; 80*f27aaf4bSChristian Brunner RADOSCB *event_rcb; 81*f27aaf4bSChristian Brunner } BDRVRBDState; 82*f27aaf4bSChristian Brunner 83*f27aaf4bSChristian Brunner typedef struct rbd_obj_header_ondisk RbdHeader1; 84*f27aaf4bSChristian Brunner 85*f27aaf4bSChristian Brunner static void rbd_aio_bh_cb(void *opaque); 86*f27aaf4bSChristian Brunner 87*f27aaf4bSChristian Brunner static int rbd_next_tok(char *dst, int dst_len, 88*f27aaf4bSChristian Brunner char *src, char delim, 89*f27aaf4bSChristian Brunner const char *name, 90*f27aaf4bSChristian Brunner char **p) 91*f27aaf4bSChristian Brunner { 92*f27aaf4bSChristian Brunner int l; 93*f27aaf4bSChristian Brunner char *end; 94*f27aaf4bSChristian Brunner 95*f27aaf4bSChristian Brunner *p = NULL; 96*f27aaf4bSChristian Brunner 97*f27aaf4bSChristian Brunner if (delim != '\0') { 98*f27aaf4bSChristian Brunner end = strchr(src, delim); 99*f27aaf4bSChristian Brunner if (end) { 100*f27aaf4bSChristian Brunner *p = end + 1; 101*f27aaf4bSChristian Brunner *end = '\0'; 102*f27aaf4bSChristian Brunner } 103*f27aaf4bSChristian Brunner } 104*f27aaf4bSChristian Brunner l = strlen(src); 105*f27aaf4bSChristian Brunner if (l >= dst_len) { 106*f27aaf4bSChristian Brunner error_report("%s too long", name); 107*f27aaf4bSChristian Brunner return -EINVAL; 108*f27aaf4bSChristian Brunner } else if (l == 0) { 109*f27aaf4bSChristian Brunner error_report("%s too short", name); 110*f27aaf4bSChristian Brunner return -EINVAL; 111*f27aaf4bSChristian Brunner } 112*f27aaf4bSChristian Brunner 113*f27aaf4bSChristian Brunner pstrcpy(dst, dst_len, src); 114*f27aaf4bSChristian Brunner 115*f27aaf4bSChristian Brunner return 0; 116*f27aaf4bSChristian Brunner } 117*f27aaf4bSChristian Brunner 118*f27aaf4bSChristian Brunner static int rbd_parsename(const char *filename, 119*f27aaf4bSChristian Brunner char *pool, int pool_len, 120*f27aaf4bSChristian Brunner char *snap, int snap_len, 121*f27aaf4bSChristian Brunner char *name, int name_len) 122*f27aaf4bSChristian Brunner { 123*f27aaf4bSChristian Brunner const char *start; 124*f27aaf4bSChristian Brunner char *p, *buf; 125*f27aaf4bSChristian Brunner int ret; 126*f27aaf4bSChristian Brunner 127*f27aaf4bSChristian Brunner if (!strstart(filename, "rbd:", &start)) { 128*f27aaf4bSChristian Brunner return -EINVAL; 129*f27aaf4bSChristian Brunner } 130*f27aaf4bSChristian Brunner 131*f27aaf4bSChristian Brunner buf = qemu_strdup(start); 132*f27aaf4bSChristian Brunner p = buf; 133*f27aaf4bSChristian Brunner 134*f27aaf4bSChristian Brunner ret = rbd_next_tok(pool, pool_len, p, '/', "pool name", &p); 135*f27aaf4bSChristian Brunner if (ret < 0 || !p) { 136*f27aaf4bSChristian Brunner ret = -EINVAL; 137*f27aaf4bSChristian Brunner goto done; 138*f27aaf4bSChristian Brunner } 139*f27aaf4bSChristian Brunner ret = rbd_next_tok(name, name_len, p, '@', "object name", &p); 140*f27aaf4bSChristian Brunner if (ret < 0) { 141*f27aaf4bSChristian Brunner goto done; 142*f27aaf4bSChristian Brunner } 143*f27aaf4bSChristian Brunner if (!p) { 144*f27aaf4bSChristian Brunner *snap = '\0'; 145*f27aaf4bSChristian Brunner goto done; 146*f27aaf4bSChristian Brunner } 147*f27aaf4bSChristian Brunner 148*f27aaf4bSChristian Brunner ret = rbd_next_tok(snap, snap_len, p, '\0', "snap name", &p); 149*f27aaf4bSChristian Brunner 150*f27aaf4bSChristian Brunner done: 151*f27aaf4bSChristian Brunner qemu_free(buf); 152*f27aaf4bSChristian Brunner return ret; 153*f27aaf4bSChristian Brunner } 154*f27aaf4bSChristian Brunner 155*f27aaf4bSChristian Brunner static int create_tmap_op(uint8_t op, const char *name, char **tmap_desc) 156*f27aaf4bSChristian Brunner { 157*f27aaf4bSChristian Brunner uint32_t len = strlen(name); 158*f27aaf4bSChristian Brunner uint32_t len_le = cpu_to_le32(len); 159*f27aaf4bSChristian Brunner /* total_len = encoding op + name + empty buffer */ 160*f27aaf4bSChristian Brunner uint32_t total_len = 1 + (sizeof(uint32_t) + len) + sizeof(uint32_t); 161*f27aaf4bSChristian Brunner uint8_t *desc = NULL; 162*f27aaf4bSChristian Brunner 163*f27aaf4bSChristian Brunner desc = qemu_malloc(total_len); 164*f27aaf4bSChristian Brunner 165*f27aaf4bSChristian Brunner *tmap_desc = (char *)desc; 166*f27aaf4bSChristian Brunner 167*f27aaf4bSChristian Brunner *desc = op; 168*f27aaf4bSChristian Brunner desc++; 169*f27aaf4bSChristian Brunner memcpy(desc, &len_le, sizeof(len_le)); 170*f27aaf4bSChristian Brunner desc += sizeof(len_le); 171*f27aaf4bSChristian Brunner memcpy(desc, name, len); 172*f27aaf4bSChristian Brunner desc += len; 173*f27aaf4bSChristian Brunner len = 0; /* no need for endian conversion for 0 */ 174*f27aaf4bSChristian Brunner memcpy(desc, &len, sizeof(len)); 175*f27aaf4bSChristian Brunner desc += sizeof(len); 176*f27aaf4bSChristian Brunner 177*f27aaf4bSChristian Brunner return (char *)desc - *tmap_desc; 178*f27aaf4bSChristian Brunner } 179*f27aaf4bSChristian Brunner 180*f27aaf4bSChristian Brunner static void free_tmap_op(char *tmap_desc) 181*f27aaf4bSChristian Brunner { 182*f27aaf4bSChristian Brunner qemu_free(tmap_desc); 183*f27aaf4bSChristian Brunner } 184*f27aaf4bSChristian Brunner 185*f27aaf4bSChristian Brunner static int rbd_register_image(rados_pool_t pool, const char *name) 186*f27aaf4bSChristian Brunner { 187*f27aaf4bSChristian Brunner char *tmap_desc; 188*f27aaf4bSChristian Brunner const char *dir = RBD_DIRECTORY; 189*f27aaf4bSChristian Brunner int ret; 190*f27aaf4bSChristian Brunner 191*f27aaf4bSChristian Brunner ret = create_tmap_op(CEPH_OSD_TMAP_SET, name, &tmap_desc); 192*f27aaf4bSChristian Brunner if (ret < 0) { 193*f27aaf4bSChristian Brunner return ret; 194*f27aaf4bSChristian Brunner } 195*f27aaf4bSChristian Brunner 196*f27aaf4bSChristian Brunner ret = rados_tmap_update(pool, dir, tmap_desc, ret); 197*f27aaf4bSChristian Brunner free_tmap_op(tmap_desc); 198*f27aaf4bSChristian Brunner 199*f27aaf4bSChristian Brunner return ret; 200*f27aaf4bSChristian Brunner } 201*f27aaf4bSChristian Brunner 202*f27aaf4bSChristian Brunner static int touch_rbd_info(rados_pool_t pool, const char *info_oid) 203*f27aaf4bSChristian Brunner { 204*f27aaf4bSChristian Brunner int r = rados_write(pool, info_oid, 0, NULL, 0); 205*f27aaf4bSChristian Brunner if (r < 0) { 206*f27aaf4bSChristian Brunner return r; 207*f27aaf4bSChristian Brunner } 208*f27aaf4bSChristian Brunner return 0; 209*f27aaf4bSChristian Brunner } 210*f27aaf4bSChristian Brunner 211*f27aaf4bSChristian Brunner static int rbd_assign_bid(rados_pool_t pool, uint64_t *id) 212*f27aaf4bSChristian Brunner { 213*f27aaf4bSChristian Brunner uint64_t out[1]; 214*f27aaf4bSChristian Brunner const char *info_oid = RBD_INFO; 215*f27aaf4bSChristian Brunner 216*f27aaf4bSChristian Brunner *id = 0; 217*f27aaf4bSChristian Brunner 218*f27aaf4bSChristian Brunner int r = touch_rbd_info(pool, info_oid); 219*f27aaf4bSChristian Brunner if (r < 0) { 220*f27aaf4bSChristian Brunner return r; 221*f27aaf4bSChristian Brunner } 222*f27aaf4bSChristian Brunner 223*f27aaf4bSChristian Brunner r = rados_exec(pool, info_oid, "rbd", "assign_bid", NULL, 224*f27aaf4bSChristian Brunner 0, (char *)out, sizeof(out)); 225*f27aaf4bSChristian Brunner if (r < 0) { 226*f27aaf4bSChristian Brunner return r; 227*f27aaf4bSChristian Brunner } 228*f27aaf4bSChristian Brunner 229*f27aaf4bSChristian Brunner le64_to_cpus(out); 230*f27aaf4bSChristian Brunner *id = out[0]; 231*f27aaf4bSChristian Brunner 232*f27aaf4bSChristian Brunner return 0; 233*f27aaf4bSChristian Brunner } 234*f27aaf4bSChristian Brunner 235*f27aaf4bSChristian Brunner static int rbd_create(const char *filename, QEMUOptionParameter *options) 236*f27aaf4bSChristian Brunner { 237*f27aaf4bSChristian Brunner int64_t bytes = 0; 238*f27aaf4bSChristian Brunner int64_t objsize; 239*f27aaf4bSChristian Brunner uint64_t size; 240*f27aaf4bSChristian Brunner time_t mtime; 241*f27aaf4bSChristian Brunner uint8_t obj_order = RBD_DEFAULT_OBJ_ORDER; 242*f27aaf4bSChristian Brunner char pool[RBD_MAX_SEG_NAME_SIZE]; 243*f27aaf4bSChristian Brunner char n[RBD_MAX_SEG_NAME_SIZE]; 244*f27aaf4bSChristian Brunner char name[RBD_MAX_OBJ_NAME_SIZE]; 245*f27aaf4bSChristian Brunner char snap_buf[RBD_MAX_SEG_NAME_SIZE]; 246*f27aaf4bSChristian Brunner char *snap = NULL; 247*f27aaf4bSChristian Brunner RbdHeader1 header; 248*f27aaf4bSChristian Brunner rados_pool_t p; 249*f27aaf4bSChristian Brunner uint64_t bid; 250*f27aaf4bSChristian Brunner uint32_t hi, lo; 251*f27aaf4bSChristian Brunner int ret; 252*f27aaf4bSChristian Brunner 253*f27aaf4bSChristian Brunner if (rbd_parsename(filename, 254*f27aaf4bSChristian Brunner pool, sizeof(pool), 255*f27aaf4bSChristian Brunner snap_buf, sizeof(snap_buf), 256*f27aaf4bSChristian Brunner name, sizeof(name)) < 0) { 257*f27aaf4bSChristian Brunner return -EINVAL; 258*f27aaf4bSChristian Brunner } 259*f27aaf4bSChristian Brunner if (snap_buf[0] != '\0') { 260*f27aaf4bSChristian Brunner snap = snap_buf; 261*f27aaf4bSChristian Brunner } 262*f27aaf4bSChristian Brunner 263*f27aaf4bSChristian Brunner snprintf(n, sizeof(n), "%s%s", name, RBD_SUFFIX); 264*f27aaf4bSChristian Brunner 265*f27aaf4bSChristian Brunner /* Read out options */ 266*f27aaf4bSChristian Brunner while (options && options->name) { 267*f27aaf4bSChristian Brunner if (!strcmp(options->name, BLOCK_OPT_SIZE)) { 268*f27aaf4bSChristian Brunner bytes = options->value.n; 269*f27aaf4bSChristian Brunner } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { 270*f27aaf4bSChristian Brunner if (options->value.n) { 271*f27aaf4bSChristian Brunner objsize = options->value.n; 272*f27aaf4bSChristian Brunner if ((objsize - 1) & objsize) { /* not a power of 2? */ 273*f27aaf4bSChristian Brunner error_report("obj size needs to be power of 2"); 274*f27aaf4bSChristian Brunner return -EINVAL; 275*f27aaf4bSChristian Brunner } 276*f27aaf4bSChristian Brunner if (objsize < 4096) { 277*f27aaf4bSChristian Brunner error_report("obj size too small"); 278*f27aaf4bSChristian Brunner return -EINVAL; 279*f27aaf4bSChristian Brunner } 280*f27aaf4bSChristian Brunner obj_order = ffs(objsize) - 1; 281*f27aaf4bSChristian Brunner } 282*f27aaf4bSChristian Brunner } 283*f27aaf4bSChristian Brunner options++; 284*f27aaf4bSChristian Brunner } 285*f27aaf4bSChristian Brunner 286*f27aaf4bSChristian Brunner memset(&header, 0, sizeof(header)); 287*f27aaf4bSChristian Brunner pstrcpy(header.text, sizeof(header.text), RBD_HEADER_TEXT); 288*f27aaf4bSChristian Brunner pstrcpy(header.signature, sizeof(header.signature), RBD_HEADER_SIGNATURE); 289*f27aaf4bSChristian Brunner pstrcpy(header.version, sizeof(header.version), RBD_HEADER_VERSION); 290*f27aaf4bSChristian Brunner header.image_size = cpu_to_le64(bytes); 291*f27aaf4bSChristian Brunner header.options.order = obj_order; 292*f27aaf4bSChristian Brunner header.options.crypt_type = RBD_CRYPT_NONE; 293*f27aaf4bSChristian Brunner header.options.comp_type = RBD_COMP_NONE; 294*f27aaf4bSChristian Brunner header.snap_seq = 0; 295*f27aaf4bSChristian Brunner header.snap_count = 0; 296*f27aaf4bSChristian Brunner 297*f27aaf4bSChristian Brunner if (rados_initialize(0, NULL) < 0) { 298*f27aaf4bSChristian Brunner error_report("error initializing"); 299*f27aaf4bSChristian Brunner return -EIO; 300*f27aaf4bSChristian Brunner } 301*f27aaf4bSChristian Brunner 302*f27aaf4bSChristian Brunner if (rados_open_pool(pool, &p)) { 303*f27aaf4bSChristian Brunner error_report("error opening pool %s", pool); 304*f27aaf4bSChristian Brunner rados_deinitialize(); 305*f27aaf4bSChristian Brunner return -EIO; 306*f27aaf4bSChristian Brunner } 307*f27aaf4bSChristian Brunner 308*f27aaf4bSChristian Brunner /* check for existing rbd header file */ 309*f27aaf4bSChristian Brunner ret = rados_stat(p, n, &size, &mtime); 310*f27aaf4bSChristian Brunner if (ret == 0) { 311*f27aaf4bSChristian Brunner ret=-EEXIST; 312*f27aaf4bSChristian Brunner goto done; 313*f27aaf4bSChristian Brunner } 314*f27aaf4bSChristian Brunner 315*f27aaf4bSChristian Brunner ret = rbd_assign_bid(p, &bid); 316*f27aaf4bSChristian Brunner if (ret < 0) { 317*f27aaf4bSChristian Brunner error_report("failed assigning block id"); 318*f27aaf4bSChristian Brunner rados_deinitialize(); 319*f27aaf4bSChristian Brunner return -EIO; 320*f27aaf4bSChristian Brunner } 321*f27aaf4bSChristian Brunner hi = bid >> 32; 322*f27aaf4bSChristian Brunner lo = bid & 0xFFFFFFFF; 323*f27aaf4bSChristian Brunner snprintf(header.block_name, sizeof(header.block_name), "rb.%x.%x", hi, lo); 324*f27aaf4bSChristian Brunner 325*f27aaf4bSChristian Brunner /* create header file */ 326*f27aaf4bSChristian Brunner ret = rados_write(p, n, 0, (const char *)&header, sizeof(header)); 327*f27aaf4bSChristian Brunner if (ret < 0) { 328*f27aaf4bSChristian Brunner goto done; 329*f27aaf4bSChristian Brunner } 330*f27aaf4bSChristian Brunner 331*f27aaf4bSChristian Brunner ret = rbd_register_image(p, name); 332*f27aaf4bSChristian Brunner done: 333*f27aaf4bSChristian Brunner rados_close_pool(p); 334*f27aaf4bSChristian Brunner rados_deinitialize(); 335*f27aaf4bSChristian Brunner 336*f27aaf4bSChristian Brunner return ret; 337*f27aaf4bSChristian Brunner } 338*f27aaf4bSChristian Brunner 339*f27aaf4bSChristian Brunner /* 340*f27aaf4bSChristian Brunner * This aio completion is being called from rbd_aio_event_reader() and 341*f27aaf4bSChristian Brunner * runs in qemu context. It schedules a bh, but just in case the aio 342*f27aaf4bSChristian Brunner * was not cancelled before. 343*f27aaf4bSChristian Brunner */ 344*f27aaf4bSChristian Brunner static void rbd_complete_aio(RADOSCB *rcb) 345*f27aaf4bSChristian Brunner { 346*f27aaf4bSChristian Brunner RBDAIOCB *acb = rcb->acb; 347*f27aaf4bSChristian Brunner int64_t r; 348*f27aaf4bSChristian Brunner 349*f27aaf4bSChristian Brunner acb->aiocnt--; 350*f27aaf4bSChristian Brunner 351*f27aaf4bSChristian Brunner if (acb->cancelled) { 352*f27aaf4bSChristian Brunner if (!acb->aiocnt) { 353*f27aaf4bSChristian Brunner qemu_vfree(acb->bounce); 354*f27aaf4bSChristian Brunner qemu_aio_release(acb); 355*f27aaf4bSChristian Brunner } 356*f27aaf4bSChristian Brunner goto done; 357*f27aaf4bSChristian Brunner } 358*f27aaf4bSChristian Brunner 359*f27aaf4bSChristian Brunner r = rcb->ret; 360*f27aaf4bSChristian Brunner 361*f27aaf4bSChristian Brunner if (acb->write) { 362*f27aaf4bSChristian Brunner if (r < 0) { 363*f27aaf4bSChristian Brunner acb->ret = r; 364*f27aaf4bSChristian Brunner acb->error = 1; 365*f27aaf4bSChristian Brunner } else if (!acb->error) { 366*f27aaf4bSChristian Brunner acb->ret += rcb->segsize; 367*f27aaf4bSChristian Brunner } 368*f27aaf4bSChristian Brunner } else { 369*f27aaf4bSChristian Brunner if (r == -ENOENT) { 370*f27aaf4bSChristian Brunner memset(rcb->buf, 0, rcb->segsize); 371*f27aaf4bSChristian Brunner if (!acb->error) { 372*f27aaf4bSChristian Brunner acb->ret += rcb->segsize; 373*f27aaf4bSChristian Brunner } 374*f27aaf4bSChristian Brunner } else if (r < 0) { 375*f27aaf4bSChristian Brunner memset(rcb->buf, 0, rcb->segsize); 376*f27aaf4bSChristian Brunner acb->ret = r; 377*f27aaf4bSChristian Brunner acb->error = 1; 378*f27aaf4bSChristian Brunner } else if (r < rcb->segsize) { 379*f27aaf4bSChristian Brunner memset(rcb->buf + r, 0, rcb->segsize - r); 380*f27aaf4bSChristian Brunner if (!acb->error) { 381*f27aaf4bSChristian Brunner acb->ret += rcb->segsize; 382*f27aaf4bSChristian Brunner } 383*f27aaf4bSChristian Brunner } else if (!acb->error) { 384*f27aaf4bSChristian Brunner acb->ret += r; 385*f27aaf4bSChristian Brunner } 386*f27aaf4bSChristian Brunner } 387*f27aaf4bSChristian Brunner /* Note that acb->bh can be NULL in case where the aio was cancelled */ 388*f27aaf4bSChristian Brunner if (!acb->aiocnt) { 389*f27aaf4bSChristian Brunner acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); 390*f27aaf4bSChristian Brunner qemu_bh_schedule(acb->bh); 391*f27aaf4bSChristian Brunner } 392*f27aaf4bSChristian Brunner done: 393*f27aaf4bSChristian Brunner qemu_free(rcb); 394*f27aaf4bSChristian Brunner } 395*f27aaf4bSChristian Brunner 396*f27aaf4bSChristian Brunner /* 397*f27aaf4bSChristian Brunner * aio fd read handler. It runs in the qemu context and calls the 398*f27aaf4bSChristian Brunner * completion handling of completed rados aio operations. 399*f27aaf4bSChristian Brunner */ 400*f27aaf4bSChristian Brunner static void rbd_aio_event_reader(void *opaque) 401*f27aaf4bSChristian Brunner { 402*f27aaf4bSChristian Brunner BDRVRBDState *s = opaque; 403*f27aaf4bSChristian Brunner 404*f27aaf4bSChristian Brunner ssize_t ret; 405*f27aaf4bSChristian Brunner 406*f27aaf4bSChristian Brunner do { 407*f27aaf4bSChristian Brunner char *p = (char *)&s->event_rcb; 408*f27aaf4bSChristian Brunner 409*f27aaf4bSChristian Brunner /* now read the rcb pointer that was sent from a non qemu thread */ 410*f27aaf4bSChristian Brunner if ((ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, 411*f27aaf4bSChristian Brunner sizeof(s->event_rcb) - s->event_reader_pos)) > 0) { 412*f27aaf4bSChristian Brunner if (ret > 0) { 413*f27aaf4bSChristian Brunner s->event_reader_pos += ret; 414*f27aaf4bSChristian Brunner if (s->event_reader_pos == sizeof(s->event_rcb)) { 415*f27aaf4bSChristian Brunner s->event_reader_pos = 0; 416*f27aaf4bSChristian Brunner rbd_complete_aio(s->event_rcb); 417*f27aaf4bSChristian Brunner s->qemu_aio_count --; 418*f27aaf4bSChristian Brunner } 419*f27aaf4bSChristian Brunner } 420*f27aaf4bSChristian Brunner } 421*f27aaf4bSChristian Brunner } while (ret < 0 && errno == EINTR); 422*f27aaf4bSChristian Brunner } 423*f27aaf4bSChristian Brunner 424*f27aaf4bSChristian Brunner static int rbd_aio_flush_cb(void *opaque) 425*f27aaf4bSChristian Brunner { 426*f27aaf4bSChristian Brunner BDRVRBDState *s = opaque; 427*f27aaf4bSChristian Brunner 428*f27aaf4bSChristian Brunner return (s->qemu_aio_count > 0); 429*f27aaf4bSChristian Brunner } 430*f27aaf4bSChristian Brunner 431*f27aaf4bSChristian Brunner 432*f27aaf4bSChristian Brunner static int rbd_set_snapc(rados_pool_t pool, const char *snap, RbdHeader1 *header) 433*f27aaf4bSChristian Brunner { 434*f27aaf4bSChristian Brunner uint32_t snap_count = le32_to_cpu(header->snap_count); 435*f27aaf4bSChristian Brunner rados_snap_t *snaps = NULL; 436*f27aaf4bSChristian Brunner rados_snap_t seq; 437*f27aaf4bSChristian Brunner uint32_t i; 438*f27aaf4bSChristian Brunner uint64_t snap_names_len = le64_to_cpu(header->snap_names_len); 439*f27aaf4bSChristian Brunner int r; 440*f27aaf4bSChristian Brunner rados_snap_t snapid = 0; 441*f27aaf4bSChristian Brunner 442*f27aaf4bSChristian Brunner if (snap_count) { 443*f27aaf4bSChristian Brunner const char *header_snap = (const char *)&header->snaps[snap_count]; 444*f27aaf4bSChristian Brunner const char *end = header_snap + snap_names_len; 445*f27aaf4bSChristian Brunner snaps = qemu_malloc(sizeof(rados_snap_t) * header->snap_count); 446*f27aaf4bSChristian Brunner 447*f27aaf4bSChristian Brunner for (i=0; i < snap_count; i++) { 448*f27aaf4bSChristian Brunner snaps[i] = le64_to_cpu(header->snaps[i].id); 449*f27aaf4bSChristian Brunner 450*f27aaf4bSChristian Brunner if (snap && strcmp(snap, header_snap) == 0) { 451*f27aaf4bSChristian Brunner snapid = snaps[i]; 452*f27aaf4bSChristian Brunner } 453*f27aaf4bSChristian Brunner 454*f27aaf4bSChristian Brunner header_snap += strlen(header_snap) + 1; 455*f27aaf4bSChristian Brunner if (header_snap > end) { 456*f27aaf4bSChristian Brunner error_report("bad header, snapshot list broken"); 457*f27aaf4bSChristian Brunner } 458*f27aaf4bSChristian Brunner } 459*f27aaf4bSChristian Brunner } 460*f27aaf4bSChristian Brunner 461*f27aaf4bSChristian Brunner if (snap && !snapid) { 462*f27aaf4bSChristian Brunner error_report("snapshot not found"); 463*f27aaf4bSChristian Brunner qemu_free(snaps); 464*f27aaf4bSChristian Brunner return -ENOENT; 465*f27aaf4bSChristian Brunner } 466*f27aaf4bSChristian Brunner seq = le32_to_cpu(header->snap_seq); 467*f27aaf4bSChristian Brunner 468*f27aaf4bSChristian Brunner r = rados_set_snap_context(pool, seq, snaps, snap_count); 469*f27aaf4bSChristian Brunner 470*f27aaf4bSChristian Brunner rados_set_snap(pool, snapid); 471*f27aaf4bSChristian Brunner 472*f27aaf4bSChristian Brunner qemu_free(snaps); 473*f27aaf4bSChristian Brunner 474*f27aaf4bSChristian Brunner return r; 475*f27aaf4bSChristian Brunner } 476*f27aaf4bSChristian Brunner 477*f27aaf4bSChristian Brunner #define BUF_READ_START_LEN 4096 478*f27aaf4bSChristian Brunner 479*f27aaf4bSChristian Brunner static int rbd_read_header(BDRVRBDState *s, char **hbuf) 480*f27aaf4bSChristian Brunner { 481*f27aaf4bSChristian Brunner char *buf = NULL; 482*f27aaf4bSChristian Brunner char n[RBD_MAX_SEG_NAME_SIZE]; 483*f27aaf4bSChristian Brunner uint64_t len = BUF_READ_START_LEN; 484*f27aaf4bSChristian Brunner int r; 485*f27aaf4bSChristian Brunner 486*f27aaf4bSChristian Brunner snprintf(n, sizeof(n), "%s%s", s->name, RBD_SUFFIX); 487*f27aaf4bSChristian Brunner 488*f27aaf4bSChristian Brunner buf = qemu_malloc(len); 489*f27aaf4bSChristian Brunner 490*f27aaf4bSChristian Brunner r = rados_read(s->header_pool, n, 0, buf, len); 491*f27aaf4bSChristian Brunner if (r < 0) { 492*f27aaf4bSChristian Brunner goto failed; 493*f27aaf4bSChristian Brunner } 494*f27aaf4bSChristian Brunner 495*f27aaf4bSChristian Brunner if (r < len) { 496*f27aaf4bSChristian Brunner goto done; 497*f27aaf4bSChristian Brunner } 498*f27aaf4bSChristian Brunner 499*f27aaf4bSChristian Brunner qemu_free(buf); 500*f27aaf4bSChristian Brunner buf = qemu_malloc(len); 501*f27aaf4bSChristian Brunner 502*f27aaf4bSChristian Brunner r = rados_stat(s->header_pool, n, &len, NULL); 503*f27aaf4bSChristian Brunner if (r < 0) { 504*f27aaf4bSChristian Brunner goto failed; 505*f27aaf4bSChristian Brunner } 506*f27aaf4bSChristian Brunner 507*f27aaf4bSChristian Brunner r = rados_read(s->header_pool, n, 0, buf, len); 508*f27aaf4bSChristian Brunner if (r < 0) { 509*f27aaf4bSChristian Brunner goto failed; 510*f27aaf4bSChristian Brunner } 511*f27aaf4bSChristian Brunner 512*f27aaf4bSChristian Brunner done: 513*f27aaf4bSChristian Brunner *hbuf = buf; 514*f27aaf4bSChristian Brunner return 0; 515*f27aaf4bSChristian Brunner 516*f27aaf4bSChristian Brunner failed: 517*f27aaf4bSChristian Brunner qemu_free(buf); 518*f27aaf4bSChristian Brunner return r; 519*f27aaf4bSChristian Brunner } 520*f27aaf4bSChristian Brunner 521*f27aaf4bSChristian Brunner static int rbd_open(BlockDriverState *bs, const char *filename, int flags) 522*f27aaf4bSChristian Brunner { 523*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 524*f27aaf4bSChristian Brunner RbdHeader1 *header; 525*f27aaf4bSChristian Brunner char pool[RBD_MAX_SEG_NAME_SIZE]; 526*f27aaf4bSChristian Brunner char snap_buf[RBD_MAX_SEG_NAME_SIZE]; 527*f27aaf4bSChristian Brunner char *snap = NULL; 528*f27aaf4bSChristian Brunner char *hbuf = NULL; 529*f27aaf4bSChristian Brunner int r; 530*f27aaf4bSChristian Brunner 531*f27aaf4bSChristian Brunner if (rbd_parsename(filename, pool, sizeof(pool), 532*f27aaf4bSChristian Brunner snap_buf, sizeof(snap_buf), 533*f27aaf4bSChristian Brunner s->name, sizeof(s->name)) < 0) { 534*f27aaf4bSChristian Brunner return -EINVAL; 535*f27aaf4bSChristian Brunner } 536*f27aaf4bSChristian Brunner if (snap_buf[0] != '\0') { 537*f27aaf4bSChristian Brunner snap = snap_buf; 538*f27aaf4bSChristian Brunner } 539*f27aaf4bSChristian Brunner 540*f27aaf4bSChristian Brunner if ((r = rados_initialize(0, NULL)) < 0) { 541*f27aaf4bSChristian Brunner error_report("error initializing"); 542*f27aaf4bSChristian Brunner return r; 543*f27aaf4bSChristian Brunner } 544*f27aaf4bSChristian Brunner 545*f27aaf4bSChristian Brunner if ((r = rados_open_pool(pool, &s->pool))) { 546*f27aaf4bSChristian Brunner error_report("error opening pool %s", pool); 547*f27aaf4bSChristian Brunner rados_deinitialize(); 548*f27aaf4bSChristian Brunner return r; 549*f27aaf4bSChristian Brunner } 550*f27aaf4bSChristian Brunner 551*f27aaf4bSChristian Brunner if ((r = rados_open_pool(pool, &s->header_pool))) { 552*f27aaf4bSChristian Brunner error_report("error opening pool %s", pool); 553*f27aaf4bSChristian Brunner rados_deinitialize(); 554*f27aaf4bSChristian Brunner return r; 555*f27aaf4bSChristian Brunner } 556*f27aaf4bSChristian Brunner 557*f27aaf4bSChristian Brunner if ((r = rbd_read_header(s, &hbuf)) < 0) { 558*f27aaf4bSChristian Brunner error_report("error reading header from %s", s->name); 559*f27aaf4bSChristian Brunner goto failed; 560*f27aaf4bSChristian Brunner } 561*f27aaf4bSChristian Brunner 562*f27aaf4bSChristian Brunner if (memcmp(hbuf + 64, RBD_HEADER_SIGNATURE, 4)) { 563*f27aaf4bSChristian Brunner error_report("Invalid header signature"); 564*f27aaf4bSChristian Brunner r = -EMEDIUMTYPE; 565*f27aaf4bSChristian Brunner goto failed; 566*f27aaf4bSChristian Brunner } 567*f27aaf4bSChristian Brunner 568*f27aaf4bSChristian Brunner if (memcmp(hbuf + 68, RBD_HEADER_VERSION, 8)) { 569*f27aaf4bSChristian Brunner error_report("Unknown image version"); 570*f27aaf4bSChristian Brunner r = -EMEDIUMTYPE; 571*f27aaf4bSChristian Brunner goto failed; 572*f27aaf4bSChristian Brunner } 573*f27aaf4bSChristian Brunner 574*f27aaf4bSChristian Brunner header = (RbdHeader1 *) hbuf; 575*f27aaf4bSChristian Brunner s->size = le64_to_cpu(header->image_size); 576*f27aaf4bSChristian Brunner s->objsize = 1ULL << header->options.order; 577*f27aaf4bSChristian Brunner memcpy(s->block_name, header->block_name, sizeof(header->block_name)); 578*f27aaf4bSChristian Brunner 579*f27aaf4bSChristian Brunner r = rbd_set_snapc(s->pool, snap, header); 580*f27aaf4bSChristian Brunner if (r < 0) { 581*f27aaf4bSChristian Brunner error_report("failed setting snap context: %s", strerror(-r)); 582*f27aaf4bSChristian Brunner goto failed; 583*f27aaf4bSChristian Brunner } 584*f27aaf4bSChristian Brunner 585*f27aaf4bSChristian Brunner bs->read_only = (snap != NULL); 586*f27aaf4bSChristian Brunner 587*f27aaf4bSChristian Brunner s->event_reader_pos = 0; 588*f27aaf4bSChristian Brunner r = qemu_pipe(s->fds); 589*f27aaf4bSChristian Brunner if (r < 0) { 590*f27aaf4bSChristian Brunner error_report("error opening eventfd"); 591*f27aaf4bSChristian Brunner goto failed; 592*f27aaf4bSChristian Brunner } 593*f27aaf4bSChristian Brunner fcntl(s->fds[0], F_SETFL, O_NONBLOCK); 594*f27aaf4bSChristian Brunner fcntl(s->fds[1], F_SETFL, O_NONBLOCK); 595*f27aaf4bSChristian Brunner qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], rbd_aio_event_reader, NULL, 596*f27aaf4bSChristian Brunner rbd_aio_flush_cb, NULL, s); 597*f27aaf4bSChristian Brunner 598*f27aaf4bSChristian Brunner qemu_free(hbuf); 599*f27aaf4bSChristian Brunner 600*f27aaf4bSChristian Brunner return 0; 601*f27aaf4bSChristian Brunner 602*f27aaf4bSChristian Brunner failed: 603*f27aaf4bSChristian Brunner qemu_free(hbuf); 604*f27aaf4bSChristian Brunner 605*f27aaf4bSChristian Brunner rados_close_pool(s->header_pool); 606*f27aaf4bSChristian Brunner rados_close_pool(s->pool); 607*f27aaf4bSChristian Brunner rados_deinitialize(); 608*f27aaf4bSChristian Brunner return r; 609*f27aaf4bSChristian Brunner } 610*f27aaf4bSChristian Brunner 611*f27aaf4bSChristian Brunner static void rbd_close(BlockDriverState *bs) 612*f27aaf4bSChristian Brunner { 613*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 614*f27aaf4bSChristian Brunner 615*f27aaf4bSChristian Brunner close(s->fds[0]); 616*f27aaf4bSChristian Brunner close(s->fds[1]); 617*f27aaf4bSChristian Brunner qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL , NULL, NULL, NULL, 618*f27aaf4bSChristian Brunner NULL); 619*f27aaf4bSChristian Brunner 620*f27aaf4bSChristian Brunner rados_close_pool(s->header_pool); 621*f27aaf4bSChristian Brunner rados_close_pool(s->pool); 622*f27aaf4bSChristian Brunner rados_deinitialize(); 623*f27aaf4bSChristian Brunner } 624*f27aaf4bSChristian Brunner 625*f27aaf4bSChristian Brunner /* 626*f27aaf4bSChristian Brunner * Cancel aio. Since we don't reference acb in a non qemu threads, 627*f27aaf4bSChristian Brunner * it is safe to access it here. 628*f27aaf4bSChristian Brunner */ 629*f27aaf4bSChristian Brunner static void rbd_aio_cancel(BlockDriverAIOCB *blockacb) 630*f27aaf4bSChristian Brunner { 631*f27aaf4bSChristian Brunner RBDAIOCB *acb = (RBDAIOCB *) blockacb; 632*f27aaf4bSChristian Brunner acb->cancelled = 1; 633*f27aaf4bSChristian Brunner } 634*f27aaf4bSChristian Brunner 635*f27aaf4bSChristian Brunner static AIOPool rbd_aio_pool = { 636*f27aaf4bSChristian Brunner .aiocb_size = sizeof(RBDAIOCB), 637*f27aaf4bSChristian Brunner .cancel = rbd_aio_cancel, 638*f27aaf4bSChristian Brunner }; 639*f27aaf4bSChristian Brunner 640*f27aaf4bSChristian Brunner /* 641*f27aaf4bSChristian Brunner * This is the callback function for rados_aio_read and _write 642*f27aaf4bSChristian Brunner * 643*f27aaf4bSChristian Brunner * Note: this function is being called from a non qemu thread so 644*f27aaf4bSChristian Brunner * we need to be careful about what we do here. Generally we only 645*f27aaf4bSChristian Brunner * write to the block notification pipe, and do the rest of the 646*f27aaf4bSChristian Brunner * io completion handling from rbd_aio_event_reader() which 647*f27aaf4bSChristian Brunner * runs in a qemu context. 648*f27aaf4bSChristian Brunner */ 649*f27aaf4bSChristian Brunner static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb) 650*f27aaf4bSChristian Brunner { 651*f27aaf4bSChristian Brunner int ret; 652*f27aaf4bSChristian Brunner rcb->ret = rados_aio_get_return_value(c); 653*f27aaf4bSChristian Brunner rados_aio_release(c); 654*f27aaf4bSChristian Brunner while (1) { 655*f27aaf4bSChristian Brunner fd_set wfd; 656*f27aaf4bSChristian Brunner int fd = rcb->s->fds[RBD_FD_WRITE]; 657*f27aaf4bSChristian Brunner 658*f27aaf4bSChristian Brunner /* send the rcb pointer to the qemu thread that is responsible 659*f27aaf4bSChristian Brunner for the aio completion. Must do it in a qemu thread context */ 660*f27aaf4bSChristian Brunner ret = write(fd, (void *)&rcb, sizeof(rcb)); 661*f27aaf4bSChristian Brunner if (ret >= 0) { 662*f27aaf4bSChristian Brunner break; 663*f27aaf4bSChristian Brunner } 664*f27aaf4bSChristian Brunner if (errno == EINTR) { 665*f27aaf4bSChristian Brunner continue; 666*f27aaf4bSChristian Brunner } 667*f27aaf4bSChristian Brunner if (errno != EAGAIN) { 668*f27aaf4bSChristian Brunner break; 669*f27aaf4bSChristian Brunner } 670*f27aaf4bSChristian Brunner 671*f27aaf4bSChristian Brunner FD_ZERO(&wfd); 672*f27aaf4bSChristian Brunner FD_SET(fd, &wfd); 673*f27aaf4bSChristian Brunner do { 674*f27aaf4bSChristian Brunner ret = select(fd + 1, NULL, &wfd, NULL, NULL); 675*f27aaf4bSChristian Brunner } while (ret < 0 && errno == EINTR); 676*f27aaf4bSChristian Brunner } 677*f27aaf4bSChristian Brunner 678*f27aaf4bSChristian Brunner if (ret < 0) { 679*f27aaf4bSChristian Brunner error_report("failed writing to acb->s->fds\n"); 680*f27aaf4bSChristian Brunner qemu_free(rcb); 681*f27aaf4bSChristian Brunner } 682*f27aaf4bSChristian Brunner } 683*f27aaf4bSChristian Brunner 684*f27aaf4bSChristian Brunner /* Callback when all queued rados_aio requests are complete */ 685*f27aaf4bSChristian Brunner 686*f27aaf4bSChristian Brunner static void rbd_aio_bh_cb(void *opaque) 687*f27aaf4bSChristian Brunner { 688*f27aaf4bSChristian Brunner RBDAIOCB *acb = opaque; 689*f27aaf4bSChristian Brunner 690*f27aaf4bSChristian Brunner if (!acb->write) { 691*f27aaf4bSChristian Brunner qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); 692*f27aaf4bSChristian Brunner } 693*f27aaf4bSChristian Brunner qemu_vfree(acb->bounce); 694*f27aaf4bSChristian Brunner acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); 695*f27aaf4bSChristian Brunner qemu_bh_delete(acb->bh); 696*f27aaf4bSChristian Brunner acb->bh = NULL; 697*f27aaf4bSChristian Brunner 698*f27aaf4bSChristian Brunner qemu_aio_release(acb); 699*f27aaf4bSChristian Brunner } 700*f27aaf4bSChristian Brunner 701*f27aaf4bSChristian Brunner static BlockDriverAIOCB *rbd_aio_rw_vector(BlockDriverState *bs, 702*f27aaf4bSChristian Brunner int64_t sector_num, 703*f27aaf4bSChristian Brunner QEMUIOVector *qiov, 704*f27aaf4bSChristian Brunner int nb_sectors, 705*f27aaf4bSChristian Brunner BlockDriverCompletionFunc *cb, 706*f27aaf4bSChristian Brunner void *opaque, int write) 707*f27aaf4bSChristian Brunner { 708*f27aaf4bSChristian Brunner RBDAIOCB *acb; 709*f27aaf4bSChristian Brunner RADOSCB *rcb; 710*f27aaf4bSChristian Brunner rados_completion_t c; 711*f27aaf4bSChristian Brunner char n[RBD_MAX_SEG_NAME_SIZE]; 712*f27aaf4bSChristian Brunner int64_t segnr, segoffs, segsize, last_segnr; 713*f27aaf4bSChristian Brunner int64_t off, size; 714*f27aaf4bSChristian Brunner char *buf; 715*f27aaf4bSChristian Brunner 716*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 717*f27aaf4bSChristian Brunner 718*f27aaf4bSChristian Brunner acb = qemu_aio_get(&rbd_aio_pool, bs, cb, opaque); 719*f27aaf4bSChristian Brunner acb->write = write; 720*f27aaf4bSChristian Brunner acb->qiov = qiov; 721*f27aaf4bSChristian Brunner acb->bounce = qemu_blockalign(bs, qiov->size); 722*f27aaf4bSChristian Brunner acb->aiocnt = 0; 723*f27aaf4bSChristian Brunner acb->ret = 0; 724*f27aaf4bSChristian Brunner acb->error = 0; 725*f27aaf4bSChristian Brunner acb->s = s; 726*f27aaf4bSChristian Brunner acb->cancelled = 0; 727*f27aaf4bSChristian Brunner acb->bh = NULL; 728*f27aaf4bSChristian Brunner 729*f27aaf4bSChristian Brunner if (write) { 730*f27aaf4bSChristian Brunner qemu_iovec_to_buffer(acb->qiov, acb->bounce); 731*f27aaf4bSChristian Brunner } 732*f27aaf4bSChristian Brunner 733*f27aaf4bSChristian Brunner buf = acb->bounce; 734*f27aaf4bSChristian Brunner 735*f27aaf4bSChristian Brunner off = sector_num * BDRV_SECTOR_SIZE; 736*f27aaf4bSChristian Brunner size = nb_sectors * BDRV_SECTOR_SIZE; 737*f27aaf4bSChristian Brunner segnr = off / s->objsize; 738*f27aaf4bSChristian Brunner segoffs = off % s->objsize; 739*f27aaf4bSChristian Brunner segsize = s->objsize - segoffs; 740*f27aaf4bSChristian Brunner 741*f27aaf4bSChristian Brunner last_segnr = ((off + size - 1) / s->objsize); 742*f27aaf4bSChristian Brunner acb->aiocnt = (last_segnr - segnr) + 1; 743*f27aaf4bSChristian Brunner 744*f27aaf4bSChristian Brunner s->qemu_aio_count += acb->aiocnt; /* All the RADOSCB */ 745*f27aaf4bSChristian Brunner 746*f27aaf4bSChristian Brunner while (size > 0) { 747*f27aaf4bSChristian Brunner if (size < segsize) { 748*f27aaf4bSChristian Brunner segsize = size; 749*f27aaf4bSChristian Brunner } 750*f27aaf4bSChristian Brunner 751*f27aaf4bSChristian Brunner snprintf(n, sizeof(n), "%s.%012" PRIx64, s->block_name, 752*f27aaf4bSChristian Brunner segnr); 753*f27aaf4bSChristian Brunner 754*f27aaf4bSChristian Brunner rcb = qemu_malloc(sizeof(RADOSCB)); 755*f27aaf4bSChristian Brunner rcb->done = 0; 756*f27aaf4bSChristian Brunner rcb->acb = acb; 757*f27aaf4bSChristian Brunner rcb->segsize = segsize; 758*f27aaf4bSChristian Brunner rcb->buf = buf; 759*f27aaf4bSChristian Brunner rcb->s = acb->s; 760*f27aaf4bSChristian Brunner 761*f27aaf4bSChristian Brunner if (write) { 762*f27aaf4bSChristian Brunner rados_aio_create_completion(rcb, NULL, 763*f27aaf4bSChristian Brunner (rados_callback_t) rbd_finish_aiocb, 764*f27aaf4bSChristian Brunner &c); 765*f27aaf4bSChristian Brunner rados_aio_write(s->pool, n, segoffs, buf, segsize, c); 766*f27aaf4bSChristian Brunner } else { 767*f27aaf4bSChristian Brunner rados_aio_create_completion(rcb, 768*f27aaf4bSChristian Brunner (rados_callback_t) rbd_finish_aiocb, 769*f27aaf4bSChristian Brunner NULL, &c); 770*f27aaf4bSChristian Brunner rados_aio_read(s->pool, n, segoffs, buf, segsize, c); 771*f27aaf4bSChristian Brunner } 772*f27aaf4bSChristian Brunner 773*f27aaf4bSChristian Brunner buf += segsize; 774*f27aaf4bSChristian Brunner size -= segsize; 775*f27aaf4bSChristian Brunner segoffs = 0; 776*f27aaf4bSChristian Brunner segsize = s->objsize; 777*f27aaf4bSChristian Brunner segnr++; 778*f27aaf4bSChristian Brunner } 779*f27aaf4bSChristian Brunner 780*f27aaf4bSChristian Brunner return &acb->common; 781*f27aaf4bSChristian Brunner } 782*f27aaf4bSChristian Brunner 783*f27aaf4bSChristian Brunner static BlockDriverAIOCB *rbd_aio_readv(BlockDriverState * bs, 784*f27aaf4bSChristian Brunner int64_t sector_num, QEMUIOVector * qiov, 785*f27aaf4bSChristian Brunner int nb_sectors, 786*f27aaf4bSChristian Brunner BlockDriverCompletionFunc * cb, 787*f27aaf4bSChristian Brunner void *opaque) 788*f27aaf4bSChristian Brunner { 789*f27aaf4bSChristian Brunner return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 790*f27aaf4bSChristian Brunner } 791*f27aaf4bSChristian Brunner 792*f27aaf4bSChristian Brunner static BlockDriverAIOCB *rbd_aio_writev(BlockDriverState * bs, 793*f27aaf4bSChristian Brunner int64_t sector_num, QEMUIOVector * qiov, 794*f27aaf4bSChristian Brunner int nb_sectors, 795*f27aaf4bSChristian Brunner BlockDriverCompletionFunc * cb, 796*f27aaf4bSChristian Brunner void *opaque) 797*f27aaf4bSChristian Brunner { 798*f27aaf4bSChristian Brunner return rbd_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 799*f27aaf4bSChristian Brunner } 800*f27aaf4bSChristian Brunner 801*f27aaf4bSChristian Brunner static int rbd_getinfo(BlockDriverState * bs, BlockDriverInfo * bdi) 802*f27aaf4bSChristian Brunner { 803*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 804*f27aaf4bSChristian Brunner bdi->cluster_size = s->objsize; 805*f27aaf4bSChristian Brunner return 0; 806*f27aaf4bSChristian Brunner } 807*f27aaf4bSChristian Brunner 808*f27aaf4bSChristian Brunner static int64_t rbd_getlength(BlockDriverState * bs) 809*f27aaf4bSChristian Brunner { 810*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 811*f27aaf4bSChristian Brunner 812*f27aaf4bSChristian Brunner return s->size; 813*f27aaf4bSChristian Brunner } 814*f27aaf4bSChristian Brunner 815*f27aaf4bSChristian Brunner static int rbd_snap_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) 816*f27aaf4bSChristian Brunner { 817*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 818*f27aaf4bSChristian Brunner char inbuf[512], outbuf[128]; 819*f27aaf4bSChristian Brunner uint64_t snap_id; 820*f27aaf4bSChristian Brunner int r; 821*f27aaf4bSChristian Brunner char *p = inbuf; 822*f27aaf4bSChristian Brunner char *end = inbuf + sizeof(inbuf); 823*f27aaf4bSChristian Brunner char n[RBD_MAX_SEG_NAME_SIZE]; 824*f27aaf4bSChristian Brunner char *hbuf = NULL; 825*f27aaf4bSChristian Brunner RbdHeader1 *header; 826*f27aaf4bSChristian Brunner 827*f27aaf4bSChristian Brunner if (sn_info->name[0] == '\0') { 828*f27aaf4bSChristian Brunner return -EINVAL; /* we need a name for rbd snapshots */ 829*f27aaf4bSChristian Brunner } 830*f27aaf4bSChristian Brunner 831*f27aaf4bSChristian Brunner /* 832*f27aaf4bSChristian Brunner * rbd snapshots are using the name as the user controlled unique identifier 833*f27aaf4bSChristian Brunner * we can't use the rbd snapid for that purpose, as it can't be set 834*f27aaf4bSChristian Brunner */ 835*f27aaf4bSChristian Brunner if (sn_info->id_str[0] != '\0' && 836*f27aaf4bSChristian Brunner strcmp(sn_info->id_str, sn_info->name) != 0) { 837*f27aaf4bSChristian Brunner return -EINVAL; 838*f27aaf4bSChristian Brunner } 839*f27aaf4bSChristian Brunner 840*f27aaf4bSChristian Brunner if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { 841*f27aaf4bSChristian Brunner return -ERANGE; 842*f27aaf4bSChristian Brunner } 843*f27aaf4bSChristian Brunner 844*f27aaf4bSChristian Brunner r = rados_selfmanaged_snap_create(s->header_pool, &snap_id); 845*f27aaf4bSChristian Brunner if (r < 0) { 846*f27aaf4bSChristian Brunner error_report("failed to create snap id: %s", strerror(-r)); 847*f27aaf4bSChristian Brunner return r; 848*f27aaf4bSChristian Brunner } 849*f27aaf4bSChristian Brunner 850*f27aaf4bSChristian Brunner *(uint32_t *)p = strlen(sn_info->name); 851*f27aaf4bSChristian Brunner cpu_to_le32s((uint32_t *)p); 852*f27aaf4bSChristian Brunner p += sizeof(uint32_t); 853*f27aaf4bSChristian Brunner strncpy(p, sn_info->name, end - p); 854*f27aaf4bSChristian Brunner p += strlen(p); 855*f27aaf4bSChristian Brunner if (p + sizeof(snap_id) > end) { 856*f27aaf4bSChristian Brunner error_report("invalid input parameter"); 857*f27aaf4bSChristian Brunner return -EINVAL; 858*f27aaf4bSChristian Brunner } 859*f27aaf4bSChristian Brunner 860*f27aaf4bSChristian Brunner *(uint64_t *)p = snap_id; 861*f27aaf4bSChristian Brunner cpu_to_le64s((uint64_t *)p); 862*f27aaf4bSChristian Brunner 863*f27aaf4bSChristian Brunner snprintf(n, sizeof(n), "%s%s", s->name, RBD_SUFFIX); 864*f27aaf4bSChristian Brunner 865*f27aaf4bSChristian Brunner r = rados_exec(s->header_pool, n, "rbd", "snap_add", inbuf, 866*f27aaf4bSChristian Brunner sizeof(inbuf), outbuf, sizeof(outbuf)); 867*f27aaf4bSChristian Brunner if (r < 0) { 868*f27aaf4bSChristian Brunner error_report("rbd.snap_add execution failed failed: %s", strerror(-r)); 869*f27aaf4bSChristian Brunner return r; 870*f27aaf4bSChristian Brunner } 871*f27aaf4bSChristian Brunner 872*f27aaf4bSChristian Brunner sprintf(sn_info->id_str, "%s", sn_info->name); 873*f27aaf4bSChristian Brunner 874*f27aaf4bSChristian Brunner r = rbd_read_header(s, &hbuf); 875*f27aaf4bSChristian Brunner if (r < 0) { 876*f27aaf4bSChristian Brunner error_report("failed reading header: %s", strerror(-r)); 877*f27aaf4bSChristian Brunner return r; 878*f27aaf4bSChristian Brunner } 879*f27aaf4bSChristian Brunner 880*f27aaf4bSChristian Brunner header = (RbdHeader1 *) hbuf; 881*f27aaf4bSChristian Brunner r = rbd_set_snapc(s->pool, sn_info->name, header); 882*f27aaf4bSChristian Brunner if (r < 0) { 883*f27aaf4bSChristian Brunner error_report("failed setting snap context: %s", strerror(-r)); 884*f27aaf4bSChristian Brunner goto failed; 885*f27aaf4bSChristian Brunner } 886*f27aaf4bSChristian Brunner 887*f27aaf4bSChristian Brunner return 0; 888*f27aaf4bSChristian Brunner 889*f27aaf4bSChristian Brunner failed: 890*f27aaf4bSChristian Brunner qemu_free(header); 891*f27aaf4bSChristian Brunner return r; 892*f27aaf4bSChristian Brunner } 893*f27aaf4bSChristian Brunner 894*f27aaf4bSChristian Brunner static int decode32(char **p, const char *end, uint32_t *v) 895*f27aaf4bSChristian Brunner { 896*f27aaf4bSChristian Brunner if (*p + 4 > end) { 897*f27aaf4bSChristian Brunner return -ERANGE; 898*f27aaf4bSChristian Brunner } 899*f27aaf4bSChristian Brunner 900*f27aaf4bSChristian Brunner *v = *(uint32_t *)(*p); 901*f27aaf4bSChristian Brunner le32_to_cpus(v); 902*f27aaf4bSChristian Brunner *p += 4; 903*f27aaf4bSChristian Brunner return 0; 904*f27aaf4bSChristian Brunner } 905*f27aaf4bSChristian Brunner 906*f27aaf4bSChristian Brunner static int decode64(char **p, const char *end, uint64_t *v) 907*f27aaf4bSChristian Brunner { 908*f27aaf4bSChristian Brunner if (*p + 8 > end) { 909*f27aaf4bSChristian Brunner return -ERANGE; 910*f27aaf4bSChristian Brunner } 911*f27aaf4bSChristian Brunner 912*f27aaf4bSChristian Brunner *v = *(uint64_t *)(*p); 913*f27aaf4bSChristian Brunner le64_to_cpus(v); 914*f27aaf4bSChristian Brunner *p += 8; 915*f27aaf4bSChristian Brunner return 0; 916*f27aaf4bSChristian Brunner } 917*f27aaf4bSChristian Brunner 918*f27aaf4bSChristian Brunner static int decode_str(char **p, const char *end, char **s) 919*f27aaf4bSChristian Brunner { 920*f27aaf4bSChristian Brunner uint32_t len; 921*f27aaf4bSChristian Brunner int r; 922*f27aaf4bSChristian Brunner 923*f27aaf4bSChristian Brunner if ((r = decode32(p, end, &len)) < 0) { 924*f27aaf4bSChristian Brunner return r; 925*f27aaf4bSChristian Brunner } 926*f27aaf4bSChristian Brunner 927*f27aaf4bSChristian Brunner *s = qemu_malloc(len + 1); 928*f27aaf4bSChristian Brunner memcpy(*s, *p, len); 929*f27aaf4bSChristian Brunner *p += len; 930*f27aaf4bSChristian Brunner (*s)[len] = '\0'; 931*f27aaf4bSChristian Brunner 932*f27aaf4bSChristian Brunner return len; 933*f27aaf4bSChristian Brunner } 934*f27aaf4bSChristian Brunner 935*f27aaf4bSChristian Brunner static int rbd_snap_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) 936*f27aaf4bSChristian Brunner { 937*f27aaf4bSChristian Brunner BDRVRBDState *s = bs->opaque; 938*f27aaf4bSChristian Brunner char n[RBD_MAX_SEG_NAME_SIZE]; 939*f27aaf4bSChristian Brunner QEMUSnapshotInfo *sn_info, *sn_tab = NULL; 940*f27aaf4bSChristian Brunner RbdHeader1 *header; 941*f27aaf4bSChristian Brunner char *hbuf = NULL; 942*f27aaf4bSChristian Brunner char *outbuf = NULL, *end, *buf; 943*f27aaf4bSChristian Brunner uint64_t len; 944*f27aaf4bSChristian Brunner uint64_t snap_seq; 945*f27aaf4bSChristian Brunner uint32_t snap_count; 946*f27aaf4bSChristian Brunner int r, i; 947*f27aaf4bSChristian Brunner 948*f27aaf4bSChristian Brunner /* read header to estimate how much space we need to read the snap 949*f27aaf4bSChristian Brunner * list */ 950*f27aaf4bSChristian Brunner if ((r = rbd_read_header(s, &hbuf)) < 0) { 951*f27aaf4bSChristian Brunner goto done_err; 952*f27aaf4bSChristian Brunner } 953*f27aaf4bSChristian Brunner header = (RbdHeader1 *)hbuf; 954*f27aaf4bSChristian Brunner len = le64_to_cpu(header->snap_names_len); 955*f27aaf4bSChristian Brunner len += 1024; /* should have already been enough, but new snapshots might 956*f27aaf4bSChristian Brunner already been created since we read the header. just allocate 957*f27aaf4bSChristian Brunner a bit more, so that in most cases it'll suffice anyway */ 958*f27aaf4bSChristian Brunner qemu_free(hbuf); 959*f27aaf4bSChristian Brunner 960*f27aaf4bSChristian Brunner snprintf(n, sizeof(n), "%s%s", s->name, RBD_SUFFIX); 961*f27aaf4bSChristian Brunner while (1) { 962*f27aaf4bSChristian Brunner qemu_free(outbuf); 963*f27aaf4bSChristian Brunner outbuf = qemu_malloc(len); 964*f27aaf4bSChristian Brunner 965*f27aaf4bSChristian Brunner r = rados_exec(s->header_pool, n, "rbd", "snap_list", NULL, 0, 966*f27aaf4bSChristian Brunner outbuf, len); 967*f27aaf4bSChristian Brunner if (r < 0) { 968*f27aaf4bSChristian Brunner error_report("rbd.snap_list execution failed failed: %s", strerror(-r)); 969*f27aaf4bSChristian Brunner goto done_err; 970*f27aaf4bSChristian Brunner } 971*f27aaf4bSChristian Brunner if (r != len) { 972*f27aaf4bSChristian Brunner break; 973*f27aaf4bSChristian Brunner } 974*f27aaf4bSChristian Brunner 975*f27aaf4bSChristian Brunner /* if we're here, we probably raced with some snaps creation */ 976*f27aaf4bSChristian Brunner len *= 2; 977*f27aaf4bSChristian Brunner } 978*f27aaf4bSChristian Brunner buf = outbuf; 979*f27aaf4bSChristian Brunner end = buf + len; 980*f27aaf4bSChristian Brunner 981*f27aaf4bSChristian Brunner if ((r = decode64(&buf, end, &snap_seq)) < 0) { 982*f27aaf4bSChristian Brunner goto done_err; 983*f27aaf4bSChristian Brunner } 984*f27aaf4bSChristian Brunner if ((r = decode32(&buf, end, &snap_count)) < 0) { 985*f27aaf4bSChristian Brunner goto done_err; 986*f27aaf4bSChristian Brunner } 987*f27aaf4bSChristian Brunner 988*f27aaf4bSChristian Brunner sn_tab = qemu_mallocz(snap_count * sizeof(QEMUSnapshotInfo)); 989*f27aaf4bSChristian Brunner for (i = 0; i < snap_count; i++) { 990*f27aaf4bSChristian Brunner uint64_t id, image_size; 991*f27aaf4bSChristian Brunner char *snap_name; 992*f27aaf4bSChristian Brunner 993*f27aaf4bSChristian Brunner if ((r = decode64(&buf, end, &id)) < 0) { 994*f27aaf4bSChristian Brunner goto done_err; 995*f27aaf4bSChristian Brunner } 996*f27aaf4bSChristian Brunner if ((r = decode64(&buf, end, &image_size)) < 0) { 997*f27aaf4bSChristian Brunner goto done_err; 998*f27aaf4bSChristian Brunner } 999*f27aaf4bSChristian Brunner if ((r = decode_str(&buf, end, &snap_name)) < 0) { 1000*f27aaf4bSChristian Brunner goto done_err; 1001*f27aaf4bSChristian Brunner } 1002*f27aaf4bSChristian Brunner 1003*f27aaf4bSChristian Brunner sn_info = sn_tab + i; 1004*f27aaf4bSChristian Brunner pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name); 1005*f27aaf4bSChristian Brunner pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name); 1006*f27aaf4bSChristian Brunner qemu_free(snap_name); 1007*f27aaf4bSChristian Brunner 1008*f27aaf4bSChristian Brunner sn_info->vm_state_size = image_size; 1009*f27aaf4bSChristian Brunner sn_info->date_sec = 0; 1010*f27aaf4bSChristian Brunner sn_info->date_nsec = 0; 1011*f27aaf4bSChristian Brunner sn_info->vm_clock_nsec = 0; 1012*f27aaf4bSChristian Brunner } 1013*f27aaf4bSChristian Brunner *psn_tab = sn_tab; 1014*f27aaf4bSChristian Brunner qemu_free(outbuf); 1015*f27aaf4bSChristian Brunner return snap_count; 1016*f27aaf4bSChristian Brunner done_err: 1017*f27aaf4bSChristian Brunner qemu_free(sn_tab); 1018*f27aaf4bSChristian Brunner qemu_free(outbuf); 1019*f27aaf4bSChristian Brunner return r; 1020*f27aaf4bSChristian Brunner } 1021*f27aaf4bSChristian Brunner 1022*f27aaf4bSChristian Brunner static QEMUOptionParameter rbd_create_options[] = { 1023*f27aaf4bSChristian Brunner { 1024*f27aaf4bSChristian Brunner .name = BLOCK_OPT_SIZE, 1025*f27aaf4bSChristian Brunner .type = OPT_SIZE, 1026*f27aaf4bSChristian Brunner .help = "Virtual disk size" 1027*f27aaf4bSChristian Brunner }, 1028*f27aaf4bSChristian Brunner { 1029*f27aaf4bSChristian Brunner .name = BLOCK_OPT_CLUSTER_SIZE, 1030*f27aaf4bSChristian Brunner .type = OPT_SIZE, 1031*f27aaf4bSChristian Brunner .help = "RBD object size" 1032*f27aaf4bSChristian Brunner }, 1033*f27aaf4bSChristian Brunner {NULL} 1034*f27aaf4bSChristian Brunner }; 1035*f27aaf4bSChristian Brunner 1036*f27aaf4bSChristian Brunner static BlockDriver bdrv_rbd = { 1037*f27aaf4bSChristian Brunner .format_name = "rbd", 1038*f27aaf4bSChristian Brunner .instance_size = sizeof(BDRVRBDState), 1039*f27aaf4bSChristian Brunner .bdrv_file_open = rbd_open, 1040*f27aaf4bSChristian Brunner .bdrv_close = rbd_close, 1041*f27aaf4bSChristian Brunner .bdrv_create = rbd_create, 1042*f27aaf4bSChristian Brunner .bdrv_get_info = rbd_getinfo, 1043*f27aaf4bSChristian Brunner .create_options = rbd_create_options, 1044*f27aaf4bSChristian Brunner .bdrv_getlength = rbd_getlength, 1045*f27aaf4bSChristian Brunner .protocol_name = "rbd", 1046*f27aaf4bSChristian Brunner 1047*f27aaf4bSChristian Brunner .bdrv_aio_readv = rbd_aio_readv, 1048*f27aaf4bSChristian Brunner .bdrv_aio_writev = rbd_aio_writev, 1049*f27aaf4bSChristian Brunner 1050*f27aaf4bSChristian Brunner .bdrv_snapshot_create = rbd_snap_create, 1051*f27aaf4bSChristian Brunner .bdrv_snapshot_list = rbd_snap_list, 1052*f27aaf4bSChristian Brunner }; 1053*f27aaf4bSChristian Brunner 1054*f27aaf4bSChristian Brunner static void bdrv_rbd_init(void) 1055*f27aaf4bSChristian Brunner { 1056*f27aaf4bSChristian Brunner bdrv_register(&bdrv_rbd); 1057*f27aaf4bSChristian Brunner } 1058*f27aaf4bSChristian Brunner 1059*f27aaf4bSChristian Brunner block_init(bdrv_rbd_init); 1060