1 /* 2 * QEMU NVM Express Virtual Namespace 3 * 4 * Copyright (c) 2019 CNEX Labs 5 * Copyright (c) 2020 Samsung Electronics 6 * 7 * Authors: 8 * Klaus Jensen <k.jensen@samsung.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See the 11 * COPYING file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qemu/units.h" 17 #include "qemu/error-report.h" 18 #include "qapi/error.h" 19 #include "sysemu/sysemu.h" 20 #include "sysemu/block-backend.h" 21 22 #include "nvme.h" 23 #include "trace.h" 24 25 #define MIN_DISCARD_GRANULARITY (4 * KiB) 26 #define NVME_DEFAULT_ZONE_SIZE (128 * MiB) 27 28 void nvme_ns_init_format(NvmeNamespace *ns) 29 { 30 NvmeIdNs *id_ns = &ns->id_ns; 31 BlockDriverInfo bdi; 32 int npdg, nlbas, ret; 33 34 ns->lbaf = id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; 35 ns->lbasz = 1 << ns->lbaf.ds; 36 37 nlbas = ns->size / (ns->lbasz + ns->lbaf.ms); 38 39 id_ns->nsze = cpu_to_le64(nlbas); 40 41 /* no thin provisioning */ 42 id_ns->ncap = id_ns->nsze; 43 id_ns->nuse = id_ns->ncap; 44 45 ns->moff = (int64_t)nlbas << ns->lbaf.ds; 46 47 npdg = ns->blkconf.discard_granularity / ns->lbasz; 48 49 ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); 50 if (ret >= 0 && bdi.cluster_size > ns->blkconf.discard_granularity) { 51 npdg = bdi.cluster_size / ns->lbasz; 52 } 53 54 id_ns->npda = id_ns->npdg = npdg - 1; 55 } 56 57 static int nvme_ns_init(NvmeNamespace *ns, Error **errp) 58 { 59 NvmeIdNs *id_ns = &ns->id_ns; 60 uint8_t ds; 61 uint16_t ms; 62 int i; 63 64 ns->csi = NVME_CSI_NVM; 65 ns->status = 0x0; 66 67 ns->id_ns.dlfeat = 0x1; 68 69 /* support DULBE and I/O optimization fields */ 70 id_ns->nsfeat |= (0x4 | 0x10); 71 72 if (ns->params.shared) { 73 id_ns->nmic |= NVME_NMIC_NS_SHARED; 74 } 75 76 /* simple copy */ 77 id_ns->mssrl = cpu_to_le16(ns->params.mssrl); 78 id_ns->mcl = cpu_to_le32(ns->params.mcl); 79 id_ns->msrc = ns->params.msrc; 80 81 ds = 31 - clz32(ns->blkconf.logical_block_size); 82 ms = ns->params.ms; 83 84 if (ns->params.ms) { 85 id_ns->mc = 0x3; 86 87 if (ns->params.mset) { 88 id_ns->flbas |= 0x10; 89 } 90 91 id_ns->dpc = 0x1f; 92 id_ns->dps = ((ns->params.pil & 0x1) << 3) | ns->params.pi; 93 94 NvmeLBAF lbaf[16] = { 95 [0] = { .ds = 9 }, 96 [1] = { .ds = 9, .ms = 8 }, 97 [2] = { .ds = 9, .ms = 16 }, 98 [3] = { .ds = 9, .ms = 64 }, 99 [4] = { .ds = 12 }, 100 [5] = { .ds = 12, .ms = 8 }, 101 [6] = { .ds = 12, .ms = 16 }, 102 [7] = { .ds = 12, .ms = 64 }, 103 }; 104 105 memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf)); 106 id_ns->nlbaf = 7; 107 } else { 108 NvmeLBAF lbaf[16] = { 109 [0] = { .ds = 9 }, 110 [1] = { .ds = 12 }, 111 }; 112 113 memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf)); 114 id_ns->nlbaf = 1; 115 } 116 117 for (i = 0; i <= id_ns->nlbaf; i++) { 118 NvmeLBAF *lbaf = &id_ns->lbaf[i]; 119 if (lbaf->ds == ds) { 120 if (lbaf->ms == ms) { 121 id_ns->flbas |= i; 122 goto lbaf_found; 123 } 124 } 125 } 126 127 /* add non-standard lba format */ 128 id_ns->nlbaf++; 129 id_ns->lbaf[id_ns->nlbaf].ds = ds; 130 id_ns->lbaf[id_ns->nlbaf].ms = ms; 131 id_ns->flbas |= id_ns->nlbaf; 132 133 lbaf_found: 134 nvme_ns_init_format(ns); 135 136 return 0; 137 } 138 139 static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) 140 { 141 bool read_only; 142 143 if (!blkconf_blocksizes(&ns->blkconf, errp)) { 144 return -1; 145 } 146 147 read_only = !blk_supports_write_perm(ns->blkconf.blk); 148 if (!blkconf_apply_backend_options(&ns->blkconf, read_only, false, errp)) { 149 return -1; 150 } 151 152 if (ns->blkconf.discard_granularity == -1) { 153 ns->blkconf.discard_granularity = 154 MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); 155 } 156 157 ns->size = blk_getlength(ns->blkconf.blk); 158 if (ns->size < 0) { 159 error_setg_errno(errp, -ns->size, "could not get blockdev size"); 160 return -1; 161 } 162 163 return 0; 164 } 165 166 static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) 167 { 168 uint64_t zone_size, zone_cap; 169 170 /* Make sure that the values of ZNS properties are sane */ 171 if (ns->params.zone_size_bs) { 172 zone_size = ns->params.zone_size_bs; 173 } else { 174 zone_size = NVME_DEFAULT_ZONE_SIZE; 175 } 176 if (ns->params.zone_cap_bs) { 177 zone_cap = ns->params.zone_cap_bs; 178 } else { 179 zone_cap = zone_size; 180 } 181 if (zone_cap > zone_size) { 182 error_setg(errp, "zone capacity %"PRIu64"B exceeds " 183 "zone size %"PRIu64"B", zone_cap, zone_size); 184 return -1; 185 } 186 if (zone_size < ns->lbasz) { 187 error_setg(errp, "zone size %"PRIu64"B too small, " 188 "must be at least %zuB", zone_size, ns->lbasz); 189 return -1; 190 } 191 if (zone_cap < ns->lbasz) { 192 error_setg(errp, "zone capacity %"PRIu64"B too small, " 193 "must be at least %zuB", zone_cap, ns->lbasz); 194 return -1; 195 } 196 197 /* 198 * Save the main zone geometry values to avoid 199 * calculating them later again. 200 */ 201 ns->zone_size = zone_size / ns->lbasz; 202 ns->zone_capacity = zone_cap / ns->lbasz; 203 ns->num_zones = le64_to_cpu(ns->id_ns.nsze) / ns->zone_size; 204 205 /* Do a few more sanity checks of ZNS properties */ 206 if (!ns->num_zones) { 207 error_setg(errp, 208 "insufficient drive capacity, must be at least the size " 209 "of one zone (%"PRIu64"B)", zone_size); 210 return -1; 211 } 212 213 return 0; 214 } 215 216 static void nvme_ns_zoned_init_state(NvmeNamespace *ns) 217 { 218 uint64_t start = 0, zone_size = ns->zone_size; 219 uint64_t capacity = ns->num_zones * zone_size; 220 NvmeZone *zone; 221 int i; 222 223 ns->zone_array = g_new0(NvmeZone, ns->num_zones); 224 if (ns->params.zd_extension_size) { 225 ns->zd_extensions = g_malloc0(ns->params.zd_extension_size * 226 ns->num_zones); 227 } 228 229 QTAILQ_INIT(&ns->exp_open_zones); 230 QTAILQ_INIT(&ns->imp_open_zones); 231 QTAILQ_INIT(&ns->closed_zones); 232 QTAILQ_INIT(&ns->full_zones); 233 234 zone = ns->zone_array; 235 for (i = 0; i < ns->num_zones; i++, zone++) { 236 if (start + zone_size > capacity) { 237 zone_size = capacity - start; 238 } 239 zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE; 240 nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); 241 zone->d.za = 0; 242 zone->d.zcap = ns->zone_capacity; 243 zone->d.zslba = start; 244 zone->d.wp = start; 245 zone->w_ptr = start; 246 start += zone_size; 247 } 248 249 ns->zone_size_log2 = 0; 250 if (is_power_of_2(ns->zone_size)) { 251 ns->zone_size_log2 = 63 - clz64(ns->zone_size); 252 } 253 } 254 255 static void nvme_ns_init_zoned(NvmeNamespace *ns) 256 { 257 NvmeIdNsZoned *id_ns_z; 258 int i; 259 260 nvme_ns_zoned_init_state(ns); 261 262 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); 263 264 /* MAR/MOR are zeroes-based, FFFFFFFFFh means no limit */ 265 id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); 266 id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); 267 id_ns_z->zoc = 0; 268 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; 269 270 for (i = 0; i <= ns->id_ns.nlbaf; i++) { 271 id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size); 272 id_ns_z->lbafe[i].zdes = 273 ns->params.zd_extension_size >> 6; /* Units of 64B */ 274 } 275 276 ns->csi = NVME_CSI_ZONED; 277 ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); 278 ns->id_ns.ncap = ns->id_ns.nsze; 279 ns->id_ns.nuse = ns->id_ns.ncap; 280 281 /* 282 * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated" 283 * status of logical blocks. Since the spec defines that logical blocks 284 * SHALL be deallocated when then zone is in the Empty or Offline states, 285 * we can only support DULBE if the zone size is a multiple of the 286 * calculated NPDG. 287 */ 288 if (ns->zone_size % (ns->id_ns.npdg + 1)) { 289 warn_report("the zone size (%"PRIu64" blocks) is not a multiple of " 290 "the calculated deallocation granularity (%d blocks); " 291 "DULBE support disabled", 292 ns->zone_size, ns->id_ns.npdg + 1); 293 294 ns->id_ns.nsfeat &= ~0x4; 295 } 296 297 ns->id_ns_zoned = id_ns_z; 298 } 299 300 static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) 301 { 302 uint8_t state; 303 304 zone->w_ptr = zone->d.wp; 305 state = nvme_get_zone_state(zone); 306 if (zone->d.wp != zone->d.zslba || 307 (zone->d.za & NVME_ZA_ZD_EXT_VALID)) { 308 if (state != NVME_ZONE_STATE_CLOSED) { 309 trace_pci_nvme_clear_ns_close(state, zone->d.zslba); 310 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); 311 } 312 nvme_aor_inc_active(ns); 313 QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); 314 } else { 315 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); 316 nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); 317 } 318 } 319 320 /* 321 * Close all the zones that are currently open. 322 */ 323 static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) 324 { 325 NvmeZone *zone, *next; 326 327 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 328 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 329 nvme_aor_dec_active(ns); 330 nvme_clear_zone(ns, zone); 331 } 332 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 333 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 334 nvme_aor_dec_open(ns); 335 nvme_aor_dec_active(ns); 336 nvme_clear_zone(ns, zone); 337 } 338 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 339 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 340 nvme_aor_dec_open(ns); 341 nvme_aor_dec_active(ns); 342 nvme_clear_zone(ns, zone); 343 } 344 345 assert(ns->nr_open_zones == 0); 346 } 347 348 static int nvme_ns_check_constraints(NvmeCtrl *n, NvmeNamespace *ns, 349 Error **errp) 350 { 351 if (!ns->blkconf.blk) { 352 error_setg(errp, "block backend not configured"); 353 return -1; 354 } 355 356 if (ns->params.pi && ns->params.ms < 8) { 357 error_setg(errp, "at least 8 bytes of metadata required to enable " 358 "protection information"); 359 return -1; 360 } 361 362 if (ns->params.nsid > NVME_MAX_NAMESPACES) { 363 error_setg(errp, "invalid namespace id (must be between 0 and %d)", 364 NVME_MAX_NAMESPACES); 365 return -1; 366 } 367 368 if (!n->subsys) { 369 if (ns->params.detached) { 370 error_setg(errp, "detached requires that the nvme device is " 371 "linked to an nvme-subsys device"); 372 return -1; 373 } 374 375 if (ns->params.shared) { 376 error_setg(errp, "shared requires that the nvme device is " 377 "linked to an nvme-subsys device"); 378 return -1; 379 } 380 } 381 382 if (ns->params.zoned) { 383 if (ns->params.max_active_zones) { 384 if (ns->params.max_open_zones > ns->params.max_active_zones) { 385 error_setg(errp, "max_open_zones (%u) exceeds " 386 "max_active_zones (%u)", ns->params.max_open_zones, 387 ns->params.max_active_zones); 388 return -1; 389 } 390 391 if (!ns->params.max_open_zones) { 392 ns->params.max_open_zones = ns->params.max_active_zones; 393 } 394 } 395 396 if (ns->params.zd_extension_size) { 397 if (ns->params.zd_extension_size & 0x3f) { 398 error_setg(errp, "zone descriptor extension size must be a " 399 "multiple of 64B"); 400 return -1; 401 } 402 if ((ns->params.zd_extension_size >> 6) > 0xff) { 403 error_setg(errp, 404 "zone descriptor extension size is too large"); 405 return -1; 406 } 407 } 408 } 409 410 return 0; 411 } 412 413 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) 414 { 415 if (nvme_ns_check_constraints(n, ns, errp)) { 416 return -1; 417 } 418 419 if (nvme_ns_init_blk(ns, errp)) { 420 return -1; 421 } 422 423 if (nvme_ns_init(ns, errp)) { 424 return -1; 425 } 426 if (ns->params.zoned) { 427 if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { 428 return -1; 429 } 430 nvme_ns_init_zoned(ns); 431 } 432 433 return 0; 434 } 435 436 void nvme_ns_drain(NvmeNamespace *ns) 437 { 438 blk_drain(ns->blkconf.blk); 439 } 440 441 void nvme_ns_shutdown(NvmeNamespace *ns) 442 { 443 blk_flush(ns->blkconf.blk); 444 if (ns->params.zoned) { 445 nvme_zoned_ns_shutdown(ns); 446 } 447 } 448 449 void nvme_ns_cleanup(NvmeNamespace *ns) 450 { 451 if (ns->params.zoned) { 452 g_free(ns->id_ns_zoned); 453 g_free(ns->zone_array); 454 g_free(ns->zd_extensions); 455 } 456 } 457 458 static void nvme_ns_realize(DeviceState *dev, Error **errp) 459 { 460 NvmeNamespace *ns = NVME_NS(dev); 461 BusState *s = qdev_get_parent_bus(dev); 462 NvmeCtrl *n = NVME(s->parent); 463 NvmeSubsystem *subsys = n->subsys; 464 uint32_t nsid = ns->params.nsid; 465 int i; 466 467 if (nvme_ns_setup(n, ns, errp)) { 468 return; 469 } 470 471 if (!nsid) { 472 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 473 if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) { 474 continue; 475 } 476 477 nsid = ns->params.nsid = i; 478 break; 479 } 480 481 if (!nsid) { 482 error_setg(errp, "no free namespace id"); 483 return; 484 } 485 } else { 486 if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) { 487 error_setg(errp, "namespace id '%d' already allocated", nsid); 488 return; 489 } 490 } 491 492 if (subsys) { 493 subsys->namespaces[nsid] = ns; 494 495 if (ns->params.detached) { 496 return; 497 } 498 499 if (ns->params.shared) { 500 for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { 501 NvmeCtrl *ctrl = subsys->ctrls[i]; 502 503 if (ctrl) { 504 nvme_attach_ns(ctrl, ns); 505 } 506 } 507 508 return; 509 } 510 } 511 512 nvme_attach_ns(n, ns); 513 } 514 515 static Property nvme_ns_props[] = { 516 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), 517 DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false), 518 DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false), 519 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), 520 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), 521 DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0), 522 DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0), 523 DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0), 524 DEFINE_PROP_UINT8("pil", NvmeNamespace, params.pil, 0), 525 DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128), 526 DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128), 527 DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127), 528 DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), 529 DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, 530 NVME_DEFAULT_ZONE_SIZE), 531 DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs, 532 0), 533 DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace, 534 params.cross_zone_read, false), 535 DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace, 536 params.max_active_zones, 0), 537 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, 538 params.max_open_zones, 0), 539 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, 540 params.zd_extension_size, 0), 541 DEFINE_PROP_END_OF_LIST(), 542 }; 543 544 static void nvme_ns_class_init(ObjectClass *oc, void *data) 545 { 546 DeviceClass *dc = DEVICE_CLASS(oc); 547 548 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 549 550 dc->bus_type = TYPE_NVME_BUS; 551 dc->realize = nvme_ns_realize; 552 device_class_set_props(dc, nvme_ns_props); 553 dc->desc = "Virtual NVMe namespace"; 554 } 555 556 static void nvme_ns_instance_init(Object *obj) 557 { 558 NvmeNamespace *ns = NVME_NS(obj); 559 char *bootindex = g_strdup_printf("/namespace@%d,0", ns->params.nsid); 560 561 device_add_bootindex_property(obj, &ns->bootindex, "bootindex", 562 bootindex, DEVICE(obj)); 563 564 g_free(bootindex); 565 } 566 567 static const TypeInfo nvme_ns_info = { 568 .name = TYPE_NVME_NS, 569 .parent = TYPE_DEVICE, 570 .class_init = nvme_ns_class_init, 571 .instance_size = sizeof(NvmeNamespace), 572 .instance_init = nvme_ns_instance_init, 573 }; 574 575 static void nvme_ns_register_types(void) 576 { 577 type_register_static(&nvme_ns_info); 578 } 579 580 type_init(nvme_ns_register_types) 581