1 /* 2 * QEMU NVM Express Virtual Namespace 3 * 4 * Copyright (c) 2019 CNEX Labs 5 * Copyright (c) 2020 Samsung Electronics 6 * 7 * Authors: 8 * Klaus Jensen <k.jensen@samsung.com> 9 * 10 * This work is licensed under the terms of the GNU GPL, version 2. See the 11 * COPYING file in the top-level directory. 12 * 13 */ 14 15 #include "qemu/osdep.h" 16 #include "qemu/units.h" 17 #include "qemu/error-report.h" 18 #include "qapi/error.h" 19 #include "sysemu/sysemu.h" 20 #include "sysemu/block-backend.h" 21 22 #include "nvme.h" 23 #include "trace.h" 24 25 #define MIN_DISCARD_GRANULARITY (4 * KiB) 26 #define NVME_DEFAULT_ZONE_SIZE (128 * MiB) 27 28 void nvme_ns_init_format(NvmeNamespace *ns) 29 { 30 NvmeIdNs *id_ns = &ns->id_ns; 31 BlockDriverInfo bdi; 32 int npdg, nlbas, ret; 33 34 ns->lbaf = id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)]; 35 ns->lbasz = 1 << ns->lbaf.ds; 36 37 nlbas = ns->size / (ns->lbasz + ns->lbaf.ms); 38 39 id_ns->nsze = cpu_to_le64(nlbas); 40 41 /* no thin provisioning */ 42 id_ns->ncap = id_ns->nsze; 43 id_ns->nuse = id_ns->ncap; 44 45 ns->moff = (int64_t)nlbas << ns->lbaf.ds; 46 47 npdg = ns->blkconf.discard_granularity / ns->lbasz; 48 49 ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi); 50 if (ret >= 0 && bdi.cluster_size > ns->blkconf.discard_granularity) { 51 npdg = bdi.cluster_size / ns->lbasz; 52 } 53 54 id_ns->npda = id_ns->npdg = npdg - 1; 55 } 56 57 static int nvme_ns_init(NvmeNamespace *ns, Error **errp) 58 { 59 static uint64_t ns_count; 60 NvmeIdNs *id_ns = &ns->id_ns; 61 uint8_t ds; 62 uint16_t ms; 63 int i; 64 65 ns->csi = NVME_CSI_NVM; 66 ns->status = 0x0; 67 68 ns->id_ns.dlfeat = 0x1; 69 70 /* support DULBE and I/O optimization fields */ 71 id_ns->nsfeat |= (0x4 | 0x10); 72 73 if (ns->params.shared) { 74 id_ns->nmic |= NVME_NMIC_NS_SHARED; 75 } 76 77 /* Substitute a missing EUI-64 by an autogenerated one */ 78 ++ns_count; 79 if (!ns->params.eui64 && ns->params.eui64_default) { 80 ns->params.eui64 = ns_count + NVME_EUI64_DEFAULT; 81 } 82 83 /* simple copy */ 84 id_ns->mssrl = cpu_to_le16(ns->params.mssrl); 85 id_ns->mcl = cpu_to_le32(ns->params.mcl); 86 id_ns->msrc = ns->params.msrc; 87 id_ns->eui64 = cpu_to_be64(ns->params.eui64); 88 89 ds = 31 - clz32(ns->blkconf.logical_block_size); 90 ms = ns->params.ms; 91 92 id_ns->mc = NVME_ID_NS_MC_EXTENDED | NVME_ID_NS_MC_SEPARATE; 93 94 if (ms && ns->params.mset) { 95 id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDED; 96 } 97 98 id_ns->dpc = 0x1f; 99 id_ns->dps = ns->params.pi; 100 if (ns->params.pi && ns->params.pil) { 101 id_ns->dps |= NVME_ID_NS_DPS_FIRST_EIGHT; 102 } 103 104 static const NvmeLBAF lbaf[16] = { 105 [0] = { .ds = 9 }, 106 [1] = { .ds = 9, .ms = 8 }, 107 [2] = { .ds = 9, .ms = 16 }, 108 [3] = { .ds = 9, .ms = 64 }, 109 [4] = { .ds = 12 }, 110 [5] = { .ds = 12, .ms = 8 }, 111 [6] = { .ds = 12, .ms = 16 }, 112 [7] = { .ds = 12, .ms = 64 }, 113 }; 114 115 memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf)); 116 id_ns->nlbaf = 7; 117 118 for (i = 0; i <= id_ns->nlbaf; i++) { 119 NvmeLBAF *lbaf = &id_ns->lbaf[i]; 120 if (lbaf->ds == ds) { 121 if (lbaf->ms == ms) { 122 id_ns->flbas |= i; 123 goto lbaf_found; 124 } 125 } 126 } 127 128 /* add non-standard lba format */ 129 id_ns->nlbaf++; 130 id_ns->lbaf[id_ns->nlbaf].ds = ds; 131 id_ns->lbaf[id_ns->nlbaf].ms = ms; 132 id_ns->flbas |= id_ns->nlbaf; 133 134 lbaf_found: 135 nvme_ns_init_format(ns); 136 137 return 0; 138 } 139 140 static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp) 141 { 142 bool read_only; 143 144 if (!blkconf_blocksizes(&ns->blkconf, errp)) { 145 return -1; 146 } 147 148 read_only = !blk_supports_write_perm(ns->blkconf.blk); 149 if (!blkconf_apply_backend_options(&ns->blkconf, read_only, false, errp)) { 150 return -1; 151 } 152 153 if (ns->blkconf.discard_granularity == -1) { 154 ns->blkconf.discard_granularity = 155 MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY); 156 } 157 158 ns->size = blk_getlength(ns->blkconf.blk); 159 if (ns->size < 0) { 160 error_setg_errno(errp, -ns->size, "could not get blockdev size"); 161 return -1; 162 } 163 164 return 0; 165 } 166 167 static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp) 168 { 169 uint64_t zone_size, zone_cap; 170 171 /* Make sure that the values of ZNS properties are sane */ 172 if (ns->params.zone_size_bs) { 173 zone_size = ns->params.zone_size_bs; 174 } else { 175 zone_size = NVME_DEFAULT_ZONE_SIZE; 176 } 177 if (ns->params.zone_cap_bs) { 178 zone_cap = ns->params.zone_cap_bs; 179 } else { 180 zone_cap = zone_size; 181 } 182 if (zone_cap > zone_size) { 183 error_setg(errp, "zone capacity %"PRIu64"B exceeds " 184 "zone size %"PRIu64"B", zone_cap, zone_size); 185 return -1; 186 } 187 if (zone_size < ns->lbasz) { 188 error_setg(errp, "zone size %"PRIu64"B too small, " 189 "must be at least %zuB", zone_size, ns->lbasz); 190 return -1; 191 } 192 if (zone_cap < ns->lbasz) { 193 error_setg(errp, "zone capacity %"PRIu64"B too small, " 194 "must be at least %zuB", zone_cap, ns->lbasz); 195 return -1; 196 } 197 198 /* 199 * Save the main zone geometry values to avoid 200 * calculating them later again. 201 */ 202 ns->zone_size = zone_size / ns->lbasz; 203 ns->zone_capacity = zone_cap / ns->lbasz; 204 ns->num_zones = le64_to_cpu(ns->id_ns.nsze) / ns->zone_size; 205 206 /* Do a few more sanity checks of ZNS properties */ 207 if (!ns->num_zones) { 208 error_setg(errp, 209 "insufficient drive capacity, must be at least the size " 210 "of one zone (%"PRIu64"B)", zone_size); 211 return -1; 212 } 213 214 return 0; 215 } 216 217 static void nvme_ns_zoned_init_state(NvmeNamespace *ns) 218 { 219 uint64_t start = 0, zone_size = ns->zone_size; 220 uint64_t capacity = ns->num_zones * zone_size; 221 NvmeZone *zone; 222 int i; 223 224 ns->zone_array = g_new0(NvmeZone, ns->num_zones); 225 if (ns->params.zd_extension_size) { 226 ns->zd_extensions = g_malloc0(ns->params.zd_extension_size * 227 ns->num_zones); 228 } 229 230 QTAILQ_INIT(&ns->exp_open_zones); 231 QTAILQ_INIT(&ns->imp_open_zones); 232 QTAILQ_INIT(&ns->closed_zones); 233 QTAILQ_INIT(&ns->full_zones); 234 235 zone = ns->zone_array; 236 for (i = 0; i < ns->num_zones; i++, zone++) { 237 if (start + zone_size > capacity) { 238 zone_size = capacity - start; 239 } 240 zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE; 241 nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); 242 zone->d.za = 0; 243 zone->d.zcap = ns->zone_capacity; 244 zone->d.zslba = start; 245 zone->d.wp = start; 246 zone->w_ptr = start; 247 start += zone_size; 248 } 249 250 ns->zone_size_log2 = 0; 251 if (is_power_of_2(ns->zone_size)) { 252 ns->zone_size_log2 = 63 - clz64(ns->zone_size); 253 } 254 } 255 256 static void nvme_ns_init_zoned(NvmeNamespace *ns) 257 { 258 NvmeIdNsZoned *id_ns_z; 259 int i; 260 261 nvme_ns_zoned_init_state(ns); 262 263 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned)); 264 265 /* MAR/MOR are zeroes-based, FFFFFFFFFh means no limit */ 266 id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1); 267 id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1); 268 id_ns_z->zoc = 0; 269 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00; 270 271 for (i = 0; i <= ns->id_ns.nlbaf; i++) { 272 id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size); 273 id_ns_z->lbafe[i].zdes = 274 ns->params.zd_extension_size >> 6; /* Units of 64B */ 275 } 276 277 ns->csi = NVME_CSI_ZONED; 278 ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size); 279 ns->id_ns.ncap = ns->id_ns.nsze; 280 ns->id_ns.nuse = ns->id_ns.ncap; 281 282 /* 283 * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated" 284 * status of logical blocks. Since the spec defines that logical blocks 285 * SHALL be deallocated when then zone is in the Empty or Offline states, 286 * we can only support DULBE if the zone size is a multiple of the 287 * calculated NPDG. 288 */ 289 if (ns->zone_size % (ns->id_ns.npdg + 1)) { 290 warn_report("the zone size (%"PRIu64" blocks) is not a multiple of " 291 "the calculated deallocation granularity (%d blocks); " 292 "DULBE support disabled", 293 ns->zone_size, ns->id_ns.npdg + 1); 294 295 ns->id_ns.nsfeat &= ~0x4; 296 } 297 298 ns->id_ns_zoned = id_ns_z; 299 } 300 301 static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone) 302 { 303 uint8_t state; 304 305 zone->w_ptr = zone->d.wp; 306 state = nvme_get_zone_state(zone); 307 if (zone->d.wp != zone->d.zslba || 308 (zone->d.za & NVME_ZA_ZD_EXT_VALID)) { 309 if (state != NVME_ZONE_STATE_CLOSED) { 310 trace_pci_nvme_clear_ns_close(state, zone->d.zslba); 311 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED); 312 } 313 nvme_aor_inc_active(ns); 314 QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry); 315 } else { 316 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba); 317 nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY); 318 } 319 } 320 321 /* 322 * Close all the zones that are currently open. 323 */ 324 static void nvme_zoned_ns_shutdown(NvmeNamespace *ns) 325 { 326 NvmeZone *zone, *next; 327 328 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) { 329 QTAILQ_REMOVE(&ns->closed_zones, zone, entry); 330 nvme_aor_dec_active(ns); 331 nvme_clear_zone(ns, zone); 332 } 333 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) { 334 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry); 335 nvme_aor_dec_open(ns); 336 nvme_aor_dec_active(ns); 337 nvme_clear_zone(ns, zone); 338 } 339 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) { 340 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry); 341 nvme_aor_dec_open(ns); 342 nvme_aor_dec_active(ns); 343 nvme_clear_zone(ns, zone); 344 } 345 346 assert(ns->nr_open_zones == 0); 347 } 348 349 static int nvme_ns_check_constraints(NvmeCtrl *n, NvmeNamespace *ns, 350 Error **errp) 351 { 352 if (!ns->blkconf.blk) { 353 error_setg(errp, "block backend not configured"); 354 return -1; 355 } 356 357 if (ns->params.pi && ns->params.ms < 8) { 358 error_setg(errp, "at least 8 bytes of metadata required to enable " 359 "protection information"); 360 return -1; 361 } 362 363 if (ns->params.nsid > NVME_MAX_NAMESPACES) { 364 error_setg(errp, "invalid namespace id (must be between 0 and %d)", 365 NVME_MAX_NAMESPACES); 366 return -1; 367 } 368 369 if (!n->subsys) { 370 if (ns->params.detached) { 371 error_setg(errp, "detached requires that the nvme device is " 372 "linked to an nvme-subsys device"); 373 return -1; 374 } 375 376 if (ns->params.shared) { 377 error_setg(errp, "shared requires that the nvme device is " 378 "linked to an nvme-subsys device"); 379 return -1; 380 } 381 } 382 383 if (ns->params.zoned) { 384 if (ns->params.max_active_zones) { 385 if (ns->params.max_open_zones > ns->params.max_active_zones) { 386 error_setg(errp, "max_open_zones (%u) exceeds " 387 "max_active_zones (%u)", ns->params.max_open_zones, 388 ns->params.max_active_zones); 389 return -1; 390 } 391 392 if (!ns->params.max_open_zones) { 393 ns->params.max_open_zones = ns->params.max_active_zones; 394 } 395 } 396 397 if (ns->params.zd_extension_size) { 398 if (ns->params.zd_extension_size & 0x3f) { 399 error_setg(errp, "zone descriptor extension size must be a " 400 "multiple of 64B"); 401 return -1; 402 } 403 if ((ns->params.zd_extension_size >> 6) > 0xff) { 404 error_setg(errp, 405 "zone descriptor extension size is too large"); 406 return -1; 407 } 408 } 409 } 410 411 return 0; 412 } 413 414 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp) 415 { 416 if (nvme_ns_check_constraints(n, ns, errp)) { 417 return -1; 418 } 419 420 if (nvme_ns_init_blk(ns, errp)) { 421 return -1; 422 } 423 424 if (nvme_ns_init(ns, errp)) { 425 return -1; 426 } 427 if (ns->params.zoned) { 428 if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) { 429 return -1; 430 } 431 nvme_ns_init_zoned(ns); 432 } 433 434 return 0; 435 } 436 437 void nvme_ns_drain(NvmeNamespace *ns) 438 { 439 blk_drain(ns->blkconf.blk); 440 } 441 442 void nvme_ns_shutdown(NvmeNamespace *ns) 443 { 444 blk_flush(ns->blkconf.blk); 445 if (ns->params.zoned) { 446 nvme_zoned_ns_shutdown(ns); 447 } 448 } 449 450 void nvme_ns_cleanup(NvmeNamespace *ns) 451 { 452 if (ns->params.zoned) { 453 g_free(ns->id_ns_zoned); 454 g_free(ns->zone_array); 455 g_free(ns->zd_extensions); 456 } 457 } 458 459 static void nvme_ns_realize(DeviceState *dev, Error **errp) 460 { 461 NvmeNamespace *ns = NVME_NS(dev); 462 BusState *s = qdev_get_parent_bus(dev); 463 NvmeCtrl *n = NVME(s->parent); 464 NvmeSubsystem *subsys = n->subsys; 465 uint32_t nsid = ns->params.nsid; 466 int i; 467 468 if (nvme_ns_setup(n, ns, errp)) { 469 return; 470 } 471 472 if (!nsid) { 473 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) { 474 if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) { 475 continue; 476 } 477 478 nsid = ns->params.nsid = i; 479 break; 480 } 481 482 if (!nsid) { 483 error_setg(errp, "no free namespace id"); 484 return; 485 } 486 } else { 487 if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) { 488 error_setg(errp, "namespace id '%d' already allocated", nsid); 489 return; 490 } 491 } 492 493 if (subsys) { 494 subsys->namespaces[nsid] = ns; 495 496 if (ns->params.detached) { 497 return; 498 } 499 500 if (ns->params.shared) { 501 for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) { 502 NvmeCtrl *ctrl = subsys->ctrls[i]; 503 504 if (ctrl) { 505 nvme_attach_ns(ctrl, ns); 506 } 507 } 508 509 return; 510 } 511 } 512 513 nvme_attach_ns(n, ns); 514 } 515 516 static Property nvme_ns_props[] = { 517 DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf), 518 DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false), 519 DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false), 520 DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0), 521 DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid), 522 DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0), 523 DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0), 524 DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0), 525 DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0), 526 DEFINE_PROP_UINT8("pil", NvmeNamespace, params.pil, 0), 527 DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128), 528 DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128), 529 DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127), 530 DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false), 531 DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs, 532 NVME_DEFAULT_ZONE_SIZE), 533 DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs, 534 0), 535 DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace, 536 params.cross_zone_read, false), 537 DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace, 538 params.max_active_zones, 0), 539 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace, 540 params.max_open_zones, 0), 541 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace, 542 params.zd_extension_size, 0), 543 DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default, 544 true), 545 DEFINE_PROP_END_OF_LIST(), 546 }; 547 548 static void nvme_ns_class_init(ObjectClass *oc, void *data) 549 { 550 DeviceClass *dc = DEVICE_CLASS(oc); 551 552 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories); 553 554 dc->bus_type = TYPE_NVME_BUS; 555 dc->realize = nvme_ns_realize; 556 device_class_set_props(dc, nvme_ns_props); 557 dc->desc = "Virtual NVMe namespace"; 558 } 559 560 static void nvme_ns_instance_init(Object *obj) 561 { 562 NvmeNamespace *ns = NVME_NS(obj); 563 char *bootindex = g_strdup_printf("/namespace@%d,0", ns->params.nsid); 564 565 device_add_bootindex_property(obj, &ns->bootindex, "bootindex", 566 bootindex, DEVICE(obj)); 567 568 g_free(bootindex); 569 } 570 571 static const TypeInfo nvme_ns_info = { 572 .name = TYPE_NVME_NS, 573 .parent = TYPE_DEVICE, 574 .class_init = nvme_ns_class_init, 575 .instance_size = sizeof(NvmeNamespace), 576 .instance_init = nvme_ns_instance_init, 577 }; 578 579 static void nvme_ns_register_types(void) 580 { 581 type_register_static(&nvme_ns_info); 582 } 583 584 type_init(nvme_ns_register_types) 585