xref: /openbmc/qemu/hw/nvme/ns.c (revision 6016b7b4)
1 /*
2  * QEMU NVM Express Virtual Namespace
3  *
4  * Copyright (c) 2019 CNEX Labs
5  * Copyright (c) 2020 Samsung Electronics
6  *
7  * Authors:
8  *  Klaus Jensen      <k.jensen@samsung.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2. See the
11  * COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/units.h"
17 #include "qemu/error-report.h"
18 #include "qapi/error.h"
19 #include "sysemu/sysemu.h"
20 #include "sysemu/block-backend.h"
21 
22 #include "nvme.h"
23 #include "trace.h"
24 
25 #define MIN_DISCARD_GRANULARITY (4 * KiB)
26 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
27 
28 void nvme_ns_init_format(NvmeNamespace *ns)
29 {
30     NvmeIdNs *id_ns = &ns->id_ns;
31     BlockDriverInfo bdi;
32     int npdg, nlbas, ret;
33 
34     ns->lbaf = id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)];
35     ns->lbasz = 1 << ns->lbaf.ds;
36 
37     nlbas = ns->size / (ns->lbasz + ns->lbaf.ms);
38 
39     id_ns->nsze = cpu_to_le64(nlbas);
40 
41     /* no thin provisioning */
42     id_ns->ncap = id_ns->nsze;
43     id_ns->nuse = id_ns->ncap;
44 
45     ns->moff = (int64_t)nlbas << ns->lbaf.ds;
46 
47     npdg = ns->blkconf.discard_granularity / ns->lbasz;
48 
49     ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi);
50     if (ret >= 0 && bdi.cluster_size > ns->blkconf.discard_granularity) {
51         npdg = bdi.cluster_size / ns->lbasz;
52     }
53 
54     id_ns->npda = id_ns->npdg = npdg - 1;
55 }
56 
57 static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
58 {
59     static uint64_t ns_count;
60     NvmeIdNs *id_ns = &ns->id_ns;
61     uint8_t ds;
62     uint16_t ms;
63     int i;
64 
65     ns->csi = NVME_CSI_NVM;
66     ns->status = 0x0;
67 
68     ns->id_ns.dlfeat = 0x1;
69 
70     /* support DULBE and I/O optimization fields */
71     id_ns->nsfeat |= (0x4 | 0x10);
72 
73     if (ns->params.shared) {
74         id_ns->nmic |= NVME_NMIC_NS_SHARED;
75     }
76 
77     /* Substitute a missing EUI-64 by an autogenerated one */
78     ++ns_count;
79     if (!ns->params.eui64 && ns->params.eui64_default) {
80         ns->params.eui64 = ns_count + NVME_EUI64_DEFAULT;
81     }
82 
83     /* simple copy */
84     id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
85     id_ns->mcl = cpu_to_le32(ns->params.mcl);
86     id_ns->msrc = ns->params.msrc;
87     id_ns->eui64 = cpu_to_be64(ns->params.eui64);
88 
89     ds = 31 - clz32(ns->blkconf.logical_block_size);
90     ms = ns->params.ms;
91 
92     id_ns->mc = NVME_ID_NS_MC_EXTENDED | NVME_ID_NS_MC_SEPARATE;
93 
94     if (ms && ns->params.mset) {
95         id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDED;
96     }
97 
98     id_ns->dpc = 0x1f;
99     id_ns->dps = ns->params.pi;
100     if (ns->params.pi && ns->params.pil) {
101         id_ns->dps |= NVME_ID_NS_DPS_FIRST_EIGHT;
102     }
103 
104     static const NvmeLBAF lbaf[16] = {
105         [0] = { .ds =  9           },
106         [1] = { .ds =  9, .ms =  8 },
107         [2] = { .ds =  9, .ms = 16 },
108         [3] = { .ds =  9, .ms = 64 },
109         [4] = { .ds = 12           },
110         [5] = { .ds = 12, .ms =  8 },
111         [6] = { .ds = 12, .ms = 16 },
112         [7] = { .ds = 12, .ms = 64 },
113     };
114 
115     memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf));
116     id_ns->nlbaf = 7;
117 
118     for (i = 0; i <= id_ns->nlbaf; i++) {
119         NvmeLBAF *lbaf = &id_ns->lbaf[i];
120         if (lbaf->ds == ds) {
121             if (lbaf->ms == ms) {
122                 id_ns->flbas |= i;
123                 goto lbaf_found;
124             }
125         }
126     }
127 
128     /* add non-standard lba format */
129     id_ns->nlbaf++;
130     id_ns->lbaf[id_ns->nlbaf].ds = ds;
131     id_ns->lbaf[id_ns->nlbaf].ms = ms;
132     id_ns->flbas |= id_ns->nlbaf;
133 
134 lbaf_found:
135     nvme_ns_init_format(ns);
136 
137     return 0;
138 }
139 
140 static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
141 {
142     bool read_only;
143 
144     if (!blkconf_blocksizes(&ns->blkconf, errp)) {
145         return -1;
146     }
147 
148     read_only = !blk_supports_write_perm(ns->blkconf.blk);
149     if (!blkconf_apply_backend_options(&ns->blkconf, read_only, false, errp)) {
150         return -1;
151     }
152 
153     if (ns->blkconf.discard_granularity == -1) {
154         ns->blkconf.discard_granularity =
155             MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
156     }
157 
158     ns->size = blk_getlength(ns->blkconf.blk);
159     if (ns->size < 0) {
160         error_setg_errno(errp, -ns->size, "could not get blockdev size");
161         return -1;
162     }
163 
164     return 0;
165 }
166 
167 static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
168 {
169     uint64_t zone_size, zone_cap;
170 
171     /* Make sure that the values of ZNS properties are sane */
172     if (ns->params.zone_size_bs) {
173         zone_size = ns->params.zone_size_bs;
174     } else {
175         zone_size = NVME_DEFAULT_ZONE_SIZE;
176     }
177     if (ns->params.zone_cap_bs) {
178         zone_cap = ns->params.zone_cap_bs;
179     } else {
180         zone_cap = zone_size;
181     }
182     if (zone_cap > zone_size) {
183         error_setg(errp, "zone capacity %"PRIu64"B exceeds "
184                    "zone size %"PRIu64"B", zone_cap, zone_size);
185         return -1;
186     }
187     if (zone_size < ns->lbasz) {
188         error_setg(errp, "zone size %"PRIu64"B too small, "
189                    "must be at least %zuB", zone_size, ns->lbasz);
190         return -1;
191     }
192     if (zone_cap < ns->lbasz) {
193         error_setg(errp, "zone capacity %"PRIu64"B too small, "
194                    "must be at least %zuB", zone_cap, ns->lbasz);
195         return -1;
196     }
197 
198     /*
199      * Save the main zone geometry values to avoid
200      * calculating them later again.
201      */
202     ns->zone_size = zone_size / ns->lbasz;
203     ns->zone_capacity = zone_cap / ns->lbasz;
204     ns->num_zones = le64_to_cpu(ns->id_ns.nsze) / ns->zone_size;
205 
206     /* Do a few more sanity checks of ZNS properties */
207     if (!ns->num_zones) {
208         error_setg(errp,
209                    "insufficient drive capacity, must be at least the size "
210                    "of one zone (%"PRIu64"B)", zone_size);
211         return -1;
212     }
213 
214     return 0;
215 }
216 
217 static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
218 {
219     uint64_t start = 0, zone_size = ns->zone_size;
220     uint64_t capacity = ns->num_zones * zone_size;
221     NvmeZone *zone;
222     int i;
223 
224     ns->zone_array = g_new0(NvmeZone, ns->num_zones);
225     if (ns->params.zd_extension_size) {
226         ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
227                                       ns->num_zones);
228     }
229 
230     QTAILQ_INIT(&ns->exp_open_zones);
231     QTAILQ_INIT(&ns->imp_open_zones);
232     QTAILQ_INIT(&ns->closed_zones);
233     QTAILQ_INIT(&ns->full_zones);
234 
235     zone = ns->zone_array;
236     for (i = 0; i < ns->num_zones; i++, zone++) {
237         if (start + zone_size > capacity) {
238             zone_size = capacity - start;
239         }
240         zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
241         nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
242         zone->d.za = 0;
243         zone->d.zcap = ns->zone_capacity;
244         zone->d.zslba = start;
245         zone->d.wp = start;
246         zone->w_ptr = start;
247         start += zone_size;
248     }
249 
250     ns->zone_size_log2 = 0;
251     if (is_power_of_2(ns->zone_size)) {
252         ns->zone_size_log2 = 63 - clz64(ns->zone_size);
253     }
254 }
255 
256 static void nvme_ns_init_zoned(NvmeNamespace *ns)
257 {
258     NvmeIdNsZoned *id_ns_z;
259     int i;
260 
261     nvme_ns_zoned_init_state(ns);
262 
263     id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
264 
265     /* MAR/MOR are zeroes-based, FFFFFFFFFh means no limit */
266     id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
267     id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
268     id_ns_z->zoc = 0;
269     id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
270 
271     for (i = 0; i <= ns->id_ns.nlbaf; i++) {
272         id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size);
273         id_ns_z->lbafe[i].zdes =
274             ns->params.zd_extension_size >> 6; /* Units of 64B */
275     }
276 
277     ns->csi = NVME_CSI_ZONED;
278     ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
279     ns->id_ns.ncap = ns->id_ns.nsze;
280     ns->id_ns.nuse = ns->id_ns.ncap;
281 
282     /*
283      * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
284      * status of logical blocks. Since the spec defines that logical blocks
285      * SHALL be deallocated when then zone is in the Empty or Offline states,
286      * we can only support DULBE if the zone size is a multiple of the
287      * calculated NPDG.
288      */
289     if (ns->zone_size % (ns->id_ns.npdg + 1)) {
290         warn_report("the zone size (%"PRIu64" blocks) is not a multiple of "
291                     "the calculated deallocation granularity (%d blocks); "
292                     "DULBE support disabled",
293                     ns->zone_size, ns->id_ns.npdg + 1);
294 
295         ns->id_ns.nsfeat &= ~0x4;
296     }
297 
298     ns->id_ns_zoned = id_ns_z;
299 }
300 
301 static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
302 {
303     uint8_t state;
304 
305     zone->w_ptr = zone->d.wp;
306     state = nvme_get_zone_state(zone);
307     if (zone->d.wp != zone->d.zslba ||
308         (zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
309         if (state != NVME_ZONE_STATE_CLOSED) {
310             trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
311             nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
312         }
313         nvme_aor_inc_active(ns);
314         QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
315     } else {
316         trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
317         nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
318     }
319 }
320 
321 /*
322  * Close all the zones that are currently open.
323  */
324 static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
325 {
326     NvmeZone *zone, *next;
327 
328     QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
329         QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
330         nvme_aor_dec_active(ns);
331         nvme_clear_zone(ns, zone);
332     }
333     QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
334         QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
335         nvme_aor_dec_open(ns);
336         nvme_aor_dec_active(ns);
337         nvme_clear_zone(ns, zone);
338     }
339     QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
340         QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
341         nvme_aor_dec_open(ns);
342         nvme_aor_dec_active(ns);
343         nvme_clear_zone(ns, zone);
344     }
345 
346     assert(ns->nr_open_zones == 0);
347 }
348 
349 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
350 {
351     if (!ns->blkconf.blk) {
352         error_setg(errp, "block backend not configured");
353         return -1;
354     }
355 
356     if (ns->params.pi && ns->params.ms < 8) {
357         error_setg(errp, "at least 8 bytes of metadata required to enable "
358                    "protection information");
359         return -1;
360     }
361 
362     if (ns->params.nsid > NVME_MAX_NAMESPACES) {
363         error_setg(errp, "invalid namespace id (must be between 0 and %d)",
364                    NVME_MAX_NAMESPACES);
365         return -1;
366     }
367 
368     if (ns->params.zoned) {
369         if (ns->params.max_active_zones) {
370             if (ns->params.max_open_zones > ns->params.max_active_zones) {
371                 error_setg(errp, "max_open_zones (%u) exceeds "
372                            "max_active_zones (%u)", ns->params.max_open_zones,
373                            ns->params.max_active_zones);
374                 return -1;
375             }
376 
377             if (!ns->params.max_open_zones) {
378                 ns->params.max_open_zones = ns->params.max_active_zones;
379             }
380         }
381 
382         if (ns->params.zd_extension_size) {
383             if (ns->params.zd_extension_size & 0x3f) {
384                 error_setg(errp, "zone descriptor extension size must be a "
385                            "multiple of 64B");
386                 return -1;
387             }
388             if ((ns->params.zd_extension_size >> 6) > 0xff) {
389                 error_setg(errp,
390                            "zone descriptor extension size is too large");
391                 return -1;
392             }
393         }
394     }
395 
396     return 0;
397 }
398 
399 int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
400 {
401     if (nvme_ns_check_constraints(ns, errp)) {
402         return -1;
403     }
404 
405     if (nvme_ns_init_blk(ns, errp)) {
406         return -1;
407     }
408 
409     if (nvme_ns_init(ns, errp)) {
410         return -1;
411     }
412     if (ns->params.zoned) {
413         if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
414             return -1;
415         }
416         nvme_ns_init_zoned(ns);
417     }
418 
419     return 0;
420 }
421 
422 void nvme_ns_drain(NvmeNamespace *ns)
423 {
424     blk_drain(ns->blkconf.blk);
425 }
426 
427 void nvme_ns_shutdown(NvmeNamespace *ns)
428 {
429     blk_flush(ns->blkconf.blk);
430     if (ns->params.zoned) {
431         nvme_zoned_ns_shutdown(ns);
432     }
433 }
434 
435 void nvme_ns_cleanup(NvmeNamespace *ns)
436 {
437     if (ns->params.zoned) {
438         g_free(ns->id_ns_zoned);
439         g_free(ns->zone_array);
440         g_free(ns->zd_extensions);
441     }
442 }
443 
444 static void nvme_ns_unrealize(DeviceState *dev)
445 {
446     NvmeNamespace *ns = NVME_NS(dev);
447 
448     nvme_ns_drain(ns);
449     nvme_ns_shutdown(ns);
450     nvme_ns_cleanup(ns);
451 }
452 
453 static void nvme_ns_realize(DeviceState *dev, Error **errp)
454 {
455     NvmeNamespace *ns = NVME_NS(dev);
456     BusState *s = qdev_get_parent_bus(dev);
457     NvmeCtrl *n = NVME(s->parent);
458     NvmeSubsystem *subsys = n->subsys;
459     uint32_t nsid = ns->params.nsid;
460     int i;
461 
462     if (!n->subsys) {
463         if (ns->params.detached) {
464             error_setg(errp, "detached requires that the nvme device is "
465                        "linked to an nvme-subsys device");
466             return;
467         }
468     } else {
469         /*
470          * If this namespace belongs to a subsystem (through a link on the
471          * controller device), reparent the device.
472          */
473         if (!qdev_set_parent_bus(dev, &subsys->bus.parent_bus, errp)) {
474             return;
475         }
476     }
477 
478     if (nvme_ns_setup(ns, errp)) {
479         return;
480     }
481 
482     if (!nsid) {
483         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
484             if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) {
485                 continue;
486             }
487 
488             nsid = ns->params.nsid = i;
489             break;
490         }
491 
492         if (!nsid) {
493             error_setg(errp, "no free namespace id");
494             return;
495         }
496     } else {
497         if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) {
498             error_setg(errp, "namespace id '%d' already allocated", nsid);
499             return;
500         }
501     }
502 
503     if (subsys) {
504         subsys->namespaces[nsid] = ns;
505 
506         if (ns->params.detached) {
507             return;
508         }
509 
510         if (ns->params.shared) {
511             for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) {
512                 NvmeCtrl *ctrl = subsys->ctrls[i];
513 
514                 if (ctrl) {
515                     nvme_attach_ns(ctrl, ns);
516                 }
517             }
518 
519             return;
520         }
521     }
522 
523     nvme_attach_ns(n, ns);
524 }
525 
526 static Property nvme_ns_props[] = {
527     DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
528     DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
529     DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, true),
530     DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
531     DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
532     DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0),
533     DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0),
534     DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0),
535     DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0),
536     DEFINE_PROP_UINT8("pil", NvmeNamespace, params.pil, 0),
537     DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
538     DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128),
539     DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127),
540     DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
541     DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs,
542                      NVME_DEFAULT_ZONE_SIZE),
543     DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs,
544                      0),
545     DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
546                      params.cross_zone_read, false),
547     DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace,
548                        params.max_active_zones, 0),
549     DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
550                        params.max_open_zones, 0),
551     DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
552                        params.zd_extension_size, 0),
553     DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default,
554                      true),
555     DEFINE_PROP_END_OF_LIST(),
556 };
557 
558 static void nvme_ns_class_init(ObjectClass *oc, void *data)
559 {
560     DeviceClass *dc = DEVICE_CLASS(oc);
561 
562     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
563 
564     dc->bus_type = TYPE_NVME_BUS;
565     dc->realize = nvme_ns_realize;
566     dc->unrealize = nvme_ns_unrealize;
567     device_class_set_props(dc, nvme_ns_props);
568     dc->desc = "Virtual NVMe namespace";
569 }
570 
571 static void nvme_ns_instance_init(Object *obj)
572 {
573     NvmeNamespace *ns = NVME_NS(obj);
574     char *bootindex = g_strdup_printf("/namespace@%d,0", ns->params.nsid);
575 
576     device_add_bootindex_property(obj, &ns->bootindex, "bootindex",
577                                   bootindex, DEVICE(obj));
578 
579     g_free(bootindex);
580 }
581 
582 static const TypeInfo nvme_ns_info = {
583     .name = TYPE_NVME_NS,
584     .parent = TYPE_DEVICE,
585     .class_init = nvme_ns_class_init,
586     .instance_size = sizeof(NvmeNamespace),
587     .instance_init = nvme_ns_instance_init,
588 };
589 
590 static void nvme_ns_register_types(void)
591 {
592     type_register_static(&nvme_ns_info);
593 }
594 
595 type_init(nvme_ns_register_types)
596