xref: /openbmc/qemu/hw/nvme/ns.c (revision 40f23e4e)
1 /*
2  * QEMU NVM Express Virtual Namespace
3  *
4  * Copyright (c) 2019 CNEX Labs
5  * Copyright (c) 2020 Samsung Electronics
6  *
7  * Authors:
8  *  Klaus Jensen      <k.jensen@samsung.com>
9  *
10  * This work is licensed under the terms of the GNU GPL, version 2. See the
11  * COPYING file in the top-level directory.
12  *
13  */
14 
15 #include "qemu/osdep.h"
16 #include "qemu/units.h"
17 #include "qemu/error-report.h"
18 #include "qapi/error.h"
19 #include "sysemu/sysemu.h"
20 #include "sysemu/block-backend.h"
21 
22 #include "nvme.h"
23 #include "trace.h"
24 
25 #define MIN_DISCARD_GRANULARITY (4 * KiB)
26 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
27 
28 void nvme_ns_init_format(NvmeNamespace *ns)
29 {
30     NvmeIdNs *id_ns = &ns->id_ns;
31     BlockDriverInfo bdi;
32     int npdg, nlbas, ret;
33 
34     ns->lbaf = id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)];
35     ns->lbasz = 1 << ns->lbaf.ds;
36 
37     nlbas = ns->size / (ns->lbasz + ns->lbaf.ms);
38 
39     id_ns->nsze = cpu_to_le64(nlbas);
40 
41     /* no thin provisioning */
42     id_ns->ncap = id_ns->nsze;
43     id_ns->nuse = id_ns->ncap;
44 
45     ns->moff = (int64_t)nlbas << ns->lbaf.ds;
46 
47     npdg = ns->blkconf.discard_granularity / ns->lbasz;
48 
49     ret = bdrv_get_info(blk_bs(ns->blkconf.blk), &bdi);
50     if (ret >= 0 && bdi.cluster_size > ns->blkconf.discard_granularity) {
51         npdg = bdi.cluster_size / ns->lbasz;
52     }
53 
54     id_ns->npda = id_ns->npdg = npdg - 1;
55 }
56 
57 static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
58 {
59     static uint64_t ns_count;
60     NvmeIdNs *id_ns = &ns->id_ns;
61     uint8_t ds;
62     uint16_t ms;
63     int i;
64 
65     ns->csi = NVME_CSI_NVM;
66     ns->status = 0x0;
67 
68     ns->id_ns.dlfeat = 0x1;
69 
70     /* support DULBE and I/O optimization fields */
71     id_ns->nsfeat |= (0x4 | 0x10);
72 
73     if (ns->params.shared) {
74         id_ns->nmic |= NVME_NMIC_NS_SHARED;
75     }
76 
77     /* Substitute a missing EUI-64 by an autogenerated one */
78     ++ns_count;
79     if (!ns->params.eui64 && ns->params.eui64_default) {
80         ns->params.eui64 = ns_count + NVME_EUI64_DEFAULT;
81     }
82 
83     /* simple copy */
84     id_ns->mssrl = cpu_to_le16(ns->params.mssrl);
85     id_ns->mcl = cpu_to_le32(ns->params.mcl);
86     id_ns->msrc = ns->params.msrc;
87     id_ns->eui64 = cpu_to_be64(ns->params.eui64);
88 
89     ds = 31 - clz32(ns->blkconf.logical_block_size);
90     ms = ns->params.ms;
91 
92     id_ns->mc = NVME_ID_NS_MC_EXTENDED | NVME_ID_NS_MC_SEPARATE;
93 
94     if (ms && ns->params.mset) {
95         id_ns->flbas |= NVME_ID_NS_FLBAS_EXTENDED;
96     }
97 
98     id_ns->dpc = 0x1f;
99     id_ns->dps = ns->params.pi;
100     if (ns->params.pi && ns->params.pil) {
101         id_ns->dps |= NVME_ID_NS_DPS_FIRST_EIGHT;
102     }
103 
104     static const NvmeLBAF lbaf[16] = {
105         [0] = { .ds =  9           },
106         [1] = { .ds =  9, .ms =  8 },
107         [2] = { .ds =  9, .ms = 16 },
108         [3] = { .ds =  9, .ms = 64 },
109         [4] = { .ds = 12           },
110         [5] = { .ds = 12, .ms =  8 },
111         [6] = { .ds = 12, .ms = 16 },
112         [7] = { .ds = 12, .ms = 64 },
113     };
114 
115     memcpy(&id_ns->lbaf, &lbaf, sizeof(lbaf));
116     id_ns->nlbaf = 7;
117 
118     for (i = 0; i <= id_ns->nlbaf; i++) {
119         NvmeLBAF *lbaf = &id_ns->lbaf[i];
120         if (lbaf->ds == ds) {
121             if (lbaf->ms == ms) {
122                 id_ns->flbas |= i;
123                 goto lbaf_found;
124             }
125         }
126     }
127 
128     /* add non-standard lba format */
129     id_ns->nlbaf++;
130     id_ns->lbaf[id_ns->nlbaf].ds = ds;
131     id_ns->lbaf[id_ns->nlbaf].ms = ms;
132     id_ns->flbas |= id_ns->nlbaf;
133 
134 lbaf_found:
135     nvme_ns_init_format(ns);
136 
137     return 0;
138 }
139 
140 static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
141 {
142     bool read_only;
143 
144     if (!blkconf_blocksizes(&ns->blkconf, errp)) {
145         return -1;
146     }
147 
148     read_only = !blk_supports_write_perm(ns->blkconf.blk);
149     if (!blkconf_apply_backend_options(&ns->blkconf, read_only, false, errp)) {
150         return -1;
151     }
152 
153     if (ns->blkconf.discard_granularity == -1) {
154         ns->blkconf.discard_granularity =
155             MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
156     }
157 
158     ns->size = blk_getlength(ns->blkconf.blk);
159     if (ns->size < 0) {
160         error_setg_errno(errp, -ns->size, "could not get blockdev size");
161         return -1;
162     }
163 
164     return 0;
165 }
166 
167 static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace *ns, Error **errp)
168 {
169     uint64_t zone_size, zone_cap;
170 
171     /* Make sure that the values of ZNS properties are sane */
172     if (ns->params.zone_size_bs) {
173         zone_size = ns->params.zone_size_bs;
174     } else {
175         zone_size = NVME_DEFAULT_ZONE_SIZE;
176     }
177     if (ns->params.zone_cap_bs) {
178         zone_cap = ns->params.zone_cap_bs;
179     } else {
180         zone_cap = zone_size;
181     }
182     if (zone_cap > zone_size) {
183         error_setg(errp, "zone capacity %"PRIu64"B exceeds "
184                    "zone size %"PRIu64"B", zone_cap, zone_size);
185         return -1;
186     }
187     if (zone_size < ns->lbasz) {
188         error_setg(errp, "zone size %"PRIu64"B too small, "
189                    "must be at least %zuB", zone_size, ns->lbasz);
190         return -1;
191     }
192     if (zone_cap < ns->lbasz) {
193         error_setg(errp, "zone capacity %"PRIu64"B too small, "
194                    "must be at least %zuB", zone_cap, ns->lbasz);
195         return -1;
196     }
197 
198     /*
199      * Save the main zone geometry values to avoid
200      * calculating them later again.
201      */
202     ns->zone_size = zone_size / ns->lbasz;
203     ns->zone_capacity = zone_cap / ns->lbasz;
204     ns->num_zones = le64_to_cpu(ns->id_ns.nsze) / ns->zone_size;
205 
206     /* Do a few more sanity checks of ZNS properties */
207     if (!ns->num_zones) {
208         error_setg(errp,
209                    "insufficient drive capacity, must be at least the size "
210                    "of one zone (%"PRIu64"B)", zone_size);
211         return -1;
212     }
213 
214     return 0;
215 }
216 
217 static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
218 {
219     uint64_t start = 0, zone_size = ns->zone_size;
220     uint64_t capacity = ns->num_zones * zone_size;
221     NvmeZone *zone;
222     int i;
223 
224     ns->zone_array = g_new0(NvmeZone, ns->num_zones);
225     if (ns->params.zd_extension_size) {
226         ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
227                                       ns->num_zones);
228     }
229 
230     QTAILQ_INIT(&ns->exp_open_zones);
231     QTAILQ_INIT(&ns->imp_open_zones);
232     QTAILQ_INIT(&ns->closed_zones);
233     QTAILQ_INIT(&ns->full_zones);
234 
235     zone = ns->zone_array;
236     for (i = 0; i < ns->num_zones; i++, zone++) {
237         if (start + zone_size > capacity) {
238             zone_size = capacity - start;
239         }
240         zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
241         nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
242         zone->d.za = 0;
243         zone->d.zcap = ns->zone_capacity;
244         zone->d.zslba = start;
245         zone->d.wp = start;
246         zone->w_ptr = start;
247         start += zone_size;
248     }
249 
250     ns->zone_size_log2 = 0;
251     if (is_power_of_2(ns->zone_size)) {
252         ns->zone_size_log2 = 63 - clz64(ns->zone_size);
253     }
254 }
255 
256 static void nvme_ns_init_zoned(NvmeNamespace *ns)
257 {
258     NvmeIdNsZoned *id_ns_z;
259     int i;
260 
261     nvme_ns_zoned_init_state(ns);
262 
263     id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
264 
265     /* MAR/MOR are zeroes-based, FFFFFFFFFh means no limit */
266     id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
267     id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
268     id_ns_z->zoc = 0;
269     id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
270 
271     for (i = 0; i <= ns->id_ns.nlbaf; i++) {
272         id_ns_z->lbafe[i].zsze = cpu_to_le64(ns->zone_size);
273         id_ns_z->lbafe[i].zdes =
274             ns->params.zd_extension_size >> 6; /* Units of 64B */
275     }
276 
277     ns->csi = NVME_CSI_ZONED;
278     ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
279     ns->id_ns.ncap = ns->id_ns.nsze;
280     ns->id_ns.nuse = ns->id_ns.ncap;
281 
282     /*
283      * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
284      * status of logical blocks. Since the spec defines that logical blocks
285      * SHALL be deallocated when then zone is in the Empty or Offline states,
286      * we can only support DULBE if the zone size is a multiple of the
287      * calculated NPDG.
288      */
289     if (ns->zone_size % (ns->id_ns.npdg + 1)) {
290         warn_report("the zone size (%"PRIu64" blocks) is not a multiple of "
291                     "the calculated deallocation granularity (%d blocks); "
292                     "DULBE support disabled",
293                     ns->zone_size, ns->id_ns.npdg + 1);
294 
295         ns->id_ns.nsfeat &= ~0x4;
296     }
297 
298     ns->id_ns_zoned = id_ns_z;
299 }
300 
301 static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone *zone)
302 {
303     uint8_t state;
304 
305     zone->w_ptr = zone->d.wp;
306     state = nvme_get_zone_state(zone);
307     if (zone->d.wp != zone->d.zslba ||
308         (zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
309         if (state != NVME_ZONE_STATE_CLOSED) {
310             trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
311             nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
312         }
313         nvme_aor_inc_active(ns);
314         QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
315     } else {
316         trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
317         nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
318     }
319 }
320 
321 /*
322  * Close all the zones that are currently open.
323  */
324 static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
325 {
326     NvmeZone *zone, *next;
327 
328     QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
329         QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
330         nvme_aor_dec_active(ns);
331         nvme_clear_zone(ns, zone);
332     }
333     QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
334         QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
335         nvme_aor_dec_open(ns);
336         nvme_aor_dec_active(ns);
337         nvme_clear_zone(ns, zone);
338     }
339     QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
340         QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
341         nvme_aor_dec_open(ns);
342         nvme_aor_dec_active(ns);
343         nvme_clear_zone(ns, zone);
344     }
345 
346     assert(ns->nr_open_zones == 0);
347 }
348 
349 static int nvme_ns_check_constraints(NvmeCtrl *n, NvmeNamespace *ns,
350                                      Error **errp)
351 {
352     if (!ns->blkconf.blk) {
353         error_setg(errp, "block backend not configured");
354         return -1;
355     }
356 
357     if (ns->params.pi && ns->params.ms < 8) {
358         error_setg(errp, "at least 8 bytes of metadata required to enable "
359                    "protection information");
360         return -1;
361     }
362 
363     if (ns->params.nsid > NVME_MAX_NAMESPACES) {
364         error_setg(errp, "invalid namespace id (must be between 0 and %d)",
365                    NVME_MAX_NAMESPACES);
366         return -1;
367     }
368 
369     if (!n->subsys) {
370         if (ns->params.detached) {
371             error_setg(errp, "detached requires that the nvme device is "
372                        "linked to an nvme-subsys device");
373             return -1;
374         }
375 
376         if (ns->params.shared) {
377             error_setg(errp, "shared requires that the nvme device is "
378                        "linked to an nvme-subsys device");
379             return -1;
380         }
381     }
382 
383     if (ns->params.zoned) {
384         if (ns->params.max_active_zones) {
385             if (ns->params.max_open_zones > ns->params.max_active_zones) {
386                 error_setg(errp, "max_open_zones (%u) exceeds "
387                            "max_active_zones (%u)", ns->params.max_open_zones,
388                            ns->params.max_active_zones);
389                 return -1;
390             }
391 
392             if (!ns->params.max_open_zones) {
393                 ns->params.max_open_zones = ns->params.max_active_zones;
394             }
395         }
396 
397         if (ns->params.zd_extension_size) {
398             if (ns->params.zd_extension_size & 0x3f) {
399                 error_setg(errp, "zone descriptor extension size must be a "
400                            "multiple of 64B");
401                 return -1;
402             }
403             if ((ns->params.zd_extension_size >> 6) > 0xff) {
404                 error_setg(errp,
405                            "zone descriptor extension size is too large");
406                 return -1;
407             }
408         }
409     }
410 
411     return 0;
412 }
413 
414 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
415 {
416     if (nvme_ns_check_constraints(n, ns, errp)) {
417         return -1;
418     }
419 
420     if (nvme_ns_init_blk(ns, errp)) {
421         return -1;
422     }
423 
424     if (nvme_ns_init(ns, errp)) {
425         return -1;
426     }
427     if (ns->params.zoned) {
428         if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
429             return -1;
430         }
431         nvme_ns_init_zoned(ns);
432     }
433 
434     return 0;
435 }
436 
437 void nvme_ns_drain(NvmeNamespace *ns)
438 {
439     blk_drain(ns->blkconf.blk);
440 }
441 
442 void nvme_ns_shutdown(NvmeNamespace *ns)
443 {
444     blk_flush(ns->blkconf.blk);
445     if (ns->params.zoned) {
446         nvme_zoned_ns_shutdown(ns);
447     }
448 }
449 
450 void nvme_ns_cleanup(NvmeNamespace *ns)
451 {
452     if (ns->params.zoned) {
453         g_free(ns->id_ns_zoned);
454         g_free(ns->zone_array);
455         g_free(ns->zd_extensions);
456     }
457 }
458 
459 static void nvme_ns_realize(DeviceState *dev, Error **errp)
460 {
461     NvmeNamespace *ns = NVME_NS(dev);
462     BusState *s = qdev_get_parent_bus(dev);
463     NvmeCtrl *n = NVME(s->parent);
464     NvmeSubsystem *subsys = n->subsys;
465     uint32_t nsid = ns->params.nsid;
466     int i;
467 
468     if (nvme_ns_setup(n, ns, errp)) {
469         return;
470     }
471 
472     if (!nsid) {
473         for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
474             if (nvme_ns(n, i) || nvme_subsys_ns(subsys, i)) {
475                 continue;
476             }
477 
478             nsid = ns->params.nsid = i;
479             break;
480         }
481 
482         if (!nsid) {
483             error_setg(errp, "no free namespace id");
484             return;
485         }
486     } else {
487         if (nvme_ns(n, nsid) || nvme_subsys_ns(subsys, nsid)) {
488             error_setg(errp, "namespace id '%d' already allocated", nsid);
489             return;
490         }
491     }
492 
493     if (subsys) {
494         subsys->namespaces[nsid] = ns;
495 
496         if (ns->params.detached) {
497             return;
498         }
499 
500         if (ns->params.shared) {
501             for (i = 0; i < ARRAY_SIZE(subsys->ctrls); i++) {
502                 NvmeCtrl *ctrl = subsys->ctrls[i];
503 
504                 if (ctrl) {
505                     nvme_attach_ns(ctrl, ns);
506                 }
507             }
508 
509             return;
510         }
511     }
512 
513     nvme_attach_ns(n, ns);
514 }
515 
516 static Property nvme_ns_props[] = {
517     DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
518     DEFINE_PROP_BOOL("detached", NvmeNamespace, params.detached, false),
519     DEFINE_PROP_BOOL("shared", NvmeNamespace, params.shared, false),
520     DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
521     DEFINE_PROP_UUID("uuid", NvmeNamespace, params.uuid),
522     DEFINE_PROP_UINT64("eui64", NvmeNamespace, params.eui64, 0),
523     DEFINE_PROP_UINT16("ms", NvmeNamespace, params.ms, 0),
524     DEFINE_PROP_UINT8("mset", NvmeNamespace, params.mset, 0),
525     DEFINE_PROP_UINT8("pi", NvmeNamespace, params.pi, 0),
526     DEFINE_PROP_UINT8("pil", NvmeNamespace, params.pil, 0),
527     DEFINE_PROP_UINT16("mssrl", NvmeNamespace, params.mssrl, 128),
528     DEFINE_PROP_UINT32("mcl", NvmeNamespace, params.mcl, 128),
529     DEFINE_PROP_UINT8("msrc", NvmeNamespace, params.msrc, 127),
530     DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
531     DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace, params.zone_size_bs,
532                      NVME_DEFAULT_ZONE_SIZE),
533     DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace, params.zone_cap_bs,
534                      0),
535     DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace,
536                      params.cross_zone_read, false),
537     DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace,
538                        params.max_active_zones, 0),
539     DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
540                        params.max_open_zones, 0),
541     DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
542                        params.zd_extension_size, 0),
543     DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default,
544                      true),
545     DEFINE_PROP_END_OF_LIST(),
546 };
547 
548 static void nvme_ns_class_init(ObjectClass *oc, void *data)
549 {
550     DeviceClass *dc = DEVICE_CLASS(oc);
551 
552     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
553 
554     dc->bus_type = TYPE_NVME_BUS;
555     dc->realize = nvme_ns_realize;
556     device_class_set_props(dc, nvme_ns_props);
557     dc->desc = "Virtual NVMe namespace";
558 }
559 
560 static void nvme_ns_instance_init(Object *obj)
561 {
562     NvmeNamespace *ns = NVME_NS(obj);
563     char *bootindex = g_strdup_printf("/namespace@%d,0", ns->params.nsid);
564 
565     device_add_bootindex_property(obj, &ns->bootindex, "bootindex",
566                                   bootindex, DEVICE(obj));
567 
568     g_free(bootindex);
569 }
570 
571 static const TypeInfo nvme_ns_info = {
572     .name = TYPE_NVME_NS,
573     .parent = TYPE_DEVICE,
574     .class_init = nvme_ns_class_init,
575     .instance_size = sizeof(NvmeNamespace),
576     .instance_init = nvme_ns_instance_init,
577 };
578 
579 static void nvme_ns_register_types(void)
580 {
581     type_register_static(&nvme_ns_info);
582 }
583 
584 type_init(nvme_ns_register_types)
585