xref: /openbmc/qemu/hw/nvme/ctrl.c (revision 248f9209edfd289e7d97fb323e5075ccd55cc157)
1  /*
2   * QEMU NVM Express Controller
3   *
4   * Copyright (c) 2012, Intel Corporation
5   *
6   * Written by Keith Busch <keith.busch@intel.com>
7   *
8   * This code is licensed under the GNU GPL v2 or later.
9   */
10  
11  /**
12   * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13   *
14   *  https://nvmexpress.org/developers/nvme-specification/
15   *
16   *
17   * Notes on coding style
18   * ---------------------
19   * While QEMU coding style prefers lowercase hexadecimals in constants, the
20   * NVMe subsystem use this format from the NVMe specifications in the comments
21   * (i.e. 'h' suffix instead of '0x' prefix).
22   *
23   * Usage
24   * -----
25   * See docs/system/nvme.rst for extensive documentation.
26   *
27   * Add options:
28   *      -drive file=<file>,if=none,id=<drive_id>
29   *      -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30   *      -device nvme,serial=<serial>,id=<bus_name>, \
31   *              cmb_size_mb=<cmb_size_mb[optional]>, \
32   *              [pmrdev=<mem_backend_file_id>,] \
33   *              max_ioqpairs=<N[optional]>, \
34   *              aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35   *              mdts=<N[optional]>,vsl=<N[optional]>, \
36   *              zoned.zasl=<N[optional]>, \
37   *              zoned.auto_transition=<on|off[optional]>, \
38   *              sriov_max_vfs=<N[optional]> \
39   *              sriov_vq_flexible=<N[optional]> \
40   *              sriov_vi_flexible=<N[optional]> \
41   *              sriov_max_vi_per_vf=<N[optional]> \
42   *              sriov_max_vq_per_vf=<N[optional]> \
43   *              atomic.dn=<on|off[optional]>, \
44   *              atomic.awun<N[optional]>, \
45   *              atomic.awupf<N[optional]>, \
46   *              subsys=<subsys_id>
47   *      -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
48   *              zoned=<true|false[optional]>, \
49   *              subsys=<subsys_id>,shared=<true|false[optional]>, \
50   *              detached=<true|false[optional]>, \
51   *              zoned.zone_size=<N[optional]>, \
52   *              zoned.zone_capacity=<N[optional]>, \
53   *              zoned.descr_ext_size=<N[optional]>, \
54   *              zoned.max_active=<N[optional]>, \
55   *              zoned.max_open=<N[optional]>, \
56   *              zoned.cross_read=<true|false[optional]>
57   *
58   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
59   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
60   * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
61   * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
62   *
63   * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
64   * For example:
65   * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
66   *  size=<size> .... -device nvme,...,pmrdev=<mem_id>
67   *
68   * The PMR will use BAR 4/5 exclusively.
69   *
70   * To place controller(s) and namespace(s) to a subsystem, then provide
71   * nvme-subsys device as above.
72   *
73   * nvme subsystem device parameters
74   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
75   * - `nqn`
76   *   This parameter provides the `<nqn_id>` part of the string
77   *   `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
78   *   of subsystem controllers. Note that `<nqn_id>` should be unique per
79   *   subsystem, but this is not enforced by QEMU. If not specified, it will
80   *   default to the value of the `id` parameter (`<subsys_id>`).
81   *
82   * nvme device parameters
83   * ~~~~~~~~~~~~~~~~~~~~~~
84   * - `subsys`
85   *   Specifying this parameter attaches the controller to the subsystem and
86   *   the SUBNQN field in the controller will report the NQN of the subsystem
87   *   device. This also enables multi controller capability represented in
88   *   Identify Controller data structure in CMIC (Controller Multi-path I/O and
89   *   Namespace Sharing Capabilities).
90   *
91   * - `aerl`
92   *   The Asynchronous Event Request Limit (AERL). Indicates the maximum number
93   *   of concurrently outstanding Asynchronous Event Request commands support
94   *   by the controller. This is a 0's based value.
95   *
96   * - `aer_max_queued`
97   *   This is the maximum number of events that the device will enqueue for
98   *   completion when there are no outstanding AERs. When the maximum number of
99   *   enqueued events are reached, subsequent events will be dropped.
100   *
101   * - `mdts`
102   *   Indicates the maximum data transfer size for a command that transfers data
103   *   between host-accessible memory and the controller. The value is specified
104   *   as a power of two (2^n) and is in units of the minimum memory page size
105   *   (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
106   *
107   * - `vsl`
108   *   Indicates the maximum data size limit for the Verify command. Like `mdts`,
109   *   this value is specified as a power of two (2^n) and is in units of the
110   *   minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
111   *   KiB).
112   *
113   * - `zoned.zasl`
114   *   Indicates the maximum data transfer size for the Zone Append command. Like
115   *   `mdts`, the value is specified as a power of two (2^n) and is in units of
116   *   the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
117   *   defaulting to the value of `mdts`).
118   *
119   * - `zoned.auto_transition`
120   *   Indicates if zones in zone state implicitly opened can be automatically
121   *   transitioned to zone state closed for resource management purposes.
122   *   Defaults to 'on'.
123   *
124   * - `sriov_max_vfs`
125   *   Indicates the maximum number of PCIe virtual functions supported
126   *   by the controller. The default value is 0. Specifying a non-zero value
127   *   enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
128   *   Virtual function controllers will not report SR-IOV capability.
129   *
130   *   NOTE: Single Root I/O Virtualization support is experimental.
131   *   All the related parameters may be subject to change.
132   *
133   * - `sriov_vq_flexible`
134   *   Indicates the total number of flexible queue resources assignable to all
135   *   the secondary controllers. Implicitly sets the number of primary
136   *   controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
137   *
138   * - `sriov_vi_flexible`
139   *   Indicates the total number of flexible interrupt resources assignable to
140   *   all the secondary controllers. Implicitly sets the number of primary
141   *   controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
142   *
143   * - `sriov_max_vi_per_vf`
144   *   Indicates the maximum number of virtual interrupt resources assignable
145   *   to a secondary controller. The default 0 resolves to
146   *   `(sriov_vi_flexible / sriov_max_vfs)`.
147   *
148   * - `sriov_max_vq_per_vf`
149   *   Indicates the maximum number of virtual queue resources assignable to
150   *   a secondary controller. The default 0 resolves to
151   *   `(sriov_vq_flexible / sriov_max_vfs)`.
152   *
153   * nvme namespace device parameters
154   * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
155   * - `shared`
156   *   When the parent nvme device (as defined explicitly by the 'bus' parameter
157   *   or implicitly by the most recently defined NvmeBus) is linked to an
158   *   nvme-subsys device, the namespace will be attached to all controllers in
159   *   the subsystem. If set to 'off' (the default), the namespace will remain a
160   *   private namespace and may only be attached to a single controller at a
161   *   time.
162   *
163   * - `detached`
164   *   This parameter is only valid together with the `subsys` parameter. If left
165   *   at the default value (`false/off`), the namespace will be attached to all
166   *   controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
167   *   namespace will be available in the subsystem but not attached to any
168   *   controllers.
169   *
170   * Setting `zoned` to true selects Zoned Command Set at the namespace.
171   * In this case, the following namespace properties are available to configure
172   * zoned operation:
173   *     zoned.zone_size=<zone size in bytes, default: 128MiB>
174   *         The number may be followed by K, M, G as in kilo-, mega- or giga-.
175   *
176   *     zoned.zone_capacity=<zone capacity in bytes, default: zone size>
177   *         The value 0 (default) forces zone capacity to be the same as zone
178   *         size. The value of this property may not exceed zone size.
179   *
180   *     zoned.descr_ext_size=<zone descriptor extension size, default 0>
181   *         This value needs to be specified in 64B units. If it is zero,
182   *         namespace(s) will not support zone descriptor extensions.
183   *
184   *     zoned.max_active=<Maximum Active Resources (zones), default: 0>
185   *         The default value means there is no limit to the number of
186   *         concurrently active zones.
187   *
188   *     zoned.max_open=<Maximum Open Resources (zones), default: 0>
189   *         The default value means there is no limit to the number of
190   *         concurrently open zones.
191   *
192   *     zoned.cross_read=<enable RAZB, default: false>
193   *         Setting this property to true enables Read Across Zone Boundaries.
194   */
195  
196  #include "qemu/osdep.h"
197  #include "qemu/cutils.h"
198  #include "qemu/error-report.h"
199  #include "qemu/log.h"
200  #include "qemu/units.h"
201  #include "qemu/range.h"
202  #include "qapi/error.h"
203  #include "qapi/visitor.h"
204  #include "sysemu/sysemu.h"
205  #include "sysemu/block-backend.h"
206  #include "sysemu/hostmem.h"
207  #include "hw/pci/msix.h"
208  #include "hw/pci/pcie_sriov.h"
209  #include "sysemu/spdm-socket.h"
210  #include "migration/vmstate.h"
211  
212  #include "nvme.h"
213  #include "dif.h"
214  #include "trace.h"
215  
216  #define NVME_MAX_IOQPAIRS 0xffff
217  #define NVME_DB_SIZE  4
218  #define NVME_SPEC_VER 0x00010400
219  #define NVME_CMB_BIR 2
220  #define NVME_PMR_BIR 4
221  #define NVME_TEMPERATURE 0x143
222  #define NVME_TEMPERATURE_WARNING 0x157
223  #define NVME_TEMPERATURE_CRITICAL 0x175
224  #define NVME_NUM_FW_SLOTS 1
225  #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
226  #define NVME_VF_RES_GRANULARITY 1
227  #define NVME_VF_OFFSET 0x1
228  #define NVME_VF_STRIDE 1
229  
230  #define NVME_GUEST_ERR(trace, fmt, ...) \
231      do { \
232          (trace_##trace)(__VA_ARGS__); \
233          qemu_log_mask(LOG_GUEST_ERROR, #trace \
234              " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
235      } while (0)
236  
237  static const bool nvme_feature_support[NVME_FID_MAX] = {
238      [NVME_ARBITRATION]              = true,
239      [NVME_POWER_MANAGEMENT]         = true,
240      [NVME_TEMPERATURE_THRESHOLD]    = true,
241      [NVME_ERROR_RECOVERY]           = true,
242      [NVME_VOLATILE_WRITE_CACHE]     = true,
243      [NVME_NUMBER_OF_QUEUES]         = true,
244      [NVME_INTERRUPT_COALESCING]     = true,
245      [NVME_INTERRUPT_VECTOR_CONF]    = true,
246      [NVME_WRITE_ATOMICITY]          = true,
247      [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
248      [NVME_TIMESTAMP]                = true,
249      [NVME_HOST_BEHAVIOR_SUPPORT]    = true,
250      [NVME_COMMAND_SET_PROFILE]      = true,
251      [NVME_FDP_MODE]                 = true,
252      [NVME_FDP_EVENTS]               = true,
253  };
254  
255  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
256      [NVME_TEMPERATURE_THRESHOLD]    = NVME_FEAT_CAP_CHANGE,
257      [NVME_ERROR_RECOVERY]           = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
258      [NVME_VOLATILE_WRITE_CACHE]     = NVME_FEAT_CAP_CHANGE,
259      [NVME_NUMBER_OF_QUEUES]         = NVME_FEAT_CAP_CHANGE,
260      [NVME_WRITE_ATOMICITY]          = NVME_FEAT_CAP_CHANGE,
261      [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
262      [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
263      [NVME_HOST_BEHAVIOR_SUPPORT]    = NVME_FEAT_CAP_CHANGE,
264      [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
265      [NVME_FDP_MODE]                 = NVME_FEAT_CAP_CHANGE,
266      [NVME_FDP_EVENTS]               = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
267  };
268  
269  static const uint32_t nvme_cse_acs[256] = {
270      [NVME_ADM_CMD_DELETE_SQ]        = NVME_CMD_EFF_CSUPP,
271      [NVME_ADM_CMD_CREATE_SQ]        = NVME_CMD_EFF_CSUPP,
272      [NVME_ADM_CMD_GET_LOG_PAGE]     = NVME_CMD_EFF_CSUPP,
273      [NVME_ADM_CMD_DELETE_CQ]        = NVME_CMD_EFF_CSUPP,
274      [NVME_ADM_CMD_CREATE_CQ]        = NVME_CMD_EFF_CSUPP,
275      [NVME_ADM_CMD_IDENTIFY]         = NVME_CMD_EFF_CSUPP,
276      [NVME_ADM_CMD_ABORT]            = NVME_CMD_EFF_CSUPP,
277      [NVME_ADM_CMD_SET_FEATURES]     = NVME_CMD_EFF_CSUPP,
278      [NVME_ADM_CMD_GET_FEATURES]     = NVME_CMD_EFF_CSUPP,
279      [NVME_ADM_CMD_ASYNC_EV_REQ]     = NVME_CMD_EFF_CSUPP,
280      [NVME_ADM_CMD_NS_ATTACHMENT]    = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
281      [NVME_ADM_CMD_VIRT_MNGMT]       = NVME_CMD_EFF_CSUPP,
282      [NVME_ADM_CMD_DBBUF_CONFIG]     = NVME_CMD_EFF_CSUPP,
283      [NVME_ADM_CMD_FORMAT_NVM]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
284      [NVME_ADM_CMD_DIRECTIVE_RECV]   = NVME_CMD_EFF_CSUPP,
285      [NVME_ADM_CMD_DIRECTIVE_SEND]   = NVME_CMD_EFF_CSUPP,
286  };
287  
288  static const uint32_t nvme_cse_iocs_none[256];
289  
290  static const uint32_t nvme_cse_iocs_nvm[256] = {
291      [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292      [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
293      [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294      [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
295      [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
296      [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
297      [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
298      [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
299      [NVME_CMD_IO_MGMT_RECV]         = NVME_CMD_EFF_CSUPP,
300      [NVME_CMD_IO_MGMT_SEND]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
301  };
302  
303  static const uint32_t nvme_cse_iocs_zoned[256] = {
304      [NVME_CMD_FLUSH]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
305      [NVME_CMD_WRITE_ZEROES]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
306      [NVME_CMD_WRITE]                = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
307      [NVME_CMD_READ]                 = NVME_CMD_EFF_CSUPP,
308      [NVME_CMD_DSM]                  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
309      [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
310      [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
311      [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
312      [NVME_CMD_ZONE_APPEND]          = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
313      [NVME_CMD_ZONE_MGMT_SEND]       = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
314      [NVME_CMD_ZONE_MGMT_RECV]       = NVME_CMD_EFF_CSUPP,
315  };
316  
317  static void nvme_process_sq(void *opaque);
318  static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
319  static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
320  
nvme_sqid(NvmeRequest * req)321  static uint16_t nvme_sqid(NvmeRequest *req)
322  {
323      return le16_to_cpu(req->sq->sqid);
324  }
325  
nvme_make_pid(NvmeNamespace * ns,uint16_t rg,uint16_t ph)326  static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
327                                       uint16_t ph)
328  {
329      uint16_t rgif = ns->endgrp->fdp.rgif;
330  
331      if (!rgif) {
332          return ph;
333      }
334  
335      return (rg << (16 - rgif)) | ph;
336  }
337  
nvme_ph_valid(NvmeNamespace * ns,uint16_t ph)338  static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
339  {
340      return ph < ns->fdp.nphs;
341  }
342  
nvme_rg_valid(NvmeEnduranceGroup * endgrp,uint16_t rg)343  static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
344  {
345      return rg < endgrp->fdp.nrg;
346  }
347  
nvme_pid2ph(NvmeNamespace * ns,uint16_t pid)348  static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
349  {
350      uint16_t rgif = ns->endgrp->fdp.rgif;
351  
352      if (!rgif) {
353          return pid;
354      }
355  
356      return pid & ((1 << (15 - rgif)) - 1);
357  }
358  
nvme_pid2rg(NvmeNamespace * ns,uint16_t pid)359  static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
360  {
361      uint16_t rgif = ns->endgrp->fdp.rgif;
362  
363      if (!rgif) {
364          return 0;
365      }
366  
367      return pid >> (16 - rgif);
368  }
369  
nvme_parse_pid(NvmeNamespace * ns,uint16_t pid,uint16_t * ph,uint16_t * rg)370  static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
371                                    uint16_t *ph, uint16_t *rg)
372  {
373      *rg = nvme_pid2rg(ns, pid);
374      *ph = nvme_pid2ph(ns, pid);
375  
376      return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
377  }
378  
nvme_assign_zone_state(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state)379  static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
380                                     NvmeZoneState state)
381  {
382      if (QTAILQ_IN_USE(zone, entry)) {
383          switch (nvme_get_zone_state(zone)) {
384          case NVME_ZONE_STATE_EXPLICITLY_OPEN:
385              QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
386              break;
387          case NVME_ZONE_STATE_IMPLICITLY_OPEN:
388              QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
389              break;
390          case NVME_ZONE_STATE_CLOSED:
391              QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
392              break;
393          case NVME_ZONE_STATE_FULL:
394              QTAILQ_REMOVE(&ns->full_zones, zone, entry);
395          default:
396              ;
397          }
398      }
399  
400      nvme_set_zone_state(zone, state);
401  
402      switch (state) {
403      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
404          QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
405          break;
406      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
407          QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
408          break;
409      case NVME_ZONE_STATE_CLOSED:
410          QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
411          break;
412      case NVME_ZONE_STATE_FULL:
413          QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
414      case NVME_ZONE_STATE_READ_ONLY:
415          break;
416      default:
417          zone->d.za = 0;
418      }
419  }
420  
nvme_zns_check_resources(NvmeNamespace * ns,uint32_t act,uint32_t opn,uint32_t zrwa)421  static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
422                                           uint32_t opn, uint32_t zrwa)
423  {
424      if (ns->params.max_active_zones != 0 &&
425          ns->nr_active_zones + act > ns->params.max_active_zones) {
426          trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
427          return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
428      }
429  
430      if (ns->params.max_open_zones != 0 &&
431          ns->nr_open_zones + opn > ns->params.max_open_zones) {
432          trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
433          return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
434      }
435  
436      if (zrwa > ns->zns.numzrwa) {
437          return NVME_NOZRWA | NVME_DNR;
438      }
439  
440      return NVME_SUCCESS;
441  }
442  
443  /*
444   * Check if we can open a zone without exceeding open/active limits.
445   * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
446   */
nvme_aor_check(NvmeNamespace * ns,uint32_t act,uint32_t opn)447  static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
448  {
449      return nvme_zns_check_resources(ns, act, opn, 0);
450  }
451  
nvme_fdp_alloc_event(NvmeCtrl * n,NvmeFdpEventBuffer * ebuf)452  static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
453  {
454      NvmeFdpEvent *ret = NULL;
455      bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
456  
457      ret = &ebuf->events[ebuf->next++];
458      if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
459          ebuf->next = 0;
460      }
461      if (is_full) {
462          ebuf->start = ebuf->next;
463      } else {
464          ebuf->nelems++;
465      }
466  
467      memset(ret, 0, sizeof(NvmeFdpEvent));
468      ret->timestamp = nvme_get_timestamp(n);
469  
470      return ret;
471  }
472  
log_event(NvmeRuHandle * ruh,uint8_t event_type)473  static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
474  {
475      return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
476  }
477  
nvme_update_ruh(NvmeCtrl * n,NvmeNamespace * ns,uint16_t pid)478  static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
479  {
480      NvmeEnduranceGroup *endgrp = ns->endgrp;
481      NvmeRuHandle *ruh;
482      NvmeReclaimUnit *ru;
483      NvmeFdpEvent *e = NULL;
484      uint16_t ph, rg, ruhid;
485  
486      if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
487          return false;
488      }
489  
490      ruhid = ns->fdp.phs[ph];
491  
492      ruh = &endgrp->fdp.ruhs[ruhid];
493      ru = &ruh->rus[rg];
494  
495      if (ru->ruamw) {
496          if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
497              e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
498              e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
499              e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
500              e->pid = cpu_to_le16(pid);
501              e->nsid = cpu_to_le32(ns->params.nsid);
502              e->rgid = cpu_to_le16(rg);
503              e->ruhid = cpu_to_le16(ruhid);
504          }
505  
506          /* log (eventual) GC overhead of prematurely swapping the RU */
507          nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
508      }
509  
510      ru->ruamw = ruh->ruamw;
511  
512      return true;
513  }
514  
nvme_addr_is_cmb(NvmeCtrl * n,hwaddr addr)515  static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
516  {
517      hwaddr hi, lo;
518  
519      if (!n->cmb.cmse) {
520          return false;
521      }
522  
523      lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
524      hi = lo + int128_get64(n->cmb.mem.size);
525  
526      return addr >= lo && addr < hi;
527  }
528  
nvme_addr_to_cmb(NvmeCtrl * n,hwaddr addr)529  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
530  {
531      hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
532      return &n->cmb.buf[addr - base];
533  }
534  
nvme_addr_is_pmr(NvmeCtrl * n,hwaddr addr)535  static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
536  {
537      hwaddr hi;
538  
539      if (!n->pmr.cmse) {
540          return false;
541      }
542  
543      hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
544  
545      return addr >= n->pmr.cba && addr < hi;
546  }
547  
nvme_addr_to_pmr(NvmeCtrl * n,hwaddr addr)548  static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
549  {
550      return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
551  }
552  
nvme_addr_is_iomem(NvmeCtrl * n,hwaddr addr)553  static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
554  {
555      hwaddr hi, lo;
556  
557      /*
558       * The purpose of this check is to guard against invalid "local" access to
559       * the iomem (i.e. controller registers). Thus, we check against the range
560       * covered by the 'bar0' MemoryRegion since that is currently composed of
561       * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
562       * that if the device model is ever changed to allow the CMB to be located
563       * in BAR0 as well, then this must be changed.
564       */
565      lo = n->bar0.addr;
566      hi = lo + int128_get64(n->bar0.size);
567  
568      return addr >= lo && addr < hi;
569  }
570  
nvme_addr_read(NvmeCtrl * n,hwaddr addr,void * buf,int size)571  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
572  {
573      hwaddr hi = addr + size - 1;
574      if (hi < addr) {
575          return 1;
576      }
577  
578      if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
579          memcpy(buf, nvme_addr_to_cmb(n, addr), size);
580          return 0;
581      }
582  
583      if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
584          memcpy(buf, nvme_addr_to_pmr(n, addr), size);
585          return 0;
586      }
587  
588      return pci_dma_read(PCI_DEVICE(n), addr, buf, size);
589  }
590  
nvme_addr_write(NvmeCtrl * n,hwaddr addr,const void * buf,int size)591  static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
592  {
593      hwaddr hi = addr + size - 1;
594      if (hi < addr) {
595          return 1;
596      }
597  
598      if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
599          memcpy(nvme_addr_to_cmb(n, addr), buf, size);
600          return 0;
601      }
602  
603      if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
604          memcpy(nvme_addr_to_pmr(n, addr), buf, size);
605          return 0;
606      }
607  
608      return pci_dma_write(PCI_DEVICE(n), addr, buf, size);
609  }
610  
nvme_nsid_valid(NvmeCtrl * n,uint32_t nsid)611  static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
612  {
613      return nsid &&
614          (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
615  }
616  
nvme_check_sqid(NvmeCtrl * n,uint16_t sqid)617  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
618  {
619      return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
620  }
621  
nvme_check_cqid(NvmeCtrl * n,uint16_t cqid)622  static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
623  {
624      return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
625  }
626  
nvme_inc_cq_tail(NvmeCQueue * cq)627  static void nvme_inc_cq_tail(NvmeCQueue *cq)
628  {
629      cq->tail++;
630      if (cq->tail >= cq->size) {
631          cq->tail = 0;
632          cq->phase = !cq->phase;
633      }
634  }
635  
nvme_inc_sq_head(NvmeSQueue * sq)636  static void nvme_inc_sq_head(NvmeSQueue *sq)
637  {
638      sq->head = (sq->head + 1) % sq->size;
639  }
640  
nvme_cq_full(NvmeCQueue * cq)641  static uint8_t nvme_cq_full(NvmeCQueue *cq)
642  {
643      return (cq->tail + 1) % cq->size == cq->head;
644  }
645  
nvme_sq_empty(NvmeSQueue * sq)646  static uint8_t nvme_sq_empty(NvmeSQueue *sq)
647  {
648      return sq->head == sq->tail;
649  }
650  
nvme_irq_check(NvmeCtrl * n)651  static void nvme_irq_check(NvmeCtrl *n)
652  {
653      PCIDevice *pci = PCI_DEVICE(n);
654      uint32_t intms = ldl_le_p(&n->bar.intms);
655  
656      if (msix_enabled(pci)) {
657          return;
658      }
659  
660      /* vfs does not implement intx */
661      if (pci_is_vf(pci)) {
662          return;
663      }
664  
665      if (~intms & n->irq_status) {
666          pci_irq_assert(pci);
667      } else {
668          pci_irq_deassert(pci);
669      }
670  }
671  
nvme_irq_assert(NvmeCtrl * n,NvmeCQueue * cq)672  static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
673  {
674      PCIDevice *pci = PCI_DEVICE(n);
675  
676      if (cq->irq_enabled) {
677          if (msix_enabled(pci)) {
678              trace_pci_nvme_irq_msix(cq->vector);
679              msix_notify(pci, cq->vector);
680          } else {
681              trace_pci_nvme_irq_pin();
682              assert(cq->vector < 32);
683              n->irq_status |= 1 << cq->vector;
684              nvme_irq_check(n);
685          }
686      } else {
687          trace_pci_nvme_irq_masked();
688      }
689  }
690  
nvme_irq_deassert(NvmeCtrl * n,NvmeCQueue * cq)691  static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
692  {
693      if (cq->irq_enabled) {
694          if (msix_enabled(PCI_DEVICE(n))) {
695              return;
696          } else {
697              assert(cq->vector < 32);
698              if (!n->cq_pending) {
699                  n->irq_status &= ~(1 << cq->vector);
700              }
701              nvme_irq_check(n);
702          }
703      }
704  }
705  
nvme_req_clear(NvmeRequest * req)706  static void nvme_req_clear(NvmeRequest *req)
707  {
708      req->ns = NULL;
709      req->opaque = NULL;
710      req->aiocb = NULL;
711      memset(&req->cqe, 0x0, sizeof(req->cqe));
712      req->status = NVME_SUCCESS;
713  }
714  
nvme_sg_init(NvmeCtrl * n,NvmeSg * sg,bool dma)715  static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
716  {
717      if (dma) {
718          pci_dma_sglist_init(&sg->qsg, PCI_DEVICE(n), 0);
719          sg->flags = NVME_SG_DMA;
720      } else {
721          qemu_iovec_init(&sg->iov, 0);
722      }
723  
724      sg->flags |= NVME_SG_ALLOC;
725  }
726  
nvme_sg_unmap(NvmeSg * sg)727  static inline void nvme_sg_unmap(NvmeSg *sg)
728  {
729      if (!(sg->flags & NVME_SG_ALLOC)) {
730          return;
731      }
732  
733      if (sg->flags & NVME_SG_DMA) {
734          qemu_sglist_destroy(&sg->qsg);
735      } else {
736          qemu_iovec_destroy(&sg->iov);
737      }
738  
739      memset(sg, 0x0, sizeof(*sg));
740  }
741  
742  /*
743   * When metadata is transferred as extended LBAs, the DPTR mapped into `sg`
744   * holds both data and metadata. This function splits the data and metadata
745   * into two separate QSG/IOVs.
746   */
nvme_sg_split(NvmeSg * sg,NvmeNamespace * ns,NvmeSg * data,NvmeSg * mdata)747  static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
748                            NvmeSg *mdata)
749  {
750      NvmeSg *dst = data;
751      uint32_t trans_len, count = ns->lbasz;
752      uint64_t offset = 0;
753      bool dma = sg->flags & NVME_SG_DMA;
754      size_t sge_len;
755      size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
756      int sg_idx = 0;
757  
758      assert(sg->flags & NVME_SG_ALLOC);
759  
760      while (sg_len) {
761          sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
762  
763          trans_len = MIN(sg_len, count);
764          trans_len = MIN(trans_len, sge_len - offset);
765  
766          if (dst) {
767              if (dma) {
768                  qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
769                                  trans_len);
770              } else {
771                  qemu_iovec_add(&dst->iov,
772                                 sg->iov.iov[sg_idx].iov_base + offset,
773                                 trans_len);
774              }
775          }
776  
777          sg_len -= trans_len;
778          count -= trans_len;
779          offset += trans_len;
780  
781          if (count == 0) {
782              dst = (dst == data) ? mdata : data;
783              count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
784          }
785  
786          if (sge_len == offset) {
787              offset = 0;
788              sg_idx++;
789          }
790      }
791  }
792  
nvme_map_addr_cmb(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)793  static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
794                                    size_t len)
795  {
796      if (!len) {
797          return NVME_SUCCESS;
798      }
799  
800      trace_pci_nvme_map_addr_cmb(addr, len);
801  
802      if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
803          return NVME_DATA_TRAS_ERROR;
804      }
805  
806      qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
807  
808      return NVME_SUCCESS;
809  }
810  
nvme_map_addr_pmr(NvmeCtrl * n,QEMUIOVector * iov,hwaddr addr,size_t len)811  static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
812                                    size_t len)
813  {
814      if (!len) {
815          return NVME_SUCCESS;
816      }
817  
818      if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
819          return NVME_DATA_TRAS_ERROR;
820      }
821  
822      qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
823  
824      return NVME_SUCCESS;
825  }
826  
nvme_map_addr(NvmeCtrl * n,NvmeSg * sg,hwaddr addr,size_t len)827  static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
828  {
829      bool cmb = false, pmr = false;
830  
831      if (!len) {
832          return NVME_SUCCESS;
833      }
834  
835      trace_pci_nvme_map_addr(addr, len);
836  
837      if (nvme_addr_is_iomem(n, addr)) {
838          return NVME_DATA_TRAS_ERROR;
839      }
840  
841      if (nvme_addr_is_cmb(n, addr)) {
842          cmb = true;
843      } else if (nvme_addr_is_pmr(n, addr)) {
844          pmr = true;
845      }
846  
847      if (cmb || pmr) {
848          if (sg->flags & NVME_SG_DMA) {
849              return NVME_INVALID_USE_OF_CMB | NVME_DNR;
850          }
851  
852          if (sg->iov.niov + 1 > IOV_MAX) {
853              goto max_mappings_exceeded;
854          }
855  
856          if (cmb) {
857              return nvme_map_addr_cmb(n, &sg->iov, addr, len);
858          } else {
859              return nvme_map_addr_pmr(n, &sg->iov, addr, len);
860          }
861      }
862  
863      if (!(sg->flags & NVME_SG_DMA)) {
864          return NVME_INVALID_USE_OF_CMB | NVME_DNR;
865      }
866  
867      if (sg->qsg.nsg + 1 > IOV_MAX) {
868          goto max_mappings_exceeded;
869      }
870  
871      qemu_sglist_add(&sg->qsg, addr, len);
872  
873      return NVME_SUCCESS;
874  
875  max_mappings_exceeded:
876      NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
877                     "number of mappings exceed 1024");
878      return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
879  }
880  
nvme_addr_is_dma(NvmeCtrl * n,hwaddr addr)881  static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
882  {
883      return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
884  }
885  
nvme_map_prp(NvmeCtrl * n,NvmeSg * sg,uint64_t prp1,uint64_t prp2,uint32_t len)886  static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
887                               uint64_t prp2, uint32_t len)
888  {
889      hwaddr trans_len = n->page_size - (prp1 % n->page_size);
890      trans_len = MIN(len, trans_len);
891      int num_prps = (len >> n->page_bits) + 1;
892      uint16_t status;
893      int ret;
894  
895      trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
896  
897      nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
898  
899      status = nvme_map_addr(n, sg, prp1, trans_len);
900      if (status) {
901          goto unmap;
902      }
903  
904      len -= trans_len;
905      if (len) {
906          if (len > n->page_size) {
907              g_autofree uint64_t *prp_list = g_new(uint64_t, n->max_prp_ents);
908              uint32_t nents, prp_trans;
909              int i = 0;
910  
911              /*
912               * The first PRP list entry, pointed to by PRP2 may contain offset.
913               * Hence, we need to calculate the number of entries in based on
914               * that offset.
915               */
916              nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
917              prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
918              ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
919              if (ret) {
920                  trace_pci_nvme_err_addr_read(prp2);
921                  status = NVME_DATA_TRAS_ERROR;
922                  goto unmap;
923              }
924              while (len != 0) {
925                  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
926  
927                  if (i == nents - 1 && len > n->page_size) {
928                      if (unlikely(prp_ent & (n->page_size - 1))) {
929                          trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
930                          status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
931                          goto unmap;
932                      }
933  
934                      i = 0;
935                      nents = (len + n->page_size - 1) >> n->page_bits;
936                      nents = MIN(nents, n->max_prp_ents);
937                      prp_trans = nents * sizeof(uint64_t);
938                      ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
939                                           prp_trans);
940                      if (ret) {
941                          trace_pci_nvme_err_addr_read(prp_ent);
942                          status = NVME_DATA_TRAS_ERROR;
943                          goto unmap;
944                      }
945                      prp_ent = le64_to_cpu(prp_list[i]);
946                  }
947  
948                  if (unlikely(prp_ent & (n->page_size - 1))) {
949                      trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
950                      status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
951                      goto unmap;
952                  }
953  
954                  trans_len = MIN(len, n->page_size);
955                  status = nvme_map_addr(n, sg, prp_ent, trans_len);
956                  if (status) {
957                      goto unmap;
958                  }
959  
960                  len -= trans_len;
961                  i++;
962              }
963          } else {
964              if (unlikely(prp2 & (n->page_size - 1))) {
965                  trace_pci_nvme_err_invalid_prp2_align(prp2);
966                  status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
967                  goto unmap;
968              }
969              status = nvme_map_addr(n, sg, prp2, len);
970              if (status) {
971                  goto unmap;
972              }
973          }
974      }
975  
976      return NVME_SUCCESS;
977  
978  unmap:
979      nvme_sg_unmap(sg);
980      return status;
981  }
982  
983  /*
984   * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
985   * number of bytes mapped in len.
986   */
nvme_map_sgl_data(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor * segment,uint64_t nsgld,size_t * len,NvmeCmd * cmd)987  static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
988                                    NvmeSglDescriptor *segment, uint64_t nsgld,
989                                    size_t *len, NvmeCmd *cmd)
990  {
991      dma_addr_t addr, trans_len;
992      uint32_t dlen;
993      uint16_t status;
994  
995      for (int i = 0; i < nsgld; i++) {
996          uint8_t type = NVME_SGL_TYPE(segment[i].type);
997  
998          switch (type) {
999          case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
1000              break;
1001          case NVME_SGL_DESCR_TYPE_SEGMENT:
1002          case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1003              return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
1004          default:
1005              return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
1006          }
1007  
1008          dlen = le32_to_cpu(segment[i].len);
1009  
1010          if (!dlen) {
1011              continue;
1012          }
1013  
1014          if (*len == 0) {
1015              /*
1016               * All data has been mapped, but the SGL contains additional
1017               * segments and/or descriptors. The controller might accept
1018               * ignoring the rest of the SGL.
1019               */
1020              uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
1021              if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
1022                  break;
1023              }
1024  
1025              trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
1026              return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1027          }
1028  
1029          trans_len = MIN(*len, dlen);
1030  
1031          addr = le64_to_cpu(segment[i].addr);
1032  
1033          if (UINT64_MAX - addr < dlen) {
1034              return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1035          }
1036  
1037          status = nvme_map_addr(n, sg, addr, trans_len);
1038          if (status) {
1039              return status;
1040          }
1041  
1042          *len -= trans_len;
1043      }
1044  
1045      return NVME_SUCCESS;
1046  }
1047  
nvme_map_sgl(NvmeCtrl * n,NvmeSg * sg,NvmeSglDescriptor sgl,size_t len,NvmeCmd * cmd)1048  static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
1049                               size_t len, NvmeCmd *cmd)
1050  {
1051      /*
1052       * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
1053       * dynamically allocating a potentially huge SGL. The spec allows the SGL
1054       * to be larger (as in number of bytes required to describe the SGL
1055       * descriptors and segment chain) than the command transfer size, so it is
1056       * not bounded by MDTS.
1057       */
1058  #define SEG_CHUNK_SIZE 256
1059  
1060      NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
1061      uint64_t nsgld;
1062      uint32_t seg_len;
1063      uint16_t status;
1064      hwaddr addr;
1065      int ret;
1066  
1067      sgld = &sgl;
1068      addr = le64_to_cpu(sgl.addr);
1069  
1070      trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
1071  
1072      nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
1073  
1074      /*
1075       * If the entire transfer can be described with a single data block it can
1076       * be mapped directly.
1077       */
1078      if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1079          status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
1080          if (status) {
1081              goto unmap;
1082          }
1083  
1084          goto out;
1085      }
1086  
1087      for (;;) {
1088          switch (NVME_SGL_TYPE(sgld->type)) {
1089          case NVME_SGL_DESCR_TYPE_SEGMENT:
1090          case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
1091              break;
1092          default:
1093              return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1094          }
1095  
1096          seg_len = le32_to_cpu(sgld->len);
1097  
1098          /* check the length of the (Last) Segment descriptor */
1099          if (!seg_len || seg_len & 0xf) {
1100              return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1101          }
1102  
1103          if (UINT64_MAX - addr < seg_len) {
1104              return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1105          }
1106  
1107          nsgld = seg_len / sizeof(NvmeSglDescriptor);
1108  
1109          while (nsgld > SEG_CHUNK_SIZE) {
1110              if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
1111                  trace_pci_nvme_err_addr_read(addr);
1112                  status = NVME_DATA_TRAS_ERROR;
1113                  goto unmap;
1114              }
1115  
1116              status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
1117                                         &len, cmd);
1118              if (status) {
1119                  goto unmap;
1120              }
1121  
1122              nsgld -= SEG_CHUNK_SIZE;
1123              addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
1124          }
1125  
1126          ret = nvme_addr_read(n, addr, segment, nsgld *
1127                               sizeof(NvmeSglDescriptor));
1128          if (ret) {
1129              trace_pci_nvme_err_addr_read(addr);
1130              status = NVME_DATA_TRAS_ERROR;
1131              goto unmap;
1132          }
1133  
1134          last_sgld = &segment[nsgld - 1];
1135  
1136          /*
1137           * If the segment ends with a Data Block, then we are done.
1138           */
1139          if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
1140              status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
1141              if (status) {
1142                  goto unmap;
1143              }
1144  
1145              goto out;
1146          }
1147  
1148          /*
1149           * If the last descriptor was not a Data Block, then the current
1150           * segment must not be a Last Segment.
1151           */
1152          if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1153              status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1154              goto unmap;
1155          }
1156  
1157          sgld = last_sgld;
1158          addr = le64_to_cpu(sgld->addr);
1159  
1160          /*
1161           * Do not map the last descriptor; it will be a Segment or Last Segment
1162           * descriptor and is handled by the next iteration.
1163           */
1164          status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1165          if (status) {
1166              goto unmap;
1167          }
1168      }
1169  
1170  out:
1171      /* if there is any residual left in len, the SGL was too short */
1172      if (len) {
1173          status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1174          goto unmap;
1175      }
1176  
1177      return NVME_SUCCESS;
1178  
1179  unmap:
1180      nvme_sg_unmap(sg);
1181      return status;
1182  }
1183  
nvme_map_dptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1184  uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1185                         NvmeCmd *cmd)
1186  {
1187      uint64_t prp1, prp2;
1188  
1189      switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1190      case NVME_PSDT_PRP:
1191          prp1 = le64_to_cpu(cmd->dptr.prp1);
1192          prp2 = le64_to_cpu(cmd->dptr.prp2);
1193  
1194          return nvme_map_prp(n, sg, prp1, prp2, len);
1195      case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1196      case NVME_PSDT_SGL_MPTR_SGL:
1197          return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1198      default:
1199          return NVME_INVALID_FIELD;
1200      }
1201  }
1202  
nvme_map_mptr(NvmeCtrl * n,NvmeSg * sg,size_t len,NvmeCmd * cmd)1203  static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1204                                NvmeCmd *cmd)
1205  {
1206      int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1207      hwaddr mptr = le64_to_cpu(cmd->mptr);
1208      uint16_t status;
1209  
1210      if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1211          NvmeSglDescriptor sgl;
1212  
1213          if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1214              return NVME_DATA_TRAS_ERROR;
1215          }
1216  
1217          status = nvme_map_sgl(n, sg, sgl, len, cmd);
1218          if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1219              status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1220          }
1221  
1222          return status;
1223      }
1224  
1225      nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1226      status = nvme_map_addr(n, sg, mptr, len);
1227      if (status) {
1228          nvme_sg_unmap(sg);
1229      }
1230  
1231      return status;
1232  }
1233  
nvme_map_data(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1234  static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1235  {
1236      NvmeNamespace *ns = req->ns;
1237      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1238      bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1239      bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1240      size_t len = nvme_l2b(ns, nlb);
1241      uint16_t status;
1242  
1243      if (nvme_ns_ext(ns) &&
1244          !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1245          NvmeSg sg;
1246  
1247          len += nvme_m2b(ns, nlb);
1248  
1249          status = nvme_map_dptr(n, &sg, len, &req->cmd);
1250          if (status) {
1251              return status;
1252          }
1253  
1254          nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1255          nvme_sg_split(&sg, ns, &req->sg, NULL);
1256          nvme_sg_unmap(&sg);
1257  
1258          return NVME_SUCCESS;
1259      }
1260  
1261      return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1262  }
1263  
nvme_map_mdata(NvmeCtrl * n,uint32_t nlb,NvmeRequest * req)1264  static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1265  {
1266      NvmeNamespace *ns = req->ns;
1267      size_t len = nvme_m2b(ns, nlb);
1268      uint16_t status;
1269  
1270      if (nvme_ns_ext(ns)) {
1271          NvmeSg sg;
1272  
1273          len += nvme_l2b(ns, nlb);
1274  
1275          status = nvme_map_dptr(n, &sg, len, &req->cmd);
1276          if (status) {
1277              return status;
1278          }
1279  
1280          nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1281          nvme_sg_split(&sg, ns, NULL, &req->sg);
1282          nvme_sg_unmap(&sg);
1283  
1284          return NVME_SUCCESS;
1285      }
1286  
1287      return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1288  }
1289  
nvme_tx_interleaved(NvmeCtrl * n,NvmeSg * sg,uint8_t * ptr,uint32_t len,uint32_t bytes,int32_t skip_bytes,int64_t offset,NvmeTxDirection dir)1290  static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1291                                      uint32_t len, uint32_t bytes,
1292                                      int32_t skip_bytes, int64_t offset,
1293                                      NvmeTxDirection dir)
1294  {
1295      hwaddr addr;
1296      uint32_t trans_len, count = bytes;
1297      bool dma = sg->flags & NVME_SG_DMA;
1298      int64_t sge_len;
1299      int sg_idx = 0;
1300      int ret;
1301  
1302      assert(sg->flags & NVME_SG_ALLOC);
1303  
1304      while (len) {
1305          sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1306  
1307          if (sge_len - offset < 0) {
1308              offset -= sge_len;
1309              sg_idx++;
1310              continue;
1311          }
1312  
1313          if (sge_len == offset) {
1314              offset = 0;
1315              sg_idx++;
1316              continue;
1317          }
1318  
1319          trans_len = MIN(len, count);
1320          trans_len = MIN(trans_len, sge_len - offset);
1321  
1322          if (dma) {
1323              addr = sg->qsg.sg[sg_idx].base + offset;
1324          } else {
1325              addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1326          }
1327  
1328          if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1329              ret = nvme_addr_read(n, addr, ptr, trans_len);
1330          } else {
1331              ret = nvme_addr_write(n, addr, ptr, trans_len);
1332          }
1333  
1334          if (ret) {
1335              return NVME_DATA_TRAS_ERROR;
1336          }
1337  
1338          ptr += trans_len;
1339          len -= trans_len;
1340          count -= trans_len;
1341          offset += trans_len;
1342  
1343          if (count == 0) {
1344              count = bytes;
1345              offset += skip_bytes;
1346          }
1347      }
1348  
1349      return NVME_SUCCESS;
1350  }
1351  
nvme_tx(NvmeCtrl * n,NvmeSg * sg,void * ptr,uint32_t len,NvmeTxDirection dir)1352  static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1353                          NvmeTxDirection dir)
1354  {
1355      assert(sg->flags & NVME_SG_ALLOC);
1356  
1357      if (sg->flags & NVME_SG_DMA) {
1358          const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1359          dma_addr_t residual;
1360  
1361          if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1362              dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1363          } else {
1364              dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1365          }
1366  
1367          if (unlikely(residual)) {
1368              trace_pci_nvme_err_invalid_dma();
1369              return NVME_INVALID_FIELD | NVME_DNR;
1370          }
1371      } else {
1372          size_t bytes;
1373  
1374          if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1375              bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1376          } else {
1377              bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1378          }
1379  
1380          if (unlikely(bytes != len)) {
1381              trace_pci_nvme_err_invalid_dma();
1382              return NVME_INVALID_FIELD | NVME_DNR;
1383          }
1384      }
1385  
1386      return NVME_SUCCESS;
1387  }
1388  
nvme_c2h(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1389  static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1390                                  NvmeRequest *req)
1391  {
1392      uint16_t status;
1393  
1394      status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1395      if (status) {
1396          return status;
1397      }
1398  
1399      return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1400  }
1401  
nvme_h2c(NvmeCtrl * n,void * ptr,uint32_t len,NvmeRequest * req)1402  static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1403                                  NvmeRequest *req)
1404  {
1405      uint16_t status;
1406  
1407      status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1408      if (status) {
1409          return status;
1410      }
1411  
1412      return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1413  }
1414  
nvme_bounce_data(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1415  uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1416                            NvmeTxDirection dir, NvmeRequest *req)
1417  {
1418      NvmeNamespace *ns = req->ns;
1419      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1420      bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1421      bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1422  
1423      if (nvme_ns_ext(ns) &&
1424          !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1425          return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1426                                     ns->lbaf.ms, 0, dir);
1427      }
1428  
1429      return nvme_tx(n, &req->sg, ptr, len, dir);
1430  }
1431  
nvme_bounce_mdata(NvmeCtrl * n,void * ptr,uint32_t len,NvmeTxDirection dir,NvmeRequest * req)1432  uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1433                             NvmeTxDirection dir, NvmeRequest *req)
1434  {
1435      NvmeNamespace *ns = req->ns;
1436      uint16_t status;
1437  
1438      if (nvme_ns_ext(ns)) {
1439          return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1440                                     ns->lbasz, ns->lbasz, dir);
1441      }
1442  
1443      nvme_sg_unmap(&req->sg);
1444  
1445      status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1446      if (status) {
1447          return status;
1448      }
1449  
1450      return nvme_tx(n, &req->sg, ptr, len, dir);
1451  }
1452  
nvme_blk_read(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1453  static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1454                                   uint32_t align, BlockCompletionFunc *cb,
1455                                   NvmeRequest *req)
1456  {
1457      assert(req->sg.flags & NVME_SG_ALLOC);
1458  
1459      if (req->sg.flags & NVME_SG_DMA) {
1460          req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, align, cb, req);
1461      } else {
1462          req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1463      }
1464  }
1465  
nvme_blk_write(BlockBackend * blk,int64_t offset,uint32_t align,BlockCompletionFunc * cb,NvmeRequest * req)1466  static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1467                                    uint32_t align, BlockCompletionFunc *cb,
1468                                    NvmeRequest *req)
1469  {
1470      assert(req->sg.flags & NVME_SG_ALLOC);
1471  
1472      if (req->sg.flags & NVME_SG_DMA) {
1473          req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, align, cb, req);
1474      } else {
1475          req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1476      }
1477  }
1478  
nvme_update_cq_eventidx(const NvmeCQueue * cq)1479  static void nvme_update_cq_eventidx(const NvmeCQueue *cq)
1480  {
1481      trace_pci_nvme_update_cq_eventidx(cq->cqid, cq->head);
1482  
1483      stl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->ei_addr, cq->head,
1484                     MEMTXATTRS_UNSPECIFIED);
1485  }
1486  
nvme_update_cq_head(NvmeCQueue * cq)1487  static void nvme_update_cq_head(NvmeCQueue *cq)
1488  {
1489      ldl_le_pci_dma(PCI_DEVICE(cq->ctrl), cq->db_addr, &cq->head,
1490                     MEMTXATTRS_UNSPECIFIED);
1491  
1492      trace_pci_nvme_update_cq_head(cq->cqid, cq->head);
1493  }
1494  
nvme_post_cqes(void * opaque)1495  static void nvme_post_cqes(void *opaque)
1496  {
1497      NvmeCQueue *cq = opaque;
1498      NvmeCtrl *n = cq->ctrl;
1499      NvmeRequest *req, *next;
1500      bool pending = cq->head != cq->tail;
1501      int ret;
1502  
1503      QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1504          NvmeSQueue *sq;
1505          hwaddr addr;
1506  
1507          if (n->dbbuf_enabled) {
1508              nvme_update_cq_eventidx(cq);
1509              nvme_update_cq_head(cq);
1510          }
1511  
1512          if (nvme_cq_full(cq)) {
1513              break;
1514          }
1515  
1516          sq = req->sq;
1517          req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1518          req->cqe.sq_id = cpu_to_le16(sq->sqid);
1519          req->cqe.sq_head = cpu_to_le16(sq->head);
1520          addr = cq->dma_addr + (cq->tail << NVME_CQES);
1521          ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)&req->cqe,
1522                              sizeof(req->cqe));
1523          if (ret) {
1524              trace_pci_nvme_err_addr_write(addr);
1525              trace_pci_nvme_err_cfs();
1526              stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1527              break;
1528          }
1529  
1530          QTAILQ_REMOVE(&cq->req_list, req, entry);
1531  
1532          nvme_inc_cq_tail(cq);
1533          nvme_sg_unmap(&req->sg);
1534  
1535          if (QTAILQ_EMPTY(&sq->req_list) && !nvme_sq_empty(sq)) {
1536              qemu_bh_schedule(sq->bh);
1537          }
1538  
1539          QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1540      }
1541      if (cq->tail != cq->head) {
1542          if (cq->irq_enabled && !pending) {
1543              n->cq_pending++;
1544          }
1545  
1546          nvme_irq_assert(n, cq);
1547      }
1548  }
1549  
nvme_enqueue_req_completion(NvmeCQueue * cq,NvmeRequest * req)1550  static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1551  {
1552      assert(cq->cqid == req->sq->cqid);
1553      trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1554                                            le32_to_cpu(req->cqe.result),
1555                                            le32_to_cpu(req->cqe.dw1),
1556                                            req->status);
1557  
1558      if (req->status) {
1559          trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1560                                        req->status, req->cmd.opcode);
1561      }
1562  
1563      QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1564      QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1565  
1566      qemu_bh_schedule(cq->bh);
1567  }
1568  
nvme_process_aers(void * opaque)1569  static void nvme_process_aers(void *opaque)
1570  {
1571      NvmeCtrl *n = opaque;
1572      NvmeAsyncEvent *event, *next;
1573  
1574      trace_pci_nvme_process_aers(n->aer_queued);
1575  
1576      QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1577          NvmeRequest *req;
1578          NvmeAerResult *result;
1579  
1580          /* can't post cqe if there is nothing to complete */
1581          if (!n->outstanding_aers) {
1582              trace_pci_nvme_no_outstanding_aers();
1583              break;
1584          }
1585  
1586          /* ignore if masked (cqe posted, but event not cleared) */
1587          if (n->aer_mask & (1 << event->result.event_type)) {
1588              trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1589              continue;
1590          }
1591  
1592          QTAILQ_REMOVE(&n->aer_queue, event, entry);
1593          n->aer_queued--;
1594  
1595          n->aer_mask |= 1 << event->result.event_type;
1596          n->outstanding_aers--;
1597  
1598          req = n->aer_reqs[n->outstanding_aers];
1599  
1600          result = (NvmeAerResult *) &req->cqe.result;
1601          result->event_type = event->result.event_type;
1602          result->event_info = event->result.event_info;
1603          result->log_page = event->result.log_page;
1604          g_free(event);
1605  
1606          trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1607                                      result->log_page);
1608  
1609          nvme_enqueue_req_completion(&n->admin_cq, req);
1610      }
1611  }
1612  
nvme_enqueue_event(NvmeCtrl * n,uint8_t event_type,uint8_t event_info,uint8_t log_page)1613  static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1614                                 uint8_t event_info, uint8_t log_page)
1615  {
1616      NvmeAsyncEvent *event;
1617  
1618      trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1619  
1620      if (n->aer_queued == n->params.aer_max_queued) {
1621          trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1622          return;
1623      }
1624  
1625      event = g_new(NvmeAsyncEvent, 1);
1626      event->result = (NvmeAerResult) {
1627          .event_type = event_type,
1628          .event_info = event_info,
1629          .log_page   = log_page,
1630      };
1631  
1632      QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1633      n->aer_queued++;
1634  
1635      nvme_process_aers(n);
1636  }
1637  
nvme_smart_event(NvmeCtrl * n,uint8_t event)1638  static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1639  {
1640      uint8_t aer_info;
1641  
1642      /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1643      if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1644          return;
1645      }
1646  
1647      switch (event) {
1648      case NVME_SMART_SPARE:
1649          aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1650          break;
1651      case NVME_SMART_TEMPERATURE:
1652          aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1653          break;
1654      case NVME_SMART_RELIABILITY:
1655      case NVME_SMART_MEDIA_READ_ONLY:
1656      case NVME_SMART_FAILED_VOLATILE_MEDIA:
1657      case NVME_SMART_PMR_UNRELIABLE:
1658          aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1659          break;
1660      default:
1661          return;
1662      }
1663  
1664      nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1665  }
1666  
nvme_clear_events(NvmeCtrl * n,uint8_t event_type)1667  static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1668  {
1669      NvmeAsyncEvent *event, *next;
1670  
1671      n->aer_mask &= ~(1 << event_type);
1672  
1673      QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1674          if (event->result.event_type == event_type) {
1675              QTAILQ_REMOVE(&n->aer_queue, event, entry);
1676              n->aer_queued--;
1677              g_free(event);
1678          }
1679      }
1680  }
1681  
nvme_check_mdts(NvmeCtrl * n,size_t len)1682  static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1683  {
1684      uint8_t mdts = n->params.mdts;
1685  
1686      if (mdts && len > n->page_size << mdts) {
1687          trace_pci_nvme_err_mdts(len);
1688          return NVME_INVALID_FIELD | NVME_DNR;
1689      }
1690  
1691      return NVME_SUCCESS;
1692  }
1693  
nvme_check_bounds(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1694  static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1695                                           uint32_t nlb)
1696  {
1697      uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1698  
1699      if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1700          trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1701          return NVME_LBA_RANGE | NVME_DNR;
1702      }
1703  
1704      return NVME_SUCCESS;
1705  }
1706  
nvme_block_status_all(NvmeNamespace * ns,uint64_t slba,uint32_t nlb,int flags)1707  static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1708                                   uint32_t nlb, int flags)
1709  {
1710      BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1711  
1712      int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1713      int64_t offset = nvme_l2b(ns, slba);
1714      int ret;
1715  
1716      /*
1717       * `pnum` holds the number of bytes after offset that shares the same
1718       * allocation status as the byte at offset. If `pnum` is different from
1719       * `bytes`, we should check the allocation status of the next range and
1720       * continue this until all bytes have been checked.
1721       */
1722      do {
1723          bytes -= pnum;
1724  
1725          ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1726          if (ret < 0) {
1727              return ret;
1728          }
1729  
1730  
1731          trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1732                                      !!(ret & BDRV_BLOCK_ZERO));
1733  
1734          if (!(ret & flags)) {
1735              return 1;
1736          }
1737  
1738          offset += pnum;
1739      } while (pnum != bytes);
1740  
1741      return 0;
1742  }
1743  
nvme_check_dulbe(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1744  static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1745                                   uint32_t nlb)
1746  {
1747      int ret;
1748      Error *err = NULL;
1749  
1750      ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1751      if (ret) {
1752          if (ret < 0) {
1753              error_setg_errno(&err, -ret, "unable to get block status");
1754              error_report_err(err);
1755  
1756              return NVME_INTERNAL_DEV_ERROR;
1757          }
1758  
1759          return NVME_DULB;
1760      }
1761  
1762      return NVME_SUCCESS;
1763  }
1764  
nvme_aio_err(NvmeRequest * req,int ret)1765  static void nvme_aio_err(NvmeRequest *req, int ret)
1766  {
1767      uint16_t status = NVME_SUCCESS;
1768      Error *local_err = NULL;
1769  
1770      switch (req->cmd.opcode) {
1771      case NVME_CMD_READ:
1772          status = NVME_UNRECOVERED_READ;
1773          break;
1774      case NVME_CMD_FLUSH:
1775      case NVME_CMD_WRITE:
1776      case NVME_CMD_WRITE_ZEROES:
1777      case NVME_CMD_ZONE_APPEND:
1778      case NVME_CMD_COPY:
1779          status = NVME_WRITE_FAULT;
1780          break;
1781      default:
1782          status = NVME_INTERNAL_DEV_ERROR;
1783          break;
1784      }
1785  
1786      if (ret == -ECANCELED) {
1787          status = NVME_CMD_ABORT_REQ;
1788      }
1789  
1790      trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1791  
1792      error_setg_errno(&local_err, -ret, "aio failed");
1793      error_report_err(local_err);
1794  
1795      /*
1796       * Set the command status code to the first encountered error but allow a
1797       * subsequent Internal Device Error to trump it.
1798       */
1799      if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1800          return;
1801      }
1802  
1803      req->status = status;
1804  }
1805  
nvme_zone_idx(NvmeNamespace * ns,uint64_t slba)1806  static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1807  {
1808      return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1809                                      slba / ns->zone_size;
1810  }
1811  
nvme_get_zone_by_slba(NvmeNamespace * ns,uint64_t slba)1812  static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1813  {
1814      uint32_t zone_idx = nvme_zone_idx(ns, slba);
1815  
1816      if (zone_idx >= ns->num_zones) {
1817          return NULL;
1818      }
1819  
1820      return &ns->zone_array[zone_idx];
1821  }
1822  
nvme_check_zone_state_for_write(NvmeZone * zone)1823  static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1824  {
1825      uint64_t zslba = zone->d.zslba;
1826  
1827      switch (nvme_get_zone_state(zone)) {
1828      case NVME_ZONE_STATE_EMPTY:
1829      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1830      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1831      case NVME_ZONE_STATE_CLOSED:
1832          return NVME_SUCCESS;
1833      case NVME_ZONE_STATE_FULL:
1834          trace_pci_nvme_err_zone_is_full(zslba);
1835          return NVME_ZONE_FULL;
1836      case NVME_ZONE_STATE_OFFLINE:
1837          trace_pci_nvme_err_zone_is_offline(zslba);
1838          return NVME_ZONE_OFFLINE;
1839      case NVME_ZONE_STATE_READ_ONLY:
1840          trace_pci_nvme_err_zone_is_read_only(zslba);
1841          return NVME_ZONE_READ_ONLY;
1842      default:
1843          g_assert_not_reached();
1844      }
1845  
1846      return NVME_INTERNAL_DEV_ERROR;
1847  }
1848  
nvme_check_zone_write(NvmeNamespace * ns,NvmeZone * zone,uint64_t slba,uint32_t nlb)1849  static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1850                                        uint64_t slba, uint32_t nlb)
1851  {
1852      uint64_t zcap = nvme_zone_wr_boundary(zone);
1853      uint16_t status;
1854  
1855      status = nvme_check_zone_state_for_write(zone);
1856      if (status) {
1857          return status;
1858      }
1859  
1860      if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1861          uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1862  
1863          if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1864              trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1865              return NVME_ZONE_INVALID_WRITE;
1866          }
1867      } else {
1868          if (unlikely(slba != zone->w_ptr)) {
1869              trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1870                                                 zone->w_ptr);
1871              return NVME_ZONE_INVALID_WRITE;
1872          }
1873      }
1874  
1875      if (unlikely((slba + nlb) > zcap)) {
1876          trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1877          return NVME_ZONE_BOUNDARY_ERROR;
1878      }
1879  
1880      return NVME_SUCCESS;
1881  }
1882  
nvme_check_zone_state_for_read(NvmeZone * zone)1883  static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1884  {
1885      switch (nvme_get_zone_state(zone)) {
1886      case NVME_ZONE_STATE_EMPTY:
1887      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1888      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1889      case NVME_ZONE_STATE_FULL:
1890      case NVME_ZONE_STATE_CLOSED:
1891      case NVME_ZONE_STATE_READ_ONLY:
1892          return NVME_SUCCESS;
1893      case NVME_ZONE_STATE_OFFLINE:
1894          trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1895          return NVME_ZONE_OFFLINE;
1896      default:
1897          g_assert_not_reached();
1898      }
1899  
1900      return NVME_INTERNAL_DEV_ERROR;
1901  }
1902  
nvme_check_zone_read(NvmeNamespace * ns,uint64_t slba,uint32_t nlb)1903  static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1904                                       uint32_t nlb)
1905  {
1906      NvmeZone *zone;
1907      uint64_t bndry, end;
1908      uint16_t status;
1909  
1910      zone = nvme_get_zone_by_slba(ns, slba);
1911      assert(zone);
1912  
1913      bndry = nvme_zone_rd_boundary(ns, zone);
1914      end = slba + nlb;
1915  
1916      status = nvme_check_zone_state_for_read(zone);
1917      if (status) {
1918          ;
1919      } else if (unlikely(end > bndry)) {
1920          if (!ns->params.cross_zone_read) {
1921              status = NVME_ZONE_BOUNDARY_ERROR;
1922          } else {
1923              /*
1924               * Read across zone boundary - check that all subsequent
1925               * zones that are being read have an appropriate state.
1926               */
1927              do {
1928                  zone++;
1929                  status = nvme_check_zone_state_for_read(zone);
1930                  if (status) {
1931                      break;
1932                  }
1933              } while (end > nvme_zone_rd_boundary(ns, zone));
1934          }
1935      }
1936  
1937      return status;
1938  }
1939  
nvme_zrm_finish(NvmeNamespace * ns,NvmeZone * zone)1940  static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1941  {
1942      switch (nvme_get_zone_state(zone)) {
1943      case NVME_ZONE_STATE_FULL:
1944          return NVME_SUCCESS;
1945  
1946      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1947      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1948          nvme_aor_dec_open(ns);
1949          /* fallthrough */
1950      case NVME_ZONE_STATE_CLOSED:
1951          nvme_aor_dec_active(ns);
1952  
1953          if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1954              zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1955              if (ns->params.numzrwa) {
1956                  ns->zns.numzrwa++;
1957              }
1958          }
1959  
1960          /* fallthrough */
1961      case NVME_ZONE_STATE_EMPTY:
1962          nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1963          return NVME_SUCCESS;
1964  
1965      default:
1966          return NVME_ZONE_INVAL_TRANSITION;
1967      }
1968  }
1969  
nvme_zrm_close(NvmeNamespace * ns,NvmeZone * zone)1970  static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1971  {
1972      switch (nvme_get_zone_state(zone)) {
1973      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1974      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1975          nvme_aor_dec_open(ns);
1976          nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1977          /* fall through */
1978      case NVME_ZONE_STATE_CLOSED:
1979          return NVME_SUCCESS;
1980  
1981      default:
1982          return NVME_ZONE_INVAL_TRANSITION;
1983      }
1984  }
1985  
nvme_zrm_reset(NvmeNamespace * ns,NvmeZone * zone)1986  static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1987  {
1988      switch (nvme_get_zone_state(zone)) {
1989      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1990      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1991          nvme_aor_dec_open(ns);
1992          /* fallthrough */
1993      case NVME_ZONE_STATE_CLOSED:
1994          nvme_aor_dec_active(ns);
1995  
1996          if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1997              if (ns->params.numzrwa) {
1998                  ns->zns.numzrwa++;
1999              }
2000          }
2001  
2002          /* fallthrough */
2003      case NVME_ZONE_STATE_FULL:
2004          zone->w_ptr = zone->d.zslba;
2005          zone->d.wp = zone->w_ptr;
2006          nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2007          /* fallthrough */
2008      case NVME_ZONE_STATE_EMPTY:
2009          return NVME_SUCCESS;
2010  
2011      default:
2012          return NVME_ZONE_INVAL_TRANSITION;
2013      }
2014  }
2015  
nvme_zrm_auto_transition_zone(NvmeNamespace * ns)2016  static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
2017  {
2018      NvmeZone *zone;
2019  
2020      if (ns->params.max_open_zones &&
2021          ns->nr_open_zones == ns->params.max_open_zones) {
2022          zone = QTAILQ_FIRST(&ns->imp_open_zones);
2023          if (zone) {
2024              /*
2025               * Automatically close this implicitly open zone.
2026               */
2027              QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
2028              nvme_zrm_close(ns, zone);
2029          }
2030      }
2031  }
2032  
2033  enum {
2034      NVME_ZRM_AUTO = 1 << 0,
2035      NVME_ZRM_ZRWA = 1 << 1,
2036  };
2037  
nvme_zrm_open_flags(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone,int flags)2038  static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
2039                                      NvmeZone *zone, int flags)
2040  {
2041      int act = 0;
2042      uint16_t status;
2043  
2044      switch (nvme_get_zone_state(zone)) {
2045      case NVME_ZONE_STATE_EMPTY:
2046          act = 1;
2047  
2048          /* fallthrough */
2049  
2050      case NVME_ZONE_STATE_CLOSED:
2051          if (n->params.auto_transition_zones) {
2052              nvme_zrm_auto_transition_zone(ns);
2053          }
2054          status = nvme_zns_check_resources(ns, act, 1,
2055                                            (flags & NVME_ZRM_ZRWA) ? 1 : 0);
2056          if (status) {
2057              return status;
2058          }
2059  
2060          if (act) {
2061              nvme_aor_inc_active(ns);
2062          }
2063  
2064          nvme_aor_inc_open(ns);
2065  
2066          if (flags & NVME_ZRM_AUTO) {
2067              nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
2068              return NVME_SUCCESS;
2069          }
2070  
2071          /* fallthrough */
2072  
2073      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2074          if (flags & NVME_ZRM_AUTO) {
2075              return NVME_SUCCESS;
2076          }
2077  
2078          nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
2079  
2080          /* fallthrough */
2081  
2082      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2083          if (flags & NVME_ZRM_ZRWA) {
2084              ns->zns.numzrwa--;
2085  
2086              zone->d.za |= NVME_ZA_ZRWA_VALID;
2087          }
2088  
2089          return NVME_SUCCESS;
2090  
2091      default:
2092          return NVME_ZONE_INVAL_TRANSITION;
2093      }
2094  }
2095  
nvme_zrm_auto(NvmeCtrl * n,NvmeNamespace * ns,NvmeZone * zone)2096  static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
2097                                       NvmeZone *zone)
2098  {
2099      return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
2100  }
2101  
nvme_advance_zone_wp(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlb)2102  static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
2103                                   uint32_t nlb)
2104  {
2105      zone->d.wp += nlb;
2106  
2107      if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
2108          nvme_zrm_finish(ns, zone);
2109      }
2110  }
2111  
nvme_zoned_zrwa_implicit_flush(NvmeNamespace * ns,NvmeZone * zone,uint32_t nlbc)2112  static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
2113                                             uint32_t nlbc)
2114  {
2115      uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
2116  
2117      nlbc = nzrwafgs * ns->zns.zrwafg;
2118  
2119      trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
2120  
2121      zone->w_ptr += nlbc;
2122  
2123      nvme_advance_zone_wp(ns, zone, nlbc);
2124  }
2125  
nvme_finalize_zoned_write(NvmeNamespace * ns,NvmeRequest * req)2126  static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
2127  {
2128      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2129      NvmeZone *zone;
2130      uint64_t slba;
2131      uint32_t nlb;
2132  
2133      slba = le64_to_cpu(rw->slba);
2134      nlb = le16_to_cpu(rw->nlb) + 1;
2135      zone = nvme_get_zone_by_slba(ns, slba);
2136      assert(zone);
2137  
2138      if (zone->d.za & NVME_ZA_ZRWA_VALID) {
2139          uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
2140          uint64_t elba = slba + nlb - 1;
2141  
2142          if (elba > ezrwa) {
2143              nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
2144          }
2145  
2146          return;
2147      }
2148  
2149      nvme_advance_zone_wp(ns, zone, nlb);
2150  }
2151  
nvme_is_write(NvmeRequest * req)2152  static inline bool nvme_is_write(NvmeRequest *req)
2153  {
2154      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2155  
2156      return rw->opcode == NVME_CMD_WRITE ||
2157             rw->opcode == NVME_CMD_ZONE_APPEND ||
2158             rw->opcode == NVME_CMD_WRITE_ZEROES;
2159  }
2160  
nvme_misc_cb(void * opaque,int ret)2161  static void nvme_misc_cb(void *opaque, int ret)
2162  {
2163      NvmeRequest *req = opaque;
2164  
2165      trace_pci_nvme_misc_cb(nvme_cid(req));
2166  
2167      if (ret) {
2168          nvme_aio_err(req, ret);
2169      }
2170  
2171      nvme_enqueue_req_completion(nvme_cq(req), req);
2172  }
2173  
nvme_rw_complete_cb(void * opaque,int ret)2174  void nvme_rw_complete_cb(void *opaque, int ret)
2175  {
2176      NvmeRequest *req = opaque;
2177      NvmeNamespace *ns = req->ns;
2178      BlockBackend *blk = ns->blkconf.blk;
2179      BlockAcctCookie *acct = &req->acct;
2180      BlockAcctStats *stats = blk_get_stats(blk);
2181  
2182      trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2183  
2184      if (ret) {
2185          block_acct_failed(stats, acct);
2186          nvme_aio_err(req, ret);
2187      } else {
2188          block_acct_done(stats, acct);
2189      }
2190  
2191      if (ns->params.zoned && nvme_is_write(req)) {
2192          nvme_finalize_zoned_write(ns, req);
2193      }
2194  
2195      nvme_enqueue_req_completion(nvme_cq(req), req);
2196  }
2197  
nvme_rw_cb(void * opaque,int ret)2198  static void nvme_rw_cb(void *opaque, int ret)
2199  {
2200      NvmeRequest *req = opaque;
2201      NvmeNamespace *ns = req->ns;
2202  
2203      BlockBackend *blk = ns->blkconf.blk;
2204  
2205      trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2206  
2207      if (ret) {
2208          goto out;
2209      }
2210  
2211      if (ns->lbaf.ms) {
2212          NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2213          uint64_t slba = le64_to_cpu(rw->slba);
2214          uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2215          uint64_t offset = nvme_moff(ns, slba);
2216  
2217          if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2218              size_t mlen = nvme_m2b(ns, nlb);
2219  
2220              req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2221                                                 BDRV_REQ_MAY_UNMAP,
2222                                                 nvme_rw_complete_cb, req);
2223              return;
2224          }
2225  
2226          if (nvme_ns_ext(ns) || req->cmd.mptr) {
2227              uint16_t status;
2228  
2229              nvme_sg_unmap(&req->sg);
2230              status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2231              if (status) {
2232                  ret = -EFAULT;
2233                  goto out;
2234              }
2235  
2236              if (req->cmd.opcode == NVME_CMD_READ) {
2237                  return nvme_blk_read(blk, offset, 1, nvme_rw_complete_cb, req);
2238              }
2239  
2240              return nvme_blk_write(blk, offset, 1, nvme_rw_complete_cb, req);
2241          }
2242      }
2243  
2244  out:
2245      nvme_rw_complete_cb(req, ret);
2246  }
2247  
nvme_verify_cb(void * opaque,int ret)2248  static void nvme_verify_cb(void *opaque, int ret)
2249  {
2250      NvmeBounceContext *ctx = opaque;
2251      NvmeRequest *req = ctx->req;
2252      NvmeNamespace *ns = req->ns;
2253      BlockBackend *blk = ns->blkconf.blk;
2254      BlockAcctCookie *acct = &req->acct;
2255      BlockAcctStats *stats = blk_get_stats(blk);
2256      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2257      uint64_t slba = le64_to_cpu(rw->slba);
2258      uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2259      uint16_t apptag = le16_to_cpu(rw->apptag);
2260      uint16_t appmask = le16_to_cpu(rw->appmask);
2261      uint64_t reftag = le32_to_cpu(rw->reftag);
2262      uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2263      uint16_t status;
2264  
2265      reftag |= cdw3 << 32;
2266  
2267      trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2268  
2269      if (ret) {
2270          block_acct_failed(stats, acct);
2271          nvme_aio_err(req, ret);
2272          goto out;
2273      }
2274  
2275      block_acct_done(stats, acct);
2276  
2277      if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2278          status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2279                                         ctx->mdata.iov.size, slba);
2280          if (status) {
2281              req->status = status;
2282              goto out;
2283          }
2284  
2285          req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2286                                       ctx->mdata.bounce, ctx->mdata.iov.size,
2287                                       prinfo, slba, apptag, appmask, &reftag);
2288      }
2289  
2290  out:
2291      qemu_iovec_destroy(&ctx->data.iov);
2292      g_free(ctx->data.bounce);
2293  
2294      qemu_iovec_destroy(&ctx->mdata.iov);
2295      g_free(ctx->mdata.bounce);
2296  
2297      g_free(ctx);
2298  
2299      nvme_enqueue_req_completion(nvme_cq(req), req);
2300  }
2301  
2302  
nvme_verify_mdata_in_cb(void * opaque,int ret)2303  static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2304  {
2305      NvmeBounceContext *ctx = opaque;
2306      NvmeRequest *req = ctx->req;
2307      NvmeNamespace *ns = req->ns;
2308      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2309      uint64_t slba = le64_to_cpu(rw->slba);
2310      uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2311      size_t mlen = nvme_m2b(ns, nlb);
2312      uint64_t offset = nvme_moff(ns, slba);
2313      BlockBackend *blk = ns->blkconf.blk;
2314  
2315      trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2316  
2317      if (ret) {
2318          goto out;
2319      }
2320  
2321      ctx->mdata.bounce = g_malloc(mlen);
2322  
2323      qemu_iovec_reset(&ctx->mdata.iov);
2324      qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2325  
2326      req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2327                                  nvme_verify_cb, ctx);
2328      return;
2329  
2330  out:
2331      nvme_verify_cb(ctx, ret);
2332  }
2333  
2334  struct nvme_compare_ctx {
2335      struct {
2336          QEMUIOVector iov;
2337          uint8_t *bounce;
2338      } data;
2339  
2340      struct {
2341          QEMUIOVector iov;
2342          uint8_t *bounce;
2343      } mdata;
2344  };
2345  
nvme_compare_mdata_cb(void * opaque,int ret)2346  static void nvme_compare_mdata_cb(void *opaque, int ret)
2347  {
2348      NvmeRequest *req = opaque;
2349      NvmeNamespace *ns = req->ns;
2350      NvmeCtrl *n = nvme_ctrl(req);
2351      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2352      uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2353      uint16_t apptag = le16_to_cpu(rw->apptag);
2354      uint16_t appmask = le16_to_cpu(rw->appmask);
2355      uint64_t reftag = le32_to_cpu(rw->reftag);
2356      uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2357      struct nvme_compare_ctx *ctx = req->opaque;
2358      g_autofree uint8_t *buf = NULL;
2359      BlockBackend *blk = ns->blkconf.blk;
2360      BlockAcctCookie *acct = &req->acct;
2361      BlockAcctStats *stats = blk_get_stats(blk);
2362      uint16_t status = NVME_SUCCESS;
2363  
2364      reftag |= cdw3 << 32;
2365  
2366      trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2367  
2368      if (ret) {
2369          block_acct_failed(stats, acct);
2370          nvme_aio_err(req, ret);
2371          goto out;
2372      }
2373  
2374      buf = g_malloc(ctx->mdata.iov.size);
2375  
2376      status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2377                                 NVME_TX_DIRECTION_TO_DEVICE, req);
2378      if (status) {
2379          req->status = status;
2380          goto out;
2381      }
2382  
2383      if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2384          uint64_t slba = le64_to_cpu(rw->slba);
2385          uint8_t *bufp;
2386          uint8_t *mbufp = ctx->mdata.bounce;
2387          uint8_t *end = mbufp + ctx->mdata.iov.size;
2388          int16_t pil = 0;
2389  
2390          status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2391                                  ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2392                                  slba, apptag, appmask, &reftag);
2393          if (status) {
2394              req->status = status;
2395              goto out;
2396          }
2397  
2398          /*
2399           * When formatted with protection information, do not compare the DIF
2400           * tuple.
2401           */
2402          if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2403              pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2404          }
2405  
2406          for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2407              if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2408                  req->status = NVME_CMP_FAILURE | NVME_DNR;
2409                  goto out;
2410              }
2411          }
2412  
2413          goto out;
2414      }
2415  
2416      if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2417          req->status = NVME_CMP_FAILURE | NVME_DNR;
2418          goto out;
2419      }
2420  
2421      block_acct_done(stats, acct);
2422  
2423  out:
2424      qemu_iovec_destroy(&ctx->data.iov);
2425      g_free(ctx->data.bounce);
2426  
2427      qemu_iovec_destroy(&ctx->mdata.iov);
2428      g_free(ctx->mdata.bounce);
2429  
2430      g_free(ctx);
2431  
2432      nvme_enqueue_req_completion(nvme_cq(req), req);
2433  }
2434  
nvme_compare_data_cb(void * opaque,int ret)2435  static void nvme_compare_data_cb(void *opaque, int ret)
2436  {
2437      NvmeRequest *req = opaque;
2438      NvmeCtrl *n = nvme_ctrl(req);
2439      NvmeNamespace *ns = req->ns;
2440      BlockBackend *blk = ns->blkconf.blk;
2441      BlockAcctCookie *acct = &req->acct;
2442      BlockAcctStats *stats = blk_get_stats(blk);
2443  
2444      struct nvme_compare_ctx *ctx = req->opaque;
2445      g_autofree uint8_t *buf = NULL;
2446      uint16_t status;
2447  
2448      trace_pci_nvme_compare_data_cb(nvme_cid(req));
2449  
2450      if (ret) {
2451          block_acct_failed(stats, acct);
2452          nvme_aio_err(req, ret);
2453          goto out;
2454      }
2455  
2456      buf = g_malloc(ctx->data.iov.size);
2457  
2458      status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2459                                NVME_TX_DIRECTION_TO_DEVICE, req);
2460      if (status) {
2461          req->status = status;
2462          goto out;
2463      }
2464  
2465      if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2466          req->status = NVME_CMP_FAILURE | NVME_DNR;
2467          goto out;
2468      }
2469  
2470      if (ns->lbaf.ms) {
2471          NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2472          uint64_t slba = le64_to_cpu(rw->slba);
2473          uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2474          size_t mlen = nvme_m2b(ns, nlb);
2475          uint64_t offset = nvme_moff(ns, slba);
2476  
2477          ctx->mdata.bounce = g_malloc(mlen);
2478  
2479          qemu_iovec_init(&ctx->mdata.iov, 1);
2480          qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2481  
2482          req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2483                                      nvme_compare_mdata_cb, req);
2484          return;
2485      }
2486  
2487      block_acct_done(stats, acct);
2488  
2489  out:
2490      qemu_iovec_destroy(&ctx->data.iov);
2491      g_free(ctx->data.bounce);
2492      g_free(ctx);
2493  
2494      nvme_enqueue_req_completion(nvme_cq(req), req);
2495  }
2496  
2497  typedef struct NvmeDSMAIOCB {
2498      BlockAIOCB common;
2499      BlockAIOCB *aiocb;
2500      NvmeRequest *req;
2501      int ret;
2502  
2503      NvmeDsmRange *range;
2504      unsigned int nr;
2505      unsigned int idx;
2506  } NvmeDSMAIOCB;
2507  
nvme_dsm_cancel(BlockAIOCB * aiocb)2508  static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2509  {
2510      NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2511  
2512      /* break nvme_dsm_cb loop */
2513      iocb->idx = iocb->nr;
2514      iocb->ret = -ECANCELED;
2515  
2516      if (iocb->aiocb) {
2517          blk_aio_cancel_async(iocb->aiocb);
2518          iocb->aiocb = NULL;
2519      } else {
2520          /*
2521           * We only reach this if nvme_dsm_cancel() has already been called or
2522           * the command ran to completion.
2523           */
2524          assert(iocb->idx == iocb->nr);
2525      }
2526  }
2527  
2528  static const AIOCBInfo nvme_dsm_aiocb_info = {
2529      .aiocb_size   = sizeof(NvmeDSMAIOCB),
2530      .cancel_async = nvme_dsm_cancel,
2531  };
2532  
2533  static void nvme_dsm_cb(void *opaque, int ret);
2534  
nvme_dsm_md_cb(void * opaque,int ret)2535  static void nvme_dsm_md_cb(void *opaque, int ret)
2536  {
2537      NvmeDSMAIOCB *iocb = opaque;
2538      NvmeRequest *req = iocb->req;
2539      NvmeNamespace *ns = req->ns;
2540      NvmeDsmRange *range;
2541      uint64_t slba;
2542      uint32_t nlb;
2543  
2544      if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
2545          goto done;
2546      }
2547  
2548      range = &iocb->range[iocb->idx - 1];
2549      slba = le64_to_cpu(range->slba);
2550      nlb = le32_to_cpu(range->nlb);
2551  
2552      /*
2553       * Check that all block were discarded (zeroed); otherwise we do not zero
2554       * the metadata.
2555       */
2556  
2557      ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2558      if (ret) {
2559          if (ret < 0) {
2560              goto done;
2561          }
2562  
2563          nvme_dsm_cb(iocb, 0);
2564          return;
2565      }
2566  
2567      iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2568                                          nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2569                                          nvme_dsm_cb, iocb);
2570      return;
2571  
2572  done:
2573      nvme_dsm_cb(iocb, ret);
2574  }
2575  
nvme_dsm_cb(void * opaque,int ret)2576  static void nvme_dsm_cb(void *opaque, int ret)
2577  {
2578      NvmeDSMAIOCB *iocb = opaque;
2579      NvmeRequest *req = iocb->req;
2580      NvmeCtrl *n = nvme_ctrl(req);
2581      NvmeNamespace *ns = req->ns;
2582      NvmeDsmRange *range;
2583      uint64_t slba;
2584      uint32_t nlb;
2585  
2586      if (iocb->ret < 0) {
2587          goto done;
2588      } else if (ret < 0) {
2589          iocb->ret = ret;
2590          goto done;
2591      }
2592  
2593  next:
2594      if (iocb->idx == iocb->nr) {
2595          goto done;
2596      }
2597  
2598      range = &iocb->range[iocb->idx++];
2599      slba = le64_to_cpu(range->slba);
2600      nlb = le32_to_cpu(range->nlb);
2601  
2602      trace_pci_nvme_dsm_deallocate(slba, nlb);
2603  
2604      if (nlb > n->dmrsl) {
2605          trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2606          goto next;
2607      }
2608  
2609      if (nvme_check_bounds(ns, slba, nlb)) {
2610          trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2611                                               ns->id_ns.nsze);
2612          goto next;
2613      }
2614  
2615      iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2616                                     nvme_l2b(ns, nlb),
2617                                     nvme_dsm_md_cb, iocb);
2618      return;
2619  
2620  done:
2621      iocb->aiocb = NULL;
2622      iocb->common.cb(iocb->common.opaque, iocb->ret);
2623      g_free(iocb->range);
2624      qemu_aio_unref(iocb);
2625  }
2626  
nvme_dsm(NvmeCtrl * n,NvmeRequest * req)2627  static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2628  {
2629      NvmeNamespace *ns = req->ns;
2630      NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2631      uint32_t attr = le32_to_cpu(dsm->attributes);
2632      uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2633      uint16_t status = NVME_SUCCESS;
2634  
2635      trace_pci_nvme_dsm(nr, attr);
2636  
2637      if (attr & NVME_DSMGMT_AD) {
2638          NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2639                                           nvme_misc_cb, req);
2640  
2641          iocb->req = req;
2642          iocb->ret = 0;
2643          iocb->range = g_new(NvmeDsmRange, nr);
2644          iocb->nr = nr;
2645          iocb->idx = 0;
2646  
2647          status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2648                            req);
2649          if (status) {
2650              g_free(iocb->range);
2651              qemu_aio_unref(iocb);
2652  
2653              return status;
2654          }
2655  
2656          req->aiocb = &iocb->common;
2657          nvme_dsm_cb(iocb, 0);
2658  
2659          return NVME_NO_COMPLETE;
2660      }
2661  
2662      return status;
2663  }
2664  
nvme_verify(NvmeCtrl * n,NvmeRequest * req)2665  static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2666  {
2667      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2668      NvmeNamespace *ns = req->ns;
2669      BlockBackend *blk = ns->blkconf.blk;
2670      uint64_t slba = le64_to_cpu(rw->slba);
2671      uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2672      size_t len = nvme_l2b(ns, nlb);
2673      size_t data_len = len;
2674      int64_t offset = nvme_l2b(ns, slba);
2675      uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2676      uint32_t reftag = le32_to_cpu(rw->reftag);
2677      NvmeBounceContext *ctx = NULL;
2678      uint16_t status;
2679  
2680      trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2681  
2682      if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2683          status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2684          if (status) {
2685              return status;
2686          }
2687  
2688          if (prinfo & NVME_PRINFO_PRACT) {
2689              return NVME_INVALID_PROT_INFO | NVME_DNR;
2690          }
2691      }
2692  
2693      if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
2694          data_len += nvme_m2b(ns, nlb);
2695      }
2696  
2697      if (data_len > (n->page_size << n->params.vsl)) {
2698          return NVME_INVALID_FIELD | NVME_DNR;
2699      }
2700  
2701      status = nvme_check_bounds(ns, slba, nlb);
2702      if (status) {
2703          return status;
2704      }
2705  
2706      if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2707          status = nvme_check_dulbe(ns, slba, nlb);
2708          if (status) {
2709              return status;
2710          }
2711      }
2712  
2713      ctx = g_new0(NvmeBounceContext, 1);
2714      ctx->req = req;
2715  
2716      ctx->data.bounce = g_malloc(len);
2717  
2718      qemu_iovec_init(&ctx->data.iov, 1);
2719      qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2720  
2721      block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2722                       BLOCK_ACCT_READ);
2723  
2724      req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2725                                  nvme_verify_mdata_in_cb, ctx);
2726      return NVME_NO_COMPLETE;
2727  }
2728  
2729  typedef struct NvmeCopyAIOCB {
2730      BlockAIOCB common;
2731      BlockAIOCB *aiocb;
2732      NvmeRequest *req;
2733      NvmeCtrl *n;
2734      int ret;
2735  
2736      void *ranges;
2737      unsigned int format;
2738      int nr;
2739      int idx;
2740  
2741      uint8_t *bounce;
2742      QEMUIOVector iov;
2743      struct {
2744          BlockAcctCookie read;
2745          BlockAcctCookie write;
2746      } acct;
2747  
2748      uint64_t reftag;
2749      uint64_t slba;
2750  
2751      NvmeZone *zone;
2752      NvmeNamespace *sns;
2753      uint32_t tcl;
2754  } NvmeCopyAIOCB;
2755  
nvme_copy_cancel(BlockAIOCB * aiocb)2756  static void nvme_copy_cancel(BlockAIOCB *aiocb)
2757  {
2758      NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2759  
2760      iocb->ret = -ECANCELED;
2761  
2762      if (iocb->aiocb) {
2763          blk_aio_cancel_async(iocb->aiocb);
2764          iocb->aiocb = NULL;
2765      }
2766  }
2767  
2768  static const AIOCBInfo nvme_copy_aiocb_info = {
2769      .aiocb_size   = sizeof(NvmeCopyAIOCB),
2770      .cancel_async = nvme_copy_cancel,
2771  };
2772  
nvme_copy_done(NvmeCopyAIOCB * iocb)2773  static void nvme_copy_done(NvmeCopyAIOCB *iocb)
2774  {
2775      NvmeRequest *req = iocb->req;
2776      NvmeNamespace *ns = req->ns;
2777      BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2778  
2779      if (iocb->idx != iocb->nr) {
2780          req->cqe.result = cpu_to_le32(iocb->idx);
2781      }
2782  
2783      qemu_iovec_destroy(&iocb->iov);
2784      g_free(iocb->bounce);
2785  
2786      if (iocb->ret < 0) {
2787          block_acct_failed(stats, &iocb->acct.read);
2788          block_acct_failed(stats, &iocb->acct.write);
2789      } else {
2790          block_acct_done(stats, &iocb->acct.read);
2791          block_acct_done(stats, &iocb->acct.write);
2792      }
2793  
2794      iocb->common.cb(iocb->common.opaque, iocb->ret);
2795      qemu_aio_unref(iocb);
2796  }
2797  
2798  static void nvme_do_copy(NvmeCopyAIOCB *iocb);
2799  
nvme_copy_source_range_parse_format0_2(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2800  static void nvme_copy_source_range_parse_format0_2(void *ranges,
2801                                                     int idx, uint64_t *slba,
2802                                                     uint32_t *nlb,
2803                                                     uint32_t *snsid,
2804                                                     uint16_t *apptag,
2805                                                     uint16_t *appmask,
2806                                                     uint64_t *reftag)
2807  {
2808      NvmeCopySourceRangeFormat0_2 *_ranges = ranges;
2809  
2810      if (snsid) {
2811          *snsid = le32_to_cpu(_ranges[idx].sparams);
2812      }
2813  
2814      if (slba) {
2815          *slba = le64_to_cpu(_ranges[idx].slba);
2816      }
2817  
2818      if (nlb) {
2819          *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2820      }
2821  
2822      if (apptag) {
2823          *apptag = le16_to_cpu(_ranges[idx].apptag);
2824      }
2825  
2826      if (appmask) {
2827          *appmask = le16_to_cpu(_ranges[idx].appmask);
2828      }
2829  
2830      if (reftag) {
2831          *reftag = le32_to_cpu(_ranges[idx].reftag);
2832      }
2833  }
2834  
nvme_copy_source_range_parse_format1_3(void * ranges,int idx,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2835  static void nvme_copy_source_range_parse_format1_3(void *ranges, int idx,
2836                                                     uint64_t *slba,
2837                                                     uint32_t *nlb,
2838                                                     uint32_t *snsid,
2839                                                     uint16_t *apptag,
2840                                                     uint16_t *appmask,
2841                                                     uint64_t *reftag)
2842  {
2843      NvmeCopySourceRangeFormat1_3 *_ranges = ranges;
2844  
2845      if (snsid) {
2846          *snsid = le32_to_cpu(_ranges[idx].sparams);
2847      }
2848  
2849      if (slba) {
2850          *slba = le64_to_cpu(_ranges[idx].slba);
2851      }
2852  
2853      if (nlb) {
2854          *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2855      }
2856  
2857      if (apptag) {
2858          *apptag = le16_to_cpu(_ranges[idx].apptag);
2859      }
2860  
2861      if (appmask) {
2862          *appmask = le16_to_cpu(_ranges[idx].appmask);
2863      }
2864  
2865      if (reftag) {
2866          *reftag = 0;
2867  
2868          *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2869          *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2870          *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2871          *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2872          *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2873          *reftag |= (uint64_t)_ranges[idx].sr[9];
2874      }
2875  }
2876  
nvme_copy_source_range_parse(void * ranges,int idx,uint8_t format,uint64_t * slba,uint32_t * nlb,uint32_t * snsid,uint16_t * apptag,uint16_t * appmask,uint64_t * reftag)2877  static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2878                                           uint64_t *slba, uint32_t *nlb,
2879                                           uint32_t *snsid, uint16_t *apptag,
2880                                           uint16_t *appmask, uint64_t *reftag)
2881  {
2882      switch (format) {
2883      case NVME_COPY_FORMAT_0:
2884      case NVME_COPY_FORMAT_2:
2885          nvme_copy_source_range_parse_format0_2(ranges, idx, slba, nlb, snsid,
2886                                                 apptag, appmask, reftag);
2887          break;
2888  
2889      case NVME_COPY_FORMAT_1:
2890      case NVME_COPY_FORMAT_3:
2891          nvme_copy_source_range_parse_format1_3(ranges, idx, slba, nlb, snsid,
2892                                                 apptag, appmask, reftag);
2893          break;
2894  
2895      default:
2896          abort();
2897      }
2898  }
2899  
nvme_check_copy_mcl(NvmeNamespace * ns,NvmeCopyAIOCB * iocb,uint16_t nr)2900  static inline uint16_t nvme_check_copy_mcl(NvmeNamespace *ns,
2901                                             NvmeCopyAIOCB *iocb, uint16_t nr)
2902  {
2903      uint32_t copy_len = 0;
2904  
2905      for (int idx = 0; idx < nr; idx++) {
2906          uint32_t nlb;
2907          nvme_copy_source_range_parse(iocb->ranges, idx, iocb->format, NULL,
2908                                       &nlb, NULL, NULL, NULL, NULL);
2909          copy_len += nlb;
2910      }
2911      iocb->tcl = copy_len;
2912      if (copy_len > ns->id_ns.mcl) {
2913          return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2914      }
2915  
2916      return NVME_SUCCESS;
2917  }
2918  
nvme_copy_out_completed_cb(void * opaque,int ret)2919  static void nvme_copy_out_completed_cb(void *opaque, int ret)
2920  {
2921      NvmeCopyAIOCB *iocb = opaque;
2922      NvmeRequest *req = iocb->req;
2923      NvmeNamespace *dns = req->ns;
2924      uint32_t nlb;
2925  
2926      nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2927                                   &nlb, NULL, NULL, NULL, NULL);
2928  
2929      if (ret < 0) {
2930          iocb->ret = ret;
2931          goto out;
2932      } else if (iocb->ret < 0) {
2933          goto out;
2934      }
2935  
2936      if (dns->params.zoned) {
2937          nvme_advance_zone_wp(dns, iocb->zone, nlb);
2938      }
2939  
2940      iocb->idx++;
2941      iocb->slba += nlb;
2942  out:
2943      nvme_do_copy(iocb);
2944  }
2945  
nvme_copy_out_cb(void * opaque,int ret)2946  static void nvme_copy_out_cb(void *opaque, int ret)
2947  {
2948      NvmeCopyAIOCB *iocb = opaque;
2949      NvmeRequest *req = iocb->req;
2950      NvmeNamespace *dns = req->ns;
2951      uint32_t nlb;
2952      size_t mlen;
2953      uint8_t *mbounce;
2954  
2955      if (ret < 0 || iocb->ret < 0 || !dns->lbaf.ms) {
2956          goto out;
2957      }
2958  
2959      nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2960                                   &nlb, NULL, NULL, NULL, NULL);
2961  
2962      mlen = nvme_m2b(dns, nlb);
2963      mbounce = iocb->bounce + nvme_l2b(dns, nlb);
2964  
2965      qemu_iovec_reset(&iocb->iov);
2966      qemu_iovec_add(&iocb->iov, mbounce, mlen);
2967  
2968      iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_moff(dns, iocb->slba),
2969                                    &iocb->iov, 0, nvme_copy_out_completed_cb,
2970                                    iocb);
2971  
2972      return;
2973  
2974  out:
2975      nvme_copy_out_completed_cb(iocb, ret);
2976  }
2977  
nvme_copy_in_completed_cb(void * opaque,int ret)2978  static void nvme_copy_in_completed_cb(void *opaque, int ret)
2979  {
2980      NvmeCopyAIOCB *iocb = opaque;
2981      NvmeRequest *req = iocb->req;
2982      NvmeNamespace *sns = iocb->sns;
2983      NvmeNamespace *dns = req->ns;
2984      NvmeCopyCmd *copy = NULL;
2985      uint8_t *mbounce = NULL;
2986      uint32_t nlb;
2987      uint64_t slba;
2988      uint16_t apptag, appmask;
2989      uint64_t reftag;
2990      size_t len, mlen;
2991      uint16_t status;
2992  
2993      if (ret < 0) {
2994          iocb->ret = ret;
2995          goto out;
2996      } else if (iocb->ret < 0) {
2997          goto out;
2998      }
2999  
3000      nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3001                                   &nlb, NULL, &apptag, &appmask, &reftag);
3002  
3003      trace_pci_nvme_copy_out(iocb->slba, nlb);
3004  
3005      len = nvme_l2b(sns, nlb);
3006  
3007      if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps)) {
3008          copy = (NvmeCopyCmd *)&req->cmd;
3009  
3010          uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3011  
3012          mlen = nvme_m2b(sns, nlb);
3013          mbounce = iocb->bounce + nvme_l2b(sns, nlb);
3014  
3015          status = nvme_dif_mangle_mdata(sns, mbounce, mlen, slba);
3016          if (status) {
3017              goto invalid;
3018          }
3019          status = nvme_dif_check(sns, iocb->bounce, len, mbounce, mlen, prinfor,
3020                                  slba, apptag, appmask, &reftag);
3021          if (status) {
3022              goto invalid;
3023          }
3024      }
3025  
3026      if (NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3027          copy = (NvmeCopyCmd *)&req->cmd;
3028          uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3029  
3030          mlen = nvme_m2b(dns, nlb);
3031          mbounce = iocb->bounce + nvme_l2b(dns, nlb);
3032  
3033          apptag = le16_to_cpu(copy->apptag);
3034          appmask = le16_to_cpu(copy->appmask);
3035  
3036          if (prinfow & NVME_PRINFO_PRACT) {
3037              status = nvme_check_prinfo(dns, prinfow, iocb->slba, iocb->reftag);
3038              if (status) {
3039                  goto invalid;
3040              }
3041  
3042              nvme_dif_pract_generate_dif(dns, iocb->bounce, len, mbounce, mlen,
3043                                          apptag, &iocb->reftag);
3044          } else {
3045              status = nvme_dif_check(dns, iocb->bounce, len, mbounce, mlen,
3046                                      prinfow, iocb->slba, apptag, appmask,
3047                                      &iocb->reftag);
3048              if (status) {
3049                  goto invalid;
3050              }
3051          }
3052      }
3053  
3054      status = nvme_check_bounds(dns, iocb->slba, nlb);
3055      if (status) {
3056          goto invalid;
3057      }
3058  
3059      if (dns->params.zoned) {
3060          status = nvme_check_zone_write(dns, iocb->zone, iocb->slba, nlb);
3061          if (status) {
3062              goto invalid;
3063          }
3064  
3065          if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
3066              iocb->zone->w_ptr += nlb;
3067          }
3068      }
3069  
3070      qemu_iovec_reset(&iocb->iov);
3071      qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3072  
3073      block_acct_start(blk_get_stats(dns->blkconf.blk), &iocb->acct.write, 0,
3074                       BLOCK_ACCT_WRITE);
3075  
3076      iocb->aiocb = blk_aio_pwritev(dns->blkconf.blk, nvme_l2b(dns, iocb->slba),
3077                                    &iocb->iov, 0, nvme_copy_out_cb, iocb);
3078  
3079      return;
3080  
3081  invalid:
3082      req->status = status;
3083      iocb->ret = -1;
3084  out:
3085      nvme_do_copy(iocb);
3086  }
3087  
nvme_copy_in_cb(void * opaque,int ret)3088  static void nvme_copy_in_cb(void *opaque, int ret)
3089  {
3090      NvmeCopyAIOCB *iocb = opaque;
3091      NvmeNamespace *sns = iocb->sns;
3092      uint64_t slba;
3093      uint32_t nlb;
3094  
3095      if (ret < 0 || iocb->ret < 0 || !sns->lbaf.ms) {
3096          goto out;
3097      }
3098  
3099      nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
3100                                   &nlb, NULL, NULL, NULL, NULL);
3101  
3102      qemu_iovec_reset(&iocb->iov);
3103      qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(sns, nlb),
3104                     nvme_m2b(sns, nlb));
3105  
3106      iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_moff(sns, slba),
3107                                   &iocb->iov, 0, nvme_copy_in_completed_cb,
3108                                   iocb);
3109      return;
3110  
3111  out:
3112      nvme_copy_in_completed_cb(iocb, ret);
3113  }
3114  
nvme_csi_supports_copy(uint8_t csi)3115  static inline bool nvme_csi_supports_copy(uint8_t csi)
3116  {
3117      return csi == NVME_CSI_NVM || csi == NVME_CSI_ZONED;
3118  }
3119  
nvme_copy_ns_format_match(NvmeNamespace * sns,NvmeNamespace * dns)3120  static inline bool nvme_copy_ns_format_match(NvmeNamespace *sns,
3121                                               NvmeNamespace *dns)
3122  {
3123      return sns->lbaf.ds == dns->lbaf.ds && sns->lbaf.ms == dns->lbaf.ms;
3124  }
3125  
nvme_copy_matching_ns_format(NvmeNamespace * sns,NvmeNamespace * dns,bool pi_enable)3126  static bool nvme_copy_matching_ns_format(NvmeNamespace *sns, NvmeNamespace *dns,
3127                                           bool pi_enable)
3128  {
3129      if (!nvme_csi_supports_copy(sns->csi) ||
3130          !nvme_csi_supports_copy(dns->csi)) {
3131          return false;
3132      }
3133  
3134      if (!pi_enable && !nvme_copy_ns_format_match(sns, dns)) {
3135              return false;
3136      }
3137  
3138      if (pi_enable && (!nvme_copy_ns_format_match(sns, dns) ||
3139          sns->id_ns.dps != dns->id_ns.dps)) {
3140              return false;
3141      }
3142  
3143      return true;
3144  }
3145  
nvme_copy_corresp_pi_match(NvmeNamespace * sns,NvmeNamespace * dns)3146  static inline bool nvme_copy_corresp_pi_match(NvmeNamespace *sns,
3147                                                NvmeNamespace *dns)
3148  {
3149      return sns->lbaf.ms == 0 &&
3150             ((dns->lbaf.ms == 8 && dns->pif == 0) ||
3151             (dns->lbaf.ms == 16 && dns->pif == 1));
3152  }
3153  
nvme_copy_corresp_pi_format(NvmeNamespace * sns,NvmeNamespace * dns,bool sns_pi_en)3154  static bool nvme_copy_corresp_pi_format(NvmeNamespace *sns, NvmeNamespace *dns,
3155                                          bool sns_pi_en)
3156  {
3157      if (!nvme_csi_supports_copy(sns->csi) ||
3158          !nvme_csi_supports_copy(dns->csi)) {
3159          return false;
3160      }
3161  
3162      if (!sns_pi_en && !nvme_copy_corresp_pi_match(sns, dns)) {
3163          return false;
3164      }
3165  
3166      if (sns_pi_en && !nvme_copy_corresp_pi_match(dns, sns)) {
3167          return false;
3168      }
3169  
3170      return true;
3171  }
3172  
nvme_do_copy(NvmeCopyAIOCB * iocb)3173  static void nvme_do_copy(NvmeCopyAIOCB *iocb)
3174  {
3175      NvmeRequest *req = iocb->req;
3176      NvmeNamespace *sns;
3177      NvmeNamespace *dns = req->ns;
3178      NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3179      uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3180      uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3181      uint64_t slba;
3182      uint32_t nlb;
3183      size_t len;
3184      uint16_t status;
3185      uint32_t dnsid = le32_to_cpu(req->cmd.nsid);
3186      uint32_t snsid = dnsid;
3187  
3188      if (iocb->ret < 0) {
3189          goto done;
3190      }
3191  
3192      if (iocb->idx == iocb->nr) {
3193          goto done;
3194      }
3195  
3196      if (iocb->format == 2 || iocb->format == 3) {
3197          nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3198                                       &slba, &nlb, &snsid, NULL, NULL, NULL);
3199          if (snsid != dnsid) {
3200              if (snsid == NVME_NSID_BROADCAST ||
3201                  !nvme_nsid_valid(iocb->n, snsid)) {
3202                  status = NVME_INVALID_NSID | NVME_DNR;
3203                  goto invalid;
3204              }
3205              iocb->sns = nvme_ns(iocb->n, snsid);
3206              if (unlikely(!iocb->sns)) {
3207                  status = NVME_INVALID_FIELD | NVME_DNR;
3208                  goto invalid;
3209              }
3210          } else {
3211              if (((slba + nlb) > iocb->slba) &&
3212                  ((slba + nlb) < (iocb->slba + iocb->tcl))) {
3213                  status = NVME_CMD_OVERLAP_IO_RANGE | NVME_DNR;
3214                  goto invalid;
3215              }
3216          }
3217      } else {
3218          nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format,
3219                                       &slba, &nlb, NULL, NULL, NULL, NULL);
3220      }
3221  
3222      sns = iocb->sns;
3223      if ((snsid == dnsid) && NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3224          ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3225          status = NVME_INVALID_FIELD | NVME_DNR;
3226          goto invalid;
3227      } else if (snsid != dnsid) {
3228          if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3229              !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3230              if (!nvme_copy_matching_ns_format(sns, dns, false)) {
3231                  status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3232                  goto invalid;
3233              }
3234          }
3235          if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3236              NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3237              if ((prinfor & NVME_PRINFO_PRACT) !=
3238                  (prinfow & NVME_PRINFO_PRACT)) {
3239                  status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3240                  goto invalid;
3241              } else {
3242                  if (!nvme_copy_matching_ns_format(sns, dns, true)) {
3243                      status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3244                      goto invalid;
3245                  }
3246              }
3247          }
3248  
3249          if (!NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3250              NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3251              if (!(prinfow & NVME_PRINFO_PRACT)) {
3252                  status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3253                  goto invalid;
3254              } else {
3255                  if (!nvme_copy_corresp_pi_format(sns, dns, false)) {
3256                      status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3257                      goto invalid;
3258                  }
3259              }
3260          }
3261  
3262          if (NVME_ID_NS_DPS_TYPE(sns->id_ns.dps) &&
3263              !NVME_ID_NS_DPS_TYPE(dns->id_ns.dps)) {
3264              if (!(prinfor & NVME_PRINFO_PRACT)) {
3265                  status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3266                  goto invalid;
3267              } else {
3268                  if (!nvme_copy_corresp_pi_format(sns, dns, true)) {
3269                      status = NVME_CMD_INCOMP_NS_OR_FMT | NVME_DNR;
3270                      goto invalid;
3271                  }
3272              }
3273          }
3274      }
3275      len = nvme_l2b(sns, nlb);
3276  
3277      trace_pci_nvme_copy_source_range(slba, nlb);
3278  
3279      if (nlb > le16_to_cpu(sns->id_ns.mssrl)) {
3280          status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3281          goto invalid;
3282      }
3283  
3284      status = nvme_check_bounds(sns, slba, nlb);
3285      if (status) {
3286          goto invalid;
3287      }
3288  
3289      if (NVME_ERR_REC_DULBE(sns->features.err_rec)) {
3290          status = nvme_check_dulbe(sns, slba, nlb);
3291          if (status) {
3292              goto invalid;
3293          }
3294      }
3295  
3296      if (sns->params.zoned) {
3297          status = nvme_check_zone_read(sns, slba, nlb);
3298          if (status) {
3299              goto invalid;
3300          }
3301      }
3302  
3303      g_free(iocb->bounce);
3304      iocb->bounce = g_malloc_n(le16_to_cpu(sns->id_ns.mssrl),
3305                                sns->lbasz + sns->lbaf.ms);
3306  
3307      qemu_iovec_reset(&iocb->iov);
3308      qemu_iovec_add(&iocb->iov, iocb->bounce, len);
3309  
3310      block_acct_start(blk_get_stats(sns->blkconf.blk), &iocb->acct.read, 0,
3311                       BLOCK_ACCT_READ);
3312  
3313      iocb->aiocb = blk_aio_preadv(sns->blkconf.blk, nvme_l2b(sns, slba),
3314                                   &iocb->iov, 0, nvme_copy_in_cb, iocb);
3315      return;
3316  
3317  invalid:
3318      req->status = status;
3319      iocb->ret = -1;
3320  done:
3321      nvme_copy_done(iocb);
3322  }
3323  
nvme_copy(NvmeCtrl * n,NvmeRequest * req)3324  static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3325  {
3326      NvmeNamespace *ns = req->ns;
3327      NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3328      NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3329                                        nvme_misc_cb, req);
3330      uint16_t nr = copy->nr + 1;
3331      uint8_t format = copy->control[0] & 0xf;
3332      size_t len = sizeof(NvmeCopySourceRangeFormat0_2);
3333  
3334      uint16_t status;
3335  
3336      trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3337  
3338      iocb->ranges = NULL;
3339      iocb->zone = NULL;
3340  
3341      if (!(n->id_ctrl.ocfs & (1 << format)) ||
3342          ((format == 2 || format == 3) &&
3343           !(n->features.hbs.cdfe & (1 << format)))) {
3344          trace_pci_nvme_err_copy_invalid_format(format);
3345          status = NVME_INVALID_FIELD | NVME_DNR;
3346          goto invalid;
3347      }
3348  
3349      if (nr > ns->id_ns.msrc + 1) {
3350          status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3351          goto invalid;
3352      }
3353  
3354      if ((ns->pif == 0x0 && (format != 0x0 && format != 0x2)) ||
3355          (ns->pif != 0x0 && (format != 0x1 && format != 0x3))) {
3356          status = NVME_INVALID_FORMAT | NVME_DNR;
3357          goto invalid;
3358      }
3359  
3360      if (ns->pif) {
3361          len = sizeof(NvmeCopySourceRangeFormat1_3);
3362      }
3363  
3364      iocb->format = format;
3365      iocb->ranges = g_malloc_n(nr, len);
3366      status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3367      if (status) {
3368          goto invalid;
3369      }
3370  
3371      iocb->slba = le64_to_cpu(copy->sdlba);
3372  
3373      if (ns->params.zoned) {
3374          iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3375          if (!iocb->zone) {
3376              status = NVME_LBA_RANGE | NVME_DNR;
3377              goto invalid;
3378          }
3379  
3380          status = nvme_zrm_auto(n, ns, iocb->zone);
3381          if (status) {
3382              goto invalid;
3383          }
3384      }
3385  
3386      status = nvme_check_copy_mcl(ns, iocb, nr);
3387      if (status) {
3388          goto invalid;
3389      }
3390  
3391      iocb->req = req;
3392      iocb->ret = 0;
3393      iocb->nr = nr;
3394      iocb->idx = 0;
3395      iocb->reftag = le32_to_cpu(copy->reftag);
3396      iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3397  
3398      qemu_iovec_init(&iocb->iov, 1);
3399  
3400      req->aiocb = &iocb->common;
3401      iocb->sns = req->ns;
3402      iocb->n = n;
3403      iocb->bounce = NULL;
3404      nvme_do_copy(iocb);
3405  
3406      return NVME_NO_COMPLETE;
3407  
3408  invalid:
3409      g_free(iocb->ranges);
3410      qemu_aio_unref(iocb);
3411      return status;
3412  }
3413  
nvme_compare(NvmeCtrl * n,NvmeRequest * req)3414  static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3415  {
3416      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3417      NvmeNamespace *ns = req->ns;
3418      BlockBackend *blk = ns->blkconf.blk;
3419      uint64_t slba = le64_to_cpu(rw->slba);
3420      uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3421      uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3422      size_t data_len = nvme_l2b(ns, nlb);
3423      size_t len = data_len;
3424      int64_t offset = nvme_l2b(ns, slba);
3425      struct nvme_compare_ctx *ctx = NULL;
3426      uint16_t status;
3427  
3428      trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3429  
3430      if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3431          return NVME_INVALID_PROT_INFO | NVME_DNR;
3432      }
3433  
3434      if (nvme_ns_ext(ns)) {
3435          len += nvme_m2b(ns, nlb);
3436      }
3437  
3438      if (NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt)) {
3439          status = nvme_check_mdts(n, data_len);
3440      } else {
3441          status = nvme_check_mdts(n, len);
3442      }
3443      if (status) {
3444          return status;
3445      }
3446  
3447      status = nvme_check_bounds(ns, slba, nlb);
3448      if (status) {
3449          return status;
3450      }
3451  
3452      if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3453          status = nvme_check_dulbe(ns, slba, nlb);
3454          if (status) {
3455              return status;
3456          }
3457      }
3458  
3459      status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3460      if (status) {
3461          return status;
3462      }
3463  
3464      ctx = g_new(struct nvme_compare_ctx, 1);
3465      ctx->data.bounce = g_malloc(data_len);
3466  
3467      req->opaque = ctx;
3468  
3469      qemu_iovec_init(&ctx->data.iov, 1);
3470      qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3471  
3472      block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3473                       BLOCK_ACCT_READ);
3474      req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3475                                  nvme_compare_data_cb, req);
3476  
3477      return NVME_NO_COMPLETE;
3478  }
3479  
3480  typedef struct NvmeFlushAIOCB {
3481      BlockAIOCB common;
3482      BlockAIOCB *aiocb;
3483      NvmeRequest *req;
3484      int ret;
3485  
3486      NvmeNamespace *ns;
3487      uint32_t nsid;
3488      bool broadcast;
3489  } NvmeFlushAIOCB;
3490  
nvme_flush_cancel(BlockAIOCB * acb)3491  static void nvme_flush_cancel(BlockAIOCB *acb)
3492  {
3493      NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3494  
3495      iocb->ret = -ECANCELED;
3496  
3497      if (iocb->aiocb) {
3498          blk_aio_cancel_async(iocb->aiocb);
3499          iocb->aiocb = NULL;
3500      }
3501  }
3502  
3503  static const AIOCBInfo nvme_flush_aiocb_info = {
3504      .aiocb_size = sizeof(NvmeFlushAIOCB),
3505      .cancel_async = nvme_flush_cancel,
3506  };
3507  
3508  static void nvme_do_flush(NvmeFlushAIOCB *iocb);
3509  
nvme_flush_ns_cb(void * opaque,int ret)3510  static void nvme_flush_ns_cb(void *opaque, int ret)
3511  {
3512      NvmeFlushAIOCB *iocb = opaque;
3513      NvmeNamespace *ns = iocb->ns;
3514  
3515      if (ret < 0) {
3516          iocb->ret = ret;
3517          goto out;
3518      } else if (iocb->ret < 0) {
3519          goto out;
3520      }
3521  
3522      if (ns) {
3523          trace_pci_nvme_flush_ns(iocb->nsid);
3524  
3525          iocb->ns = NULL;
3526          iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3527          return;
3528      }
3529  
3530  out:
3531      nvme_do_flush(iocb);
3532  }
3533  
nvme_do_flush(NvmeFlushAIOCB * iocb)3534  static void nvme_do_flush(NvmeFlushAIOCB *iocb)
3535  {
3536      NvmeRequest *req = iocb->req;
3537      NvmeCtrl *n = nvme_ctrl(req);
3538      int i;
3539  
3540      if (iocb->ret < 0) {
3541          goto done;
3542      }
3543  
3544      if (iocb->broadcast) {
3545          for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3546              iocb->ns = nvme_ns(n, i);
3547              if (iocb->ns) {
3548                  iocb->nsid = i;
3549                  break;
3550              }
3551          }
3552      }
3553  
3554      if (!iocb->ns) {
3555          goto done;
3556      }
3557  
3558      nvme_flush_ns_cb(iocb, 0);
3559      return;
3560  
3561  done:
3562      iocb->common.cb(iocb->common.opaque, iocb->ret);
3563      qemu_aio_unref(iocb);
3564  }
3565  
nvme_flush(NvmeCtrl * n,NvmeRequest * req)3566  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3567  {
3568      NvmeFlushAIOCB *iocb;
3569      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3570      uint16_t status;
3571  
3572      iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3573  
3574      iocb->req = req;
3575      iocb->ret = 0;
3576      iocb->ns = NULL;
3577      iocb->nsid = 0;
3578      iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3579  
3580      if (!iocb->broadcast) {
3581          if (!nvme_nsid_valid(n, nsid)) {
3582              status = NVME_INVALID_NSID | NVME_DNR;
3583              goto out;
3584          }
3585  
3586          iocb->ns = nvme_ns(n, nsid);
3587          if (!iocb->ns) {
3588              status = NVME_INVALID_FIELD | NVME_DNR;
3589              goto out;
3590          }
3591  
3592          iocb->nsid = nsid;
3593      }
3594  
3595      req->aiocb = &iocb->common;
3596      nvme_do_flush(iocb);
3597  
3598      return NVME_NO_COMPLETE;
3599  
3600  out:
3601      qemu_aio_unref(iocb);
3602  
3603      return status;
3604  }
3605  
nvme_read(NvmeCtrl * n,NvmeRequest * req)3606  static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3607  {
3608      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3609      NvmeNamespace *ns = req->ns;
3610      uint64_t slba = le64_to_cpu(rw->slba);
3611      uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3612      uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3613      uint64_t data_size = nvme_l2b(ns, nlb);
3614      uint64_t mapped_size = data_size;
3615      uint64_t data_offset;
3616      BlockBackend *blk = ns->blkconf.blk;
3617      uint16_t status;
3618  
3619      if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3620          mapped_size += nvme_m2b(ns, nlb);
3621  
3622          if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3623              bool pract = prinfo & NVME_PRINFO_PRACT;
3624  
3625              if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3626                  mapped_size = data_size;
3627              }
3628          }
3629      }
3630  
3631      trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3632  
3633      status = nvme_check_mdts(n, mapped_size);
3634      if (status) {
3635          goto invalid;
3636      }
3637  
3638      status = nvme_check_bounds(ns, slba, nlb);
3639      if (status) {
3640          goto invalid;
3641      }
3642  
3643      if (ns->params.zoned) {
3644          status = nvme_check_zone_read(ns, slba, nlb);
3645          if (status) {
3646              trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3647              goto invalid;
3648          }
3649      }
3650  
3651      if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3652          status = nvme_check_dulbe(ns, slba, nlb);
3653          if (status) {
3654              goto invalid;
3655          }
3656      }
3657  
3658      if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3659          return nvme_dif_rw(n, req);
3660      }
3661  
3662      status = nvme_map_data(n, nlb, req);
3663      if (status) {
3664          goto invalid;
3665      }
3666  
3667      data_offset = nvme_l2b(ns, slba);
3668  
3669      block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3670                       BLOCK_ACCT_READ);
3671      nvme_blk_read(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3672      return NVME_NO_COMPLETE;
3673  
3674  invalid:
3675      block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3676      return status | NVME_DNR;
3677  }
3678  
nvme_do_write_fdp(NvmeCtrl * n,NvmeRequest * req,uint64_t slba,uint32_t nlb)3679  static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
3680                                uint32_t nlb)
3681  {
3682      NvmeNamespace *ns = req->ns;
3683      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3684      uint64_t data_size = nvme_l2b(ns, nlb);
3685      uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
3686      uint8_t dtype = (dw12 >> 20) & 0xf;
3687      uint16_t pid = le16_to_cpu(rw->dspec);
3688      uint16_t ph, rg, ruhid;
3689      NvmeReclaimUnit *ru;
3690  
3691      if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT ||
3692          !nvme_parse_pid(ns, pid, &ph, &rg)) {
3693          ph = 0;
3694          rg = 0;
3695      }
3696  
3697      ruhid = ns->fdp.phs[ph];
3698      ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
3699  
3700      nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
3701      nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
3702  
3703      while (nlb) {
3704          if (nlb < ru->ruamw) {
3705              ru->ruamw -= nlb;
3706              break;
3707          }
3708  
3709          nlb -= ru->ruamw;
3710          nvme_update_ruh(n, ns, pid);
3711      }
3712  }
3713  
nvme_do_write(NvmeCtrl * n,NvmeRequest * req,bool append,bool wrz)3714  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3715                                bool wrz)
3716  {
3717      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3718      NvmeNamespace *ns = req->ns;
3719      uint64_t slba = le64_to_cpu(rw->slba);
3720      uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3721      uint16_t ctrl = le16_to_cpu(rw->control);
3722      uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3723      uint64_t data_size = nvme_l2b(ns, nlb);
3724      uint64_t mapped_size = data_size;
3725      uint64_t data_offset;
3726      NvmeZone *zone;
3727      NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3728      BlockBackend *blk = ns->blkconf.blk;
3729      uint16_t status;
3730  
3731      if (nvme_ns_ext(ns) && !(NVME_ID_CTRL_CTRATT_MEM(n->id_ctrl.ctratt))) {
3732          mapped_size += nvme_m2b(ns, nlb);
3733  
3734          if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3735              bool pract = prinfo & NVME_PRINFO_PRACT;
3736  
3737              if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3738                  mapped_size -= nvme_m2b(ns, nlb);
3739              }
3740          }
3741      }
3742  
3743      trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3744                           nvme_nsid(ns), nlb, mapped_size, slba);
3745  
3746      if (!wrz) {
3747          status = nvme_check_mdts(n, mapped_size);
3748          if (status) {
3749              goto invalid;
3750          }
3751      }
3752  
3753      status = nvme_check_bounds(ns, slba, nlb);
3754      if (status) {
3755          goto invalid;
3756      }
3757  
3758      if (ns->params.zoned) {
3759          zone = nvme_get_zone_by_slba(ns, slba);
3760          assert(zone);
3761  
3762          if (append) {
3763              bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3764  
3765              if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3766                  return NVME_INVALID_ZONE_OP | NVME_DNR;
3767              }
3768  
3769              if (unlikely(slba != zone->d.zslba)) {
3770                  trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3771                  status = NVME_INVALID_FIELD;
3772                  goto invalid;
3773              }
3774  
3775              if (n->params.zasl &&
3776                  data_size > (uint64_t)n->page_size << n->params.zasl) {
3777                  trace_pci_nvme_err_zasl(data_size);
3778                  return NVME_INVALID_FIELD | NVME_DNR;
3779              }
3780  
3781              slba = zone->w_ptr;
3782              rw->slba = cpu_to_le64(slba);
3783              res->slba = cpu_to_le64(slba);
3784  
3785              switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3786              case NVME_ID_NS_DPS_TYPE_1:
3787                  if (!piremap) {
3788                      return NVME_INVALID_PROT_INFO | NVME_DNR;
3789                  }
3790  
3791                  /* fallthrough */
3792  
3793              case NVME_ID_NS_DPS_TYPE_2:
3794                  if (piremap) {
3795                      uint32_t reftag = le32_to_cpu(rw->reftag);
3796                      rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3797                  }
3798  
3799                  break;
3800  
3801              case NVME_ID_NS_DPS_TYPE_3:
3802                  if (piremap) {
3803                      return NVME_INVALID_PROT_INFO | NVME_DNR;
3804                  }
3805  
3806                  break;
3807              }
3808          }
3809  
3810          status = nvme_check_zone_write(ns, zone, slba, nlb);
3811          if (status) {
3812              goto invalid;
3813          }
3814  
3815          status = nvme_zrm_auto(n, ns, zone);
3816          if (status) {
3817              goto invalid;
3818          }
3819  
3820          if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3821              zone->w_ptr += nlb;
3822          }
3823      } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
3824          nvme_do_write_fdp(n, req, slba, nlb);
3825      }
3826  
3827      data_offset = nvme_l2b(ns, slba);
3828  
3829      if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3830          return nvme_dif_rw(n, req);
3831      }
3832  
3833      if (!wrz) {
3834          status = nvme_map_data(n, nlb, req);
3835          if (status) {
3836              goto invalid;
3837          }
3838  
3839          block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3840                           BLOCK_ACCT_WRITE);
3841          nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
3842      } else {
3843          req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3844                                             BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3845                                             req);
3846      }
3847  
3848      return NVME_NO_COMPLETE;
3849  
3850  invalid:
3851      block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3852      return status | NVME_DNR;
3853  }
3854  
nvme_write(NvmeCtrl * n,NvmeRequest * req)3855  static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3856  {
3857      return nvme_do_write(n, req, false, false);
3858  }
3859  
nvme_write_zeroes(NvmeCtrl * n,NvmeRequest * req)3860  static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3861  {
3862      return nvme_do_write(n, req, false, true);
3863  }
3864  
nvme_zone_append(NvmeCtrl * n,NvmeRequest * req)3865  static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3866  {
3867      return nvme_do_write(n, req, true, false);
3868  }
3869  
nvme_get_mgmt_zone_slba_idx(NvmeNamespace * ns,NvmeCmd * c,uint64_t * slba,uint32_t * zone_idx)3870  static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3871                                              uint64_t *slba, uint32_t *zone_idx)
3872  {
3873      uint32_t dw10 = le32_to_cpu(c->cdw10);
3874      uint32_t dw11 = le32_to_cpu(c->cdw11);
3875  
3876      if (!ns->params.zoned) {
3877          trace_pci_nvme_err_invalid_opc(c->opcode);
3878          return NVME_INVALID_OPCODE | NVME_DNR;
3879      }
3880  
3881      *slba = ((uint64_t)dw11) << 32 | dw10;
3882      if (unlikely(*slba >= ns->id_ns.nsze)) {
3883          trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3884          *slba = 0;
3885          return NVME_LBA_RANGE | NVME_DNR;
3886      }
3887  
3888      *zone_idx = nvme_zone_idx(ns, *slba);
3889      assert(*zone_idx < ns->num_zones);
3890  
3891      return NVME_SUCCESS;
3892  }
3893  
3894  typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3895                                   NvmeRequest *);
3896  
3897  enum NvmeZoneProcessingMask {
3898      NVME_PROC_CURRENT_ZONE    = 0,
3899      NVME_PROC_OPENED_ZONES    = 1 << 0,
3900      NVME_PROC_CLOSED_ZONES    = 1 << 1,
3901      NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3902      NVME_PROC_FULL_ZONES      = 1 << 3,
3903  };
3904  
nvme_open_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3905  static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3906                                 NvmeZoneState state, NvmeRequest *req)
3907  {
3908      NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3909      int flags = 0;
3910  
3911      if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3912          uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3913  
3914          if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3915              return NVME_INVALID_ZONE_OP | NVME_DNR;
3916          }
3917  
3918          if (zone->w_ptr % ns->zns.zrwafg) {
3919              return NVME_NOZRWA | NVME_DNR;
3920          }
3921  
3922          flags = NVME_ZRM_ZRWA;
3923      }
3924  
3925      return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3926  }
3927  
nvme_close_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3928  static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3929                                  NvmeZoneState state, NvmeRequest *req)
3930  {
3931      return nvme_zrm_close(ns, zone);
3932  }
3933  
nvme_finish_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3934  static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3935                                   NvmeZoneState state, NvmeRequest *req)
3936  {
3937      return nvme_zrm_finish(ns, zone);
3938  }
3939  
nvme_offline_zone(NvmeNamespace * ns,NvmeZone * zone,NvmeZoneState state,NvmeRequest * req)3940  static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3941                                    NvmeZoneState state, NvmeRequest *req)
3942  {
3943      switch (state) {
3944      case NVME_ZONE_STATE_READ_ONLY:
3945          nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3946          /* fall through */
3947      case NVME_ZONE_STATE_OFFLINE:
3948          return NVME_SUCCESS;
3949      default:
3950          return NVME_ZONE_INVAL_TRANSITION;
3951      }
3952  }
3953  
nvme_set_zd_ext(NvmeNamespace * ns,NvmeZone * zone)3954  static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3955  {
3956      uint16_t status;
3957      uint8_t state = nvme_get_zone_state(zone);
3958  
3959      if (state == NVME_ZONE_STATE_EMPTY) {
3960          status = nvme_aor_check(ns, 1, 0);
3961          if (status) {
3962              return status;
3963          }
3964          nvme_aor_inc_active(ns);
3965          zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3966          nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3967          return NVME_SUCCESS;
3968      }
3969  
3970      return NVME_ZONE_INVAL_TRANSITION;
3971  }
3972  
nvme_bulk_proc_zone(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)3973  static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3974                                      enum NvmeZoneProcessingMask proc_mask,
3975                                      op_handler_t op_hndlr, NvmeRequest *req)
3976  {
3977      uint16_t status = NVME_SUCCESS;
3978      NvmeZoneState zs = nvme_get_zone_state(zone);
3979      bool proc_zone;
3980  
3981      switch (zs) {
3982      case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3983      case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3984          proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3985          break;
3986      case NVME_ZONE_STATE_CLOSED:
3987          proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3988          break;
3989      case NVME_ZONE_STATE_READ_ONLY:
3990          proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3991          break;
3992      case NVME_ZONE_STATE_FULL:
3993          proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3994          break;
3995      default:
3996          proc_zone = false;
3997      }
3998  
3999      if (proc_zone) {
4000          status = op_hndlr(ns, zone, zs, req);
4001      }
4002  
4003      return status;
4004  }
4005  
nvme_do_zone_op(NvmeNamespace * ns,NvmeZone * zone,enum NvmeZoneProcessingMask proc_mask,op_handler_t op_hndlr,NvmeRequest * req)4006  static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
4007                                  enum NvmeZoneProcessingMask proc_mask,
4008                                  op_handler_t op_hndlr, NvmeRequest *req)
4009  {
4010      NvmeZone *next;
4011      uint16_t status = NVME_SUCCESS;
4012      int i;
4013  
4014      if (!proc_mask) {
4015          status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
4016      } else {
4017          if (proc_mask & NVME_PROC_CLOSED_ZONES) {
4018              QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
4019                  status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4020                                               req);
4021                  if (status && status != NVME_NO_COMPLETE) {
4022                      goto out;
4023                  }
4024              }
4025          }
4026          if (proc_mask & NVME_PROC_OPENED_ZONES) {
4027              QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
4028                  status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4029                                               req);
4030                  if (status && status != NVME_NO_COMPLETE) {
4031                      goto out;
4032                  }
4033              }
4034  
4035              QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
4036                  status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4037                                               req);
4038                  if (status && status != NVME_NO_COMPLETE) {
4039                      goto out;
4040                  }
4041              }
4042          }
4043          if (proc_mask & NVME_PROC_FULL_ZONES) {
4044              QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
4045                  status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4046                                               req);
4047                  if (status && status != NVME_NO_COMPLETE) {
4048                      goto out;
4049                  }
4050              }
4051          }
4052  
4053          if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
4054              for (i = 0; i < ns->num_zones; i++, zone++) {
4055                  status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
4056                                               req);
4057                  if (status && status != NVME_NO_COMPLETE) {
4058                      goto out;
4059                  }
4060              }
4061          }
4062      }
4063  
4064  out:
4065      return status;
4066  }
4067  
4068  typedef struct NvmeZoneResetAIOCB {
4069      BlockAIOCB common;
4070      BlockAIOCB *aiocb;
4071      NvmeRequest *req;
4072      int ret;
4073  
4074      bool all;
4075      int idx;
4076      NvmeZone *zone;
4077  } NvmeZoneResetAIOCB;
4078  
nvme_zone_reset_cancel(BlockAIOCB * aiocb)4079  static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
4080  {
4081      NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
4082      NvmeRequest *req = iocb->req;
4083      NvmeNamespace *ns = req->ns;
4084  
4085      iocb->idx = ns->num_zones;
4086  
4087      iocb->ret = -ECANCELED;
4088  
4089      if (iocb->aiocb) {
4090          blk_aio_cancel_async(iocb->aiocb);
4091          iocb->aiocb = NULL;
4092      }
4093  }
4094  
4095  static const AIOCBInfo nvme_zone_reset_aiocb_info = {
4096      .aiocb_size = sizeof(NvmeZoneResetAIOCB),
4097      .cancel_async = nvme_zone_reset_cancel,
4098  };
4099  
4100  static void nvme_zone_reset_cb(void *opaque, int ret);
4101  
nvme_zone_reset_epilogue_cb(void * opaque,int ret)4102  static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
4103  {
4104      NvmeZoneResetAIOCB *iocb = opaque;
4105      NvmeRequest *req = iocb->req;
4106      NvmeNamespace *ns = req->ns;
4107      int64_t moff;
4108      int count;
4109  
4110      if (ret < 0 || iocb->ret < 0 || !ns->lbaf.ms) {
4111          goto out;
4112      }
4113  
4114      moff = nvme_moff(ns, iocb->zone->d.zslba);
4115      count = nvme_m2b(ns, ns->zone_size);
4116  
4117      iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
4118                                          BDRV_REQ_MAY_UNMAP,
4119                                          nvme_zone_reset_cb, iocb);
4120      return;
4121  
4122  out:
4123      nvme_zone_reset_cb(iocb, ret);
4124  }
4125  
nvme_zone_reset_cb(void * opaque,int ret)4126  static void nvme_zone_reset_cb(void *opaque, int ret)
4127  {
4128      NvmeZoneResetAIOCB *iocb = opaque;
4129      NvmeRequest *req = iocb->req;
4130      NvmeNamespace *ns = req->ns;
4131  
4132      if (iocb->ret < 0) {
4133          goto done;
4134      } else if (ret < 0) {
4135          iocb->ret = ret;
4136          goto done;
4137      }
4138  
4139      if (iocb->zone) {
4140          nvme_zrm_reset(ns, iocb->zone);
4141  
4142          if (!iocb->all) {
4143              goto done;
4144          }
4145      }
4146  
4147      while (iocb->idx < ns->num_zones) {
4148          NvmeZone *zone = &ns->zone_array[iocb->idx++];
4149  
4150          switch (nvme_get_zone_state(zone)) {
4151          case NVME_ZONE_STATE_EMPTY:
4152              if (!iocb->all) {
4153                  goto done;
4154              }
4155  
4156              continue;
4157  
4158          case NVME_ZONE_STATE_EXPLICITLY_OPEN:
4159          case NVME_ZONE_STATE_IMPLICITLY_OPEN:
4160          case NVME_ZONE_STATE_CLOSED:
4161          case NVME_ZONE_STATE_FULL:
4162              iocb->zone = zone;
4163              break;
4164  
4165          default:
4166              continue;
4167          }
4168  
4169          trace_pci_nvme_zns_zone_reset(zone->d.zslba);
4170  
4171          iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
4172                                              nvme_l2b(ns, zone->d.zslba),
4173                                              nvme_l2b(ns, ns->zone_size),
4174                                              BDRV_REQ_MAY_UNMAP,
4175                                              nvme_zone_reset_epilogue_cb,
4176                                              iocb);
4177          return;
4178      }
4179  
4180  done:
4181      iocb->aiocb = NULL;
4182  
4183      iocb->common.cb(iocb->common.opaque, iocb->ret);
4184      qemu_aio_unref(iocb);
4185  }
4186  
nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl * n,NvmeZone * zone,uint64_t elba,NvmeRequest * req)4187  static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
4188                                                 uint64_t elba, NvmeRequest *req)
4189  {
4190      NvmeNamespace *ns = req->ns;
4191      uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
4192      uint64_t wp = zone->d.wp;
4193      uint32_t nlb = elba - wp + 1;
4194      uint16_t status;
4195  
4196  
4197      if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
4198          return NVME_INVALID_ZONE_OP | NVME_DNR;
4199      }
4200  
4201      if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
4202          return NVME_INVALID_FIELD | NVME_DNR;
4203      }
4204  
4205      if (elba < wp || elba > wp + ns->zns.zrwas) {
4206          return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
4207      }
4208  
4209      if (nlb % ns->zns.zrwafg) {
4210          return NVME_INVALID_FIELD | NVME_DNR;
4211      }
4212  
4213      status = nvme_zrm_auto(n, ns, zone);
4214      if (status) {
4215          return status;
4216      }
4217  
4218      zone->w_ptr += nlb;
4219  
4220      nvme_advance_zone_wp(ns, zone, nlb);
4221  
4222      return NVME_SUCCESS;
4223  }
4224  
nvme_zone_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4225  static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4226  {
4227      NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
4228      NvmeNamespace *ns = req->ns;
4229      NvmeZone *zone;
4230      NvmeZoneResetAIOCB *iocb;
4231      uint8_t *zd_ext;
4232      uint64_t slba = 0;
4233      uint32_t zone_idx = 0;
4234      uint16_t status;
4235      uint8_t action = cmd->zsa;
4236      bool all;
4237      enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
4238  
4239      all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
4240  
4241      req->status = NVME_SUCCESS;
4242  
4243      if (!all) {
4244          status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
4245          if (status) {
4246              return status;
4247          }
4248      }
4249  
4250      zone = &ns->zone_array[zone_idx];
4251      if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
4252          trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
4253          return NVME_INVALID_FIELD | NVME_DNR;
4254      }
4255  
4256      switch (action) {
4257  
4258      case NVME_ZONE_ACTION_OPEN:
4259          if (all) {
4260              proc_mask = NVME_PROC_CLOSED_ZONES;
4261          }
4262          trace_pci_nvme_open_zone(slba, zone_idx, all);
4263          status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
4264          break;
4265  
4266      case NVME_ZONE_ACTION_CLOSE:
4267          if (all) {
4268              proc_mask = NVME_PROC_OPENED_ZONES;
4269          }
4270          trace_pci_nvme_close_zone(slba, zone_idx, all);
4271          status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
4272          break;
4273  
4274      case NVME_ZONE_ACTION_FINISH:
4275          if (all) {
4276              proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
4277          }
4278          trace_pci_nvme_finish_zone(slba, zone_idx, all);
4279          status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
4280          break;
4281  
4282      case NVME_ZONE_ACTION_RESET:
4283          trace_pci_nvme_reset_zone(slba, zone_idx, all);
4284  
4285          iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
4286                             nvme_misc_cb, req);
4287  
4288          iocb->req = req;
4289          iocb->ret = 0;
4290          iocb->all = all;
4291          iocb->idx = zone_idx;
4292          iocb->zone = NULL;
4293  
4294          req->aiocb = &iocb->common;
4295          nvme_zone_reset_cb(iocb, 0);
4296  
4297          return NVME_NO_COMPLETE;
4298  
4299      case NVME_ZONE_ACTION_OFFLINE:
4300          if (all) {
4301              proc_mask = NVME_PROC_READ_ONLY_ZONES;
4302          }
4303          trace_pci_nvme_offline_zone(slba, zone_idx, all);
4304          status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
4305          break;
4306  
4307      case NVME_ZONE_ACTION_SET_ZD_EXT:
4308          trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
4309          if (all || !ns->params.zd_extension_size) {
4310              return NVME_INVALID_FIELD | NVME_DNR;
4311          }
4312          zd_ext = nvme_get_zd_extension(ns, zone_idx);
4313          status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
4314          if (status) {
4315              trace_pci_nvme_err_zd_extension_map_error(zone_idx);
4316              return status;
4317          }
4318  
4319          status = nvme_set_zd_ext(ns, zone);
4320          if (status == NVME_SUCCESS) {
4321              trace_pci_nvme_zd_extension_set(zone_idx);
4322              return status;
4323          }
4324          break;
4325  
4326      case NVME_ZONE_ACTION_ZRWA_FLUSH:
4327          if (all) {
4328              return NVME_INVALID_FIELD | NVME_DNR;
4329          }
4330  
4331          return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4332  
4333      default:
4334          trace_pci_nvme_err_invalid_mgmt_action(action);
4335          status = NVME_INVALID_FIELD;
4336      }
4337  
4338      if (status == NVME_ZONE_INVAL_TRANSITION) {
4339          trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4340                                                           zone->d.za);
4341      }
4342      if (status) {
4343          status |= NVME_DNR;
4344      }
4345  
4346      return status;
4347  }
4348  
nvme_zone_matches_filter(uint32_t zafs,NvmeZone * zl)4349  static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4350  {
4351      NvmeZoneState zs = nvme_get_zone_state(zl);
4352  
4353      switch (zafs) {
4354      case NVME_ZONE_REPORT_ALL:
4355          return true;
4356      case NVME_ZONE_REPORT_EMPTY:
4357          return zs == NVME_ZONE_STATE_EMPTY;
4358      case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4359          return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4360      case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4361          return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4362      case NVME_ZONE_REPORT_CLOSED:
4363          return zs == NVME_ZONE_STATE_CLOSED;
4364      case NVME_ZONE_REPORT_FULL:
4365          return zs == NVME_ZONE_STATE_FULL;
4366      case NVME_ZONE_REPORT_READ_ONLY:
4367          return zs == NVME_ZONE_STATE_READ_ONLY;
4368      case NVME_ZONE_REPORT_OFFLINE:
4369          return zs == NVME_ZONE_STATE_OFFLINE;
4370      default:
4371          return false;
4372      }
4373  }
4374  
nvme_zone_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4375  static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4376  {
4377      NvmeCmd *cmd = &req->cmd;
4378      NvmeNamespace *ns = req->ns;
4379      /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4380      uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4381      uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4382      uint32_t zone_idx, zra, zrasf, partial;
4383      uint64_t max_zones, nr_zones = 0;
4384      uint16_t status;
4385      uint64_t slba;
4386      NvmeZoneDescr *z;
4387      NvmeZone *zone;
4388      NvmeZoneReportHeader *header;
4389      void *buf, *buf_p;
4390      size_t zone_entry_sz;
4391      int i;
4392  
4393      req->status = NVME_SUCCESS;
4394  
4395      status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4396      if (status) {
4397          return status;
4398      }
4399  
4400      zra = dw13 & 0xff;
4401      if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4402          return NVME_INVALID_FIELD | NVME_DNR;
4403      }
4404      if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4405          return NVME_INVALID_FIELD | NVME_DNR;
4406      }
4407  
4408      zrasf = (dw13 >> 8) & 0xff;
4409      if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4410          return NVME_INVALID_FIELD | NVME_DNR;
4411      }
4412  
4413      if (data_size < sizeof(NvmeZoneReportHeader)) {
4414          return NVME_INVALID_FIELD | NVME_DNR;
4415      }
4416  
4417      status = nvme_check_mdts(n, data_size);
4418      if (status) {
4419          return status;
4420      }
4421  
4422      partial = (dw13 >> 16) & 0x01;
4423  
4424      zone_entry_sz = sizeof(NvmeZoneDescr);
4425      if (zra == NVME_ZONE_REPORT_EXTENDED) {
4426          zone_entry_sz += ns->params.zd_extension_size;
4427      }
4428  
4429      max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4430      buf = g_malloc0(data_size);
4431  
4432      zone = &ns->zone_array[zone_idx];
4433      for (i = zone_idx; i < ns->num_zones; i++) {
4434          if (partial && nr_zones >= max_zones) {
4435              break;
4436          }
4437          if (nvme_zone_matches_filter(zrasf, zone++)) {
4438              nr_zones++;
4439          }
4440      }
4441      header = buf;
4442      header->nr_zones = cpu_to_le64(nr_zones);
4443  
4444      buf_p = buf + sizeof(NvmeZoneReportHeader);
4445      for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4446          zone = &ns->zone_array[zone_idx];
4447          if (nvme_zone_matches_filter(zrasf, zone)) {
4448              z = buf_p;
4449              buf_p += sizeof(NvmeZoneDescr);
4450  
4451              z->zt = zone->d.zt;
4452              z->zs = zone->d.zs;
4453              z->zcap = cpu_to_le64(zone->d.zcap);
4454              z->zslba = cpu_to_le64(zone->d.zslba);
4455              z->za = zone->d.za;
4456  
4457              if (nvme_wp_is_valid(zone)) {
4458                  z->wp = cpu_to_le64(zone->d.wp);
4459              } else {
4460                  z->wp = cpu_to_le64(~0ULL);
4461              }
4462  
4463              if (zra == NVME_ZONE_REPORT_EXTENDED) {
4464                  if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4465                      memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4466                             ns->params.zd_extension_size);
4467                  }
4468                  buf_p += ns->params.zd_extension_size;
4469              }
4470  
4471              max_zones--;
4472          }
4473      }
4474  
4475      status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4476  
4477      g_free(buf);
4478  
4479      return status;
4480  }
4481  
nvme_io_mgmt_recv_ruhs(NvmeCtrl * n,NvmeRequest * req,size_t len)4482  static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
4483                                         size_t len)
4484  {
4485      NvmeNamespace *ns = req->ns;
4486      NvmeEnduranceGroup *endgrp;
4487      NvmeRuhStatus *hdr;
4488      NvmeRuhStatusDescr *ruhsd;
4489      unsigned int nruhsd;
4490      uint16_t rg, ph, *ruhid;
4491      size_t trans_len;
4492      g_autofree uint8_t *buf = NULL;
4493  
4494      if (!n->subsys) {
4495          return NVME_INVALID_FIELD | NVME_DNR;
4496      }
4497  
4498      if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
4499          return NVME_INVALID_NSID | NVME_DNR;
4500      }
4501  
4502      if (!n->subsys->endgrp.fdp.enabled) {
4503          return NVME_FDP_DISABLED | NVME_DNR;
4504      }
4505  
4506      endgrp = ns->endgrp;
4507  
4508      nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
4509      trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
4510      buf = g_malloc0(trans_len);
4511  
4512      trans_len = MIN(trans_len, len);
4513  
4514      hdr = (NvmeRuhStatus *)buf;
4515      ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
4516  
4517      hdr->nruhsd = cpu_to_le16(nruhsd);
4518  
4519      ruhid = ns->fdp.phs;
4520  
4521      for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
4522          NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
4523  
4524          for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
4525              uint16_t pid = nvme_make_pid(ns, rg, ph);
4526  
4527              ruhsd->pid = cpu_to_le16(pid);
4528              ruhsd->ruhid = *ruhid;
4529              ruhsd->earutr = 0;
4530              ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
4531          }
4532      }
4533  
4534      return nvme_c2h(n, buf, trans_len, req);
4535  }
4536  
nvme_io_mgmt_recv(NvmeCtrl * n,NvmeRequest * req)4537  static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4538  {
4539      NvmeCmd *cmd = &req->cmd;
4540      uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4541      uint32_t numd = le32_to_cpu(cmd->cdw11);
4542      uint8_t mo = (cdw10 & 0xff);
4543      size_t len = (numd + 1) << 2;
4544  
4545      switch (mo) {
4546      case NVME_IOMR_MO_NOP:
4547          return 0;
4548      case NVME_IOMR_MO_RUH_STATUS:
4549          return nvme_io_mgmt_recv_ruhs(n, req, len);
4550      default:
4551          return NVME_INVALID_FIELD | NVME_DNR;
4552      };
4553  }
4554  
nvme_io_mgmt_send_ruh_update(NvmeCtrl * n,NvmeRequest * req)4555  static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
4556  {
4557      NvmeCmd *cmd = &req->cmd;
4558      NvmeNamespace *ns = req->ns;
4559      uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4560      uint16_t ret = NVME_SUCCESS;
4561      uint32_t npid = (cdw10 >> 16) + 1;
4562      unsigned int i = 0;
4563      g_autofree uint16_t *pids = NULL;
4564      uint32_t maxnpid;
4565  
4566      if (!ns->endgrp || !ns->endgrp->fdp.enabled) {
4567          return NVME_FDP_DISABLED | NVME_DNR;
4568      }
4569  
4570      maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
4571  
4572      if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
4573          return NVME_INVALID_FIELD | NVME_DNR;
4574      }
4575  
4576      pids = g_new(uint16_t, npid);
4577  
4578      ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
4579      if (ret) {
4580          return ret;
4581      }
4582  
4583      for (; i < npid; i++) {
4584          if (!nvme_update_ruh(n, ns, pids[i])) {
4585              return NVME_INVALID_FIELD | NVME_DNR;
4586          }
4587      }
4588  
4589      return ret;
4590  }
4591  
nvme_io_mgmt_send(NvmeCtrl * n,NvmeRequest * req)4592  static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
4593  {
4594      NvmeCmd *cmd = &req->cmd;
4595      uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
4596      uint8_t mo = (cdw10 & 0xff);
4597  
4598      switch (mo) {
4599      case NVME_IOMS_MO_NOP:
4600          return 0;
4601      case NVME_IOMS_MO_RUH_UPDATE:
4602          return nvme_io_mgmt_send_ruh_update(n, req);
4603      default:
4604          return NVME_INVALID_FIELD | NVME_DNR;
4605      };
4606  }
4607  
nvme_io_cmd(NvmeCtrl * n,NvmeRequest * req)4608  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4609  {
4610      NvmeNamespace *ns;
4611      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4612  
4613      trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4614                            req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4615  
4616      /*
4617       * In the base NVM command set, Flush may apply to all namespaces
4618       * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4619       * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4620       *
4621       * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4622       * opcode with a specific command since we cannot determine a unique I/O
4623       * command set. Opcode 0h could have any other meaning than something
4624       * equivalent to flushing and say it DOES have completely different
4625       * semantics in some other command set - does an NSID of FFFFFFFFh then
4626       * mean "for all namespaces, apply whatever command set specific command
4627       * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4628       * whatever command that uses the 0h opcode if, and only if, it allows NSID
4629       * to be FFFFFFFFh"?
4630       *
4631       * Anyway (and luckily), for now, we do not care about this since the
4632       * device only supports namespace types that includes the NVM Flush command
4633       * (NVM and Zoned), so always do an NVM Flush.
4634       */
4635  
4636      if (req->cmd.opcode == NVME_CMD_FLUSH) {
4637          return nvme_flush(n, req);
4638      }
4639  
4640      if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4641          return NVME_INVALID_NSID | NVME_DNR;
4642      }
4643  
4644      ns = nvme_ns(n, nsid);
4645      if (unlikely(!ns)) {
4646          return NVME_INVALID_FIELD | NVME_DNR;
4647      }
4648  
4649      if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4650          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4651          return NVME_INVALID_OPCODE | NVME_DNR;
4652      }
4653  
4654      if (ns->status) {
4655          return ns->status;
4656      }
4657  
4658      if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4659          return NVME_INVALID_FIELD;
4660      }
4661  
4662      req->ns = ns;
4663  
4664      switch (req->cmd.opcode) {
4665      case NVME_CMD_WRITE_ZEROES:
4666          return nvme_write_zeroes(n, req);
4667      case NVME_CMD_ZONE_APPEND:
4668          return nvme_zone_append(n, req);
4669      case NVME_CMD_WRITE:
4670          return nvme_write(n, req);
4671      case NVME_CMD_READ:
4672          return nvme_read(n, req);
4673      case NVME_CMD_COMPARE:
4674          return nvme_compare(n, req);
4675      case NVME_CMD_DSM:
4676          return nvme_dsm(n, req);
4677      case NVME_CMD_VERIFY:
4678          return nvme_verify(n, req);
4679      case NVME_CMD_COPY:
4680          return nvme_copy(n, req);
4681      case NVME_CMD_ZONE_MGMT_SEND:
4682          return nvme_zone_mgmt_send(n, req);
4683      case NVME_CMD_ZONE_MGMT_RECV:
4684          return nvme_zone_mgmt_recv(n, req);
4685      case NVME_CMD_IO_MGMT_RECV:
4686          return nvme_io_mgmt_recv(n, req);
4687      case NVME_CMD_IO_MGMT_SEND:
4688          return nvme_io_mgmt_send(n, req);
4689      default:
4690          g_assert_not_reached();
4691      }
4692  
4693      return NVME_INVALID_OPCODE | NVME_DNR;
4694  }
4695  
nvme_cq_notifier(EventNotifier * e)4696  static void nvme_cq_notifier(EventNotifier *e)
4697  {
4698      NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4699      NvmeCtrl *n = cq->ctrl;
4700  
4701      if (!event_notifier_test_and_clear(e)) {
4702          return;
4703      }
4704  
4705      nvme_update_cq_head(cq);
4706  
4707      if (cq->tail == cq->head) {
4708          if (cq->irq_enabled) {
4709              n->cq_pending--;
4710          }
4711  
4712          nvme_irq_deassert(n, cq);
4713      }
4714  
4715      qemu_bh_schedule(cq->bh);
4716  }
4717  
nvme_init_cq_ioeventfd(NvmeCQueue * cq)4718  static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4719  {
4720      NvmeCtrl *n = cq->ctrl;
4721      uint16_t offset = (cq->cqid << 3) + (1 << 2);
4722      int ret;
4723  
4724      ret = event_notifier_init(&cq->notifier, 0);
4725      if (ret < 0) {
4726          return ret;
4727      }
4728  
4729      event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4730      memory_region_add_eventfd(&n->iomem,
4731                                0x1000 + offset, 4, false, 0, &cq->notifier);
4732  
4733      return 0;
4734  }
4735  
nvme_sq_notifier(EventNotifier * e)4736  static void nvme_sq_notifier(EventNotifier *e)
4737  {
4738      NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4739  
4740      if (!event_notifier_test_and_clear(e)) {
4741          return;
4742      }
4743  
4744      nvme_process_sq(sq);
4745  }
4746  
nvme_init_sq_ioeventfd(NvmeSQueue * sq)4747  static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4748  {
4749      NvmeCtrl *n = sq->ctrl;
4750      uint16_t offset = sq->sqid << 3;
4751      int ret;
4752  
4753      ret = event_notifier_init(&sq->notifier, 0);
4754      if (ret < 0) {
4755          return ret;
4756      }
4757  
4758      event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4759      memory_region_add_eventfd(&n->iomem,
4760                                0x1000 + offset, 4, false, 0, &sq->notifier);
4761  
4762      return 0;
4763  }
4764  
nvme_free_sq(NvmeSQueue * sq,NvmeCtrl * n)4765  static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4766  {
4767      uint16_t offset = sq->sqid << 3;
4768  
4769      n->sq[sq->sqid] = NULL;
4770      qemu_bh_delete(sq->bh);
4771      if (sq->ioeventfd_enabled) {
4772          memory_region_del_eventfd(&n->iomem,
4773                                    0x1000 + offset, 4, false, 0, &sq->notifier);
4774          event_notifier_set_handler(&sq->notifier, NULL);
4775          event_notifier_cleanup(&sq->notifier);
4776      }
4777      g_free(sq->io_req);
4778      if (sq->sqid) {
4779          g_free(sq);
4780      }
4781  }
4782  
nvme_del_sq(NvmeCtrl * n,NvmeRequest * req)4783  static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4784  {
4785      NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4786      NvmeRequest *r, *next;
4787      NvmeSQueue *sq;
4788      NvmeCQueue *cq;
4789      uint16_t qid = le16_to_cpu(c->qid);
4790  
4791      if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4792          trace_pci_nvme_err_invalid_del_sq(qid);
4793          return NVME_INVALID_QID | NVME_DNR;
4794      }
4795  
4796      trace_pci_nvme_del_sq(qid);
4797  
4798      sq = n->sq[qid];
4799      while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4800          r = QTAILQ_FIRST(&sq->out_req_list);
4801          assert(r->aiocb);
4802          blk_aio_cancel(r->aiocb);
4803      }
4804  
4805      assert(QTAILQ_EMPTY(&sq->out_req_list));
4806  
4807      if (!nvme_check_cqid(n, sq->cqid)) {
4808          cq = n->cq[sq->cqid];
4809          QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4810  
4811          nvme_post_cqes(cq);
4812          QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4813              if (r->sq == sq) {
4814                  QTAILQ_REMOVE(&cq->req_list, r, entry);
4815                  QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4816              }
4817          }
4818      }
4819  
4820      nvme_free_sq(sq, n);
4821      return NVME_SUCCESS;
4822  }
4823  
nvme_init_sq(NvmeSQueue * sq,NvmeCtrl * n,uint64_t dma_addr,uint16_t sqid,uint16_t cqid,uint16_t size)4824  static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4825                           uint16_t sqid, uint16_t cqid, uint16_t size)
4826  {
4827      int i;
4828      NvmeCQueue *cq;
4829  
4830      sq->ctrl = n;
4831      sq->dma_addr = dma_addr;
4832      sq->sqid = sqid;
4833      sq->size = size;
4834      sq->cqid = cqid;
4835      sq->head = sq->tail = 0;
4836      sq->io_req = g_new0(NvmeRequest, sq->size);
4837  
4838      QTAILQ_INIT(&sq->req_list);
4839      QTAILQ_INIT(&sq->out_req_list);
4840      for (i = 0; i < sq->size; i++) {
4841          sq->io_req[i].sq = sq;
4842          QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4843      }
4844  
4845      sq->bh = qemu_bh_new_guarded(nvme_process_sq, sq,
4846                                   &DEVICE(sq->ctrl)->mem_reentrancy_guard);
4847  
4848      if (n->dbbuf_enabled) {
4849          sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4850          sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4851  
4852          if (n->params.ioeventfd && sq->sqid != 0) {
4853              if (!nvme_init_sq_ioeventfd(sq)) {
4854                  sq->ioeventfd_enabled = true;
4855              }
4856          }
4857      }
4858  
4859      assert(n->cq[cqid]);
4860      cq = n->cq[cqid];
4861      QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4862      n->sq[sqid] = sq;
4863  }
4864  
nvme_create_sq(NvmeCtrl * n,NvmeRequest * req)4865  static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4866  {
4867      NvmeSQueue *sq;
4868      NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4869  
4870      uint16_t cqid = le16_to_cpu(c->cqid);
4871      uint16_t sqid = le16_to_cpu(c->sqid);
4872      uint16_t qsize = le16_to_cpu(c->qsize);
4873      uint16_t qflags = le16_to_cpu(c->sq_flags);
4874      uint64_t prp1 = le64_to_cpu(c->prp1);
4875  
4876      trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4877  
4878      if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4879          trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4880          return NVME_INVALID_CQID | NVME_DNR;
4881      }
4882      if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4883          trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4884          return NVME_INVALID_QID | NVME_DNR;
4885      }
4886      if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4887          trace_pci_nvme_err_invalid_create_sq_size(qsize);
4888          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4889      }
4890      if (unlikely(prp1 & (n->page_size - 1))) {
4891          trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4892          return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4893      }
4894      if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4895          trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4896          return NVME_INVALID_FIELD | NVME_DNR;
4897      }
4898      sq = g_malloc0(sizeof(*sq));
4899      nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4900      return NVME_SUCCESS;
4901  }
4902  
4903  struct nvme_stats {
4904      uint64_t units_read;
4905      uint64_t units_written;
4906      uint64_t read_commands;
4907      uint64_t write_commands;
4908  };
4909  
nvme_set_blk_stats(NvmeNamespace * ns,struct nvme_stats * stats)4910  static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4911  {
4912      BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4913  
4914      stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
4915      stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
4916      stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4917      stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4918  }
4919  
nvme_smart_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4920  static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4921                                  uint64_t off, NvmeRequest *req)
4922  {
4923      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4924      struct nvme_stats stats = { 0 };
4925      NvmeSmartLog smart = { 0 };
4926      uint32_t trans_len;
4927      NvmeNamespace *ns;
4928      time_t current_ms;
4929      uint64_t u_read, u_written;
4930  
4931      if (off >= sizeof(smart)) {
4932          return NVME_INVALID_FIELD | NVME_DNR;
4933      }
4934  
4935      if (nsid != 0xffffffff) {
4936          ns = nvme_ns(n, nsid);
4937          if (!ns) {
4938              return NVME_INVALID_NSID | NVME_DNR;
4939          }
4940          nvme_set_blk_stats(ns, &stats);
4941      } else {
4942          int i;
4943  
4944          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4945              ns = nvme_ns(n, i);
4946              if (!ns) {
4947                  continue;
4948              }
4949              nvme_set_blk_stats(ns, &stats);
4950          }
4951      }
4952  
4953      trans_len = MIN(sizeof(smart) - off, buf_len);
4954      smart.critical_warning = n->smart_critical_warning;
4955  
4956      u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
4957      u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
4958  
4959      smart.data_units_read[0] = cpu_to_le64(u_read);
4960      smart.data_units_written[0] = cpu_to_le64(u_written);
4961      smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4962      smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4963  
4964      smart.temperature = cpu_to_le16(n->temperature);
4965  
4966      if ((n->temperature >= n->features.temp_thresh_hi) ||
4967          (n->temperature <= n->features.temp_thresh_low)) {
4968          smart.critical_warning |= NVME_SMART_TEMPERATURE;
4969      }
4970  
4971      current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4972      smart.power_on_hours[0] =
4973          cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4974  
4975      if (!rae) {
4976          nvme_clear_events(n, NVME_AER_TYPE_SMART);
4977      }
4978  
4979      return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4980  }
4981  
nvme_endgrp_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)4982  static uint16_t nvme_endgrp_info(NvmeCtrl *n,  uint8_t rae, uint32_t buf_len,
4983                                   uint64_t off, NvmeRequest *req)
4984  {
4985      uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
4986      uint16_t endgrpid = (dw11 >> 16) & 0xffff;
4987      struct nvme_stats stats = {};
4988      NvmeEndGrpLog info = {};
4989      int i;
4990  
4991      if (!n->subsys || endgrpid != 0x1) {
4992          return NVME_INVALID_FIELD | NVME_DNR;
4993      }
4994  
4995      if (off >= sizeof(info)) {
4996          return NVME_INVALID_FIELD | NVME_DNR;
4997      }
4998  
4999      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5000          NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
5001          if (!ns) {
5002              continue;
5003          }
5004  
5005          nvme_set_blk_stats(ns, &stats);
5006      }
5007  
5008      info.data_units_read[0] =
5009          cpu_to_le64(DIV_ROUND_UP(stats.units_read / 1000000000, 1000000000));
5010      info.data_units_written[0] =
5011          cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5012      info.media_units_written[0] =
5013          cpu_to_le64(DIV_ROUND_UP(stats.units_written / 1000000000, 1000000000));
5014  
5015      info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
5016      info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
5017  
5018      buf_len = MIN(sizeof(info) - off, buf_len);
5019  
5020      return nvme_c2h(n, (uint8_t *)&info + off, buf_len, req);
5021  }
5022  
5023  
nvme_fw_log_info(NvmeCtrl * n,uint32_t buf_len,uint64_t off,NvmeRequest * req)5024  static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
5025                                   NvmeRequest *req)
5026  {
5027      uint32_t trans_len;
5028      NvmeFwSlotInfoLog fw_log = {
5029          .afi = 0x1,
5030      };
5031  
5032      if (off >= sizeof(fw_log)) {
5033          return NVME_INVALID_FIELD | NVME_DNR;
5034      }
5035  
5036      strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
5037      trans_len = MIN(sizeof(fw_log) - off, buf_len);
5038  
5039      return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
5040  }
5041  
nvme_error_info(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5042  static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5043                                  uint64_t off, NvmeRequest *req)
5044  {
5045      uint32_t trans_len;
5046      NvmeErrorLog errlog;
5047  
5048      if (off >= sizeof(errlog)) {
5049          return NVME_INVALID_FIELD | NVME_DNR;
5050      }
5051  
5052      if (!rae) {
5053          nvme_clear_events(n, NVME_AER_TYPE_ERROR);
5054      }
5055  
5056      memset(&errlog, 0x0, sizeof(errlog));
5057      trans_len = MIN(sizeof(errlog) - off, buf_len);
5058  
5059      return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
5060  }
5061  
nvme_changed_nslist(NvmeCtrl * n,uint8_t rae,uint32_t buf_len,uint64_t off,NvmeRequest * req)5062  static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
5063                                      uint64_t off, NvmeRequest *req)
5064  {
5065      uint32_t nslist[1024];
5066      uint32_t trans_len;
5067      int i = 0;
5068      uint32_t nsid;
5069  
5070      if (off >= sizeof(nslist)) {
5071          trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
5072          return NVME_INVALID_FIELD | NVME_DNR;
5073      }
5074  
5075      memset(nslist, 0x0, sizeof(nslist));
5076      trans_len = MIN(sizeof(nslist) - off, buf_len);
5077  
5078      while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
5079              NVME_CHANGED_NSID_SIZE) {
5080          /*
5081           * If more than 1024 namespaces, the first entry in the log page should
5082           * be set to FFFFFFFFh and the others to 0 as spec.
5083           */
5084          if (i == ARRAY_SIZE(nslist)) {
5085              memset(nslist, 0x0, sizeof(nslist));
5086              nslist[0] = 0xffffffff;
5087              break;
5088          }
5089  
5090          nslist[i++] = nsid;
5091          clear_bit(nsid, n->changed_nsids);
5092      }
5093  
5094      /*
5095       * Remove all the remaining list entries in case returns directly due to
5096       * more than 1024 namespaces.
5097       */
5098      if (nslist[0] == 0xffffffff) {
5099          bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
5100      }
5101  
5102      if (!rae) {
5103          nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
5104      }
5105  
5106      return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
5107  }
5108  
nvme_cmd_effects(NvmeCtrl * n,uint8_t csi,uint32_t buf_len,uint64_t off,NvmeRequest * req)5109  static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
5110                                   uint64_t off, NvmeRequest *req)
5111  {
5112      NvmeEffectsLog log = {};
5113      const uint32_t *src_iocs = NULL;
5114      uint32_t trans_len;
5115  
5116      if (off >= sizeof(log)) {
5117          trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
5118          return NVME_INVALID_FIELD | NVME_DNR;
5119      }
5120  
5121      switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
5122      case NVME_CC_CSS_NVM:
5123          src_iocs = nvme_cse_iocs_nvm;
5124          /* fall through */
5125      case NVME_CC_CSS_ADMIN_ONLY:
5126          break;
5127      case NVME_CC_CSS_CSI:
5128          switch (csi) {
5129          case NVME_CSI_NVM:
5130              src_iocs = nvme_cse_iocs_nvm;
5131              break;
5132          case NVME_CSI_ZONED:
5133              src_iocs = nvme_cse_iocs_zoned;
5134              break;
5135          }
5136      }
5137  
5138      memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
5139  
5140      if (src_iocs) {
5141          memcpy(log.iocs, src_iocs, sizeof(log.iocs));
5142      }
5143  
5144      trans_len = MIN(sizeof(log) - off, buf_len);
5145  
5146      return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
5147  }
5148  
sizeof_fdp_conf_descr(size_t nruh,size_t vss)5149  static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
5150  {
5151      size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
5152                         + vss;
5153      return ROUND_UP(entry_siz, 8);
5154  }
5155  
nvme_fdp_confs(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5156  static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5157                                 uint64_t off, NvmeRequest *req)
5158  {
5159      uint32_t log_size, trans_len;
5160      g_autofree uint8_t *buf = NULL;
5161      NvmeFdpDescrHdr *hdr;
5162      NvmeRuhDescr *ruhd;
5163      NvmeEnduranceGroup *endgrp;
5164      NvmeFdpConfsHdr *log;
5165      size_t nruh, fdp_descr_size;
5166      int i;
5167  
5168      if (endgrpid != 1 || !n->subsys) {
5169          return NVME_INVALID_FIELD | NVME_DNR;
5170      }
5171  
5172      endgrp = &n->subsys->endgrp;
5173  
5174      if (endgrp->fdp.enabled) {
5175          nruh = endgrp->fdp.nruh;
5176      } else {
5177          nruh = 1;
5178      }
5179  
5180      fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
5181      log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
5182  
5183      if (off >= log_size) {
5184          return NVME_INVALID_FIELD | NVME_DNR;
5185      }
5186  
5187      trans_len = MIN(log_size - off, buf_len);
5188  
5189      buf = g_malloc0(log_size);
5190      log = (NvmeFdpConfsHdr *)buf;
5191      hdr = (NvmeFdpDescrHdr *)(log + 1);
5192      ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
5193  
5194      log->num_confs = cpu_to_le16(0);
5195      log->size = cpu_to_le32(log_size);
5196  
5197      hdr->descr_size = cpu_to_le16(fdp_descr_size);
5198      if (endgrp->fdp.enabled) {
5199          hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
5200          hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
5201          hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
5202          hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5203          hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5204          hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
5205          hdr->runs = cpu_to_le64(endgrp->fdp.runs);
5206  
5207          for (i = 0; i < nruh; i++) {
5208              ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5209              ruhd++;
5210          }
5211      } else {
5212          /* 1 bit for RUH in PIF -> 2 RUHs max. */
5213          hdr->nrg = cpu_to_le16(1);
5214          hdr->nruh = cpu_to_le16(1);
5215          hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
5216          hdr->nnss = cpu_to_le32(1);
5217          hdr->runs = cpu_to_le64(96 * MiB);
5218  
5219          ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
5220      }
5221  
5222      return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5223  }
5224  
nvme_fdp_ruh_usage(NvmeCtrl * n,uint32_t endgrpid,uint32_t dw10,uint32_t dw12,uint32_t buf_len,uint64_t off,NvmeRequest * req)5225  static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
5226                                     uint32_t dw10, uint32_t dw12,
5227                                     uint32_t buf_len, uint64_t off,
5228                                     NvmeRequest *req)
5229  {
5230      NvmeRuHandle *ruh;
5231      NvmeRuhuLog *hdr;
5232      NvmeRuhuDescr *ruhud;
5233      NvmeEnduranceGroup *endgrp;
5234      g_autofree uint8_t *buf = NULL;
5235      uint32_t log_size, trans_len;
5236      uint16_t i;
5237  
5238      if (endgrpid != 1 || !n->subsys) {
5239          return NVME_INVALID_FIELD | NVME_DNR;
5240      }
5241  
5242      endgrp = &n->subsys->endgrp;
5243  
5244      if (!endgrp->fdp.enabled) {
5245          return NVME_FDP_DISABLED | NVME_DNR;
5246      }
5247  
5248      log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
5249  
5250      if (off >= log_size) {
5251          return NVME_INVALID_FIELD | NVME_DNR;
5252      }
5253  
5254      trans_len = MIN(log_size - off, buf_len);
5255  
5256      buf = g_malloc0(log_size);
5257      hdr = (NvmeRuhuLog *)buf;
5258      ruhud = (NvmeRuhuDescr *)(hdr + 1);
5259  
5260      ruh = endgrp->fdp.ruhs;
5261      hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
5262  
5263      for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
5264          ruhud->ruha = ruh->ruha;
5265      }
5266  
5267      return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
5268  }
5269  
nvme_fdp_stats(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5270  static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
5271                                 uint64_t off, NvmeRequest *req)
5272  {
5273      NvmeEnduranceGroup *endgrp;
5274      NvmeFdpStatsLog log = {};
5275      uint32_t trans_len;
5276  
5277      if (off >= sizeof(NvmeFdpStatsLog)) {
5278          return NVME_INVALID_FIELD | NVME_DNR;
5279      }
5280  
5281      if (endgrpid != 1 || !n->subsys) {
5282          return NVME_INVALID_FIELD | NVME_DNR;
5283      }
5284  
5285      if (!n->subsys->endgrp.fdp.enabled) {
5286          return NVME_FDP_DISABLED | NVME_DNR;
5287      }
5288  
5289      endgrp = &n->subsys->endgrp;
5290  
5291      trans_len = MIN(sizeof(log) - off, buf_len);
5292  
5293      /* spec value is 128 bit, we only use 64 bit */
5294      log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
5295      log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
5296      log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
5297  
5298      return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
5299  }
5300  
nvme_fdp_events(NvmeCtrl * n,uint32_t endgrpid,uint32_t buf_len,uint64_t off,NvmeRequest * req)5301  static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
5302                                  uint32_t buf_len, uint64_t off,
5303                                  NvmeRequest *req)
5304  {
5305      NvmeEnduranceGroup *endgrp;
5306      NvmeCmd *cmd = &req->cmd;
5307      bool host_events = (cmd->cdw10 >> 8) & 0x1;
5308      uint32_t log_size, trans_len;
5309      NvmeFdpEventBuffer *ebuf;
5310      g_autofree NvmeFdpEventsLog *elog = NULL;
5311      NvmeFdpEvent *event;
5312  
5313      if (endgrpid != 1 || !n->subsys) {
5314          return NVME_INVALID_FIELD | NVME_DNR;
5315      }
5316  
5317      endgrp = &n->subsys->endgrp;
5318  
5319      if (!endgrp->fdp.enabled) {
5320          return NVME_FDP_DISABLED | NVME_DNR;
5321      }
5322  
5323      if (host_events) {
5324          ebuf = &endgrp->fdp.host_events;
5325      } else {
5326          ebuf = &endgrp->fdp.ctrl_events;
5327      }
5328  
5329      log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
5330  
5331      if (off >= log_size) {
5332          return NVME_INVALID_FIELD | NVME_DNR;
5333      }
5334  
5335      trans_len = MIN(log_size - off, buf_len);
5336      elog = g_malloc0(log_size);
5337      elog->num_events = cpu_to_le32(ebuf->nelems);
5338      event = (NvmeFdpEvent *)(elog + 1);
5339  
5340      if (ebuf->nelems && ebuf->start == ebuf->next) {
5341          unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
5342          /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
5343          memcpy(event, &ebuf->events[ebuf->start],
5344                 sizeof(NvmeFdpEvent) * nelems);
5345          memcpy(event + nelems, ebuf->events,
5346                 sizeof(NvmeFdpEvent) * ebuf->next);
5347      } else if (ebuf->start < ebuf->next) {
5348          memcpy(event, &ebuf->events[ebuf->start],
5349                 sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
5350      }
5351  
5352      return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
5353  }
5354  
nvme_get_log(NvmeCtrl * n,NvmeRequest * req)5355  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
5356  {
5357      NvmeCmd *cmd = &req->cmd;
5358  
5359      uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5360      uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5361      uint32_t dw12 = le32_to_cpu(cmd->cdw12);
5362      uint32_t dw13 = le32_to_cpu(cmd->cdw13);
5363      uint8_t  lid = dw10 & 0xff;
5364      uint8_t  lsp = (dw10 >> 8) & 0xf;
5365      uint8_t  rae = (dw10 >> 15) & 0x1;
5366      uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
5367      uint32_t numdl, numdu, lspi;
5368      uint64_t off, lpol, lpou;
5369      size_t   len;
5370      uint16_t status;
5371  
5372      numdl = (dw10 >> 16);
5373      numdu = (dw11 & 0xffff);
5374      lspi = (dw11 >> 16);
5375      lpol = dw12;
5376      lpou = dw13;
5377  
5378      len = (((numdu << 16) | numdl) + 1) << 2;
5379      off = (lpou << 32ULL) | lpol;
5380  
5381      if (off & 0x3) {
5382          return NVME_INVALID_FIELD | NVME_DNR;
5383      }
5384  
5385      trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
5386  
5387      status = nvme_check_mdts(n, len);
5388      if (status) {
5389          return status;
5390      }
5391  
5392      switch (lid) {
5393      case NVME_LOG_ERROR_INFO:
5394          return nvme_error_info(n, rae, len, off, req);
5395      case NVME_LOG_SMART_INFO:
5396          return nvme_smart_info(n, rae, len, off, req);
5397      case NVME_LOG_FW_SLOT_INFO:
5398          return nvme_fw_log_info(n, len, off, req);
5399      case NVME_LOG_CHANGED_NSLIST:
5400          return nvme_changed_nslist(n, rae, len, off, req);
5401      case NVME_LOG_CMD_EFFECTS:
5402          return nvme_cmd_effects(n, csi, len, off, req);
5403      case NVME_LOG_ENDGRP:
5404          return nvme_endgrp_info(n, rae, len, off, req);
5405      case NVME_LOG_FDP_CONFS:
5406          return nvme_fdp_confs(n, lspi, len, off, req);
5407      case NVME_LOG_FDP_RUH_USAGE:
5408          return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
5409      case NVME_LOG_FDP_STATS:
5410          return nvme_fdp_stats(n, lspi, len, off, req);
5411      case NVME_LOG_FDP_EVENTS:
5412          return nvme_fdp_events(n, lspi, len, off, req);
5413      default:
5414          trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
5415          return NVME_INVALID_FIELD | NVME_DNR;
5416      }
5417  }
5418  
nvme_free_cq(NvmeCQueue * cq,NvmeCtrl * n)5419  static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
5420  {
5421      PCIDevice *pci = PCI_DEVICE(n);
5422      uint16_t offset = (cq->cqid << 3) + (1 << 2);
5423  
5424      n->cq[cq->cqid] = NULL;
5425      qemu_bh_delete(cq->bh);
5426      if (cq->ioeventfd_enabled) {
5427          memory_region_del_eventfd(&n->iomem,
5428                                    0x1000 + offset, 4, false, 0, &cq->notifier);
5429          event_notifier_set_handler(&cq->notifier, NULL);
5430          event_notifier_cleanup(&cq->notifier);
5431      }
5432      if (msix_enabled(pci) && cq->irq_enabled) {
5433          msix_vector_unuse(pci, cq->vector);
5434      }
5435      if (cq->cqid) {
5436          g_free(cq);
5437      }
5438  }
5439  
nvme_del_cq(NvmeCtrl * n,NvmeRequest * req)5440  static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
5441  {
5442      NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
5443      NvmeCQueue *cq;
5444      uint16_t qid = le16_to_cpu(c->qid);
5445  
5446      if (unlikely(!qid || nvme_check_cqid(n, qid))) {
5447          trace_pci_nvme_err_invalid_del_cq_cqid(qid);
5448          return NVME_INVALID_CQID | NVME_DNR;
5449      }
5450  
5451      cq = n->cq[qid];
5452      if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
5453          trace_pci_nvme_err_invalid_del_cq_notempty(qid);
5454          return NVME_INVALID_QUEUE_DEL;
5455      }
5456  
5457      if (cq->irq_enabled && cq->tail != cq->head) {
5458          n->cq_pending--;
5459      }
5460  
5461      nvme_irq_deassert(n, cq);
5462      trace_pci_nvme_del_cq(qid);
5463      nvme_free_cq(cq, n);
5464      return NVME_SUCCESS;
5465  }
5466  
nvme_init_cq(NvmeCQueue * cq,NvmeCtrl * n,uint64_t dma_addr,uint16_t cqid,uint16_t vector,uint16_t size,uint16_t irq_enabled)5467  static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
5468                           uint16_t cqid, uint16_t vector, uint16_t size,
5469                           uint16_t irq_enabled)
5470  {
5471      PCIDevice *pci = PCI_DEVICE(n);
5472  
5473      if (msix_enabled(pci) && irq_enabled) {
5474          msix_vector_use(pci, vector);
5475      }
5476  
5477      cq->ctrl = n;
5478      cq->cqid = cqid;
5479      cq->size = size;
5480      cq->dma_addr = dma_addr;
5481      cq->phase = 1;
5482      cq->irq_enabled = irq_enabled;
5483      cq->vector = vector;
5484      cq->head = cq->tail = 0;
5485      QTAILQ_INIT(&cq->req_list);
5486      QTAILQ_INIT(&cq->sq_list);
5487      if (n->dbbuf_enabled) {
5488          cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
5489          cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
5490  
5491          if (n->params.ioeventfd && cqid != 0) {
5492              if (!nvme_init_cq_ioeventfd(cq)) {
5493                  cq->ioeventfd_enabled = true;
5494              }
5495          }
5496      }
5497      n->cq[cqid] = cq;
5498      cq->bh = qemu_bh_new_guarded(nvme_post_cqes, cq,
5499                                   &DEVICE(cq->ctrl)->mem_reentrancy_guard);
5500  }
5501  
nvme_create_cq(NvmeCtrl * n,NvmeRequest * req)5502  static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
5503  {
5504      NvmeCQueue *cq;
5505      NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
5506      uint16_t cqid = le16_to_cpu(c->cqid);
5507      uint16_t vector = le16_to_cpu(c->irq_vector);
5508      uint16_t qsize = le16_to_cpu(c->qsize);
5509      uint16_t qflags = le16_to_cpu(c->cq_flags);
5510      uint64_t prp1 = le64_to_cpu(c->prp1);
5511      uint32_t cc = ldq_le_p(&n->bar.cc);
5512      uint8_t iocqes = NVME_CC_IOCQES(cc);
5513      uint8_t iosqes = NVME_CC_IOSQES(cc);
5514  
5515      trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
5516                               NVME_CQ_FLAGS_IEN(qflags) != 0);
5517  
5518      if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
5519          trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
5520          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5521      }
5522  
5523      if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
5524          trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
5525          return NVME_INVALID_QID | NVME_DNR;
5526      }
5527      if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
5528          trace_pci_nvme_err_invalid_create_cq_size(qsize);
5529          return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
5530      }
5531      if (unlikely(prp1 & (n->page_size - 1))) {
5532          trace_pci_nvme_err_invalid_create_cq_addr(prp1);
5533          return NVME_INVALID_PRP_OFFSET | NVME_DNR;
5534      }
5535      if (unlikely(!msix_enabled(PCI_DEVICE(n)) && vector)) {
5536          trace_pci_nvme_err_invalid_create_cq_vector(vector);
5537          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5538      }
5539      if (unlikely(vector >= n->conf_msix_qsize)) {
5540          trace_pci_nvme_err_invalid_create_cq_vector(vector);
5541          return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
5542      }
5543      if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
5544          trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
5545          return NVME_INVALID_FIELD | NVME_DNR;
5546      }
5547  
5548      cq = g_malloc0(sizeof(*cq));
5549      nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
5550                   NVME_CQ_FLAGS_IEN(qflags));
5551  
5552      /*
5553       * It is only required to set qs_created when creating a completion queue;
5554       * creating a submission queue without a matching completion queue will
5555       * fail.
5556       */
5557      n->qs_created = true;
5558      return NVME_SUCCESS;
5559  }
5560  
nvme_rpt_empty_id_struct(NvmeCtrl * n,NvmeRequest * req)5561  static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
5562  {
5563      uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5564  
5565      return nvme_c2h(n, id, sizeof(id), req);
5566  }
5567  
nvme_identify_ctrl(NvmeCtrl * n,NvmeRequest * req)5568  static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
5569  {
5570      trace_pci_nvme_identify_ctrl();
5571  
5572      return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
5573  }
5574  
nvme_identify_ctrl_csi(NvmeCtrl * n,NvmeRequest * req)5575  static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
5576  {
5577      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5578      uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
5579      NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
5580  
5581      trace_pci_nvme_identify_ctrl_csi(c->csi);
5582  
5583      switch (c->csi) {
5584      case NVME_CSI_NVM:
5585          id_nvm->vsl = n->params.vsl;
5586          id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
5587          break;
5588  
5589      case NVME_CSI_ZONED:
5590          ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
5591          break;
5592  
5593      default:
5594          return NVME_INVALID_FIELD | NVME_DNR;
5595      }
5596  
5597      return nvme_c2h(n, id, sizeof(id), req);
5598  }
5599  
nvme_identify_ns(NvmeCtrl * n,NvmeRequest * req,bool active)5600  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
5601  {
5602      NvmeNamespace *ns;
5603      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5604      uint32_t nsid = le32_to_cpu(c->nsid);
5605  
5606      trace_pci_nvme_identify_ns(nsid);
5607  
5608      if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5609          return NVME_INVALID_NSID | NVME_DNR;
5610      }
5611  
5612      ns = nvme_ns(n, nsid);
5613      if (unlikely(!ns)) {
5614          if (!active) {
5615              ns = nvme_subsys_ns(n->subsys, nsid);
5616              if (!ns) {
5617                  return nvme_rpt_empty_id_struct(n, req);
5618              }
5619          } else {
5620              return nvme_rpt_empty_id_struct(n, req);
5621          }
5622      }
5623  
5624      if (active || ns->csi == NVME_CSI_NVM) {
5625          return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
5626      }
5627  
5628      return NVME_INVALID_CMD_SET | NVME_DNR;
5629  }
5630  
nvme_identify_ctrl_list(NvmeCtrl * n,NvmeRequest * req,bool attached)5631  static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
5632                                          bool attached)
5633  {
5634      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5635      uint32_t nsid = le32_to_cpu(c->nsid);
5636      uint16_t min_id = le16_to_cpu(c->ctrlid);
5637      uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5638      uint16_t *ids = &list[1];
5639      NvmeNamespace *ns;
5640      NvmeCtrl *ctrl;
5641      int cntlid, nr_ids = 0;
5642  
5643      trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
5644  
5645      if (!n->subsys) {
5646          return NVME_INVALID_FIELD | NVME_DNR;
5647      }
5648  
5649      if (attached) {
5650          if (nsid == NVME_NSID_BROADCAST) {
5651              return NVME_INVALID_FIELD | NVME_DNR;
5652          }
5653  
5654          ns = nvme_subsys_ns(n->subsys, nsid);
5655          if (!ns) {
5656              return NVME_INVALID_FIELD | NVME_DNR;
5657          }
5658      }
5659  
5660      for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
5661          ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
5662          if (!ctrl) {
5663              continue;
5664          }
5665  
5666          if (attached && !nvme_ns(ctrl, nsid)) {
5667              continue;
5668          }
5669  
5670          ids[nr_ids++] = cntlid;
5671      }
5672  
5673      list[0] = nr_ids;
5674  
5675      return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
5676  }
5677  
nvme_identify_pri_ctrl_cap(NvmeCtrl * n,NvmeRequest * req)5678  static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
5679  {
5680      trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
5681  
5682      return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
5683                      sizeof(NvmePriCtrlCap), req);
5684  }
5685  
nvme_identify_sec_ctrl_list(NvmeCtrl * n,NvmeRequest * req)5686  static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
5687  {
5688      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5689      uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
5690      uint16_t min_id = le16_to_cpu(c->ctrlid);
5691      uint8_t num_sec_ctrl = n->nr_sec_ctrls;
5692      NvmeSecCtrlList list = {0};
5693      uint8_t i;
5694  
5695      for (i = 0; i < num_sec_ctrl; i++) {
5696          if (n->sec_ctrl_list[i].scid >= min_id) {
5697              list.numcntl = MIN(num_sec_ctrl - i, 127);
5698              memcpy(&list.sec, n->sec_ctrl_list + i,
5699                     list.numcntl * sizeof(NvmeSecCtrlEntry));
5700              break;
5701          }
5702      }
5703  
5704      trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
5705  
5706      return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
5707  }
5708  
nvme_identify_ns_ind(NvmeCtrl * n,NvmeRequest * req,bool alloc)5709  static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, NvmeRequest *req, bool alloc)
5710  {
5711      NvmeNamespace *ns;
5712      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5713      uint32_t nsid = le32_to_cpu(c->nsid);
5714  
5715      trace_pci_nvme_identify_ns_ind(nsid);
5716  
5717      if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5718          return NVME_INVALID_NSID | NVME_DNR;
5719      }
5720  
5721      ns = nvme_ns(n, nsid);
5722      if (unlikely(!ns)) {
5723          if (alloc) {
5724              ns = nvme_subsys_ns(n->subsys, nsid);
5725              if (!ns) {
5726                  return nvme_rpt_empty_id_struct(n, req);
5727              }
5728          } else {
5729              return nvme_rpt_empty_id_struct(n, req);
5730          }
5731      }
5732  
5733      return nvme_c2h(n, (uint8_t *)&ns->id_ns_ind, sizeof(NvmeIdNsInd), req);
5734  }
5735  
nvme_identify_ns_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5736  static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
5737                                       bool active)
5738  {
5739      NvmeNamespace *ns;
5740      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5741      uint32_t nsid = le32_to_cpu(c->nsid);
5742  
5743      trace_pci_nvme_identify_ns_csi(nsid, c->csi);
5744  
5745      if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5746          return NVME_INVALID_NSID | NVME_DNR;
5747      }
5748  
5749      ns = nvme_ns(n, nsid);
5750      if (unlikely(!ns)) {
5751          if (!active) {
5752              ns = nvme_subsys_ns(n->subsys, nsid);
5753              if (!ns) {
5754                  return nvme_rpt_empty_id_struct(n, req);
5755              }
5756          } else {
5757              return nvme_rpt_empty_id_struct(n, req);
5758          }
5759      }
5760  
5761      if (c->csi == NVME_CSI_NVM) {
5762          return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5763                          req);
5764      } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5765          return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5766                          req);
5767      }
5768  
5769      return NVME_INVALID_FIELD | NVME_DNR;
5770  }
5771  
nvme_identify_nslist(NvmeCtrl * n,NvmeRequest * req,bool active)5772  static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5773                                       bool active)
5774  {
5775      NvmeNamespace *ns;
5776      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5777      uint32_t min_nsid = le32_to_cpu(c->nsid);
5778      uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5779      static const int data_len = sizeof(list);
5780      uint32_t *list_ptr = (uint32_t *)list;
5781      int i, j = 0;
5782  
5783      trace_pci_nvme_identify_nslist(min_nsid);
5784  
5785      /*
5786       * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5787       * since the Active Namespace ID List should return namespaces with ids
5788       * *higher* than the NSID specified in the command. This is also specified
5789       * in the spec (NVM Express v1.3d, Section 5.15.4).
5790       */
5791      if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5792          return NVME_INVALID_NSID | NVME_DNR;
5793      }
5794  
5795      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5796          ns = nvme_ns(n, i);
5797          if (!ns) {
5798              if (!active) {
5799                  ns = nvme_subsys_ns(n->subsys, i);
5800                  if (!ns) {
5801                      continue;
5802                  }
5803              } else {
5804                  continue;
5805              }
5806          }
5807          if (ns->params.nsid <= min_nsid) {
5808              continue;
5809          }
5810          list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5811          if (j == data_len / sizeof(uint32_t)) {
5812              break;
5813          }
5814      }
5815  
5816      return nvme_c2h(n, list, data_len, req);
5817  }
5818  
nvme_identify_nslist_csi(NvmeCtrl * n,NvmeRequest * req,bool active)5819  static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5820                                           bool active)
5821  {
5822      NvmeNamespace *ns;
5823      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5824      uint32_t min_nsid = le32_to_cpu(c->nsid);
5825      uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5826      static const int data_len = sizeof(list);
5827      uint32_t *list_ptr = (uint32_t *)list;
5828      int i, j = 0;
5829  
5830      trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5831  
5832      /*
5833       * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5834       */
5835      if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5836          return NVME_INVALID_NSID | NVME_DNR;
5837      }
5838  
5839      if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5840          return NVME_INVALID_FIELD | NVME_DNR;
5841      }
5842  
5843      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5844          ns = nvme_ns(n, i);
5845          if (!ns) {
5846              if (!active) {
5847                  ns = nvme_subsys_ns(n->subsys, i);
5848                  if (!ns) {
5849                      continue;
5850                  }
5851              } else {
5852                  continue;
5853              }
5854          }
5855          if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5856              continue;
5857          }
5858          list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5859          if (j == data_len / sizeof(uint32_t)) {
5860              break;
5861          }
5862      }
5863  
5864      return nvme_c2h(n, list, data_len, req);
5865  }
5866  
nvme_endurance_group_list(NvmeCtrl * n,NvmeRequest * req)5867  static uint16_t nvme_endurance_group_list(NvmeCtrl *n, NvmeRequest *req)
5868  {
5869      uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5870      uint16_t *nr_ids = &list[0];
5871      uint16_t *ids = &list[1];
5872      uint16_t endgid = le32_to_cpu(req->cmd.cdw11) & 0xffff;
5873  
5874      /*
5875       * The current nvme-subsys only supports Endurance Group #1.
5876       */
5877      if (!endgid) {
5878          *nr_ids = 1;
5879          ids[0] = 1;
5880      } else {
5881          *nr_ids = 0;
5882      }
5883  
5884      return nvme_c2h(n, list, sizeof(list), req);
5885  }
5886  
nvme_identify_ns_descr_list(NvmeCtrl * n,NvmeRequest * req)5887  static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5888  {
5889      NvmeNamespace *ns;
5890      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5891      uint32_t nsid = le32_to_cpu(c->nsid);
5892      uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5893      uint8_t *pos = list;
5894      struct {
5895          NvmeIdNsDescr hdr;
5896          uint8_t v[NVME_NIDL_UUID];
5897      } QEMU_PACKED uuid = {};
5898      struct {
5899          NvmeIdNsDescr hdr;
5900          uint8_t v[NVME_NIDL_NGUID];
5901      } QEMU_PACKED nguid = {};
5902      struct {
5903          NvmeIdNsDescr hdr;
5904          uint64_t v;
5905      } QEMU_PACKED eui64 = {};
5906      struct {
5907          NvmeIdNsDescr hdr;
5908          uint8_t v;
5909      } QEMU_PACKED csi = {};
5910  
5911      trace_pci_nvme_identify_ns_descr_list(nsid);
5912  
5913      if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5914          return NVME_INVALID_NSID | NVME_DNR;
5915      }
5916  
5917      ns = nvme_ns(n, nsid);
5918      if (unlikely(!ns)) {
5919          return NVME_INVALID_FIELD | NVME_DNR;
5920      }
5921  
5922      if (!qemu_uuid_is_null(&ns->params.uuid)) {
5923          uuid.hdr.nidt = NVME_NIDT_UUID;
5924          uuid.hdr.nidl = NVME_NIDL_UUID;
5925          memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5926          memcpy(pos, &uuid, sizeof(uuid));
5927          pos += sizeof(uuid);
5928      }
5929  
5930      if (!nvme_nguid_is_null(&ns->params.nguid)) {
5931          nguid.hdr.nidt = NVME_NIDT_NGUID;
5932          nguid.hdr.nidl = NVME_NIDL_NGUID;
5933          memcpy(nguid.v, ns->params.nguid.data, NVME_NIDL_NGUID);
5934          memcpy(pos, &nguid, sizeof(nguid));
5935          pos += sizeof(nguid);
5936      }
5937  
5938      if (ns->params.eui64) {
5939          eui64.hdr.nidt = NVME_NIDT_EUI64;
5940          eui64.hdr.nidl = NVME_NIDL_EUI64;
5941          eui64.v = cpu_to_be64(ns->params.eui64);
5942          memcpy(pos, &eui64, sizeof(eui64));
5943          pos += sizeof(eui64);
5944      }
5945  
5946      csi.hdr.nidt = NVME_NIDT_CSI;
5947      csi.hdr.nidl = NVME_NIDL_CSI;
5948      csi.v = ns->csi;
5949      memcpy(pos, &csi, sizeof(csi));
5950      pos += sizeof(csi);
5951  
5952      return nvme_c2h(n, list, sizeof(list), req);
5953  }
5954  
nvme_identify_cmd_set(NvmeCtrl * n,NvmeRequest * req)5955  static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5956  {
5957      uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5958      static const int data_len = sizeof(list);
5959  
5960      trace_pci_nvme_identify_cmd_set();
5961  
5962      NVME_SET_CSI(*list, NVME_CSI_NVM);
5963      NVME_SET_CSI(*list, NVME_CSI_ZONED);
5964  
5965      return nvme_c2h(n, list, data_len, req);
5966  }
5967  
nvme_identify(NvmeCtrl * n,NvmeRequest * req)5968  static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5969  {
5970      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5971  
5972      trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5973                              c->csi);
5974  
5975      switch (c->cns) {
5976      case NVME_ID_CNS_NS:
5977          return nvme_identify_ns(n, req, true);
5978      case NVME_ID_CNS_NS_PRESENT:
5979          return nvme_identify_ns(n, req, false);
5980      case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5981          return nvme_identify_ctrl_list(n, req, true);
5982      case NVME_ID_CNS_CTRL_LIST:
5983          return nvme_identify_ctrl_list(n, req, false);
5984      case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5985          return nvme_identify_pri_ctrl_cap(n, req);
5986      case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5987          return nvme_identify_sec_ctrl_list(n, req);
5988      case NVME_ID_CNS_CS_NS:
5989          return nvme_identify_ns_csi(n, req, true);
5990      case NVME_ID_CNS_CS_IND_NS:
5991          return nvme_identify_ns_ind(n, req, false);
5992      case NVME_ID_CNS_CS_IND_NS_ALLOCATED:
5993          return nvme_identify_ns_ind(n, req, true);
5994      case NVME_ID_CNS_CS_NS_PRESENT:
5995          return nvme_identify_ns_csi(n, req, false);
5996      case NVME_ID_CNS_CTRL:
5997          return nvme_identify_ctrl(n, req);
5998      case NVME_ID_CNS_CS_CTRL:
5999          return nvme_identify_ctrl_csi(n, req);
6000      case NVME_ID_CNS_NS_ACTIVE_LIST:
6001          return nvme_identify_nslist(n, req, true);
6002      case NVME_ID_CNS_NS_PRESENT_LIST:
6003          return nvme_identify_nslist(n, req, false);
6004      case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
6005          return nvme_identify_nslist_csi(n, req, true);
6006      case NVME_ID_CNS_ENDURANCE_GROUP_LIST:
6007          return nvme_endurance_group_list(n, req);
6008      case NVME_ID_CNS_CS_NS_PRESENT_LIST:
6009          return nvme_identify_nslist_csi(n, req, false);
6010      case NVME_ID_CNS_NS_DESCR_LIST:
6011          return nvme_identify_ns_descr_list(n, req);
6012      case NVME_ID_CNS_IO_COMMAND_SET:
6013          return nvme_identify_cmd_set(n, req);
6014      default:
6015          trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
6016          return NVME_INVALID_FIELD | NVME_DNR;
6017      }
6018  }
6019  
nvme_abort(NvmeCtrl * n,NvmeRequest * req)6020  static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
6021  {
6022      uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
6023      uint16_t cid  = (le32_to_cpu(req->cmd.cdw10) >> 16) & 0xffff;
6024      NvmeSQueue *sq = n->sq[sqid];
6025      NvmeRequest *r, *next;
6026      int i;
6027  
6028      req->cqe.result = 1;
6029      if (nvme_check_sqid(n, sqid)) {
6030          return NVME_INVALID_FIELD | NVME_DNR;
6031      }
6032  
6033      if (sqid == 0) {
6034          for (i = 0; i < n->outstanding_aers; i++) {
6035              NvmeRequest *re = n->aer_reqs[i];
6036              if (re->cqe.cid == cid) {
6037                  memmove(n->aer_reqs + i, n->aer_reqs + i + 1,
6038                           (n->outstanding_aers - i - 1) * sizeof(NvmeRequest *));
6039                  n->outstanding_aers--;
6040                  re->status = NVME_CMD_ABORT_REQ;
6041                  req->cqe.result = 0;
6042                  nvme_enqueue_req_completion(&n->admin_cq, re);
6043                  return NVME_SUCCESS;
6044              }
6045          }
6046      }
6047  
6048      QTAILQ_FOREACH_SAFE(r, &sq->out_req_list, entry, next) {
6049          if (r->cqe.cid == cid) {
6050              if (r->aiocb) {
6051                  blk_aio_cancel_async(r->aiocb);
6052              }
6053              break;
6054          }
6055      }
6056  
6057      return NVME_SUCCESS;
6058  }
6059  
nvme_set_timestamp(NvmeCtrl * n,uint64_t ts)6060  static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
6061  {
6062      trace_pci_nvme_setfeat_timestamp(ts);
6063  
6064      n->host_timestamp = le64_to_cpu(ts);
6065      n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6066  }
6067  
nvme_get_timestamp(const NvmeCtrl * n)6068  static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
6069  {
6070      uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6071      uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
6072  
6073      union nvme_timestamp {
6074          struct {
6075              uint64_t timestamp:48;
6076              uint64_t sync:1;
6077              uint64_t origin:3;
6078              uint64_t rsvd1:12;
6079          };
6080          uint64_t all;
6081      };
6082  
6083      union nvme_timestamp ts;
6084      ts.all = 0;
6085      ts.timestamp = n->host_timestamp + elapsed_time;
6086  
6087      /* If the host timestamp is non-zero, set the timestamp origin */
6088      ts.origin = n->host_timestamp ? 0x01 : 0x00;
6089  
6090      trace_pci_nvme_getfeat_timestamp(ts.all);
6091  
6092      return cpu_to_le64(ts.all);
6093  }
6094  
nvme_get_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6095  static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6096  {
6097      uint64_t timestamp = nvme_get_timestamp(n);
6098  
6099      return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
6100  }
6101  
nvme_get_feature_fdp(NvmeCtrl * n,uint32_t endgrpid,uint32_t * result)6102  static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
6103                                  uint32_t *result)
6104  {
6105      *result = 0;
6106  
6107      if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6108          return NVME_INVALID_FIELD | NVME_DNR;
6109      }
6110  
6111      *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
6112      *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
6113  
6114      return NVME_SUCCESS;
6115  }
6116  
nvme_get_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req,uint32_t * result)6117  static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6118                                              NvmeRequest *req, uint32_t *result)
6119  {
6120      NvmeCmd *cmd = &req->cmd;
6121      uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6122      uint16_t ph = cdw11 & 0xffff;
6123      uint8_t noet = (cdw11 >> 16) & 0xff;
6124      uint16_t ruhid, ret;
6125      uint32_t nentries = 0;
6126      uint8_t s_events_ndx = 0;
6127      size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
6128      g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
6129      NvmeRuHandle *ruh;
6130      NvmeFdpEventDescr *s_event;
6131  
6132      if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6133          return NVME_FDP_DISABLED | NVME_DNR;
6134      }
6135  
6136      if (!nvme_ph_valid(ns, ph)) {
6137          return NVME_INVALID_FIELD | NVME_DNR;
6138      }
6139  
6140      ruhid = ns->fdp.phs[ph];
6141      ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6142  
6143      assert(ruh);
6144  
6145      if (unlikely(noet == 0)) {
6146          return NVME_INVALID_FIELD | NVME_DNR;
6147      }
6148  
6149      for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
6150          uint8_t shift = nvme_fdp_evf_shifts[event_type];
6151          if (!shift && event_type) {
6152              /*
6153               * only first entry (event_type == 0) has a shift value of 0
6154               * other entries are simply unpopulated.
6155               */
6156              continue;
6157          }
6158  
6159          nentries++;
6160  
6161          s_event = &s_events[s_events_ndx];
6162          s_event->evt = event_type;
6163          s_event->evta = (ruh->event_filter >> shift) & 0x1;
6164  
6165          /* break if all `noet` entries are filled */
6166          if ((++s_events_ndx) == noet) {
6167              break;
6168          }
6169      }
6170  
6171      ret = nvme_c2h(n, s_events, s_events_siz, req);
6172      if (ret) {
6173          return ret;
6174      }
6175  
6176      *result = nentries;
6177      return NVME_SUCCESS;
6178  }
6179  
nvme_get_feature(NvmeCtrl * n,NvmeRequest * req)6180  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
6181  {
6182      NvmeCmd *cmd = &req->cmd;
6183      uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6184      uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6185      uint32_t nsid = le32_to_cpu(cmd->nsid);
6186      uint32_t result = 0;
6187      uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6188      NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
6189      uint16_t iv;
6190      NvmeNamespace *ns;
6191      int i;
6192      uint16_t endgrpid = 0, ret = NVME_SUCCESS;
6193  
6194      static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
6195          [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
6196      };
6197  
6198      trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
6199  
6200      if (!nvme_feature_support[fid]) {
6201          return NVME_INVALID_FIELD | NVME_DNR;
6202      }
6203  
6204      if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6205          if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
6206              /*
6207               * The Reservation Notification Mask and Reservation Persistence
6208               * features require a status code of Invalid Field in Command when
6209               * NSID is FFFFFFFFh. Since the device does not support those
6210               * features we can always return Invalid Namespace or Format as we
6211               * should do for all other features.
6212               */
6213              return NVME_INVALID_NSID | NVME_DNR;
6214          }
6215  
6216          if (!nvme_ns(n, nsid)) {
6217              return NVME_INVALID_FIELD | NVME_DNR;
6218          }
6219      }
6220  
6221      switch (sel) {
6222      case NVME_GETFEAT_SELECT_CURRENT:
6223          break;
6224      case NVME_GETFEAT_SELECT_SAVED:
6225          /* no features are saveable by the controller; fallthrough */
6226      case NVME_GETFEAT_SELECT_DEFAULT:
6227          goto defaults;
6228      case NVME_GETFEAT_SELECT_CAP:
6229          result = nvme_feature_cap[fid];
6230          goto out;
6231      }
6232  
6233      switch (fid) {
6234      case NVME_TEMPERATURE_THRESHOLD:
6235          result = 0;
6236  
6237          /*
6238           * The controller only implements the Composite Temperature sensor, so
6239           * return 0 for all other sensors.
6240           */
6241          if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6242              goto out;
6243          }
6244  
6245          switch (NVME_TEMP_THSEL(dw11)) {
6246          case NVME_TEMP_THSEL_OVER:
6247              result = n->features.temp_thresh_hi;
6248              goto out;
6249          case NVME_TEMP_THSEL_UNDER:
6250              result = n->features.temp_thresh_low;
6251              goto out;
6252          }
6253  
6254          return NVME_INVALID_FIELD | NVME_DNR;
6255      case NVME_ERROR_RECOVERY:
6256          if (!nvme_nsid_valid(n, nsid)) {
6257              return NVME_INVALID_NSID | NVME_DNR;
6258          }
6259  
6260          ns = nvme_ns(n, nsid);
6261          if (unlikely(!ns)) {
6262              return NVME_INVALID_FIELD | NVME_DNR;
6263          }
6264  
6265          result = ns->features.err_rec;
6266          goto out;
6267      case NVME_VOLATILE_WRITE_CACHE:
6268          result = 0;
6269          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6270              ns = nvme_ns(n, i);
6271              if (!ns) {
6272                  continue;
6273              }
6274  
6275              result = blk_enable_write_cache(ns->blkconf.blk);
6276              if (result) {
6277                  break;
6278              }
6279          }
6280          trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
6281          goto out;
6282      case NVME_ASYNCHRONOUS_EVENT_CONF:
6283          result = n->features.async_config;
6284          goto out;
6285      case NVME_TIMESTAMP:
6286          return nvme_get_feature_timestamp(n, req);
6287      case NVME_HOST_BEHAVIOR_SUPPORT:
6288          return nvme_c2h(n, (uint8_t *)&n->features.hbs,
6289                          sizeof(n->features.hbs), req);
6290      case NVME_FDP_MODE:
6291          endgrpid = dw11 & 0xff;
6292  
6293          if (endgrpid != 0x1) {
6294              return NVME_INVALID_FIELD | NVME_DNR;
6295          }
6296  
6297          ret = nvme_get_feature_fdp(n, endgrpid, &result);
6298          if (ret) {
6299              return ret;
6300          }
6301          goto out;
6302      case NVME_FDP_EVENTS:
6303          if (!nvme_nsid_valid(n, nsid)) {
6304              return NVME_INVALID_NSID | NVME_DNR;
6305          }
6306  
6307          ns = nvme_ns(n, nsid);
6308          if (unlikely(!ns)) {
6309              return NVME_INVALID_FIELD | NVME_DNR;
6310          }
6311  
6312          ret = nvme_get_feature_fdp_events(n, ns, req, &result);
6313          if (ret) {
6314              return ret;
6315          }
6316          goto out;
6317      default:
6318          break;
6319      }
6320  
6321  defaults:
6322      switch (fid) {
6323      case NVME_TEMPERATURE_THRESHOLD:
6324          result = 0;
6325  
6326          if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6327              break;
6328          }
6329  
6330          if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
6331              result = NVME_TEMPERATURE_WARNING;
6332          }
6333  
6334          break;
6335      case NVME_NUMBER_OF_QUEUES:
6336          result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
6337          trace_pci_nvme_getfeat_numq(result);
6338          break;
6339      case NVME_INTERRUPT_VECTOR_CONF:
6340          iv = dw11 & 0xffff;
6341          if (iv >= n->conf_ioqpairs + 1) {
6342              return NVME_INVALID_FIELD | NVME_DNR;
6343          }
6344  
6345          result = iv;
6346          if (iv == n->admin_cq.vector) {
6347              result |= NVME_INTVC_NOCOALESCING;
6348          }
6349          break;
6350      case NVME_FDP_MODE:
6351          endgrpid = dw11 & 0xff;
6352  
6353          if (endgrpid != 0x1) {
6354              return NVME_INVALID_FIELD | NVME_DNR;
6355          }
6356  
6357          ret = nvme_get_feature_fdp(n, endgrpid, &result);
6358          if (ret) {
6359              return ret;
6360          }
6361          break;
6362  
6363      case NVME_WRITE_ATOMICITY:
6364          result = n->dn;
6365          break;
6366      default:
6367          result = nvme_feature_default[fid];
6368          break;
6369      }
6370  
6371  out:
6372      req->cqe.result = cpu_to_le32(result);
6373      return ret;
6374  }
6375  
nvme_set_feature_timestamp(NvmeCtrl * n,NvmeRequest * req)6376  static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
6377  {
6378      uint16_t ret;
6379      uint64_t timestamp;
6380  
6381      ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
6382      if (ret) {
6383          return ret;
6384      }
6385  
6386      nvme_set_timestamp(n, timestamp);
6387  
6388      return NVME_SUCCESS;
6389  }
6390  
nvme_set_feature_fdp_events(NvmeCtrl * n,NvmeNamespace * ns,NvmeRequest * req)6391  static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
6392                                              NvmeRequest *req)
6393  {
6394      NvmeCmd *cmd = &req->cmd;
6395      uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
6396      uint16_t ph = cdw11 & 0xffff;
6397      uint8_t noet = (cdw11 >> 16) & 0xff;
6398      uint16_t ret, ruhid;
6399      uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
6400      uint8_t event_mask = 0;
6401      unsigned int i;
6402      g_autofree uint8_t *events = g_malloc0(noet);
6403      NvmeRuHandle *ruh = NULL;
6404  
6405      assert(ns);
6406  
6407      if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
6408          return NVME_FDP_DISABLED | NVME_DNR;
6409      }
6410  
6411      if (!nvme_ph_valid(ns, ph)) {
6412          return NVME_INVALID_FIELD | NVME_DNR;
6413      }
6414  
6415      ruhid = ns->fdp.phs[ph];
6416      ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
6417  
6418      ret = nvme_h2c(n, events, noet, req);
6419      if (ret) {
6420          return ret;
6421      }
6422  
6423      for (i = 0; i < noet; i++) {
6424          event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
6425      }
6426  
6427      if (enable) {
6428          ruh->event_filter |= event_mask;
6429      } else {
6430          ruh->event_filter = ruh->event_filter & ~event_mask;
6431      }
6432  
6433      return NVME_SUCCESS;
6434  }
6435  
nvme_set_feature(NvmeCtrl * n,NvmeRequest * req)6436  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
6437  {
6438      NvmeNamespace *ns = NULL;
6439  
6440      NvmeCmd *cmd = &req->cmd;
6441      uint32_t dw10 = le32_to_cpu(cmd->cdw10);
6442      uint32_t dw11 = le32_to_cpu(cmd->cdw11);
6443      uint32_t nsid = le32_to_cpu(cmd->nsid);
6444      uint8_t fid = NVME_GETSETFEAT_FID(dw10);
6445      uint8_t save = NVME_SETFEAT_SAVE(dw10);
6446      uint16_t status;
6447      int i;
6448      NvmeIdCtrl *id = &n->id_ctrl;
6449      NvmeAtomic *atomic = &n->atomic;
6450  
6451      trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
6452  
6453      if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
6454          return NVME_FID_NOT_SAVEABLE | NVME_DNR;
6455      }
6456  
6457      if (!nvme_feature_support[fid]) {
6458          return NVME_INVALID_FIELD | NVME_DNR;
6459      }
6460  
6461      if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
6462          if (nsid != NVME_NSID_BROADCAST) {
6463              if (!nvme_nsid_valid(n, nsid)) {
6464                  return NVME_INVALID_NSID | NVME_DNR;
6465              }
6466  
6467              ns = nvme_ns(n, nsid);
6468              if (unlikely(!ns)) {
6469                  return NVME_INVALID_FIELD | NVME_DNR;
6470              }
6471          }
6472      } else if (nsid && nsid != NVME_NSID_BROADCAST) {
6473          if (!nvme_nsid_valid(n, nsid)) {
6474              return NVME_INVALID_NSID | NVME_DNR;
6475          }
6476  
6477          return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
6478      }
6479  
6480      if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
6481          return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6482      }
6483  
6484      switch (fid) {
6485      case NVME_TEMPERATURE_THRESHOLD:
6486          if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
6487              break;
6488          }
6489  
6490          switch (NVME_TEMP_THSEL(dw11)) {
6491          case NVME_TEMP_THSEL_OVER:
6492              n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
6493              break;
6494          case NVME_TEMP_THSEL_UNDER:
6495              n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
6496              break;
6497          default:
6498              return NVME_INVALID_FIELD | NVME_DNR;
6499          }
6500  
6501          if ((n->temperature >= n->features.temp_thresh_hi) ||
6502              (n->temperature <= n->features.temp_thresh_low)) {
6503              nvme_smart_event(n, NVME_SMART_TEMPERATURE);
6504          }
6505  
6506          break;
6507      case NVME_ERROR_RECOVERY:
6508          if (nsid == NVME_NSID_BROADCAST) {
6509              for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6510                  ns = nvme_ns(n, i);
6511  
6512                  if (!ns) {
6513                      continue;
6514                  }
6515  
6516                  if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
6517                      ns->features.err_rec = dw11;
6518                  }
6519              }
6520  
6521              break;
6522          }
6523  
6524          assert(ns);
6525          if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
6526              ns->features.err_rec = dw11;
6527          }
6528          break;
6529      case NVME_VOLATILE_WRITE_CACHE:
6530          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6531              ns = nvme_ns(n, i);
6532              if (!ns) {
6533                  continue;
6534              }
6535  
6536              if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
6537                  blk_flush(ns->blkconf.blk);
6538              }
6539  
6540              blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
6541          }
6542  
6543          break;
6544  
6545      case NVME_NUMBER_OF_QUEUES:
6546          if (n->qs_created) {
6547              return NVME_CMD_SEQ_ERROR | NVME_DNR;
6548          }
6549  
6550          /*
6551           * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
6552           * and NSQR.
6553           */
6554          if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
6555              return NVME_INVALID_FIELD | NVME_DNR;
6556          }
6557  
6558          trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
6559                                      ((dw11 >> 16) & 0xffff) + 1,
6560                                      n->conf_ioqpairs,
6561                                      n->conf_ioqpairs);
6562          req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
6563                                        ((n->conf_ioqpairs - 1) << 16));
6564          break;
6565      case NVME_ASYNCHRONOUS_EVENT_CONF:
6566          n->features.async_config = dw11;
6567          break;
6568      case NVME_TIMESTAMP:
6569          return nvme_set_feature_timestamp(n, req);
6570      case NVME_HOST_BEHAVIOR_SUPPORT:
6571          status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
6572                            sizeof(n->features.hbs), req);
6573          if (status) {
6574              return status;
6575          }
6576  
6577          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6578              ns = nvme_ns(n, i);
6579  
6580              if (!ns) {
6581                  continue;
6582              }
6583  
6584              ns->id_ns.nlbaf = ns->nlbaf - 1;
6585              if (!n->features.hbs.lbafee) {
6586                  ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
6587              }
6588          }
6589  
6590          return status;
6591      case NVME_COMMAND_SET_PROFILE:
6592          if (dw11 & 0x1ff) {
6593              trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
6594              return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
6595          }
6596          break;
6597      case NVME_FDP_MODE:
6598          /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
6599          return NVME_CMD_SEQ_ERROR | NVME_DNR;
6600      case NVME_FDP_EVENTS:
6601          return nvme_set_feature_fdp_events(n, ns, req);
6602      case NVME_WRITE_ATOMICITY:
6603  
6604          n->dn = 0x1 & dw11;
6605  
6606          if (n->dn) {
6607              atomic->atomic_max_write_size = le16_to_cpu(id->awupf) + 1;
6608          } else {
6609              atomic->atomic_max_write_size = le16_to_cpu(id->awun) + 1;
6610          }
6611  
6612          if (atomic->atomic_max_write_size == 1) {
6613              atomic->atomic_writes = 0;
6614          } else {
6615              atomic->atomic_writes = 1;
6616          }
6617          break;
6618      default:
6619          return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
6620      }
6621      return NVME_SUCCESS;
6622  }
6623  
nvme_aer(NvmeCtrl * n,NvmeRequest * req)6624  static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
6625  {
6626      trace_pci_nvme_aer(nvme_cid(req));
6627  
6628      if (n->outstanding_aers > n->params.aerl) {
6629          trace_pci_nvme_aer_aerl_exceeded();
6630          return NVME_AER_LIMIT_EXCEEDED;
6631      }
6632  
6633      n->aer_reqs[n->outstanding_aers] = req;
6634      n->outstanding_aers++;
6635  
6636      if (!QTAILQ_EMPTY(&n->aer_queue)) {
6637          nvme_process_aers(n);
6638      }
6639  
6640      return NVME_NO_COMPLETE;
6641  }
6642  
nvme_update_dmrsl(NvmeCtrl * n)6643  static void nvme_update_dmrsl(NvmeCtrl *n)
6644  {
6645      int nsid;
6646  
6647      for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
6648          NvmeNamespace *ns = nvme_ns(n, nsid);
6649          if (!ns) {
6650              continue;
6651          }
6652  
6653          n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6654                                  BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6655      }
6656  }
6657  
nvme_select_iocs_ns(NvmeCtrl * n,NvmeNamespace * ns)6658  static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
6659  {
6660      uint32_t cc = ldl_le_p(&n->bar.cc);
6661  
6662      ns->iocs = nvme_cse_iocs_none;
6663      switch (ns->csi) {
6664      case NVME_CSI_NVM:
6665          if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
6666              ns->iocs = nvme_cse_iocs_nvm;
6667          }
6668          break;
6669      case NVME_CSI_ZONED:
6670          if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
6671              ns->iocs = nvme_cse_iocs_zoned;
6672          } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
6673              ns->iocs = nvme_cse_iocs_nvm;
6674          }
6675          break;
6676      }
6677  }
6678  
nvme_ns_attachment(NvmeCtrl * n,NvmeRequest * req)6679  static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
6680  {
6681      NvmeNamespace *ns;
6682      NvmeCtrl *ctrl;
6683      uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
6684      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6685      uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6686      uint8_t sel = dw10 & 0xf;
6687      uint16_t *nr_ids = &list[0];
6688      uint16_t *ids = &list[1];
6689      uint16_t ret;
6690      int i;
6691  
6692      trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
6693  
6694      if (!nvme_nsid_valid(n, nsid)) {
6695          return NVME_INVALID_NSID | NVME_DNR;
6696      }
6697  
6698      ns = nvme_subsys_ns(n->subsys, nsid);
6699      if (!ns) {
6700          return NVME_INVALID_FIELD | NVME_DNR;
6701      }
6702  
6703      ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
6704      if (ret) {
6705          return ret;
6706      }
6707  
6708      if (!*nr_ids) {
6709          return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6710      }
6711  
6712      *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
6713      for (i = 0; i < *nr_ids; i++) {
6714          ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
6715          if (!ctrl) {
6716              return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
6717          }
6718  
6719          switch (sel) {
6720          case NVME_NS_ATTACHMENT_ATTACH:
6721              if (nvme_ns(ctrl, nsid)) {
6722                  return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
6723              }
6724  
6725              if (ns->attached && !ns->params.shared) {
6726                  return NVME_NS_PRIVATE | NVME_DNR;
6727              }
6728  
6729              nvme_attach_ns(ctrl, ns);
6730              nvme_select_iocs_ns(ctrl, ns);
6731  
6732              break;
6733  
6734          case NVME_NS_ATTACHMENT_DETACH:
6735              if (!nvme_ns(ctrl, nsid)) {
6736                  return NVME_NS_NOT_ATTACHED | NVME_DNR;
6737              }
6738  
6739              ctrl->namespaces[nsid] = NULL;
6740              ns->attached--;
6741  
6742              nvme_update_dmrsl(ctrl);
6743  
6744              break;
6745  
6746          default:
6747              return NVME_INVALID_FIELD | NVME_DNR;
6748          }
6749  
6750          /*
6751           * Add namespace id to the changed namespace id list for event clearing
6752           * via Get Log Page command.
6753           */
6754          if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
6755              nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
6756                                 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
6757                                 NVME_LOG_CHANGED_NSLIST);
6758          }
6759      }
6760  
6761      return NVME_SUCCESS;
6762  }
6763  
6764  typedef struct NvmeFormatAIOCB {
6765      BlockAIOCB common;
6766      BlockAIOCB *aiocb;
6767      NvmeRequest *req;
6768      int ret;
6769  
6770      NvmeNamespace *ns;
6771      uint32_t nsid;
6772      bool broadcast;
6773      int64_t offset;
6774  
6775      uint8_t lbaf;
6776      uint8_t mset;
6777      uint8_t pi;
6778      uint8_t pil;
6779  } NvmeFormatAIOCB;
6780  
nvme_format_cancel(BlockAIOCB * aiocb)6781  static void nvme_format_cancel(BlockAIOCB *aiocb)
6782  {
6783      NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
6784  
6785      iocb->ret = -ECANCELED;
6786  
6787      if (iocb->aiocb) {
6788          blk_aio_cancel_async(iocb->aiocb);
6789          iocb->aiocb = NULL;
6790      }
6791  }
6792  
6793  static const AIOCBInfo nvme_format_aiocb_info = {
6794      .aiocb_size = sizeof(NvmeFormatAIOCB),
6795      .cancel_async = nvme_format_cancel,
6796  };
6797  
nvme_format_set(NvmeNamespace * ns,uint8_t lbaf,uint8_t mset,uint8_t pi,uint8_t pil)6798  static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
6799                              uint8_t pi, uint8_t pil)
6800  {
6801      uint8_t lbafl = lbaf & 0xf;
6802      uint8_t lbafu = lbaf >> 4;
6803  
6804      trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
6805  
6806      ns->id_ns.dps = (pil << 3) | pi;
6807      ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
6808  
6809      nvme_ns_init_format(ns);
6810  }
6811  
6812  static void nvme_do_format(NvmeFormatAIOCB *iocb);
6813  
nvme_format_ns_cb(void * opaque,int ret)6814  static void nvme_format_ns_cb(void *opaque, int ret)
6815  {
6816      NvmeFormatAIOCB *iocb = opaque;
6817      NvmeNamespace *ns = iocb->ns;
6818      int bytes;
6819  
6820      if (iocb->ret < 0) {
6821          goto done;
6822      } else if (ret < 0) {
6823          iocb->ret = ret;
6824          goto done;
6825      }
6826  
6827      assert(ns);
6828  
6829      if (iocb->offset < ns->size) {
6830          bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
6831  
6832          iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
6833                                              bytes, BDRV_REQ_MAY_UNMAP,
6834                                              nvme_format_ns_cb, iocb);
6835  
6836          iocb->offset += bytes;
6837          return;
6838      }
6839  
6840      nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
6841      ns->status = 0x0;
6842      iocb->ns = NULL;
6843      iocb->offset = 0;
6844  
6845  done:
6846      nvme_do_format(iocb);
6847  }
6848  
nvme_format_check(NvmeNamespace * ns,uint8_t lbaf,uint8_t pi)6849  static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
6850  {
6851      if (ns->params.zoned) {
6852          return NVME_INVALID_FORMAT | NVME_DNR;
6853      }
6854  
6855      if (lbaf > ns->id_ns.nlbaf) {
6856          return NVME_INVALID_FORMAT | NVME_DNR;
6857      }
6858  
6859      if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
6860          return NVME_INVALID_FORMAT | NVME_DNR;
6861      }
6862  
6863      if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
6864          return NVME_INVALID_FIELD | NVME_DNR;
6865      }
6866  
6867      return NVME_SUCCESS;
6868  }
6869  
nvme_do_format(NvmeFormatAIOCB * iocb)6870  static void nvme_do_format(NvmeFormatAIOCB *iocb)
6871  {
6872      NvmeRequest *req = iocb->req;
6873      NvmeCtrl *n = nvme_ctrl(req);
6874      uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6875      uint8_t lbaf = dw10 & 0xf;
6876      uint8_t pi = (dw10 >> 5) & 0x7;
6877      uint16_t status;
6878      int i;
6879  
6880      if (iocb->ret < 0) {
6881          goto done;
6882      }
6883  
6884      if (iocb->broadcast) {
6885          for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
6886              iocb->ns = nvme_ns(n, i);
6887              if (iocb->ns) {
6888                  iocb->nsid = i;
6889                  break;
6890              }
6891          }
6892      }
6893  
6894      if (!iocb->ns) {
6895          goto done;
6896      }
6897  
6898      status = nvme_format_check(iocb->ns, lbaf, pi);
6899      if (status) {
6900          req->status = status;
6901          goto done;
6902      }
6903  
6904      iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
6905      nvme_format_ns_cb(iocb, 0);
6906      return;
6907  
6908  done:
6909      iocb->common.cb(iocb->common.opaque, iocb->ret);
6910      qemu_aio_unref(iocb);
6911  }
6912  
nvme_format(NvmeCtrl * n,NvmeRequest * req)6913  static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
6914  {
6915      NvmeFormatAIOCB *iocb;
6916      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
6917      uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6918      uint8_t lbaf = dw10 & 0xf;
6919      uint8_t mset = (dw10 >> 4) & 0x1;
6920      uint8_t pi = (dw10 >> 5) & 0x7;
6921      uint8_t pil = (dw10 >> 8) & 0x1;
6922      uint8_t lbafu = (dw10 >> 12) & 0x3;
6923      uint16_t status;
6924  
6925      iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
6926  
6927      iocb->req = req;
6928      iocb->ret = 0;
6929      iocb->ns = NULL;
6930      iocb->nsid = 0;
6931      iocb->lbaf = lbaf;
6932      iocb->mset = mset;
6933      iocb->pi = pi;
6934      iocb->pil = pil;
6935      iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
6936      iocb->offset = 0;
6937  
6938      if (n->features.hbs.lbafee) {
6939          iocb->lbaf |= lbafu << 4;
6940      }
6941  
6942      if (!iocb->broadcast) {
6943          if (!nvme_nsid_valid(n, nsid)) {
6944              status = NVME_INVALID_NSID | NVME_DNR;
6945              goto out;
6946          }
6947  
6948          iocb->ns = nvme_ns(n, nsid);
6949          if (!iocb->ns) {
6950              status = NVME_INVALID_FIELD | NVME_DNR;
6951              goto out;
6952          }
6953      }
6954  
6955      req->aiocb = &iocb->common;
6956      nvme_do_format(iocb);
6957  
6958      return NVME_NO_COMPLETE;
6959  
6960  out:
6961      qemu_aio_unref(iocb);
6962  
6963      return status;
6964  }
6965  
nvme_get_virt_res_num(NvmeCtrl * n,uint8_t rt,int * num_total,int * num_prim,int * num_sec)6966  static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
6967                                    int *num_prim, int *num_sec)
6968  {
6969      *num_total = le32_to_cpu(rt ?
6970                               n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
6971      *num_prim = le16_to_cpu(rt ?
6972                              n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
6973      *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
6974  }
6975  
nvme_assign_virt_res_to_prim(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)6976  static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
6977                                               uint16_t cntlid, uint8_t rt,
6978                                               int nr)
6979  {
6980      int num_total, num_prim, num_sec;
6981  
6982      if (cntlid != n->cntlid) {
6983          return NVME_INVALID_CTRL_ID | NVME_DNR;
6984      }
6985  
6986      nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6987  
6988      if (nr > num_total) {
6989          return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6990      }
6991  
6992      if (nr > num_total - num_sec) {
6993          return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6994      }
6995  
6996      if (rt) {
6997          n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
6998      } else {
6999          n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
7000      }
7001  
7002      req->cqe.result = cpu_to_le32(nr);
7003      return req->status;
7004  }
7005  
nvme_update_virt_res(NvmeCtrl * n,NvmeSecCtrlEntry * sctrl,uint8_t rt,int nr)7006  static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
7007                                   uint8_t rt, int nr)
7008  {
7009      int prev_nr, prev_total;
7010  
7011      if (rt) {
7012          prev_nr = le16_to_cpu(sctrl->nvi);
7013          prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
7014          sctrl->nvi = cpu_to_le16(nr);
7015          n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
7016      } else {
7017          prev_nr = le16_to_cpu(sctrl->nvq);
7018          prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
7019          sctrl->nvq = cpu_to_le16(nr);
7020          n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
7021      }
7022  }
7023  
nvme_assign_virt_res_to_sec(NvmeCtrl * n,NvmeRequest * req,uint16_t cntlid,uint8_t rt,int nr)7024  static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
7025                                              uint16_t cntlid, uint8_t rt, int nr)
7026  {
7027      int num_total, num_prim, num_sec, num_free, diff, limit;
7028      NvmeSecCtrlEntry *sctrl;
7029  
7030      sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7031      if (!sctrl) {
7032          return NVME_INVALID_CTRL_ID | NVME_DNR;
7033      }
7034  
7035      if (sctrl->scs) {
7036          return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7037      }
7038  
7039      limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
7040      if (nr > limit) {
7041          return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
7042      }
7043  
7044      nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
7045      num_free = num_total - num_prim - num_sec;
7046      diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
7047  
7048      if (diff > num_free) {
7049          return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7050      }
7051  
7052      nvme_update_virt_res(n, sctrl, rt, nr);
7053      req->cqe.result = cpu_to_le32(nr);
7054  
7055      return req->status;
7056  }
7057  
nvme_virt_set_state(NvmeCtrl * n,uint16_t cntlid,bool online)7058  static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
7059  {
7060      PCIDevice *pci = PCI_DEVICE(n);
7061      NvmeCtrl *sn = NULL;
7062      NvmeSecCtrlEntry *sctrl;
7063      int vf_index;
7064  
7065      sctrl = nvme_sctrl_for_cntlid(n, cntlid);
7066      if (!sctrl) {
7067          return NVME_INVALID_CTRL_ID | NVME_DNR;
7068      }
7069  
7070      if (!pci_is_vf(pci)) {
7071          vf_index = le16_to_cpu(sctrl->vfn) - 1;
7072          sn = NVME(pcie_sriov_get_vf_at_index(pci, vf_index));
7073      }
7074  
7075      if (online) {
7076          if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
7077              return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
7078          }
7079  
7080          if (!sctrl->scs) {
7081              sctrl->scs = 0x1;
7082              nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7083          }
7084      } else {
7085          nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
7086          nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
7087  
7088          if (sctrl->scs) {
7089              sctrl->scs = 0x0;
7090              if (sn) {
7091                  nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
7092              }
7093          }
7094      }
7095  
7096      return NVME_SUCCESS;
7097  }
7098  
nvme_virt_mngmt(NvmeCtrl * n,NvmeRequest * req)7099  static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
7100  {
7101      uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7102      uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7103      uint8_t act = dw10 & 0xf;
7104      uint8_t rt = (dw10 >> 8) & 0x7;
7105      uint16_t cntlid = (dw10 >> 16) & 0xffff;
7106      int nr = dw11 & 0xffff;
7107  
7108      trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
7109  
7110      if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
7111          return NVME_INVALID_RESOURCE_ID | NVME_DNR;
7112      }
7113  
7114      switch (act) {
7115      case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
7116          return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
7117      case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
7118          return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
7119      case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
7120          return nvme_virt_set_state(n, cntlid, true);
7121      case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
7122          return nvme_virt_set_state(n, cntlid, false);
7123      default:
7124          return NVME_INVALID_FIELD | NVME_DNR;
7125      }
7126  }
7127  
nvme_dbbuf_config(NvmeCtrl * n,const NvmeRequest * req)7128  static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
7129  {
7130      PCIDevice *pci = PCI_DEVICE(n);
7131      uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
7132      uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
7133      int i;
7134  
7135      /* Address should be page aligned */
7136      if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
7137          return NVME_INVALID_FIELD | NVME_DNR;
7138      }
7139  
7140      /* Save shadow buffer base addr for use during queue creation */
7141      n->dbbuf_dbs = dbs_addr;
7142      n->dbbuf_eis = eis_addr;
7143      n->dbbuf_enabled = true;
7144  
7145      for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7146          NvmeSQueue *sq = n->sq[i];
7147          NvmeCQueue *cq = n->cq[i];
7148  
7149          if (sq) {
7150              /*
7151               * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
7152               * nvme_process_db() uses this hard-coded way to calculate
7153               * doorbell offsets. Be consistent with that here.
7154               */
7155              sq->db_addr = dbs_addr + (i << 3);
7156              sq->ei_addr = eis_addr + (i << 3);
7157              stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
7158  
7159              if (n->params.ioeventfd && sq->sqid != 0) {
7160                  if (!nvme_init_sq_ioeventfd(sq)) {
7161                      sq->ioeventfd_enabled = true;
7162                  }
7163              }
7164          }
7165  
7166          if (cq) {
7167              /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
7168              cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
7169              cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
7170              stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
7171  
7172              if (n->params.ioeventfd && cq->cqid != 0) {
7173                  if (!nvme_init_cq_ioeventfd(cq)) {
7174                      cq->ioeventfd_enabled = true;
7175                  }
7176              }
7177          }
7178      }
7179  
7180      trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
7181  
7182      return NVME_SUCCESS;
7183  }
7184  
nvme_directive_send(NvmeCtrl * n,NvmeRequest * req)7185  static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
7186  {
7187      return NVME_INVALID_FIELD | NVME_DNR;
7188  }
7189  
nvme_directive_receive(NvmeCtrl * n,NvmeRequest * req)7190  static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
7191  {
7192      NvmeNamespace *ns;
7193      uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
7194      uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
7195      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
7196      uint8_t doper, dtype;
7197      uint32_t numd, trans_len;
7198      NvmeDirectiveIdentify id = {
7199          .supported = 1 << NVME_DIRECTIVE_IDENTIFY,
7200          .enabled = 1 << NVME_DIRECTIVE_IDENTIFY,
7201      };
7202  
7203      numd = dw10 + 1;
7204      doper = dw11 & 0xff;
7205      dtype = (dw11 >> 8) & 0xff;
7206  
7207      trans_len = MIN(sizeof(NvmeDirectiveIdentify), numd << 2);
7208  
7209      if (nsid == NVME_NSID_BROADCAST || dtype != NVME_DIRECTIVE_IDENTIFY ||
7210          doper != NVME_DIRECTIVE_RETURN_PARAMS) {
7211          return NVME_INVALID_FIELD | NVME_DNR;
7212      }
7213  
7214      ns = nvme_ns(n, nsid);
7215      if (!ns) {
7216          return NVME_INVALID_FIELD | NVME_DNR;
7217      }
7218  
7219      switch (dtype) {
7220      case NVME_DIRECTIVE_IDENTIFY:
7221          switch (doper) {
7222          case NVME_DIRECTIVE_RETURN_PARAMS:
7223              if (ns->endgrp && ns->endgrp->fdp.enabled) {
7224                  id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7225                  id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7226                  id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
7227              }
7228  
7229              return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
7230  
7231          default:
7232              return NVME_INVALID_FIELD | NVME_DNR;
7233          }
7234  
7235      default:
7236          return NVME_INVALID_FIELD;
7237      }
7238  }
7239  
nvme_admin_cmd(NvmeCtrl * n,NvmeRequest * req)7240  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
7241  {
7242      trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
7243                               nvme_adm_opc_str(req->cmd.opcode));
7244  
7245      if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
7246          trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
7247          return NVME_INVALID_OPCODE | NVME_DNR;
7248      }
7249  
7250      /* SGLs shall not be used for Admin commands in NVMe over PCIe */
7251      if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
7252          return NVME_INVALID_FIELD | NVME_DNR;
7253      }
7254  
7255      if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
7256          return NVME_INVALID_FIELD;
7257      }
7258  
7259      switch (req->cmd.opcode) {
7260      case NVME_ADM_CMD_DELETE_SQ:
7261          return nvme_del_sq(n, req);
7262      case NVME_ADM_CMD_CREATE_SQ:
7263          return nvme_create_sq(n, req);
7264      case NVME_ADM_CMD_GET_LOG_PAGE:
7265          return nvme_get_log(n, req);
7266      case NVME_ADM_CMD_DELETE_CQ:
7267          return nvme_del_cq(n, req);
7268      case NVME_ADM_CMD_CREATE_CQ:
7269          return nvme_create_cq(n, req);
7270      case NVME_ADM_CMD_IDENTIFY:
7271          return nvme_identify(n, req);
7272      case NVME_ADM_CMD_ABORT:
7273          return nvme_abort(n, req);
7274      case NVME_ADM_CMD_SET_FEATURES:
7275          return nvme_set_feature(n, req);
7276      case NVME_ADM_CMD_GET_FEATURES:
7277          return nvme_get_feature(n, req);
7278      case NVME_ADM_CMD_ASYNC_EV_REQ:
7279          return nvme_aer(n, req);
7280      case NVME_ADM_CMD_NS_ATTACHMENT:
7281          return nvme_ns_attachment(n, req);
7282      case NVME_ADM_CMD_VIRT_MNGMT:
7283          return nvme_virt_mngmt(n, req);
7284      case NVME_ADM_CMD_DBBUF_CONFIG:
7285          return nvme_dbbuf_config(n, req);
7286      case NVME_ADM_CMD_FORMAT_NVM:
7287          return nvme_format(n, req);
7288      case NVME_ADM_CMD_DIRECTIVE_SEND:
7289          return nvme_directive_send(n, req);
7290      case NVME_ADM_CMD_DIRECTIVE_RECV:
7291          return nvme_directive_receive(n, req);
7292      default:
7293          g_assert_not_reached();
7294      }
7295  
7296      return NVME_INVALID_OPCODE | NVME_DNR;
7297  }
7298  
nvme_update_sq_eventidx(const NvmeSQueue * sq)7299  static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
7300  {
7301      trace_pci_nvme_update_sq_eventidx(sq->sqid, sq->tail);
7302  
7303      stl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->ei_addr, sq->tail,
7304                     MEMTXATTRS_UNSPECIFIED);
7305  }
7306  
nvme_update_sq_tail(NvmeSQueue * sq)7307  static void nvme_update_sq_tail(NvmeSQueue *sq)
7308  {
7309      ldl_le_pci_dma(PCI_DEVICE(sq->ctrl), sq->db_addr, &sq->tail,
7310                     MEMTXATTRS_UNSPECIFIED);
7311  
7312      trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
7313  }
7314  
7315  #define NVME_ATOMIC_NO_START        0
7316  #define NVME_ATOMIC_START_ATOMIC    1
7317  #define NVME_ATOMIC_START_NONATOMIC 2
7318  
nvme_atomic_write_check(NvmeCtrl * n,NvmeCmd * cmd,NvmeAtomic * atomic)7319  static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd,
7320      NvmeAtomic *atomic)
7321  {
7322      NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
7323      uint64_t slba = le64_to_cpu(rw->slba);
7324      uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
7325      uint64_t elba = slba + nlb;
7326      bool cmd_atomic_wr = true;
7327      int i;
7328  
7329      if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
7330          ((rw->nlb + 1) > atomic->atomic_max_write_size))) {
7331          cmd_atomic_wr = false;
7332      }
7333  
7334      /*
7335       * Walk the queues to see if there are any atomic conflicts.
7336       */
7337      for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
7338          NvmeSQueue *sq;
7339          NvmeRequest *req;
7340          NvmeRwCmd *req_rw;
7341          uint64_t req_slba;
7342          uint32_t req_nlb;
7343          uint64_t req_elba;
7344  
7345          sq = n->sq[i];
7346          if (!sq) {
7347              continue;
7348          }
7349  
7350          /*
7351           * Walk all the requests on a given queue.
7352           */
7353          QTAILQ_FOREACH(req, &sq->out_req_list, entry) {
7354              req_rw = (NvmeRwCmd *)&req->cmd;
7355  
7356              if (((req_rw->opcode == NVME_CMD_WRITE) ||
7357                   (req_rw->opcode == NVME_CMD_READ)) &&
7358                  (cmd->nsid == req->ns->params.nsid)) {
7359                  req_slba = le64_to_cpu(req_rw->slba);
7360                  req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
7361                  req_elba = req_slba + req_nlb;
7362  
7363                  if (cmd_atomic_wr) {
7364                      if ((elba >= req_slba) && (slba <= req_elba)) {
7365                          return NVME_ATOMIC_NO_START;
7366                      }
7367                  } else {
7368                      if (req->atomic_write && ((elba >= req_slba) &&
7369                          (slba <= req_elba))) {
7370                          return NVME_ATOMIC_NO_START;
7371                      }
7372                  }
7373              }
7374          }
7375      }
7376      if (cmd_atomic_wr) {
7377          return NVME_ATOMIC_START_ATOMIC;
7378      }
7379      return NVME_ATOMIC_START_NONATOMIC;
7380  }
7381  
nvme_get_atomic(NvmeCtrl * n,NvmeCmd * cmd)7382  static NvmeAtomic *nvme_get_atomic(NvmeCtrl *n, NvmeCmd *cmd)
7383  {
7384      if (n->atomic.atomic_writes) {
7385          return &n->atomic;
7386      }
7387      return NULL;
7388  }
7389  
nvme_process_sq(void * opaque)7390  static void nvme_process_sq(void *opaque)
7391  {
7392      NvmeSQueue *sq = opaque;
7393      NvmeCtrl *n = sq->ctrl;
7394      NvmeCQueue *cq = n->cq[sq->cqid];
7395  
7396      uint16_t status;
7397      hwaddr addr;
7398      NvmeCmd cmd;
7399      NvmeRequest *req;
7400  
7401      if (n->dbbuf_enabled) {
7402          nvme_update_sq_tail(sq);
7403      }
7404  
7405      while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
7406          NvmeAtomic *atomic;
7407          bool cmd_is_atomic;
7408  
7409          addr = sq->dma_addr + (sq->head << NVME_SQES);
7410          if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
7411              trace_pci_nvme_err_addr_read(addr);
7412              trace_pci_nvme_err_cfs();
7413              stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7414              break;
7415          }
7416  
7417          atomic = nvme_get_atomic(n, &cmd);
7418  
7419          cmd_is_atomic = false;
7420          if (sq->sqid && atomic) {
7421              int ret;
7422  
7423              ret = nvme_atomic_write_check(n, &cmd, atomic);
7424              switch (ret) {
7425              case NVME_ATOMIC_NO_START:
7426                  qemu_bh_schedule(sq->bh);
7427                  return;
7428              case NVME_ATOMIC_START_ATOMIC:
7429                  cmd_is_atomic = true;
7430                  break;
7431              case NVME_ATOMIC_START_NONATOMIC:
7432              default:
7433                  break;
7434              }
7435          }
7436          nvme_inc_sq_head(sq);
7437  
7438          req = QTAILQ_FIRST(&sq->req_list);
7439          QTAILQ_REMOVE(&sq->req_list, req, entry);
7440          QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
7441          nvme_req_clear(req);
7442          req->cqe.cid = cmd.cid;
7443          memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
7444  
7445          if (sq->sqid && atomic) {
7446              req->atomic_write = cmd_is_atomic;
7447          }
7448  
7449          status = sq->sqid ? nvme_io_cmd(n, req) :
7450              nvme_admin_cmd(n, req);
7451          if (status != NVME_NO_COMPLETE) {
7452              req->status = status;
7453              nvme_enqueue_req_completion(cq, req);
7454          }
7455  
7456          if (n->dbbuf_enabled) {
7457              nvme_update_sq_eventidx(sq);
7458              nvme_update_sq_tail(sq);
7459          }
7460      }
7461  }
7462  
nvme_update_msixcap_ts(PCIDevice * pci_dev,uint32_t table_size)7463  static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
7464  {
7465      uint8_t *config;
7466  
7467      if (!msix_present(pci_dev)) {
7468          return;
7469      }
7470  
7471      assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
7472  
7473      config = pci_dev->config + pci_dev->msix_cap;
7474      pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
7475                           table_size - 1);
7476  }
7477  
nvme_activate_virt_res(NvmeCtrl * n)7478  static void nvme_activate_virt_res(NvmeCtrl *n)
7479  {
7480      PCIDevice *pci_dev = PCI_DEVICE(n);
7481      NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7482      NvmeSecCtrlEntry *sctrl;
7483  
7484      /* -1 to account for the admin queue */
7485      if (pci_is_vf(pci_dev)) {
7486          sctrl = nvme_sctrl(n);
7487          cap->vqprt = sctrl->nvq;
7488          cap->viprt = sctrl->nvi;
7489          n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7490          n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7491      } else {
7492          cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
7493          cap->virfap = n->next_pri_ctrl_cap.virfap;
7494          n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
7495                             le16_to_cpu(cap->vqrfap) - 1;
7496          n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
7497                               le16_to_cpu(cap->virfap);
7498      }
7499  }
7500  
nvme_ctrl_reset(NvmeCtrl * n,NvmeResetType rst)7501  static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
7502  {
7503      PCIDevice *pci_dev = PCI_DEVICE(n);
7504      NvmeSecCtrlEntry *sctrl;
7505      NvmeNamespace *ns;
7506      int i;
7507  
7508      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7509          ns = nvme_ns(n, i);
7510          if (!ns) {
7511              continue;
7512          }
7513  
7514          nvme_ns_drain(ns);
7515      }
7516  
7517      for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7518          if (n->sq[i] != NULL) {
7519              nvme_free_sq(n->sq[i], n);
7520          }
7521      }
7522      for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
7523          if (n->cq[i] != NULL) {
7524              nvme_free_cq(n->cq[i], n);
7525          }
7526      }
7527  
7528      while (!QTAILQ_EMPTY(&n->aer_queue)) {
7529          NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
7530          QTAILQ_REMOVE(&n->aer_queue, event, entry);
7531          g_free(event);
7532      }
7533  
7534      if (n->params.sriov_max_vfs) {
7535          if (!pci_is_vf(pci_dev)) {
7536              for (i = 0; i < n->nr_sec_ctrls; i++) {
7537                  sctrl = &n->sec_ctrl_list[i];
7538                  nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7539              }
7540          }
7541  
7542          if (rst != NVME_RESET_CONTROLLER) {
7543              nvme_activate_virt_res(n);
7544          }
7545      }
7546  
7547      n->aer_queued = 0;
7548      n->aer_mask = 0;
7549      n->outstanding_aers = 0;
7550      n->qs_created = false;
7551  
7552      n->dn = n->params.atomic_dn; /* Set Disable Normal */
7553  
7554      nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7555  
7556      if (pci_is_vf(pci_dev)) {
7557          sctrl = nvme_sctrl(n);
7558  
7559          stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
7560      } else {
7561          stl_le_p(&n->bar.csts, 0);
7562      }
7563  
7564      stl_le_p(&n->bar.intms, 0);
7565      stl_le_p(&n->bar.intmc, 0);
7566      stl_le_p(&n->bar.cc, 0);
7567  
7568      n->dbbuf_dbs = 0;
7569      n->dbbuf_eis = 0;
7570      n->dbbuf_enabled = false;
7571  }
7572  
nvme_ctrl_shutdown(NvmeCtrl * n)7573  static void nvme_ctrl_shutdown(NvmeCtrl *n)
7574  {
7575      NvmeNamespace *ns;
7576      int i;
7577  
7578      if (n->pmr.dev) {
7579          memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7580      }
7581  
7582      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7583          ns = nvme_ns(n, i);
7584          if (!ns) {
7585              continue;
7586          }
7587  
7588          nvme_ns_shutdown(ns);
7589      }
7590  }
7591  
nvme_select_iocs(NvmeCtrl * n)7592  static void nvme_select_iocs(NvmeCtrl *n)
7593  {
7594      NvmeNamespace *ns;
7595      int i;
7596  
7597      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7598          ns = nvme_ns(n, i);
7599          if (!ns) {
7600              continue;
7601          }
7602  
7603          nvme_select_iocs_ns(n, ns);
7604      }
7605  }
7606  
nvme_start_ctrl(NvmeCtrl * n)7607  static int nvme_start_ctrl(NvmeCtrl *n)
7608  {
7609      uint64_t cap = ldq_le_p(&n->bar.cap);
7610      uint32_t cc = ldl_le_p(&n->bar.cc);
7611      uint32_t aqa = ldl_le_p(&n->bar.aqa);
7612      uint64_t asq = ldq_le_p(&n->bar.asq);
7613      uint64_t acq = ldq_le_p(&n->bar.acq);
7614      uint32_t page_bits = NVME_CC_MPS(cc) + 12;
7615      uint32_t page_size = 1 << page_bits;
7616      NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7617  
7618      if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
7619          trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
7620                                                  le16_to_cpu(sctrl->nvq));
7621          return -1;
7622      }
7623      if (unlikely(n->cq[0])) {
7624          trace_pci_nvme_err_startfail_cq();
7625          return -1;
7626      }
7627      if (unlikely(n->sq[0])) {
7628          trace_pci_nvme_err_startfail_sq();
7629          return -1;
7630      }
7631      if (unlikely(asq & (page_size - 1))) {
7632          trace_pci_nvme_err_startfail_asq_misaligned(asq);
7633          return -1;
7634      }
7635      if (unlikely(acq & (page_size - 1))) {
7636          trace_pci_nvme_err_startfail_acq_misaligned(acq);
7637          return -1;
7638      }
7639      if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
7640          trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
7641          return -1;
7642      }
7643      if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
7644          trace_pci_nvme_err_startfail_page_too_small(
7645                      NVME_CC_MPS(cc),
7646                      NVME_CAP_MPSMIN(cap));
7647          return -1;
7648      }
7649      if (unlikely(NVME_CC_MPS(cc) >
7650                   NVME_CAP_MPSMAX(cap))) {
7651          trace_pci_nvme_err_startfail_page_too_large(
7652                      NVME_CC_MPS(cc),
7653                      NVME_CAP_MPSMAX(cap));
7654          return -1;
7655      }
7656      if (unlikely(!NVME_AQA_ASQS(aqa))) {
7657          trace_pci_nvme_err_startfail_asqent_sz_zero();
7658          return -1;
7659      }
7660      if (unlikely(!NVME_AQA_ACQS(aqa))) {
7661          trace_pci_nvme_err_startfail_acqent_sz_zero();
7662          return -1;
7663      }
7664  
7665      n->page_bits = page_bits;
7666      n->page_size = page_size;
7667      n->max_prp_ents = n->page_size / sizeof(uint64_t);
7668      nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
7669      nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
7670  
7671      nvme_set_timestamp(n, 0ULL);
7672  
7673      nvme_select_iocs(n);
7674  
7675      return 0;
7676  }
7677  
nvme_cmb_enable_regs(NvmeCtrl * n)7678  static void nvme_cmb_enable_regs(NvmeCtrl *n)
7679  {
7680      uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
7681      uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
7682  
7683      NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
7684      NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
7685      NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
7686      stl_le_p(&n->bar.cmbloc, cmbloc);
7687  
7688      NVME_CMBSZ_SET_SQS(cmbsz, 1);
7689      NVME_CMBSZ_SET_CQS(cmbsz, 0);
7690      NVME_CMBSZ_SET_LISTS(cmbsz, 1);
7691      NVME_CMBSZ_SET_RDS(cmbsz, 1);
7692      NVME_CMBSZ_SET_WDS(cmbsz, 1);
7693      NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
7694      NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
7695      stl_le_p(&n->bar.cmbsz, cmbsz);
7696  }
7697  
nvme_write_bar(NvmeCtrl * n,hwaddr offset,uint64_t data,unsigned size)7698  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
7699                             unsigned size)
7700  {
7701      PCIDevice *pci = PCI_DEVICE(n);
7702      uint64_t cap = ldq_le_p(&n->bar.cap);
7703      uint32_t cc = ldl_le_p(&n->bar.cc);
7704      uint32_t intms = ldl_le_p(&n->bar.intms);
7705      uint32_t csts = ldl_le_p(&n->bar.csts);
7706      uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
7707  
7708      if (unlikely(offset & (sizeof(uint32_t) - 1))) {
7709          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
7710                         "MMIO write not 32-bit aligned,"
7711                         " offset=0x%"PRIx64"", offset);
7712          /* should be ignored, fall through for now */
7713      }
7714  
7715      if (unlikely(size < sizeof(uint32_t))) {
7716          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
7717                         "MMIO write smaller than 32-bits,"
7718                         " offset=0x%"PRIx64", size=%u",
7719                         offset, size);
7720          /* should be ignored, fall through for now */
7721      }
7722  
7723      switch (offset) {
7724      case NVME_REG_INTMS:
7725          if (unlikely(msix_enabled(pci))) {
7726              NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7727                             "undefined access to interrupt mask set"
7728                             " when MSI-X is enabled");
7729              /* should be ignored, fall through for now */
7730          }
7731          intms |= data;
7732          stl_le_p(&n->bar.intms, intms);
7733          n->bar.intmc = n->bar.intms;
7734          trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
7735          nvme_irq_check(n);
7736          break;
7737      case NVME_REG_INTMC:
7738          if (unlikely(msix_enabled(pci))) {
7739              NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
7740                             "undefined access to interrupt mask clr"
7741                             " when MSI-X is enabled");
7742              /* should be ignored, fall through for now */
7743          }
7744          intms &= ~data;
7745          stl_le_p(&n->bar.intms, intms);
7746          n->bar.intmc = n->bar.intms;
7747          trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
7748          nvme_irq_check(n);
7749          break;
7750      case NVME_REG_CC:
7751          stl_le_p(&n->bar.cc, data);
7752  
7753          trace_pci_nvme_mmio_cfg(data & 0xffffffff);
7754  
7755          if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
7756              trace_pci_nvme_mmio_shutdown_set();
7757              nvme_ctrl_shutdown(n);
7758              csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7759              csts |= NVME_CSTS_SHST_COMPLETE;
7760          } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
7761              trace_pci_nvme_mmio_shutdown_cleared();
7762              csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
7763          }
7764  
7765          if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
7766              if (unlikely(nvme_start_ctrl(n))) {
7767                  trace_pci_nvme_err_startfail();
7768                  csts = NVME_CSTS_FAILED;
7769              } else {
7770                  trace_pci_nvme_mmio_start_success();
7771                  csts = NVME_CSTS_READY;
7772              }
7773          } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
7774              trace_pci_nvme_mmio_stopped();
7775              nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
7776  
7777              break;
7778          }
7779  
7780          stl_le_p(&n->bar.csts, csts);
7781  
7782          break;
7783      case NVME_REG_CSTS:
7784          if (data & (1 << 4)) {
7785              NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
7786                             "attempted to W1C CSTS.NSSRO"
7787                             " but CAP.NSSRS is zero (not supported)");
7788          } else if (data != 0) {
7789              NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
7790                             "attempted to set a read only bit"
7791                             " of controller status");
7792          }
7793          break;
7794      case NVME_REG_NSSR:
7795          if (data == 0x4e564d65) {
7796              trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
7797          } else {
7798              /* The spec says that writes of other values have no effect */
7799              return;
7800          }
7801          break;
7802      case NVME_REG_AQA:
7803          stl_le_p(&n->bar.aqa, data);
7804          trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
7805          break;
7806      case NVME_REG_ASQ:
7807          stn_le_p(&n->bar.asq, size, data);
7808          trace_pci_nvme_mmio_asqaddr(data);
7809          break;
7810      case NVME_REG_ASQ + 4:
7811          stl_le_p((uint8_t *)&n->bar.asq + 4, data);
7812          trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
7813          break;
7814      case NVME_REG_ACQ:
7815          trace_pci_nvme_mmio_acqaddr(data);
7816          stn_le_p(&n->bar.acq, size, data);
7817          break;
7818      case NVME_REG_ACQ + 4:
7819          stl_le_p((uint8_t *)&n->bar.acq + 4, data);
7820          trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
7821          break;
7822      case NVME_REG_CMBLOC:
7823          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
7824                         "invalid write to reserved CMBLOC"
7825                         " when CMBSZ is zero, ignored");
7826          return;
7827      case NVME_REG_CMBSZ:
7828          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
7829                         "invalid write to read only CMBSZ, ignored");
7830          return;
7831      case NVME_REG_CMBMSC:
7832          if (!NVME_CAP_CMBS(cap)) {
7833              return;
7834          }
7835  
7836          stn_le_p(&n->bar.cmbmsc, size, data);
7837          n->cmb.cmse = false;
7838  
7839          if (NVME_CMBMSC_CRE(data)) {
7840              nvme_cmb_enable_regs(n);
7841  
7842              if (NVME_CMBMSC_CMSE(data)) {
7843                  uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
7844                  hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
7845                  if (cba + int128_get64(n->cmb.mem.size) < cba) {
7846                      uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
7847                      NVME_CMBSTS_SET_CBAI(cmbsts, 1);
7848                      stl_le_p(&n->bar.cmbsts, cmbsts);
7849                      return;
7850                  }
7851  
7852                  n->cmb.cba = cba;
7853                  n->cmb.cmse = true;
7854              }
7855          } else {
7856              n->bar.cmbsz = 0;
7857              n->bar.cmbloc = 0;
7858          }
7859  
7860          return;
7861      case NVME_REG_CMBMSC + 4:
7862          stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
7863          return;
7864  
7865      case NVME_REG_PMRCAP:
7866          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
7867                         "invalid write to PMRCAP register, ignored");
7868          return;
7869      case NVME_REG_PMRCTL:
7870          if (!NVME_CAP_PMRS(cap)) {
7871              return;
7872          }
7873  
7874          stl_le_p(&n->bar.pmrctl, data);
7875          if (NVME_PMRCTL_EN(data)) {
7876              memory_region_set_enabled(&n->pmr.dev->mr, true);
7877              pmrsts = 0;
7878          } else {
7879              memory_region_set_enabled(&n->pmr.dev->mr, false);
7880              NVME_PMRSTS_SET_NRDY(pmrsts, 1);
7881              n->pmr.cmse = false;
7882          }
7883          stl_le_p(&n->bar.pmrsts, pmrsts);
7884          return;
7885      case NVME_REG_PMRSTS:
7886          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
7887                         "invalid write to PMRSTS register, ignored");
7888          return;
7889      case NVME_REG_PMREBS:
7890          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
7891                         "invalid write to PMREBS register, ignored");
7892          return;
7893      case NVME_REG_PMRSWTP:
7894          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
7895                         "invalid write to PMRSWTP register, ignored");
7896          return;
7897      case NVME_REG_PMRMSCL:
7898          if (!NVME_CAP_PMRS(cap)) {
7899              return;
7900          }
7901  
7902          stl_le_p(&n->bar.pmrmscl, data);
7903          n->pmr.cmse = false;
7904  
7905          if (NVME_PMRMSCL_CMSE(data)) {
7906              uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
7907              hwaddr cba = pmrmscu << 32 |
7908                  (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
7909              if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
7910                  NVME_PMRSTS_SET_CBAI(pmrsts, 1);
7911                  stl_le_p(&n->bar.pmrsts, pmrsts);
7912                  return;
7913              }
7914  
7915              n->pmr.cmse = true;
7916              n->pmr.cba = cba;
7917          }
7918  
7919          return;
7920      case NVME_REG_PMRMSCU:
7921          if (!NVME_CAP_PMRS(cap)) {
7922              return;
7923          }
7924  
7925          stl_le_p(&n->bar.pmrmscu, data);
7926          return;
7927      default:
7928          NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
7929                         "invalid MMIO write,"
7930                         " offset=0x%"PRIx64", data=%"PRIx64"",
7931                         offset, data);
7932          break;
7933      }
7934  }
7935  
nvme_mmio_read(void * opaque,hwaddr addr,unsigned size)7936  static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
7937  {
7938      NvmeCtrl *n = (NvmeCtrl *)opaque;
7939      uint8_t *ptr = (uint8_t *)&n->bar;
7940  
7941      trace_pci_nvme_mmio_read(addr, size);
7942  
7943      if (unlikely(addr & (sizeof(uint32_t) - 1))) {
7944          NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
7945                         "MMIO read not 32-bit aligned,"
7946                         " offset=0x%"PRIx64"", addr);
7947          /* should RAZ, fall through for now */
7948      } else if (unlikely(size < sizeof(uint32_t))) {
7949          NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
7950                         "MMIO read smaller than 32-bits,"
7951                         " offset=0x%"PRIx64"", addr);
7952          /* should RAZ, fall through for now */
7953      }
7954  
7955      if (addr > sizeof(n->bar) - size) {
7956          NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
7957                         "MMIO read beyond last register,"
7958                         " offset=0x%"PRIx64", returning 0", addr);
7959  
7960          return 0;
7961      }
7962  
7963      if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
7964          addr != NVME_REG_CSTS) {
7965          trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
7966          return 0;
7967      }
7968  
7969      /*
7970       * When PMRWBM bit 1 is set then read from
7971       * from PMRSTS should ensure prior writes
7972       * made it to persistent media
7973       */
7974      if (addr == NVME_REG_PMRSTS &&
7975          (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
7976          memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
7977      }
7978  
7979      return ldn_le_p(ptr + addr, size);
7980  }
7981  
nvme_process_db(NvmeCtrl * n,hwaddr addr,int val)7982  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
7983  {
7984      PCIDevice *pci = PCI_DEVICE(n);
7985      uint32_t qid;
7986  
7987      if (unlikely(addr & ((1 << 2) - 1))) {
7988          NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
7989                         "doorbell write not 32-bit aligned,"
7990                         " offset=0x%"PRIx64", ignoring", addr);
7991          return;
7992      }
7993  
7994      if (((addr - 0x1000) >> 2) & 1) {
7995          /* Completion queue doorbell write */
7996  
7997          uint16_t new_head = val & 0xffff;
7998          NvmeCQueue *cq;
7999  
8000          qid = (addr - (0x1000 + (1 << 2))) >> 3;
8001          if (unlikely(nvme_check_cqid(n, qid))) {
8002              NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
8003                             "completion queue doorbell write"
8004                             " for nonexistent queue,"
8005                             " sqid=%"PRIu32", ignoring", qid);
8006  
8007              /*
8008               * NVM Express v1.3d, Section 4.1 state: "If host software writes
8009               * an invalid value to the Submission Queue Tail Doorbell or
8010               * Completion Queue Head Doorbell register and an Asynchronous Event
8011               * Request command is outstanding, then an asynchronous event is
8012               * posted to the Admin Completion Queue with a status code of
8013               * Invalid Doorbell Write Value."
8014               *
8015               * Also note that the spec includes the "Invalid Doorbell Register"
8016               * status code, but nowhere does it specify when to use it.
8017               * However, it seems reasonable to use it here in a similar
8018               * fashion.
8019               */
8020              if (n->outstanding_aers) {
8021                  nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8022                                     NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8023                                     NVME_LOG_ERROR_INFO);
8024              }
8025  
8026              return;
8027          }
8028  
8029          cq = n->cq[qid];
8030          if (unlikely(new_head >= cq->size)) {
8031              NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
8032                             "completion queue doorbell write value"
8033                             " beyond queue size, sqid=%"PRIu32","
8034                             " new_head=%"PRIu16", ignoring",
8035                             qid, new_head);
8036  
8037              if (n->outstanding_aers) {
8038                  nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8039                                     NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8040                                     NVME_LOG_ERROR_INFO);
8041              }
8042  
8043              return;
8044          }
8045  
8046          trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
8047  
8048          /* scheduled deferred cqe posting if queue was previously full */
8049          if (nvme_cq_full(cq)) {
8050              qemu_bh_schedule(cq->bh);
8051          }
8052  
8053          cq->head = new_head;
8054          if (!qid && n->dbbuf_enabled) {
8055              stl_le_pci_dma(pci, cq->db_addr, cq->head, MEMTXATTRS_UNSPECIFIED);
8056          }
8057  
8058          if (cq->tail == cq->head) {
8059              if (cq->irq_enabled) {
8060                  n->cq_pending--;
8061              }
8062  
8063              nvme_irq_deassert(n, cq);
8064          }
8065      } else {
8066          /* Submission queue doorbell write */
8067  
8068          uint16_t new_tail = val & 0xffff;
8069          NvmeSQueue *sq;
8070  
8071          qid = (addr - 0x1000) >> 3;
8072          if (unlikely(nvme_check_sqid(n, qid))) {
8073              NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
8074                             "submission queue doorbell write"
8075                             " for nonexistent queue,"
8076                             " sqid=%"PRIu32", ignoring", qid);
8077  
8078              if (n->outstanding_aers) {
8079                  nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8080                                     NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
8081                                     NVME_LOG_ERROR_INFO);
8082              }
8083  
8084              return;
8085          }
8086  
8087          sq = n->sq[qid];
8088          if (unlikely(new_tail >= sq->size)) {
8089              NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
8090                             "submission queue doorbell write value"
8091                             " beyond queue size, sqid=%"PRIu32","
8092                             " new_tail=%"PRIu16", ignoring",
8093                             qid, new_tail);
8094  
8095              if (n->outstanding_aers) {
8096                  nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
8097                                     NVME_AER_INFO_ERR_INVALID_DB_VALUE,
8098                                     NVME_LOG_ERROR_INFO);
8099              }
8100  
8101              return;
8102          }
8103  
8104          trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
8105  
8106          sq->tail = new_tail;
8107          if (!qid && n->dbbuf_enabled) {
8108              /*
8109               * The spec states "the host shall also update the controller's
8110               * corresponding doorbell property to match the value of that entry
8111               * in the Shadow Doorbell buffer."
8112               *
8113               * Since this context is currently a VM trap, we can safely enforce
8114               * the requirement from the device side in case the host is
8115               * misbehaving.
8116               *
8117               * Note, we shouldn't have to do this, but various drivers
8118               * including ones that run on Linux, are not updating Admin Queues,
8119               * so we can't trust reading it for an appropriate sq tail.
8120               */
8121              stl_le_pci_dma(pci, sq->db_addr, sq->tail, MEMTXATTRS_UNSPECIFIED);
8122          }
8123  
8124          qemu_bh_schedule(sq->bh);
8125      }
8126  }
8127  
nvme_mmio_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8128  static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
8129                              unsigned size)
8130  {
8131      NvmeCtrl *n = (NvmeCtrl *)opaque;
8132  
8133      trace_pci_nvme_mmio_write(addr, data, size);
8134  
8135      if (pci_is_vf(PCI_DEVICE(n)) && !nvme_sctrl(n)->scs &&
8136          addr != NVME_REG_CSTS) {
8137          trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
8138          return;
8139      }
8140  
8141      if (addr < sizeof(n->bar)) {
8142          nvme_write_bar(n, addr, data, size);
8143      } else {
8144          nvme_process_db(n, addr, data);
8145      }
8146  }
8147  
8148  static const MemoryRegionOps nvme_mmio_ops = {
8149      .read = nvme_mmio_read,
8150      .write = nvme_mmio_write,
8151      .endianness = DEVICE_LITTLE_ENDIAN,
8152      .impl = {
8153          .min_access_size = 2,
8154          .max_access_size = 8,
8155      },
8156  };
8157  
nvme_cmb_write(void * opaque,hwaddr addr,uint64_t data,unsigned size)8158  static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
8159                             unsigned size)
8160  {
8161      NvmeCtrl *n = (NvmeCtrl *)opaque;
8162      stn_le_p(&n->cmb.buf[addr], size, data);
8163  }
8164  
nvme_cmb_read(void * opaque,hwaddr addr,unsigned size)8165  static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
8166  {
8167      NvmeCtrl *n = (NvmeCtrl *)opaque;
8168      return ldn_le_p(&n->cmb.buf[addr], size);
8169  }
8170  
8171  static const MemoryRegionOps nvme_cmb_ops = {
8172      .read = nvme_cmb_read,
8173      .write = nvme_cmb_write,
8174      .endianness = DEVICE_LITTLE_ENDIAN,
8175      .impl = {
8176          .min_access_size = 1,
8177          .max_access_size = 8,
8178      },
8179  };
8180  
nvme_check_params(NvmeCtrl * n,Error ** errp)8181  static bool nvme_check_params(NvmeCtrl *n, Error **errp)
8182  {
8183      NvmeParams *params = &n->params;
8184  
8185      if (params->num_queues) {
8186          warn_report("num_queues is deprecated; please use max_ioqpairs "
8187                      "instead");
8188  
8189          params->max_ioqpairs = params->num_queues - 1;
8190      }
8191  
8192      if (n->namespace.blkconf.blk && n->subsys) {
8193          error_setg(errp, "subsystem support is unavailable with legacy "
8194                     "namespace ('drive' property)");
8195          return false;
8196      }
8197  
8198      if (params->max_ioqpairs < 1 ||
8199          params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
8200          error_setg(errp, "max_ioqpairs must be between 1 and %d",
8201                     NVME_MAX_IOQPAIRS);
8202          return false;
8203      }
8204  
8205      if (params->msix_qsize < 1 ||
8206          params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
8207          error_setg(errp, "msix_qsize must be between 1 and %d",
8208                     PCI_MSIX_FLAGS_QSIZE + 1);
8209          return false;
8210      }
8211  
8212      if (!params->serial) {
8213          error_setg(errp, "serial property not set");
8214          return false;
8215      }
8216  
8217      if (params->mqes < 1) {
8218          error_setg(errp, "mqes property cannot be less than 1");
8219          return false;
8220      }
8221  
8222      if (n->pmr.dev) {
8223          if (params->msix_exclusive_bar) {
8224              error_setg(errp, "not enough BARs available to enable PMR");
8225              return false;
8226          }
8227  
8228          if (host_memory_backend_is_mapped(n->pmr.dev)) {
8229              error_setg(errp, "can't use already busy memdev: %s",
8230                         object_get_canonical_path_component(OBJECT(n->pmr.dev)));
8231              return false;
8232          }
8233  
8234          if (!is_power_of_2(n->pmr.dev->size)) {
8235              error_setg(errp, "pmr backend size needs to be power of 2 in size");
8236              return false;
8237          }
8238  
8239          host_memory_backend_set_mapped(n->pmr.dev, true);
8240      }
8241  
8242      if (n->params.zasl > n->params.mdts) {
8243          error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
8244                     "than or equal to mdts (Maximum Data Transfer Size)");
8245          return false;
8246      }
8247  
8248      if (!n->params.vsl) {
8249          error_setg(errp, "vsl must be non-zero");
8250          return false;
8251      }
8252  
8253      if (params->sriov_max_vfs) {
8254          if (!n->subsys) {
8255              error_setg(errp, "subsystem is required for the use of SR-IOV");
8256              return false;
8257          }
8258  
8259          if (params->cmb_size_mb) {
8260              error_setg(errp, "CMB is not supported with SR-IOV");
8261              return false;
8262          }
8263  
8264          if (n->pmr.dev) {
8265              error_setg(errp, "PMR is not supported with SR-IOV");
8266              return false;
8267          }
8268  
8269          if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
8270              error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
8271                         " must be set for the use of SR-IOV");
8272              return false;
8273          }
8274  
8275          if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
8276              error_setg(errp, "sriov_vq_flexible must be greater than or equal"
8277                         " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
8278              return false;
8279          }
8280  
8281          if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
8282              error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
8283                         " greater than or equal to 2");
8284              return false;
8285          }
8286  
8287          if (params->sriov_vi_flexible < params->sriov_max_vfs) {
8288              error_setg(errp, "sriov_vi_flexible must be greater than or equal"
8289                         " to %d (sriov_max_vfs)", params->sriov_max_vfs);
8290              return false;
8291          }
8292  
8293          if (params->msix_qsize < params->sriov_vi_flexible + 1) {
8294              error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
8295                         " greater than or equal to 1");
8296              return false;
8297          }
8298  
8299          if (params->sriov_max_vi_per_vf &&
8300              (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
8301              error_setg(errp, "sriov_max_vi_per_vf must meet:"
8302                         " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
8303                         " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
8304              return false;
8305          }
8306  
8307          if (params->sriov_max_vq_per_vf &&
8308              (params->sriov_max_vq_per_vf < 2 ||
8309               (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
8310              error_setg(errp, "sriov_max_vq_per_vf must meet:"
8311                         " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
8312                         " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
8313              return false;
8314          }
8315      }
8316  
8317      return true;
8318  }
8319  
nvme_init_state(NvmeCtrl * n)8320  static void nvme_init_state(NvmeCtrl *n)
8321  {
8322      NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8323      NvmeSecCtrlEntry *list = n->sec_ctrl_list;
8324      NvmeSecCtrlEntry *sctrl;
8325      PCIDevice *pci = PCI_DEVICE(n);
8326      NvmeAtomic *atomic = &n->atomic;
8327      NvmeIdCtrl *id = &n->id_ctrl;
8328      uint8_t max_vfs;
8329      int i;
8330  
8331      if (pci_is_vf(pci)) {
8332          sctrl = nvme_sctrl(n);
8333          max_vfs = 0;
8334          n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
8335          n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
8336      } else {
8337          max_vfs = n->params.sriov_max_vfs;
8338          n->conf_ioqpairs = n->params.max_ioqpairs;
8339          n->conf_msix_qsize = n->params.msix_qsize;
8340      }
8341  
8342      n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
8343      n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
8344      n->temperature = NVME_TEMPERATURE;
8345      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
8346      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
8347      n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
8348      QTAILQ_INIT(&n->aer_queue);
8349  
8350      n->nr_sec_ctrls = max_vfs;
8351      for (i = 0; i < max_vfs; i++) {
8352          sctrl = &list[i];
8353          sctrl->pcid = cpu_to_le16(n->cntlid);
8354          sctrl->vfn = cpu_to_le16(i + 1);
8355      }
8356  
8357      cap->cntlid = cpu_to_le16(n->cntlid);
8358      cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
8359  
8360      if (pci_is_vf(pci)) {
8361          cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
8362      } else {
8363          cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
8364                                   n->params.sriov_vq_flexible);
8365          cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
8366          cap->vqrfap = cap->vqfrt;
8367          cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8368          cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
8369                          cpu_to_le16(n->params.sriov_max_vq_per_vf) :
8370                          cap->vqfrt / MAX(max_vfs, 1);
8371      }
8372  
8373      if (pci_is_vf(pci)) {
8374          cap->viprt = cpu_to_le16(n->conf_msix_qsize);
8375      } else {
8376          cap->viprt = cpu_to_le16(n->params.msix_qsize -
8377                                   n->params.sriov_vi_flexible);
8378          cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
8379          cap->virfap = cap->vifrt;
8380          cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
8381          cap->vifrsm = n->params.sriov_max_vi_per_vf ?
8382                          cpu_to_le16(n->params.sriov_max_vi_per_vf) :
8383                          cap->vifrt / MAX(max_vfs, 1);
8384      }
8385  
8386      /* Atomic Write */
8387      id->awun = cpu_to_le16(n->params.atomic_awun);
8388      id->awupf = cpu_to_le16(n->params.atomic_awupf);
8389      n->dn = n->params.atomic_dn;
8390  
8391      if (id->awun || id->awupf) {
8392          if (id->awupf > id->awun) {
8393              id->awupf = 0;
8394          }
8395  
8396          if (n->dn) {
8397              atomic->atomic_max_write_size = id->awupf + 1;
8398          } else {
8399              atomic->atomic_max_write_size = id->awun + 1;
8400          }
8401  
8402          if (atomic->atomic_max_write_size == 1) {
8403              atomic->atomic_writes = 0;
8404          } else {
8405              atomic->atomic_writes = 1;
8406          }
8407      }
8408  }
8409  
nvme_init_cmb(NvmeCtrl * n,PCIDevice * pci_dev)8410  static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
8411  {
8412      uint64_t cmb_size = n->params.cmb_size_mb * MiB;
8413      uint64_t cap = ldq_le_p(&n->bar.cap);
8414  
8415      n->cmb.buf = g_malloc0(cmb_size);
8416      memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
8417                            "nvme-cmb", cmb_size);
8418      pci_register_bar(pci_dev, NVME_CMB_BIR,
8419                       PCI_BASE_ADDRESS_SPACE_MEMORY |
8420                       PCI_BASE_ADDRESS_MEM_TYPE_64 |
8421                       PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
8422  
8423      NVME_CAP_SET_CMBS(cap, 1);
8424      stq_le_p(&n->bar.cap, cap);
8425  
8426      if (n->params.legacy_cmb) {
8427          nvme_cmb_enable_regs(n);
8428          n->cmb.cmse = true;
8429      }
8430  }
8431  
nvme_init_pmr(NvmeCtrl * n,PCIDevice * pci_dev)8432  static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
8433  {
8434      uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
8435  
8436      NVME_PMRCAP_SET_RDS(pmrcap, 1);
8437      NVME_PMRCAP_SET_WDS(pmrcap, 1);
8438      NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
8439      /* Turn on bit 1 support */
8440      NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
8441      NVME_PMRCAP_SET_CMSS(pmrcap, 1);
8442      stl_le_p(&n->bar.pmrcap, pmrcap);
8443  
8444      pci_register_bar(pci_dev, NVME_PMR_BIR,
8445                       PCI_BASE_ADDRESS_SPACE_MEMORY |
8446                       PCI_BASE_ADDRESS_MEM_TYPE_64 |
8447                       PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
8448  
8449      memory_region_set_enabled(&n->pmr.dev->mr, false);
8450  }
8451  
nvme_mbar_size(unsigned total_queues,unsigned total_irqs,unsigned * msix_table_offset,unsigned * msix_pba_offset)8452  static uint64_t nvme_mbar_size(unsigned total_queues, unsigned total_irqs,
8453                                 unsigned *msix_table_offset,
8454                                 unsigned *msix_pba_offset)
8455  {
8456      uint64_t bar_size, msix_table_size;
8457  
8458      bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
8459  
8460      if (total_irqs == 0) {
8461          goto out;
8462      }
8463  
8464      bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8465  
8466      if (msix_table_offset) {
8467          *msix_table_offset = bar_size;
8468      }
8469  
8470      msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
8471      bar_size += msix_table_size;
8472      bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
8473  
8474      if (msix_pba_offset) {
8475          *msix_pba_offset = bar_size;
8476      }
8477  
8478      bar_size += QEMU_ALIGN_UP(total_irqs, 64) / 8;
8479  
8480  out:
8481      return pow2ceil(bar_size);
8482  }
8483  
nvme_init_sriov(NvmeCtrl * n,PCIDevice * pci_dev,uint16_t offset)8484  static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
8485  {
8486      uint16_t vf_dev_id = n->params.use_intel_id ?
8487                           PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
8488      NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
8489      uint64_t bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm),
8490                                        le16_to_cpu(cap->vifrsm),
8491                                        NULL, NULL);
8492  
8493      pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
8494                         n->params.sriov_max_vfs, n->params.sriov_max_vfs,
8495                         NVME_VF_OFFSET, NVME_VF_STRIDE);
8496  
8497      pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8498                                PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
8499  }
8500  
nvme_add_pm_capability(PCIDevice * pci_dev,uint8_t offset)8501  static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
8502  {
8503      Error *err = NULL;
8504      int ret;
8505  
8506      ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
8507                               PCI_PM_SIZEOF, &err);
8508      if (err) {
8509          error_report_err(err);
8510          return ret;
8511      }
8512  
8513      pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
8514                   PCI_PM_CAP_VER_1_2);
8515      pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
8516                   PCI_PM_CTRL_NO_SOFT_RESET);
8517      pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
8518                   PCI_PM_CTRL_STATE_MASK);
8519  
8520      return 0;
8521  }
8522  
pcie_doe_spdm_rsp(DOECap * doe_cap)8523  static bool pcie_doe_spdm_rsp(DOECap *doe_cap)
8524  {
8525      void *req = pcie_doe_get_write_mbox_ptr(doe_cap);
8526      uint32_t req_len = pcie_doe_get_obj_len(req) * 4;
8527      void *rsp = doe_cap->read_mbox;
8528      uint32_t rsp_len = SPDM_SOCKET_MAX_MESSAGE_BUFFER_SIZE;
8529  
8530      uint32_t recvd = spdm_socket_rsp(doe_cap->spdm_socket,
8531                               SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE,
8532                               req, req_len, rsp, rsp_len);
8533      doe_cap->read_mbox_len += DIV_ROUND_UP(recvd, 4);
8534  
8535      return recvd != 0;
8536  }
8537  
8538  static DOEProtocol doe_spdm_prot[] = {
8539      { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_CMA, pcie_doe_spdm_rsp },
8540      { PCI_VENDOR_ID_PCI_SIG, PCI_SIG_DOE_SECURED_CMA, pcie_doe_spdm_rsp },
8541      { }
8542  };
8543  
nvme_init_pci(NvmeCtrl * n,PCIDevice * pci_dev,Error ** errp)8544  static bool nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
8545  {
8546      ERRP_GUARD();
8547      uint8_t *pci_conf = pci_dev->config;
8548      uint64_t bar_size;
8549      unsigned msix_table_offset = 0, msix_pba_offset = 0;
8550      unsigned nr_vectors;
8551      int ret;
8552  
8553      pci_conf[PCI_INTERRUPT_PIN] = pci_is_vf(pci_dev) ? 0 : 1;
8554      pci_config_set_prog_interface(pci_conf, 0x2);
8555  
8556      if (n->params.use_intel_id) {
8557          pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
8558          pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
8559      } else {
8560          pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
8561          pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
8562      }
8563  
8564      pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
8565      nvme_add_pm_capability(pci_dev, 0x60);
8566      pcie_endpoint_cap_init(pci_dev, 0x80);
8567      pcie_cap_flr_init(pci_dev);
8568      if (n->params.sriov_max_vfs) {
8569          pcie_ari_init(pci_dev, 0x100);
8570      }
8571  
8572      if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8573          bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1, 0, NULL, NULL);
8574          memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8575                                bar_size);
8576          pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8577                           PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
8578          ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp);
8579      } else {
8580          assert(n->params.msix_qsize >= 1);
8581  
8582          /* add one to max_ioqpairs to account for the admin queue pair */
8583          if (!pci_is_vf(pci_dev)) {
8584              nr_vectors = n->params.msix_qsize;
8585              bar_size = nvme_mbar_size(n->params.max_ioqpairs + 1,
8586                                        nr_vectors, &msix_table_offset,
8587                                        &msix_pba_offset);
8588          } else {
8589              NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8590              NvmePriCtrlCap *cap = &pn->pri_ctrl_cap;
8591  
8592              nr_vectors = le16_to_cpu(cap->vifrsm);
8593              bar_size = nvme_mbar_size(le16_to_cpu(cap->vqfrsm), nr_vectors,
8594                                        &msix_table_offset, &msix_pba_offset);
8595          }
8596  
8597          memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
8598          memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
8599                                msix_table_offset);
8600          memory_region_add_subregion(&n->bar0, 0, &n->iomem);
8601  
8602          if (pci_is_vf(pci_dev)) {
8603              pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
8604          } else {
8605              pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
8606                               PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
8607          }
8608  
8609          ret = msix_init(pci_dev, nr_vectors,
8610                          &n->bar0, 0, msix_table_offset,
8611                          &n->bar0, 0, msix_pba_offset, 0, errp);
8612      }
8613  
8614      if (ret == -ENOTSUP) {
8615          /* report that msix is not supported, but do not error out */
8616          warn_report_err(*errp);
8617          *errp = NULL;
8618      } else if (ret < 0) {
8619          /* propagate error to caller */
8620          return false;
8621      }
8622  
8623      nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
8624  
8625      pcie_cap_deverr_init(pci_dev);
8626  
8627      /* DOE Initialisation */
8628      if (pci_dev->spdm_port) {
8629          uint16_t doe_offset = n->params.sriov_max_vfs ?
8630                                    PCI_CONFIG_SPACE_SIZE + PCI_ARI_SIZEOF
8631                                    : PCI_CONFIG_SPACE_SIZE;
8632  
8633          pcie_doe_init(pci_dev, &pci_dev->doe_spdm, doe_offset,
8634                        doe_spdm_prot, true, 0);
8635  
8636          pci_dev->doe_spdm.spdm_socket = spdm_socket_connect(pci_dev->spdm_port,
8637                                                              errp);
8638  
8639          if (pci_dev->doe_spdm.spdm_socket < 0) {
8640              return false;
8641          }
8642      }
8643  
8644      if (n->params.cmb_size_mb) {
8645          nvme_init_cmb(n, pci_dev);
8646      }
8647  
8648      if (n->pmr.dev) {
8649          nvme_init_pmr(n, pci_dev);
8650      }
8651  
8652      if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8653          nvme_init_sriov(n, pci_dev, 0x120);
8654      }
8655  
8656      return true;
8657  }
8658  
nvme_init_subnqn(NvmeCtrl * n)8659  static void nvme_init_subnqn(NvmeCtrl *n)
8660  {
8661      NvmeSubsystem *subsys = n->subsys;
8662      NvmeIdCtrl *id = &n->id_ctrl;
8663  
8664      if (!subsys) {
8665          snprintf((char *)id->subnqn, sizeof(id->subnqn),
8666                   "nqn.2019-08.org.qemu:%s", n->params.serial);
8667      } else {
8668          pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
8669      }
8670  }
8671  
nvme_init_ctrl(NvmeCtrl * n,PCIDevice * pci_dev)8672  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
8673  {
8674      NvmeIdCtrl *id = &n->id_ctrl;
8675      uint8_t *pci_conf = pci_dev->config;
8676      uint64_t cap = ldq_le_p(&n->bar.cap);
8677      NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
8678      uint32_t ctratt;
8679  
8680      id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
8681      id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
8682      strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
8683      strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
8684      strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
8685  
8686      id->cntlid = cpu_to_le16(n->cntlid);
8687  
8688      id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
8689  
8690      ctratt = NVME_CTRATT_ELBAS;
8691      if (n->params.ctratt.mem) {
8692          ctratt |= NVME_CTRATT_MEM;
8693      }
8694  
8695      id->rab = 6;
8696  
8697      if (n->params.use_intel_id) {
8698          id->ieee[0] = 0xb3;
8699          id->ieee[1] = 0x02;
8700          id->ieee[2] = 0x00;
8701      } else {
8702          id->ieee[0] = 0x00;
8703          id->ieee[1] = 0x54;
8704          id->ieee[2] = 0x52;
8705      }
8706  
8707      id->mdts = n->params.mdts;
8708      id->ver = cpu_to_le32(NVME_SPEC_VER);
8709      id->oacs =
8710          cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF |
8711                      NVME_OACS_DIRECTIVES);
8712      id->cntrltype = 0x1;
8713  
8714      /*
8715       * Because the controller always completes the Abort command immediately,
8716       * there can never be more than one concurrently executing Abort command,
8717       * so this value is never used for anything. Note that there can easily be
8718       * many Abort commands in the queues, but they are not considered
8719       * "executing" until processed by nvme_abort.
8720       *
8721       * The specification recommends a value of 3 for Abort Command Limit (four
8722       * concurrently outstanding Abort commands), so lets use that though it is
8723       * inconsequential.
8724       */
8725      id->acl = 3;
8726      id->aerl = n->params.aerl;
8727      id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
8728      id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
8729  
8730      /* recommended default value (~70 C) */
8731      id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
8732      id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
8733  
8734      id->sqes = (NVME_SQES << 4) | NVME_SQES;
8735      id->cqes = (NVME_CQES << 4) | NVME_CQES;
8736      id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
8737      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
8738                             NVME_ONCS_FEATURES | NVME_ONCS_DSM |
8739                             NVME_ONCS_COMPARE | NVME_ONCS_COPY |
8740                             NVME_ONCS_NVMCSA | NVME_ONCS_NVMAFC);
8741  
8742      /*
8743       * NOTE: If this device ever supports a command set that does NOT use 0x0
8744       * as a Flush-equivalent operation, support for the broadcast NSID in Flush
8745       * should probably be removed.
8746       *
8747       * See comment in nvme_io_cmd.
8748       */
8749      id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
8750  
8751      id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1 |
8752                              NVME_OCFS_COPY_FORMAT_2 | NVME_OCFS_COPY_FORMAT_3);
8753      id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
8754                             NVME_CTRL_SGLS_MPTR_SGL);
8755  
8756      nvme_init_subnqn(n);
8757  
8758      id->psd[0].mp = cpu_to_le16(0x9c4);
8759      id->psd[0].enlat = cpu_to_le32(0x10);
8760      id->psd[0].exlat = cpu_to_le32(0x4);
8761  
8762      if (n->subsys) {
8763          id->cmic |= NVME_CMIC_MULTI_CTRL;
8764          ctratt |= NVME_CTRATT_ENDGRPS;
8765  
8766          id->endgidmax = cpu_to_le16(0x1);
8767  
8768          if (n->subsys->endgrp.fdp.enabled) {
8769              ctratt |= NVME_CTRATT_FDPS;
8770          }
8771      }
8772  
8773      id->ctratt = cpu_to_le32(ctratt);
8774  
8775      NVME_CAP_SET_MQES(cap, n->params.mqes);
8776      NVME_CAP_SET_CQR(cap, 1);
8777      NVME_CAP_SET_TO(cap, 0xf);
8778      NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
8779      NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
8780      NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
8781      NVME_CAP_SET_MPSMAX(cap, 4);
8782      NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
8783      NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
8784      stq_le_p(&n->bar.cap, cap);
8785  
8786      stl_le_p(&n->bar.vs, NVME_SPEC_VER);
8787      n->bar.intmc = n->bar.intms = 0;
8788  
8789      if (pci_is_vf(pci_dev) && !sctrl->scs) {
8790          stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
8791      }
8792  }
8793  
nvme_init_subsys(NvmeCtrl * n,Error ** errp)8794  static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
8795  {
8796      int cntlid;
8797  
8798      if (!n->subsys) {
8799          return 0;
8800      }
8801  
8802      cntlid = nvme_subsys_register_ctrl(n, errp);
8803      if (cntlid < 0) {
8804          return -1;
8805      }
8806  
8807      n->cntlid = cntlid;
8808  
8809      return 0;
8810  }
8811  
nvme_attach_ns(NvmeCtrl * n,NvmeNamespace * ns)8812  void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
8813  {
8814      uint32_t nsid = ns->params.nsid;
8815      assert(nsid && nsid <= NVME_MAX_NAMESPACES);
8816  
8817      n->namespaces[nsid] = ns;
8818      ns->attached++;
8819  
8820      n->dmrsl = MIN_NON_ZERO(n->dmrsl,
8821                              BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
8822  }
8823  
nvme_realize(PCIDevice * pci_dev,Error ** errp)8824  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
8825  {
8826      NvmeCtrl *n = NVME(pci_dev);
8827      DeviceState *dev = DEVICE(pci_dev);
8828      NvmeNamespace *ns;
8829      NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
8830  
8831      if (pci_is_vf(pci_dev)) {
8832          /*
8833           * VFs derive settings from the parent. PF's lifespan exceeds
8834           * that of VF's.
8835           */
8836          memcpy(&n->params, &pn->params, sizeof(NvmeParams));
8837  
8838          /*
8839           * Set PF's serial value to a new string memory to prevent 'serial'
8840           * property object release of PF when a VF is removed from the system.
8841           */
8842          n->params.serial = g_strdup(pn->params.serial);
8843          n->subsys = pn->subsys;
8844  
8845          /*
8846           * Assigning this link (strong link) causes an `object_unref` later in
8847           * `object_release_link_property`. Increment the refcount to balance
8848           * this out.
8849           */
8850          object_ref(OBJECT(pn->subsys));
8851      }
8852  
8853      if (!nvme_check_params(n, errp)) {
8854          return;
8855      }
8856  
8857      qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
8858  
8859      if (nvme_init_subsys(n, errp)) {
8860          return;
8861      }
8862      nvme_init_state(n);
8863      if (!nvme_init_pci(n, pci_dev, errp)) {
8864          return;
8865      }
8866      nvme_init_ctrl(n, pci_dev);
8867  
8868      /* setup a namespace if the controller drive property was given */
8869      if (n->namespace.blkconf.blk) {
8870          ns = &n->namespace;
8871          ns->params.nsid = 1;
8872  
8873          if (nvme_ns_setup(ns, errp)) {
8874              return;
8875          }
8876  
8877          nvme_attach_ns(n, ns);
8878      }
8879  }
8880  
nvme_exit(PCIDevice * pci_dev)8881  static void nvme_exit(PCIDevice *pci_dev)
8882  {
8883      NvmeCtrl *n = NVME(pci_dev);
8884      NvmeNamespace *ns;
8885      int i;
8886  
8887      nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
8888  
8889      if (n->subsys) {
8890          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
8891              ns = nvme_ns(n, i);
8892              if (ns) {
8893                  ns->attached--;
8894              }
8895          }
8896  
8897          nvme_subsys_unregister_ctrl(n->subsys, n);
8898      }
8899  
8900      g_free(n->cq);
8901      g_free(n->sq);
8902      g_free(n->aer_reqs);
8903  
8904      if (n->params.cmb_size_mb) {
8905          g_free(n->cmb.buf);
8906      }
8907  
8908      if (pci_dev->doe_spdm.spdm_socket > 0) {
8909          spdm_socket_close(pci_dev->doe_spdm.spdm_socket,
8910                            SPDM_SOCKET_TRANSPORT_TYPE_PCI_DOE);
8911      }
8912  
8913      if (n->pmr.dev) {
8914          host_memory_backend_set_mapped(n->pmr.dev, false);
8915      }
8916  
8917      if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
8918          pcie_sriov_pf_exit(pci_dev);
8919      }
8920  
8921      if (n->params.msix_exclusive_bar && !pci_is_vf(pci_dev)) {
8922          msix_uninit_exclusive_bar(pci_dev);
8923      } else {
8924          msix_uninit(pci_dev, &n->bar0, &n->bar0);
8925      }
8926  
8927      memory_region_del_subregion(&n->bar0, &n->iomem);
8928  }
8929  
8930  static Property nvme_props[] = {
8931      DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
8932      DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
8933                       HostMemoryBackend *),
8934      DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
8935                       NvmeSubsystem *),
8936      DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
8937      DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
8938      DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
8939      DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
8940      DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
8941      DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
8942      DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
8943      DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
8944      DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
8945      DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
8946      DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
8947      DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
8948      DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
8949      DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
8950                       params.auto_transition_zones, true),
8951      DEFINE_PROP_UINT16("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
8952      DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
8953                         params.sriov_vq_flexible, 0),
8954      DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
8955                         params.sriov_vi_flexible, 0),
8956      DEFINE_PROP_UINT32("sriov_max_vi_per_vf", NvmeCtrl,
8957                         params.sriov_max_vi_per_vf, 0),
8958      DEFINE_PROP_UINT32("sriov_max_vq_per_vf", NvmeCtrl,
8959                         params.sriov_max_vq_per_vf, 0),
8960      DEFINE_PROP_BOOL("msix-exclusive-bar", NvmeCtrl, params.msix_exclusive_bar,
8961                       false),
8962      DEFINE_PROP_UINT16("mqes", NvmeCtrl, params.mqes, 0x7ff),
8963      DEFINE_PROP_UINT16("spdm_port", PCIDevice, spdm_port, 0),
8964      DEFINE_PROP_BOOL("ctratt.mem", NvmeCtrl, params.ctratt.mem, false),
8965      DEFINE_PROP_BOOL("atomic.dn", NvmeCtrl, params.atomic_dn, 0),
8966      DEFINE_PROP_UINT16("atomic.awun", NvmeCtrl, params.atomic_awun, 0),
8967      DEFINE_PROP_UINT16("atomic.awupf", NvmeCtrl, params.atomic_awupf, 0),
8968      DEFINE_PROP_END_OF_LIST(),
8969  };
8970  
nvme_get_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8971  static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
8972                                     void *opaque, Error **errp)
8973  {
8974      NvmeCtrl *n = NVME(obj);
8975      uint8_t value = n->smart_critical_warning;
8976  
8977      visit_type_uint8(v, name, &value, errp);
8978  }
8979  
nvme_set_smart_warning(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)8980  static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
8981                                     void *opaque, Error **errp)
8982  {
8983      NvmeCtrl *n = NVME(obj);
8984      uint8_t value, old_value, cap = 0, index, event;
8985  
8986      if (!visit_type_uint8(v, name, &value, errp)) {
8987          return;
8988      }
8989  
8990      cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
8991            | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
8992      if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
8993          cap |= NVME_SMART_PMR_UNRELIABLE;
8994      }
8995  
8996      if ((value & cap) != value) {
8997          error_setg(errp, "unsupported smart critical warning bits: 0x%x",
8998                     value & ~cap);
8999          return;
9000      }
9001  
9002      old_value = n->smart_critical_warning;
9003      n->smart_critical_warning = value;
9004  
9005      /* only inject new bits of smart critical warning */
9006      for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
9007          event = 1 << index;
9008          if (value & ~old_value & event)
9009              nvme_smart_event(n, event);
9010      }
9011  }
9012  
nvme_pci_reset(DeviceState * qdev)9013  static void nvme_pci_reset(DeviceState *qdev)
9014  {
9015      PCIDevice *pci_dev = PCI_DEVICE(qdev);
9016      NvmeCtrl *n = NVME(pci_dev);
9017  
9018      trace_pci_nvme_pci_reset();
9019      nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
9020  }
9021  
nvme_sriov_post_write_config(PCIDevice * dev,uint16_t old_num_vfs)9022  static void nvme_sriov_post_write_config(PCIDevice *dev, uint16_t old_num_vfs)
9023  {
9024      NvmeCtrl *n = NVME(dev);
9025      NvmeSecCtrlEntry *sctrl;
9026      int i;
9027  
9028      for (i = pcie_sriov_num_vfs(dev); i < old_num_vfs; i++) {
9029          sctrl = &n->sec_ctrl_list[i];
9030          nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
9031      }
9032  }
9033  
nvme_pci_write_config(PCIDevice * dev,uint32_t address,uint32_t val,int len)9034  static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
9035                                    uint32_t val, int len)
9036  {
9037      uint16_t old_num_vfs = pcie_sriov_num_vfs(dev);
9038  
9039      if (pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9040          pcie_doe_write_config(&dev->doe_spdm, address, val, len);
9041      }
9042      pci_default_write_config(dev, address, val, len);
9043      pcie_cap_flr_write_config(dev, address, val, len);
9044      nvme_sriov_post_write_config(dev, old_num_vfs);
9045  }
9046  
nvme_pci_read_config(PCIDevice * dev,uint32_t address,int len)9047  static uint32_t nvme_pci_read_config(PCIDevice *dev, uint32_t address, int len)
9048  {
9049      uint32_t val;
9050      if (dev->spdm_port && pcie_find_capability(dev, PCI_EXT_CAP_ID_DOE)) {
9051          if (pcie_doe_read_config(&dev->doe_spdm, address, len, &val)) {
9052              return val;
9053          }
9054      }
9055      return pci_default_read_config(dev, address, len);
9056  }
9057  
9058  static const VMStateDescription nvme_vmstate = {
9059      .name = "nvme",
9060      .unmigratable = 1,
9061  };
9062  
nvme_class_init(ObjectClass * oc,void * data)9063  static void nvme_class_init(ObjectClass *oc, void *data)
9064  {
9065      DeviceClass *dc = DEVICE_CLASS(oc);
9066      PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
9067  
9068      pc->realize = nvme_realize;
9069      pc->config_write = nvme_pci_write_config;
9070      pc->config_read = nvme_pci_read_config;
9071      pc->exit = nvme_exit;
9072      pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
9073      pc->revision = 2;
9074  
9075      set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
9076      dc->desc = "Non-Volatile Memory Express";
9077      device_class_set_props(dc, nvme_props);
9078      dc->vmsd = &nvme_vmstate;
9079      device_class_set_legacy_reset(dc, nvme_pci_reset);
9080  }
9081  
nvme_instance_init(Object * obj)9082  static void nvme_instance_init(Object *obj)
9083  {
9084      NvmeCtrl *n = NVME(obj);
9085  
9086      device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
9087                                    "bootindex", "/namespace@1,0",
9088                                    DEVICE(obj));
9089  
9090      object_property_add(obj, "smart_critical_warning", "uint8",
9091                          nvme_get_smart_warning,
9092                          nvme_set_smart_warning, NULL, NULL);
9093  }
9094  
9095  static const TypeInfo nvme_info = {
9096      .name          = TYPE_NVME,
9097      .parent        = TYPE_PCI_DEVICE,
9098      .instance_size = sizeof(NvmeCtrl),
9099      .instance_init = nvme_instance_init,
9100      .class_init    = nvme_class_init,
9101      .interfaces = (InterfaceInfo[]) {
9102          { INTERFACE_PCIE_DEVICE },
9103          { }
9104      },
9105  };
9106  
9107  static const TypeInfo nvme_bus_info = {
9108      .name = TYPE_NVME_BUS,
9109      .parent = TYPE_BUS,
9110      .instance_size = sizeof(NvmeBus),
9111  };
9112  
nvme_register_types(void)9113  static void nvme_register_types(void)
9114  {
9115      type_register_static(&nvme_info);
9116      type_register_static(&nvme_bus_info);
9117  }
9118  
9119  type_init(nvme_register_types)
9120