hw/block/nvme: expose 'bootindex' property
[qemu.git] / hw / block / nvme.c
1 /*
2 * QEMU NVM Express Controller
3 *
4 * Copyright (c) 2012, Intel Corporation
5 *
6 * Written by Keith Busch <keith.busch@intel.com>
7 *
8 * This code is licensed under the GNU GPL v2 or later.
9 */
10
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
13 *
14 * https://nvmexpress.org/developers/nvme-specification/
15 */
16
17 /**
18 * Usage: add options:
19 * -drive file=<file>,if=none,id=<drive_id>
20 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
21 * -device nvme,serial=<serial>,id=<bus_name>, \
22 * cmb_size_mb=<cmb_size_mb[optional]>, \
23 * [pmrdev=<mem_backend_file_id>,] \
24 * max_ioqpairs=<N[optional]>, \
25 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
26 * mdts=<N[optional]>,vsl=<N[optional]>, \
27 * zoned.zasl=<N[optional]>, \
28 * subsys=<subsys_id>
29 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
30 * zoned=<true|false[optional]>, \
31 * subsys=<subsys_id>,detached=<true|false[optional]>
32 *
33 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
34 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
35 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
36 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
37 *
38 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
39 * For example:
40 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
41 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
42 *
43 * The PMR will use BAR 4/5 exclusively.
44 *
45 * To place controller(s) and namespace(s) to a subsystem, then provide
46 * nvme-subsys device as above.
47 *
48 * nvme subsystem device parameters
49 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
50 * - `nqn`
51 * This parameter provides the `<nqn_id>` part of the string
52 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
53 * of subsystem controllers. Note that `<nqn_id>` should be unique per
54 * subsystem, but this is not enforced by QEMU. If not specified, it will
55 * default to the value of the `id` parameter (`<subsys_id>`).
56 *
57 * nvme device parameters
58 * ~~~~~~~~~~~~~~~~~~~~~~
59 * - `subsys`
60 * Specifying this parameter attaches the controller to the subsystem and
61 * the SUBNQN field in the controller will report the NQN of the subsystem
62 * device. This also enables multi controller capability represented in
63 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
64 * Namesapce Sharing Capabilities).
65 *
66 * - `aerl`
67 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
68 * of concurrently outstanding Asynchronous Event Request commands support
69 * by the controller. This is a 0's based value.
70 *
71 * - `aer_max_queued`
72 * This is the maximum number of events that the device will enqueue for
73 * completion when there are no outstanding AERs. When the maximum number of
74 * enqueued events are reached, subsequent events will be dropped.
75 *
76 * - `mdts`
77 * Indicates the maximum data transfer size for a command that transfers data
78 * between host-accessible memory and the controller. The value is specified
79 * as a power of two (2^n) and is in units of the minimum memory page size
80 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
81 *
82 * - `vsl`
83 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
84 * this value is specified as a power of two (2^n) and is in units of the
85 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
86 * KiB).
87 *
88 * - `zoned.zasl`
89 * Indicates the maximum data transfer size for the Zone Append command. Like
90 * `mdts`, the value is specified as a power of two (2^n) and is in units of
91 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
92 * defaulting to the value of `mdts`).
93 *
94 * nvme namespace device parameters
95 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
96 * - `subsys`
97 * If given, the namespace will be attached to all controllers in the
98 * subsystem. Otherwise, `bus` must be given to attach this namespace to a
99 * specific controller as a non-shared namespace.
100 *
101 * - `detached`
102 * This parameter is only valid together with the `subsys` parameter. If left
103 * at the default value (`false/off`), the namespace will be attached to all
104 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
105 * namespace will be be available in the subsystem not not attached to any
106 * controllers.
107 *
108 * Setting `zoned` to true selects Zoned Command Set at the namespace.
109 * In this case, the following namespace properties are available to configure
110 * zoned operation:
111 * zoned.zone_size=<zone size in bytes, default: 128MiB>
112 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
113 *
114 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
115 * The value 0 (default) forces zone capacity to be the same as zone
116 * size. The value of this property may not exceed zone size.
117 *
118 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
119 * This value needs to be specified in 64B units. If it is zero,
120 * namespace(s) will not support zone descriptor extensions.
121 *
122 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
123 * The default value means there is no limit to the number of
124 * concurrently active zones.
125 *
126 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
127 * The default value means there is no limit to the number of
128 * concurrently open zones.
129 *
130 * zoned.cross_read=<enable RAZB, default: false>
131 * Setting this property to true enables Read Across Zone Boundaries.
132 */
133
134 #include "qemu/osdep.h"
135 #include "qemu/units.h"
136 #include "qemu/error-report.h"
137 #include "hw/block/block.h"
138 #include "hw/pci/msix.h"
139 #include "hw/pci/pci.h"
140 #include "hw/qdev-properties.h"
141 #include "migration/vmstate.h"
142 #include "sysemu/sysemu.h"
143 #include "qapi/error.h"
144 #include "qapi/visitor.h"
145 #include "sysemu/hostmem.h"
146 #include "sysemu/block-backend.h"
147 #include "exec/memory.h"
148 #include "qemu/log.h"
149 #include "qemu/module.h"
150 #include "qemu/cutils.h"
151 #include "trace.h"
152 #include "nvme.h"
153 #include "nvme-ns.h"
154 #include "nvme-dif.h"
155
156 #define NVME_MAX_IOQPAIRS 0xffff
157 #define NVME_DB_SIZE 4
158 #define NVME_SPEC_VER 0x00010400
159 #define NVME_CMB_BIR 2
160 #define NVME_PMR_BIR 4
161 #define NVME_TEMPERATURE 0x143
162 #define NVME_TEMPERATURE_WARNING 0x157
163 #define NVME_TEMPERATURE_CRITICAL 0x175
164 #define NVME_NUM_FW_SLOTS 1
165
166 #define NVME_GUEST_ERR(trace, fmt, ...) \
167 do { \
168 (trace_##trace)(__VA_ARGS__); \
169 qemu_log_mask(LOG_GUEST_ERROR, #trace \
170 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
171 } while (0)
172
173 static const bool nvme_feature_support[NVME_FID_MAX] = {
174 [NVME_ARBITRATION] = true,
175 [NVME_POWER_MANAGEMENT] = true,
176 [NVME_TEMPERATURE_THRESHOLD] = true,
177 [NVME_ERROR_RECOVERY] = true,
178 [NVME_VOLATILE_WRITE_CACHE] = true,
179 [NVME_NUMBER_OF_QUEUES] = true,
180 [NVME_INTERRUPT_COALESCING] = true,
181 [NVME_INTERRUPT_VECTOR_CONF] = true,
182 [NVME_WRITE_ATOMICITY] = true,
183 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
184 [NVME_TIMESTAMP] = true,
185 };
186
187 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
188 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
189 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
190 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
191 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
192 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
193 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
194 };
195
196 static const uint32_t nvme_cse_acs[256] = {
197 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
198 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
199 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
200 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
201 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
202 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
203 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
204 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
205 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
206 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
207 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
208 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
209 };
210
211 static const uint32_t nvme_cse_iocs_none[256];
212
213 static const uint32_t nvme_cse_iocs_nvm[256] = {
214 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
215 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
216 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
217 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
218 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
219 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
220 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
221 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
222 };
223
224 static const uint32_t nvme_cse_iocs_zoned[256] = {
225 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
226 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
227 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
228 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
229 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
231 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
233 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
235 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
236 };
237
238 static void nvme_process_sq(void *opaque);
239
240 static uint16_t nvme_sqid(NvmeRequest *req)
241 {
242 return le16_to_cpu(req->sq->sqid);
243 }
244
245 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
246 NvmeZoneState state)
247 {
248 if (QTAILQ_IN_USE(zone, entry)) {
249 switch (nvme_get_zone_state(zone)) {
250 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
251 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
252 break;
253 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
254 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
255 break;
256 case NVME_ZONE_STATE_CLOSED:
257 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
258 break;
259 case NVME_ZONE_STATE_FULL:
260 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
261 default:
262 ;
263 }
264 }
265
266 nvme_set_zone_state(zone, state);
267
268 switch (state) {
269 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
270 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
271 break;
272 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
273 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
274 break;
275 case NVME_ZONE_STATE_CLOSED:
276 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
277 break;
278 case NVME_ZONE_STATE_FULL:
279 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
280 case NVME_ZONE_STATE_READ_ONLY:
281 break;
282 default:
283 zone->d.za = 0;
284 }
285 }
286
287 /*
288 * Check if we can open a zone without exceeding open/active limits.
289 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
290 */
291 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
292 {
293 if (ns->params.max_active_zones != 0 &&
294 ns->nr_active_zones + act > ns->params.max_active_zones) {
295 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
296 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
297 }
298 if (ns->params.max_open_zones != 0 &&
299 ns->nr_open_zones + opn > ns->params.max_open_zones) {
300 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
301 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
302 }
303
304 return NVME_SUCCESS;
305 }
306
307 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
308 {
309 hwaddr hi, lo;
310
311 if (!n->cmb.cmse) {
312 return false;
313 }
314
315 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
316 hi = lo + int128_get64(n->cmb.mem.size);
317
318 return addr >= lo && addr < hi;
319 }
320
321 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
322 {
323 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
324 return &n->cmb.buf[addr - base];
325 }
326
327 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
328 {
329 hwaddr hi;
330
331 if (!n->pmr.cmse) {
332 return false;
333 }
334
335 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
336
337 return addr >= n->pmr.cba && addr < hi;
338 }
339
340 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
341 {
342 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
343 }
344
345 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
346 {
347 hwaddr hi = addr + size - 1;
348 if (hi < addr) {
349 return 1;
350 }
351
352 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
353 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
354 return 0;
355 }
356
357 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
358 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
359 return 0;
360 }
361
362 return pci_dma_read(&n->parent_obj, addr, buf, size);
363 }
364
365 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
366 {
367 hwaddr hi = addr + size - 1;
368 if (hi < addr) {
369 return 1;
370 }
371
372 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
373 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
374 return 0;
375 }
376
377 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
378 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
379 return 0;
380 }
381
382 return pci_dma_write(&n->parent_obj, addr, buf, size);
383 }
384
385 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
386 {
387 return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces);
388 }
389
390 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
391 {
392 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
393 }
394
395 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
396 {
397 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
398 }
399
400 static void nvme_inc_cq_tail(NvmeCQueue *cq)
401 {
402 cq->tail++;
403 if (cq->tail >= cq->size) {
404 cq->tail = 0;
405 cq->phase = !cq->phase;
406 }
407 }
408
409 static void nvme_inc_sq_head(NvmeSQueue *sq)
410 {
411 sq->head = (sq->head + 1) % sq->size;
412 }
413
414 static uint8_t nvme_cq_full(NvmeCQueue *cq)
415 {
416 return (cq->tail + 1) % cq->size == cq->head;
417 }
418
419 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
420 {
421 return sq->head == sq->tail;
422 }
423
424 static void nvme_irq_check(NvmeCtrl *n)
425 {
426 if (msix_enabled(&(n->parent_obj))) {
427 return;
428 }
429 if (~n->bar.intms & n->irq_status) {
430 pci_irq_assert(&n->parent_obj);
431 } else {
432 pci_irq_deassert(&n->parent_obj);
433 }
434 }
435
436 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
437 {
438 if (cq->irq_enabled) {
439 if (msix_enabled(&(n->parent_obj))) {
440 trace_pci_nvme_irq_msix(cq->vector);
441 msix_notify(&(n->parent_obj), cq->vector);
442 } else {
443 trace_pci_nvme_irq_pin();
444 assert(cq->vector < 32);
445 n->irq_status |= 1 << cq->vector;
446 nvme_irq_check(n);
447 }
448 } else {
449 trace_pci_nvme_irq_masked();
450 }
451 }
452
453 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
454 {
455 if (cq->irq_enabled) {
456 if (msix_enabled(&(n->parent_obj))) {
457 return;
458 } else {
459 assert(cq->vector < 32);
460 n->irq_status &= ~(1 << cq->vector);
461 nvme_irq_check(n);
462 }
463 }
464 }
465
466 static void nvme_req_clear(NvmeRequest *req)
467 {
468 req->ns = NULL;
469 req->opaque = NULL;
470 memset(&req->cqe, 0x0, sizeof(req->cqe));
471 req->status = NVME_SUCCESS;
472 }
473
474 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
475 {
476 if (dma) {
477 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
478 sg->flags = NVME_SG_DMA;
479 } else {
480 qemu_iovec_init(&sg->iov, 0);
481 }
482
483 sg->flags |= NVME_SG_ALLOC;
484 }
485
486 static inline void nvme_sg_unmap(NvmeSg *sg)
487 {
488 if (!(sg->flags & NVME_SG_ALLOC)) {
489 return;
490 }
491
492 if (sg->flags & NVME_SG_DMA) {
493 qemu_sglist_destroy(&sg->qsg);
494 } else {
495 qemu_iovec_destroy(&sg->iov);
496 }
497
498 memset(sg, 0x0, sizeof(*sg));
499 }
500
501 /*
502 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
503 * holds both data and metadata. This function splits the data and metadata
504 * into two separate QSG/IOVs.
505 */
506 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
507 NvmeSg *mdata)
508 {
509 NvmeSg *dst = data;
510 size_t size = nvme_lsize(ns);
511 size_t msize = nvme_msize(ns);
512 uint32_t trans_len, count = size;
513 uint64_t offset = 0;
514 bool dma = sg->flags & NVME_SG_DMA;
515 size_t sge_len;
516 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
517 int sg_idx = 0;
518
519 assert(sg->flags & NVME_SG_ALLOC);
520
521 while (sg_len) {
522 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
523
524 trans_len = MIN(sg_len, count);
525 trans_len = MIN(trans_len, sge_len - offset);
526
527 if (dst) {
528 if (dma) {
529 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
530 trans_len);
531 } else {
532 qemu_iovec_add(&dst->iov,
533 sg->iov.iov[sg_idx].iov_base + offset,
534 trans_len);
535 }
536 }
537
538 sg_len -= trans_len;
539 count -= trans_len;
540 offset += trans_len;
541
542 if (count == 0) {
543 dst = (dst == data) ? mdata : data;
544 count = (dst == data) ? size : msize;
545 }
546
547 if (sge_len == offset) {
548 offset = 0;
549 sg_idx++;
550 }
551 }
552 }
553
554 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
555 size_t len)
556 {
557 if (!len) {
558 return NVME_SUCCESS;
559 }
560
561 trace_pci_nvme_map_addr_cmb(addr, len);
562
563 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
564 return NVME_DATA_TRAS_ERROR;
565 }
566
567 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
568
569 return NVME_SUCCESS;
570 }
571
572 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
573 size_t len)
574 {
575 if (!len) {
576 return NVME_SUCCESS;
577 }
578
579 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
580 return NVME_DATA_TRAS_ERROR;
581 }
582
583 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
584
585 return NVME_SUCCESS;
586 }
587
588 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
589 {
590 bool cmb = false, pmr = false;
591
592 if (!len) {
593 return NVME_SUCCESS;
594 }
595
596 trace_pci_nvme_map_addr(addr, len);
597
598 if (nvme_addr_is_cmb(n, addr)) {
599 cmb = true;
600 } else if (nvme_addr_is_pmr(n, addr)) {
601 pmr = true;
602 }
603
604 if (cmb || pmr) {
605 if (sg->flags & NVME_SG_DMA) {
606 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
607 }
608
609 if (cmb) {
610 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
611 } else {
612 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
613 }
614 }
615
616 if (!(sg->flags & NVME_SG_DMA)) {
617 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
618 }
619
620 qemu_sglist_add(&sg->qsg, addr, len);
621
622 return NVME_SUCCESS;
623 }
624
625 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
626 {
627 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
628 }
629
630 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
631 uint64_t prp2, uint32_t len)
632 {
633 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
634 trans_len = MIN(len, trans_len);
635 int num_prps = (len >> n->page_bits) + 1;
636 uint16_t status;
637 int ret;
638
639 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
640
641 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
642
643 status = nvme_map_addr(n, sg, prp1, trans_len);
644 if (status) {
645 goto unmap;
646 }
647
648 len -= trans_len;
649 if (len) {
650 if (len > n->page_size) {
651 uint64_t prp_list[n->max_prp_ents];
652 uint32_t nents, prp_trans;
653 int i = 0;
654
655 nents = (len + n->page_size - 1) >> n->page_bits;
656 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
657 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
658 if (ret) {
659 trace_pci_nvme_err_addr_read(prp2);
660 status = NVME_DATA_TRAS_ERROR;
661 goto unmap;
662 }
663 while (len != 0) {
664 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
665
666 if (i == n->max_prp_ents - 1 && len > n->page_size) {
667 if (unlikely(prp_ent & (n->page_size - 1))) {
668 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
669 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
670 goto unmap;
671 }
672
673 i = 0;
674 nents = (len + n->page_size - 1) >> n->page_bits;
675 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
676 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
677 prp_trans);
678 if (ret) {
679 trace_pci_nvme_err_addr_read(prp_ent);
680 status = NVME_DATA_TRAS_ERROR;
681 goto unmap;
682 }
683 prp_ent = le64_to_cpu(prp_list[i]);
684 }
685
686 if (unlikely(prp_ent & (n->page_size - 1))) {
687 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
688 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
689 goto unmap;
690 }
691
692 trans_len = MIN(len, n->page_size);
693 status = nvme_map_addr(n, sg, prp_ent, trans_len);
694 if (status) {
695 goto unmap;
696 }
697
698 len -= trans_len;
699 i++;
700 }
701 } else {
702 if (unlikely(prp2 & (n->page_size - 1))) {
703 trace_pci_nvme_err_invalid_prp2_align(prp2);
704 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
705 goto unmap;
706 }
707 status = nvme_map_addr(n, sg, prp2, len);
708 if (status) {
709 goto unmap;
710 }
711 }
712 }
713
714 return NVME_SUCCESS;
715
716 unmap:
717 nvme_sg_unmap(sg);
718 return status;
719 }
720
721 /*
722 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
723 * number of bytes mapped in len.
724 */
725 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
726 NvmeSglDescriptor *segment, uint64_t nsgld,
727 size_t *len, NvmeCmd *cmd)
728 {
729 dma_addr_t addr, trans_len;
730 uint32_t dlen;
731 uint16_t status;
732
733 for (int i = 0; i < nsgld; i++) {
734 uint8_t type = NVME_SGL_TYPE(segment[i].type);
735
736 switch (type) {
737 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
738 if (cmd->opcode == NVME_CMD_WRITE) {
739 continue;
740 }
741 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
742 break;
743 case NVME_SGL_DESCR_TYPE_SEGMENT:
744 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
745 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
746 default:
747 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
748 }
749
750 dlen = le32_to_cpu(segment[i].len);
751
752 if (!dlen) {
753 continue;
754 }
755
756 if (*len == 0) {
757 /*
758 * All data has been mapped, but the SGL contains additional
759 * segments and/or descriptors. The controller might accept
760 * ignoring the rest of the SGL.
761 */
762 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
763 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
764 break;
765 }
766
767 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
768 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
769 }
770
771 trans_len = MIN(*len, dlen);
772
773 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
774 goto next;
775 }
776
777 addr = le64_to_cpu(segment[i].addr);
778
779 if (UINT64_MAX - addr < dlen) {
780 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
781 }
782
783 status = nvme_map_addr(n, sg, addr, trans_len);
784 if (status) {
785 return status;
786 }
787
788 next:
789 *len -= trans_len;
790 }
791
792 return NVME_SUCCESS;
793 }
794
795 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
796 size_t len, NvmeCmd *cmd)
797 {
798 /*
799 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
800 * dynamically allocating a potentially huge SGL. The spec allows the SGL
801 * to be larger (as in number of bytes required to describe the SGL
802 * descriptors and segment chain) than the command transfer size, so it is
803 * not bounded by MDTS.
804 */
805 const int SEG_CHUNK_SIZE = 256;
806
807 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
808 uint64_t nsgld;
809 uint32_t seg_len;
810 uint16_t status;
811 hwaddr addr;
812 int ret;
813
814 sgld = &sgl;
815 addr = le64_to_cpu(sgl.addr);
816
817 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
818
819 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
820
821 /*
822 * If the entire transfer can be described with a single data block it can
823 * be mapped directly.
824 */
825 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
826 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
827 if (status) {
828 goto unmap;
829 }
830
831 goto out;
832 }
833
834 for (;;) {
835 switch (NVME_SGL_TYPE(sgld->type)) {
836 case NVME_SGL_DESCR_TYPE_SEGMENT:
837 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
838 break;
839 default:
840 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
841 }
842
843 seg_len = le32_to_cpu(sgld->len);
844
845 /* check the length of the (Last) Segment descriptor */
846 if ((!seg_len || seg_len & 0xf) &&
847 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
848 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
849 }
850
851 if (UINT64_MAX - addr < seg_len) {
852 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
853 }
854
855 nsgld = seg_len / sizeof(NvmeSglDescriptor);
856
857 while (nsgld > SEG_CHUNK_SIZE) {
858 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
859 trace_pci_nvme_err_addr_read(addr);
860 status = NVME_DATA_TRAS_ERROR;
861 goto unmap;
862 }
863
864 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
865 &len, cmd);
866 if (status) {
867 goto unmap;
868 }
869
870 nsgld -= SEG_CHUNK_SIZE;
871 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
872 }
873
874 ret = nvme_addr_read(n, addr, segment, nsgld *
875 sizeof(NvmeSglDescriptor));
876 if (ret) {
877 trace_pci_nvme_err_addr_read(addr);
878 status = NVME_DATA_TRAS_ERROR;
879 goto unmap;
880 }
881
882 last_sgld = &segment[nsgld - 1];
883
884 /*
885 * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
886 * then we are done.
887 */
888 switch (NVME_SGL_TYPE(last_sgld->type)) {
889 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
890 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
891 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
892 if (status) {
893 goto unmap;
894 }
895
896 goto out;
897
898 default:
899 break;
900 }
901
902 /*
903 * If the last descriptor was not a Data Block or Bit Bucket, then the
904 * current segment must not be a Last Segment.
905 */
906 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
907 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
908 goto unmap;
909 }
910
911 sgld = last_sgld;
912 addr = le64_to_cpu(sgld->addr);
913
914 /*
915 * Do not map the last descriptor; it will be a Segment or Last Segment
916 * descriptor and is handled by the next iteration.
917 */
918 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
919 if (status) {
920 goto unmap;
921 }
922 }
923
924 out:
925 /* if there is any residual left in len, the SGL was too short */
926 if (len) {
927 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
928 goto unmap;
929 }
930
931 return NVME_SUCCESS;
932
933 unmap:
934 nvme_sg_unmap(sg);
935 return status;
936 }
937
938 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
939 NvmeCmd *cmd)
940 {
941 uint64_t prp1, prp2;
942
943 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
944 case NVME_PSDT_PRP:
945 prp1 = le64_to_cpu(cmd->dptr.prp1);
946 prp2 = le64_to_cpu(cmd->dptr.prp2);
947
948 return nvme_map_prp(n, sg, prp1, prp2, len);
949 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
950 case NVME_PSDT_SGL_MPTR_SGL:
951 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
952 default:
953 return NVME_INVALID_FIELD;
954 }
955 }
956
957 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
958 NvmeCmd *cmd)
959 {
960 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
961 hwaddr mptr = le64_to_cpu(cmd->mptr);
962 uint16_t status;
963
964 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
965 NvmeSglDescriptor sgl;
966
967 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
968 return NVME_DATA_TRAS_ERROR;
969 }
970
971 status = nvme_map_sgl(n, sg, sgl, len, cmd);
972 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
973 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
974 }
975
976 return status;
977 }
978
979 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
980 status = nvme_map_addr(n, sg, mptr, len);
981 if (status) {
982 nvme_sg_unmap(sg);
983 }
984
985 return status;
986 }
987
988 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
989 {
990 NvmeNamespace *ns = req->ns;
991 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
992 uint16_t ctrl = le16_to_cpu(rw->control);
993 size_t len = nvme_l2b(ns, nlb);
994 uint16_t status;
995
996 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
997 (ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
998 goto out;
999 }
1000
1001 if (nvme_ns_ext(ns)) {
1002 NvmeSg sg;
1003
1004 len += nvme_m2b(ns, nlb);
1005
1006 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1007 if (status) {
1008 return status;
1009 }
1010
1011 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1012 nvme_sg_split(&sg, ns, &req->sg, NULL);
1013 nvme_sg_unmap(&sg);
1014
1015 return NVME_SUCCESS;
1016 }
1017
1018 out:
1019 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1020 }
1021
1022 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1023 {
1024 NvmeNamespace *ns = req->ns;
1025 size_t len = nvme_m2b(ns, nlb);
1026 uint16_t status;
1027
1028 if (nvme_ns_ext(ns)) {
1029 NvmeSg sg;
1030
1031 len += nvme_l2b(ns, nlb);
1032
1033 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1034 if (status) {
1035 return status;
1036 }
1037
1038 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1039 nvme_sg_split(&sg, ns, NULL, &req->sg);
1040 nvme_sg_unmap(&sg);
1041
1042 return NVME_SUCCESS;
1043 }
1044
1045 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1046 }
1047
1048 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1049 uint32_t len, uint32_t bytes,
1050 int32_t skip_bytes, int64_t offset,
1051 NvmeTxDirection dir)
1052 {
1053 hwaddr addr;
1054 uint32_t trans_len, count = bytes;
1055 bool dma = sg->flags & NVME_SG_DMA;
1056 int64_t sge_len;
1057 int sg_idx = 0;
1058 int ret;
1059
1060 assert(sg->flags & NVME_SG_ALLOC);
1061
1062 while (len) {
1063 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1064
1065 if (sge_len - offset < 0) {
1066 offset -= sge_len;
1067 sg_idx++;
1068 continue;
1069 }
1070
1071 if (sge_len == offset) {
1072 offset = 0;
1073 sg_idx++;
1074 continue;
1075 }
1076
1077 trans_len = MIN(len, count);
1078 trans_len = MIN(trans_len, sge_len - offset);
1079
1080 if (dma) {
1081 addr = sg->qsg.sg[sg_idx].base + offset;
1082 } else {
1083 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1084 }
1085
1086 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1087 ret = nvme_addr_read(n, addr, ptr, trans_len);
1088 } else {
1089 ret = nvme_addr_write(n, addr, ptr, trans_len);
1090 }
1091
1092 if (ret) {
1093 return NVME_DATA_TRAS_ERROR;
1094 }
1095
1096 ptr += trans_len;
1097 len -= trans_len;
1098 count -= trans_len;
1099 offset += trans_len;
1100
1101 if (count == 0) {
1102 count = bytes;
1103 offset += skip_bytes;
1104 }
1105 }
1106
1107 return NVME_SUCCESS;
1108 }
1109
1110 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1111 NvmeTxDirection dir)
1112 {
1113 assert(sg->flags & NVME_SG_ALLOC);
1114
1115 if (sg->flags & NVME_SG_DMA) {
1116 uint64_t residual;
1117
1118 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1119 residual = dma_buf_write(ptr, len, &sg->qsg);
1120 } else {
1121 residual = dma_buf_read(ptr, len, &sg->qsg);
1122 }
1123
1124 if (unlikely(residual)) {
1125 trace_pci_nvme_err_invalid_dma();
1126 return NVME_INVALID_FIELD | NVME_DNR;
1127 }
1128 } else {
1129 size_t bytes;
1130
1131 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1132 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1133 } else {
1134 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1135 }
1136
1137 if (unlikely(bytes != len)) {
1138 trace_pci_nvme_err_invalid_dma();
1139 return NVME_INVALID_FIELD | NVME_DNR;
1140 }
1141 }
1142
1143 return NVME_SUCCESS;
1144 }
1145
1146 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1147 NvmeRequest *req)
1148 {
1149 uint16_t status;
1150
1151 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1152 if (status) {
1153 return status;
1154 }
1155
1156 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1157 }
1158
1159 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1160 NvmeRequest *req)
1161 {
1162 uint16_t status;
1163
1164 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1165 if (status) {
1166 return status;
1167 }
1168
1169 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1170 }
1171
1172 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1173 NvmeTxDirection dir, NvmeRequest *req)
1174 {
1175 NvmeNamespace *ns = req->ns;
1176 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1177 uint16_t ctrl = le16_to_cpu(rw->control);
1178
1179 if (nvme_ns_ext(ns) &&
1180 !(ctrl & NVME_RW_PRINFO_PRACT && nvme_msize(ns) == 8)) {
1181 size_t lsize = nvme_lsize(ns);
1182 size_t msize = nvme_msize(ns);
1183
1184 return nvme_tx_interleaved(n, &req->sg, ptr, len, lsize, msize, 0,
1185 dir);
1186 }
1187
1188 return nvme_tx(n, &req->sg, ptr, len, dir);
1189 }
1190
1191 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1192 NvmeTxDirection dir, NvmeRequest *req)
1193 {
1194 NvmeNamespace *ns = req->ns;
1195 uint16_t status;
1196
1197 if (nvme_ns_ext(ns)) {
1198 size_t lsize = nvme_lsize(ns);
1199 size_t msize = nvme_msize(ns);
1200
1201 return nvme_tx_interleaved(n, &req->sg, ptr, len, msize, lsize, lsize,
1202 dir);
1203 }
1204
1205 nvme_sg_unmap(&req->sg);
1206
1207 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1208 if (status) {
1209 return status;
1210 }
1211
1212 return nvme_tx(n, &req->sg, ptr, len, dir);
1213 }
1214
1215 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1216 BlockCompletionFunc *cb, NvmeRequest *req)
1217 {
1218 assert(req->sg.flags & NVME_SG_ALLOC);
1219
1220 if (req->sg.flags & NVME_SG_DMA) {
1221 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1222 cb, req);
1223 } else {
1224 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1225 }
1226 }
1227
1228 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1229 BlockCompletionFunc *cb, NvmeRequest *req)
1230 {
1231 assert(req->sg.flags & NVME_SG_ALLOC);
1232
1233 if (req->sg.flags & NVME_SG_DMA) {
1234 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1235 cb, req);
1236 } else {
1237 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1238 }
1239 }
1240
1241 static void nvme_post_cqes(void *opaque)
1242 {
1243 NvmeCQueue *cq = opaque;
1244 NvmeCtrl *n = cq->ctrl;
1245 NvmeRequest *req, *next;
1246 int ret;
1247
1248 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1249 NvmeSQueue *sq;
1250 hwaddr addr;
1251
1252 if (nvme_cq_full(cq)) {
1253 break;
1254 }
1255
1256 sq = req->sq;
1257 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1258 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1259 req->cqe.sq_head = cpu_to_le16(sq->head);
1260 addr = cq->dma_addr + cq->tail * n->cqe_size;
1261 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1262 sizeof(req->cqe));
1263 if (ret) {
1264 trace_pci_nvme_err_addr_write(addr);
1265 trace_pci_nvme_err_cfs();
1266 n->bar.csts = NVME_CSTS_FAILED;
1267 break;
1268 }
1269 QTAILQ_REMOVE(&cq->req_list, req, entry);
1270 nvme_inc_cq_tail(cq);
1271 nvme_sg_unmap(&req->sg);
1272 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1273 }
1274 if (cq->tail != cq->head) {
1275 nvme_irq_assert(n, cq);
1276 }
1277 }
1278
1279 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1280 {
1281 assert(cq->cqid == req->sq->cqid);
1282 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1283 req->status);
1284
1285 if (req->status) {
1286 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1287 req->status, req->cmd.opcode);
1288 }
1289
1290 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1291 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1292 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1293 }
1294
1295 static void nvme_process_aers(void *opaque)
1296 {
1297 NvmeCtrl *n = opaque;
1298 NvmeAsyncEvent *event, *next;
1299
1300 trace_pci_nvme_process_aers(n->aer_queued);
1301
1302 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1303 NvmeRequest *req;
1304 NvmeAerResult *result;
1305
1306 /* can't post cqe if there is nothing to complete */
1307 if (!n->outstanding_aers) {
1308 trace_pci_nvme_no_outstanding_aers();
1309 break;
1310 }
1311
1312 /* ignore if masked (cqe posted, but event not cleared) */
1313 if (n->aer_mask & (1 << event->result.event_type)) {
1314 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1315 continue;
1316 }
1317
1318 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1319 n->aer_queued--;
1320
1321 n->aer_mask |= 1 << event->result.event_type;
1322 n->outstanding_aers--;
1323
1324 req = n->aer_reqs[n->outstanding_aers];
1325
1326 result = (NvmeAerResult *) &req->cqe.result;
1327 result->event_type = event->result.event_type;
1328 result->event_info = event->result.event_info;
1329 result->log_page = event->result.log_page;
1330 g_free(event);
1331
1332 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1333 result->log_page);
1334
1335 nvme_enqueue_req_completion(&n->admin_cq, req);
1336 }
1337 }
1338
1339 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1340 uint8_t event_info, uint8_t log_page)
1341 {
1342 NvmeAsyncEvent *event;
1343
1344 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1345
1346 if (n->aer_queued == n->params.aer_max_queued) {
1347 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1348 return;
1349 }
1350
1351 event = g_new(NvmeAsyncEvent, 1);
1352 event->result = (NvmeAerResult) {
1353 .event_type = event_type,
1354 .event_info = event_info,
1355 .log_page = log_page,
1356 };
1357
1358 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1359 n->aer_queued++;
1360
1361 nvme_process_aers(n);
1362 }
1363
1364 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1365 {
1366 uint8_t aer_info;
1367
1368 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1369 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1370 return;
1371 }
1372
1373 switch (event) {
1374 case NVME_SMART_SPARE:
1375 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1376 break;
1377 case NVME_SMART_TEMPERATURE:
1378 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1379 break;
1380 case NVME_SMART_RELIABILITY:
1381 case NVME_SMART_MEDIA_READ_ONLY:
1382 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1383 case NVME_SMART_PMR_UNRELIABLE:
1384 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1385 break;
1386 default:
1387 return;
1388 }
1389
1390 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1391 }
1392
1393 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1394 {
1395 n->aer_mask &= ~(1 << event_type);
1396 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1397 nvme_process_aers(n);
1398 }
1399 }
1400
1401 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1402 {
1403 uint8_t mdts = n->params.mdts;
1404
1405 if (mdts && len > n->page_size << mdts) {
1406 trace_pci_nvme_err_mdts(len);
1407 return NVME_INVALID_FIELD | NVME_DNR;
1408 }
1409
1410 return NVME_SUCCESS;
1411 }
1412
1413 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1414 uint32_t nlb)
1415 {
1416 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1417
1418 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1419 return NVME_LBA_RANGE | NVME_DNR;
1420 }
1421
1422 return NVME_SUCCESS;
1423 }
1424
1425 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1426 uint32_t nlb)
1427 {
1428 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1429
1430 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1431 int64_t offset = nvme_l2b(ns, slba);
1432 bool zeroed;
1433 int ret;
1434
1435 Error *local_err = NULL;
1436
1437 /*
1438 * `pnum` holds the number of bytes after offset that shares the same
1439 * allocation status as the byte at offset. If `pnum` is different from
1440 * `bytes`, we should check the allocation status of the next range and
1441 * continue this until all bytes have been checked.
1442 */
1443 do {
1444 bytes -= pnum;
1445
1446 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1447 if (ret < 0) {
1448 error_setg_errno(&local_err, -ret, "unable to get block status");
1449 error_report_err(local_err);
1450
1451 return NVME_INTERNAL_DEV_ERROR;
1452 }
1453
1454 zeroed = !!(ret & BDRV_BLOCK_ZERO);
1455
1456 trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
1457
1458 if (zeroed) {
1459 return NVME_DULB;
1460 }
1461
1462 offset += pnum;
1463 } while (pnum != bytes);
1464
1465 return NVME_SUCCESS;
1466 }
1467
1468 static void nvme_aio_err(NvmeRequest *req, int ret)
1469 {
1470 uint16_t status = NVME_SUCCESS;
1471 Error *local_err = NULL;
1472
1473 switch (req->cmd.opcode) {
1474 case NVME_CMD_READ:
1475 status = NVME_UNRECOVERED_READ;
1476 break;
1477 case NVME_CMD_FLUSH:
1478 case NVME_CMD_WRITE:
1479 case NVME_CMD_WRITE_ZEROES:
1480 case NVME_CMD_ZONE_APPEND:
1481 status = NVME_WRITE_FAULT;
1482 break;
1483 default:
1484 status = NVME_INTERNAL_DEV_ERROR;
1485 break;
1486 }
1487
1488 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1489
1490 error_setg_errno(&local_err, -ret, "aio failed");
1491 error_report_err(local_err);
1492
1493 /*
1494 * Set the command status code to the first encountered error but allow a
1495 * subsequent Internal Device Error to trump it.
1496 */
1497 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1498 return;
1499 }
1500
1501 req->status = status;
1502 }
1503
1504 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1505 {
1506 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1507 slba / ns->zone_size;
1508 }
1509
1510 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1511 {
1512 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1513
1514 assert(zone_idx < ns->num_zones);
1515 return &ns->zone_array[zone_idx];
1516 }
1517
1518 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1519 {
1520 uint64_t zslba = zone->d.zslba;
1521
1522 switch (nvme_get_zone_state(zone)) {
1523 case NVME_ZONE_STATE_EMPTY:
1524 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1525 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1526 case NVME_ZONE_STATE_CLOSED:
1527 return NVME_SUCCESS;
1528 case NVME_ZONE_STATE_FULL:
1529 trace_pci_nvme_err_zone_is_full(zslba);
1530 return NVME_ZONE_FULL;
1531 case NVME_ZONE_STATE_OFFLINE:
1532 trace_pci_nvme_err_zone_is_offline(zslba);
1533 return NVME_ZONE_OFFLINE;
1534 case NVME_ZONE_STATE_READ_ONLY:
1535 trace_pci_nvme_err_zone_is_read_only(zslba);
1536 return NVME_ZONE_READ_ONLY;
1537 default:
1538 assert(false);
1539 }
1540
1541 return NVME_INTERNAL_DEV_ERROR;
1542 }
1543
1544 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1545 uint64_t slba, uint32_t nlb)
1546 {
1547 uint64_t zcap = nvme_zone_wr_boundary(zone);
1548 uint16_t status;
1549
1550 status = nvme_check_zone_state_for_write(zone);
1551 if (status) {
1552 return status;
1553 }
1554
1555 if (unlikely(slba != zone->w_ptr)) {
1556 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1557 return NVME_ZONE_INVALID_WRITE;
1558 }
1559
1560 if (unlikely((slba + nlb) > zcap)) {
1561 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1562 return NVME_ZONE_BOUNDARY_ERROR;
1563 }
1564
1565 return NVME_SUCCESS;
1566 }
1567
1568 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1569 {
1570 switch (nvme_get_zone_state(zone)) {
1571 case NVME_ZONE_STATE_EMPTY:
1572 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1573 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1574 case NVME_ZONE_STATE_FULL:
1575 case NVME_ZONE_STATE_CLOSED:
1576 case NVME_ZONE_STATE_READ_ONLY:
1577 return NVME_SUCCESS;
1578 case NVME_ZONE_STATE_OFFLINE:
1579 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1580 return NVME_ZONE_OFFLINE;
1581 default:
1582 assert(false);
1583 }
1584
1585 return NVME_INTERNAL_DEV_ERROR;
1586 }
1587
1588 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1589 uint32_t nlb)
1590 {
1591 NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
1592 uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
1593 uint64_t end = slba + nlb;
1594 uint16_t status;
1595
1596 status = nvme_check_zone_state_for_read(zone);
1597 if (status) {
1598 ;
1599 } else if (unlikely(end > bndry)) {
1600 if (!ns->params.cross_zone_read) {
1601 status = NVME_ZONE_BOUNDARY_ERROR;
1602 } else {
1603 /*
1604 * Read across zone boundary - check that all subsequent
1605 * zones that are being read have an appropriate state.
1606 */
1607 do {
1608 zone++;
1609 status = nvme_check_zone_state_for_read(zone);
1610 if (status) {
1611 break;
1612 }
1613 } while (end > nvme_zone_rd_boundary(ns, zone));
1614 }
1615 }
1616
1617 return status;
1618 }
1619
1620 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1621 {
1622 switch (nvme_get_zone_state(zone)) {
1623 case NVME_ZONE_STATE_FULL:
1624 return NVME_SUCCESS;
1625
1626 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1627 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1628 nvme_aor_dec_open(ns);
1629 /* fallthrough */
1630 case NVME_ZONE_STATE_CLOSED:
1631 nvme_aor_dec_active(ns);
1632 /* fallthrough */
1633 case NVME_ZONE_STATE_EMPTY:
1634 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1635 return NVME_SUCCESS;
1636
1637 default:
1638 return NVME_ZONE_INVAL_TRANSITION;
1639 }
1640 }
1641
1642 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1643 {
1644 switch (nvme_get_zone_state(zone)) {
1645 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1646 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1647 nvme_aor_dec_open(ns);
1648 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1649 /* fall through */
1650 case NVME_ZONE_STATE_CLOSED:
1651 return NVME_SUCCESS;
1652
1653 default:
1654 return NVME_ZONE_INVAL_TRANSITION;
1655 }
1656 }
1657
1658 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1659 {
1660 NvmeZone *zone;
1661
1662 if (ns->params.max_open_zones &&
1663 ns->nr_open_zones == ns->params.max_open_zones) {
1664 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1665 if (zone) {
1666 /*
1667 * Automatically close this implicitly open zone.
1668 */
1669 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1670 nvme_zrm_close(ns, zone);
1671 }
1672 }
1673 }
1674
1675 static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone,
1676 bool implicit)
1677 {
1678 int act = 0;
1679 uint16_t status;
1680
1681 switch (nvme_get_zone_state(zone)) {
1682 case NVME_ZONE_STATE_EMPTY:
1683 act = 1;
1684
1685 /* fallthrough */
1686
1687 case NVME_ZONE_STATE_CLOSED:
1688 nvme_zrm_auto_transition_zone(ns);
1689 status = nvme_aor_check(ns, act, 1);
1690 if (status) {
1691 return status;
1692 }
1693
1694 if (act) {
1695 nvme_aor_inc_active(ns);
1696 }
1697
1698 nvme_aor_inc_open(ns);
1699
1700 if (implicit) {
1701 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1702 return NVME_SUCCESS;
1703 }
1704
1705 /* fallthrough */
1706
1707 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1708 if (implicit) {
1709 return NVME_SUCCESS;
1710 }
1711
1712 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1713
1714 /* fallthrough */
1715
1716 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1717 return NVME_SUCCESS;
1718
1719 default:
1720 return NVME_ZONE_INVAL_TRANSITION;
1721 }
1722 }
1723
1724 static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone)
1725 {
1726 return __nvme_zrm_open(ns, zone, true);
1727 }
1728
1729 static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone)
1730 {
1731 return __nvme_zrm_open(ns, zone, false);
1732 }
1733
1734 static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1735 uint32_t nlb)
1736 {
1737 zone->d.wp += nlb;
1738
1739 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1740 nvme_zrm_finish(ns, zone);
1741 }
1742 }
1743
1744 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1745 {
1746 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1747 NvmeZone *zone;
1748 uint64_t slba;
1749 uint32_t nlb;
1750
1751 slba = le64_to_cpu(rw->slba);
1752 nlb = le16_to_cpu(rw->nlb) + 1;
1753 zone = nvme_get_zone_by_slba(ns, slba);
1754
1755 __nvme_advance_zone_wp(ns, zone, nlb);
1756 }
1757
1758 static inline bool nvme_is_write(NvmeRequest *req)
1759 {
1760 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1761
1762 return rw->opcode == NVME_CMD_WRITE ||
1763 rw->opcode == NVME_CMD_ZONE_APPEND ||
1764 rw->opcode == NVME_CMD_WRITE_ZEROES;
1765 }
1766
1767 static void nvme_misc_cb(void *opaque, int ret)
1768 {
1769 NvmeRequest *req = opaque;
1770 NvmeNamespace *ns = req->ns;
1771
1772 BlockBackend *blk = ns->blkconf.blk;
1773 BlockAcctCookie *acct = &req->acct;
1774 BlockAcctStats *stats = blk_get_stats(blk);
1775
1776 trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
1777
1778 if (ret) {
1779 block_acct_failed(stats, acct);
1780 nvme_aio_err(req, ret);
1781 } else {
1782 block_acct_done(stats, acct);
1783 }
1784
1785 nvme_enqueue_req_completion(nvme_cq(req), req);
1786 }
1787
1788 void nvme_rw_complete_cb(void *opaque, int ret)
1789 {
1790 NvmeRequest *req = opaque;
1791 NvmeNamespace *ns = req->ns;
1792 BlockBackend *blk = ns->blkconf.blk;
1793 BlockAcctCookie *acct = &req->acct;
1794 BlockAcctStats *stats = blk_get_stats(blk);
1795
1796 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1797
1798 if (ret) {
1799 block_acct_failed(stats, acct);
1800 nvme_aio_err(req, ret);
1801 } else {
1802 block_acct_done(stats, acct);
1803 }
1804
1805 if (ns->params.zoned && nvme_is_write(req)) {
1806 nvme_finalize_zoned_write(ns, req);
1807 }
1808
1809 nvme_enqueue_req_completion(nvme_cq(req), req);
1810 }
1811
1812 static void nvme_rw_cb(void *opaque, int ret)
1813 {
1814 NvmeRequest *req = opaque;
1815 NvmeNamespace *ns = req->ns;
1816
1817 BlockBackend *blk = ns->blkconf.blk;
1818
1819 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1820
1821 if (ret) {
1822 goto out;
1823 }
1824
1825 if (nvme_msize(ns)) {
1826 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1827 uint64_t slba = le64_to_cpu(rw->slba);
1828 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1829 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
1830
1831 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1832 size_t mlen = nvme_m2b(ns, nlb);
1833
1834 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1835 BDRV_REQ_MAY_UNMAP,
1836 nvme_rw_complete_cb, req);
1837 return;
1838 }
1839
1840 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1841 uint16_t status;
1842
1843 nvme_sg_unmap(&req->sg);
1844 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1845 if (status) {
1846 ret = -EFAULT;
1847 goto out;
1848 }
1849
1850 if (req->cmd.opcode == NVME_CMD_READ) {
1851 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1852 }
1853
1854 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1855 }
1856 }
1857
1858 out:
1859 nvme_rw_complete_cb(req, ret);
1860 }
1861
1862 struct nvme_aio_format_ctx {
1863 NvmeRequest *req;
1864 NvmeNamespace *ns;
1865
1866 /* number of outstanding write zeroes for this namespace */
1867 int *count;
1868 };
1869
1870 static void nvme_aio_format_cb(void *opaque, int ret)
1871 {
1872 struct nvme_aio_format_ctx *ctx = opaque;
1873 NvmeRequest *req = ctx->req;
1874 NvmeNamespace *ns = ctx->ns;
1875 uintptr_t *num_formats = (uintptr_t *)&req->opaque;
1876 int *count = ctx->count;
1877
1878 g_free(ctx);
1879
1880 if (ret) {
1881 nvme_aio_err(req, ret);
1882 }
1883
1884 if (--(*count)) {
1885 return;
1886 }
1887
1888 g_free(count);
1889 ns->status = 0x0;
1890
1891 if (--(*num_formats)) {
1892 return;
1893 }
1894
1895 nvme_enqueue_req_completion(nvme_cq(req), req);
1896 }
1897
1898 struct nvme_aio_flush_ctx {
1899 NvmeRequest *req;
1900 NvmeNamespace *ns;
1901 BlockAcctCookie acct;
1902 };
1903
1904 static void nvme_aio_flush_cb(void *opaque, int ret)
1905 {
1906 struct nvme_aio_flush_ctx *ctx = opaque;
1907 NvmeRequest *req = ctx->req;
1908 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
1909
1910 BlockBackend *blk = ctx->ns->blkconf.blk;
1911 BlockAcctCookie *acct = &ctx->acct;
1912 BlockAcctStats *stats = blk_get_stats(blk);
1913
1914 trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
1915
1916 if (!ret) {
1917 block_acct_done(stats, acct);
1918 } else {
1919 block_acct_failed(stats, acct);
1920 nvme_aio_err(req, ret);
1921 }
1922
1923 (*num_flushes)--;
1924 g_free(ctx);
1925
1926 if (*num_flushes) {
1927 return;
1928 }
1929
1930 nvme_enqueue_req_completion(nvme_cq(req), req);
1931 }
1932
1933 static void nvme_verify_cb(void *opaque, int ret)
1934 {
1935 NvmeBounceContext *ctx = opaque;
1936 NvmeRequest *req = ctx->req;
1937 NvmeNamespace *ns = req->ns;
1938 BlockBackend *blk = ns->blkconf.blk;
1939 BlockAcctCookie *acct = &req->acct;
1940 BlockAcctStats *stats = blk_get_stats(blk);
1941 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1942 uint64_t slba = le64_to_cpu(rw->slba);
1943 uint16_t ctrl = le16_to_cpu(rw->control);
1944 uint16_t apptag = le16_to_cpu(rw->apptag);
1945 uint16_t appmask = le16_to_cpu(rw->appmask);
1946 uint32_t reftag = le32_to_cpu(rw->reftag);
1947 uint16_t status;
1948
1949 trace_pci_nvme_verify_cb(nvme_cid(req), NVME_RW_PRINFO(ctrl), apptag,
1950 appmask, reftag);
1951
1952 if (ret) {
1953 block_acct_failed(stats, acct);
1954 nvme_aio_err(req, ret);
1955 goto out;
1956 }
1957
1958 block_acct_done(stats, acct);
1959
1960 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1961 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1962 ctx->mdata.iov.size, slba);
1963 if (status) {
1964 req->status = status;
1965 goto out;
1966 }
1967
1968 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1969 ctx->mdata.bounce, ctx->mdata.iov.size,
1970 ctrl, slba, apptag, appmask, reftag);
1971 }
1972
1973 out:
1974 qemu_iovec_destroy(&ctx->data.iov);
1975 g_free(ctx->data.bounce);
1976
1977 qemu_iovec_destroy(&ctx->mdata.iov);
1978 g_free(ctx->mdata.bounce);
1979
1980 g_free(ctx);
1981
1982 nvme_enqueue_req_completion(nvme_cq(req), req);
1983 }
1984
1985
1986 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
1987 {
1988 NvmeBounceContext *ctx = opaque;
1989 NvmeRequest *req = ctx->req;
1990 NvmeNamespace *ns = req->ns;
1991 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1992 uint64_t slba = le64_to_cpu(rw->slba);
1993 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
1994 size_t mlen = nvme_m2b(ns, nlb);
1995 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
1996 BlockBackend *blk = ns->blkconf.blk;
1997
1998 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
1999
2000 if (ret) {
2001 goto out;
2002 }
2003
2004 ctx->mdata.bounce = g_malloc(mlen);
2005
2006 qemu_iovec_reset(&ctx->mdata.iov);
2007 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2008
2009 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2010 nvme_verify_cb, ctx);
2011 return;
2012
2013 out:
2014 nvme_verify_cb(ctx, ret);
2015 }
2016
2017 static void nvme_aio_discard_cb(void *opaque, int ret)
2018 {
2019 NvmeRequest *req = opaque;
2020 uintptr_t *discards = (uintptr_t *)&req->opaque;
2021
2022 trace_pci_nvme_aio_discard_cb(nvme_cid(req));
2023
2024 if (ret) {
2025 nvme_aio_err(req, ret);
2026 }
2027
2028 (*discards)--;
2029
2030 if (*discards) {
2031 return;
2032 }
2033
2034 nvme_enqueue_req_completion(nvme_cq(req), req);
2035 }
2036
2037 struct nvme_zone_reset_ctx {
2038 NvmeRequest *req;
2039 NvmeZone *zone;
2040 };
2041
2042 static void nvme_aio_zone_reset_complete_cb(void *opaque, int ret)
2043 {
2044 struct nvme_zone_reset_ctx *ctx = opaque;
2045 NvmeRequest *req = ctx->req;
2046 NvmeNamespace *ns = req->ns;
2047 NvmeZone *zone = ctx->zone;
2048 uintptr_t *resets = (uintptr_t *)&req->opaque;
2049
2050 if (ret) {
2051 nvme_aio_err(req, ret);
2052 goto out;
2053 }
2054
2055 switch (nvme_get_zone_state(zone)) {
2056 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2057 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2058 nvme_aor_dec_open(ns);
2059 /* fall through */
2060 case NVME_ZONE_STATE_CLOSED:
2061 nvme_aor_dec_active(ns);
2062 /* fall through */
2063 case NVME_ZONE_STATE_FULL:
2064 zone->w_ptr = zone->d.zslba;
2065 zone->d.wp = zone->w_ptr;
2066 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
2067 /* fall through */
2068 default:
2069 break;
2070 }
2071
2072 out:
2073 g_free(ctx);
2074
2075 (*resets)--;
2076
2077 if (*resets) {
2078 return;
2079 }
2080
2081 nvme_enqueue_req_completion(nvme_cq(req), req);
2082 }
2083
2084 static void nvme_aio_zone_reset_cb(void *opaque, int ret)
2085 {
2086 struct nvme_zone_reset_ctx *ctx = opaque;
2087 NvmeRequest *req = ctx->req;
2088 NvmeNamespace *ns = req->ns;
2089 NvmeZone *zone = ctx->zone;
2090
2091 trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
2092
2093 if (ret) {
2094 goto out;
2095 }
2096
2097 if (nvme_msize(ns)) {
2098 int64_t offset = ns->mdata_offset + nvme_m2b(ns, zone->d.zslba);
2099
2100 blk_aio_pwrite_zeroes(ns->blkconf.blk, offset,
2101 nvme_m2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
2102 nvme_aio_zone_reset_complete_cb, ctx);
2103 return;
2104 }
2105
2106 out:
2107 nvme_aio_zone_reset_complete_cb(opaque, ret);
2108 }
2109
2110 struct nvme_copy_ctx {
2111 int copies;
2112 uint8_t *bounce;
2113 uint8_t *mbounce;
2114 uint32_t nlb;
2115 NvmeCopySourceRange *ranges;
2116 };
2117
2118 struct nvme_copy_in_ctx {
2119 NvmeRequest *req;
2120 QEMUIOVector iov;
2121 NvmeCopySourceRange *range;
2122 };
2123
2124 static void nvme_copy_complete_cb(void *opaque, int ret)
2125 {
2126 NvmeRequest *req = opaque;
2127 NvmeNamespace *ns = req->ns;
2128 struct nvme_copy_ctx *ctx = req->opaque;
2129
2130 if (ret) {
2131 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2132 nvme_aio_err(req, ret);
2133 goto out;
2134 }
2135
2136 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2137
2138 out:
2139 if (ns->params.zoned) {
2140 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2141 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2142 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2143
2144 __nvme_advance_zone_wp(ns, zone, ctx->nlb);
2145 }
2146
2147 g_free(ctx->bounce);
2148 g_free(ctx->mbounce);
2149 g_free(ctx);
2150
2151 nvme_enqueue_req_completion(nvme_cq(req), req);
2152 }
2153
2154 static void nvme_copy_cb(void *opaque, int ret)
2155 {
2156 NvmeRequest *req = opaque;
2157 NvmeNamespace *ns = req->ns;
2158 struct nvme_copy_ctx *ctx = req->opaque;
2159
2160 trace_pci_nvme_copy_cb(nvme_cid(req));
2161
2162 if (ret) {
2163 goto out;
2164 }
2165
2166 if (nvme_msize(ns)) {
2167 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2168 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2169 int64_t offset = ns->mdata_offset + nvme_m2b(ns, sdlba);
2170
2171 qemu_iovec_reset(&req->sg.iov);
2172 qemu_iovec_add(&req->sg.iov, ctx->mbounce, nvme_m2b(ns, ctx->nlb));
2173
2174 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, offset, &req->sg.iov, 0,
2175 nvme_copy_complete_cb, req);
2176 return;
2177 }
2178
2179 out:
2180 nvme_copy_complete_cb(opaque, ret);
2181 }
2182
2183 static void nvme_copy_in_complete(NvmeRequest *req)
2184 {
2185 NvmeNamespace *ns = req->ns;
2186 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2187 struct nvme_copy_ctx *ctx = req->opaque;
2188 uint64_t sdlba = le64_to_cpu(copy->sdlba);
2189 uint16_t status;
2190
2191 trace_pci_nvme_copy_in_complete(nvme_cid(req));
2192
2193 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
2194
2195 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2196 uint16_t prinfor = (copy->control[0] >> 4) & 0xf;
2197 uint16_t prinfow = (copy->control[2] >> 2) & 0xf;
2198 uint16_t nr = copy->nr + 1;
2199 NvmeCopySourceRange *range;
2200 uint64_t slba;
2201 uint32_t nlb;
2202 uint16_t apptag, appmask;
2203 uint32_t reftag;
2204 uint8_t *buf = ctx->bounce, *mbuf = ctx->mbounce;
2205 size_t len, mlen;
2206 int i;
2207
2208 /*
2209 * The dif helpers expects prinfo to be similar to the control field of
2210 * the NvmeRwCmd, so shift by 10 to fake it.
2211 */
2212 prinfor = prinfor << 10;
2213 prinfow = prinfow << 10;
2214
2215 for (i = 0; i < nr; i++) {
2216 range = &ctx->ranges[i];
2217 slba = le64_to_cpu(range->slba);
2218 nlb = le16_to_cpu(range->nlb) + 1;
2219 len = nvme_l2b(ns, nlb);
2220 mlen = nvme_m2b(ns, nlb);
2221 apptag = le16_to_cpu(range->apptag);
2222 appmask = le16_to_cpu(range->appmask);
2223 reftag = le32_to_cpu(range->reftag);
2224
2225 status = nvme_dif_check(ns, buf, len, mbuf, mlen, prinfor, slba,
2226 apptag, appmask, reftag);
2227 if (status) {
2228 goto invalid;
2229 }
2230
2231 buf += len;
2232 mbuf += mlen;
2233 }
2234
2235 apptag = le16_to_cpu(copy->apptag);
2236 appmask = le16_to_cpu(copy->appmask);
2237 reftag = le32_to_cpu(copy->reftag);
2238
2239 if (prinfow & NVME_RW_PRINFO_PRACT) {
2240 size_t len = nvme_l2b(ns, ctx->nlb);
2241 size_t mlen = nvme_m2b(ns, ctx->nlb);
2242
2243 status = nvme_check_prinfo(ns, prinfow, sdlba, reftag);
2244 if (status) {
2245 goto invalid;
2246 }
2247
2248 nvme_dif_pract_generate_dif(ns, ctx->bounce, len, ctx->mbounce,
2249 mlen, apptag, reftag);
2250 } else {
2251 status = nvme_dif_check(ns, ctx->bounce, len, ctx->mbounce, mlen,
2252 prinfow, sdlba, apptag, appmask, reftag);
2253 if (status) {
2254 goto invalid;
2255 }
2256 }
2257 }
2258
2259 status = nvme_check_bounds(ns, sdlba, ctx->nlb);
2260 if (status) {
2261 trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
2262 goto invalid;
2263 }
2264
2265 if (ns->params.zoned) {
2266 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
2267
2268 status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
2269 if (status) {
2270 goto invalid;
2271 }
2272
2273 status = nvme_zrm_auto(ns, zone);
2274 if (status) {
2275 goto invalid;
2276 }
2277
2278 zone->w_ptr += ctx->nlb;
2279 }
2280
2281 qemu_iovec_init(&req->sg.iov, 1);
2282 qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
2283
2284 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2285 BLOCK_ACCT_WRITE);
2286
2287 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
2288 &req->sg.iov, 0, nvme_copy_cb, req);
2289
2290 return;
2291
2292 invalid:
2293 req->status = status;
2294
2295 g_free(ctx->bounce);
2296 g_free(ctx);
2297
2298 nvme_enqueue_req_completion(nvme_cq(req), req);
2299 }
2300
2301 static void nvme_aio_copy_in_cb(void *opaque, int ret)
2302 {
2303 struct nvme_copy_in_ctx *in_ctx = opaque;
2304 NvmeRequest *req = in_ctx->req;
2305 NvmeNamespace *ns = req->ns;
2306 struct nvme_copy_ctx *ctx = req->opaque;
2307
2308 qemu_iovec_destroy(&in_ctx->iov);
2309 g_free(in_ctx);
2310
2311 trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
2312
2313 if (ret) {
2314 nvme_aio_err(req, ret);
2315 }
2316
2317 ctx->copies--;
2318
2319 if (ctx->copies) {
2320 return;
2321 }
2322
2323 if (req->status) {
2324 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
2325
2326 g_free(ctx->bounce);
2327 g_free(ctx->mbounce);
2328 g_free(ctx);
2329
2330 nvme_enqueue_req_completion(nvme_cq(req), req);
2331
2332 return;
2333 }
2334
2335 nvme_copy_in_complete(req);
2336 }
2337
2338 struct nvme_compare_ctx {
2339 struct {
2340 QEMUIOVector iov;
2341 uint8_t *bounce;
2342 } data;
2343
2344 struct {
2345 QEMUIOVector iov;
2346 uint8_t *bounce;
2347 } mdata;
2348 };
2349
2350 static void nvme_compare_mdata_cb(void *opaque, int ret)
2351 {
2352 NvmeRequest *req = opaque;
2353 NvmeNamespace *ns = req->ns;
2354 NvmeCtrl *n = nvme_ctrl(req);
2355 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2356 uint16_t ctrl = le16_to_cpu(rw->control);
2357 uint16_t apptag = le16_to_cpu(rw->apptag);
2358 uint16_t appmask = le16_to_cpu(rw->appmask);
2359 uint32_t reftag = le32_to_cpu(rw->reftag);
2360 struct nvme_compare_ctx *ctx = req->opaque;
2361 g_autofree uint8_t *buf = NULL;
2362 uint16_t status = NVME_SUCCESS;
2363
2364 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2365
2366 buf = g_malloc(ctx->mdata.iov.size);
2367
2368 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2369 NVME_TX_DIRECTION_TO_DEVICE, req);
2370 if (status) {
2371 req->status = status;
2372 goto out;
2373 }
2374
2375 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2376 uint64_t slba = le64_to_cpu(rw->slba);
2377 uint8_t *bufp;
2378 uint8_t *mbufp = ctx->mdata.bounce;
2379 uint8_t *end = mbufp + ctx->mdata.iov.size;
2380 size_t msize = nvme_msize(ns);
2381 int16_t pil = 0;
2382
2383 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2384 ctx->mdata.bounce, ctx->mdata.iov.size, ctrl,
2385 slba, apptag, appmask, reftag);
2386 if (status) {
2387 req->status = status;
2388 goto out;
2389 }
2390
2391 /*
2392 * When formatted with protection information, do not compare the DIF
2393 * tuple.
2394 */
2395 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2396 pil = nvme_msize(ns) - sizeof(NvmeDifTuple);
2397 }
2398
2399 for (bufp = buf; mbufp < end; bufp += msize, mbufp += msize) {
2400 if (memcmp(bufp + pil, mbufp + pil, msize - pil)) {
2401 req->status = NVME_CMP_FAILURE;
2402 goto out;
2403 }
2404 }
2405
2406 goto out;
2407 }
2408
2409 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2410 req->status = NVME_CMP_FAILURE;
2411 goto out;
2412 }
2413
2414 out:
2415 qemu_iovec_destroy(&ctx->data.iov);
2416 g_free(ctx->data.bounce);
2417
2418 qemu_iovec_destroy(&ctx->mdata.iov);
2419 g_free(ctx->mdata.bounce);
2420
2421 g_free(ctx);
2422
2423 nvme_enqueue_req_completion(nvme_cq(req), req);
2424 }
2425
2426 static void nvme_compare_data_cb(void *opaque, int ret)
2427 {
2428 NvmeRequest *req = opaque;
2429 NvmeCtrl *n = nvme_ctrl(req);
2430 NvmeNamespace *ns = req->ns;
2431 BlockBackend *blk = ns->blkconf.blk;
2432 BlockAcctCookie *acct = &req->acct;
2433 BlockAcctStats *stats = blk_get_stats(blk);
2434
2435 struct nvme_compare_ctx *ctx = req->opaque;
2436 g_autofree uint8_t *buf = NULL;
2437 uint16_t status;
2438
2439 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2440
2441 if (ret) {
2442 block_acct_failed(stats, acct);
2443 nvme_aio_err(req, ret);
2444 goto out;
2445 }
2446
2447 buf = g_malloc(ctx->data.iov.size);
2448
2449 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2450 NVME_TX_DIRECTION_TO_DEVICE, req);
2451 if (status) {
2452 req->status = status;
2453 goto out;
2454 }
2455
2456 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2457 req->status = NVME_CMP_FAILURE;
2458 goto out;
2459 }
2460
2461 if (nvme_msize(ns)) {
2462 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2463 uint64_t slba = le64_to_cpu(rw->slba);
2464 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2465 size_t mlen = nvme_m2b(ns, nlb);
2466 uint64_t offset = ns->mdata_offset + nvme_m2b(ns, slba);
2467
2468 ctx->mdata.bounce = g_malloc(mlen);
2469
2470 qemu_iovec_init(&ctx->mdata.iov, 1);
2471 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2472
2473 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2474 nvme_compare_mdata_cb, req);
2475 return;
2476 }
2477
2478 block_acct_done(stats, acct);
2479
2480 out:
2481 qemu_iovec_destroy(&ctx->data.iov);
2482 g_free(ctx->data.bounce);
2483 g_free(ctx);
2484
2485 nvme_enqueue_req_completion(nvme_cq(req), req);
2486 }
2487
2488 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2489 {
2490 NvmeNamespace *ns = req->ns;
2491 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2492
2493 uint32_t attr = le32_to_cpu(dsm->attributes);
2494 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2495
2496 uint16_t status = NVME_SUCCESS;
2497
2498 trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
2499
2500 if (attr & NVME_DSMGMT_AD) {
2501 int64_t offset;
2502 size_t len;
2503 NvmeDsmRange range[nr];
2504 uintptr_t *discards = (uintptr_t *)&req->opaque;
2505
2506 status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
2507 if (status) {
2508 return status;
2509 }
2510
2511 /*
2512 * AIO callbacks may be called immediately, so initialize discards to 1
2513 * to make sure the the callback does not complete the request before
2514 * all discards have been issued.
2515 */
2516 *discards = 1;
2517
2518 for (int i = 0; i < nr; i++) {
2519 uint64_t slba = le64_to_cpu(range[i].slba);
2520 uint32_t nlb = le32_to_cpu(range[i].nlb);
2521
2522 if (nvme_check_bounds(ns, slba, nlb)) {
2523 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2524 ns->id_ns.nsze);
2525 continue;
2526 }
2527
2528 trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
2529 nlb);
2530
2531 if (nlb > n->dmrsl) {
2532 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2533 }
2534
2535 offset = nvme_l2b(ns, slba);
2536 len = nvme_l2b(ns, nlb);
2537
2538 while (len) {
2539 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
2540
2541 (*discards)++;
2542
2543 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
2544 nvme_aio_discard_cb, req);
2545
2546 offset += bytes;
2547 len -= bytes;
2548 }
2549 }
2550
2551 /* account for the 1-initialization */
2552 (*discards)--;
2553
2554 if (*discards) {
2555 status = NVME_NO_COMPLETE;
2556 } else {
2557 status = req->status;
2558 }
2559 }
2560
2561 return status;
2562 }
2563
2564 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2565 {
2566 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2567 NvmeNamespace *ns = req->ns;
2568 BlockBackend *blk = ns->blkconf.blk;
2569 uint64_t slba = le64_to_cpu(rw->slba);
2570 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2571 size_t len = nvme_l2b(ns, nlb);
2572 int64_t offset = nvme_l2b(ns, slba);
2573 uint16_t ctrl = le16_to_cpu(rw->control);
2574 uint32_t reftag = le32_to_cpu(rw->reftag);
2575 NvmeBounceContext *ctx = NULL;
2576 uint16_t status;
2577
2578 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2579
2580 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2581 status = nvme_check_prinfo(ns, ctrl, slba, reftag);
2582 if (status) {
2583 return status;
2584 }
2585
2586 if (ctrl & NVME_RW_PRINFO_PRACT) {
2587 return NVME_INVALID_PROT_INFO | NVME_DNR;
2588 }
2589 }
2590
2591 if (len > n->page_size << n->params.vsl) {
2592 return NVME_INVALID_FIELD | NVME_DNR;
2593 }
2594
2595 status = nvme_check_bounds(ns, slba, nlb);
2596 if (status) {
2597 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2598 return status;
2599 }
2600
2601 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2602 status = nvme_check_dulbe(ns, slba, nlb);
2603 if (status) {
2604 return status;
2605 }
2606 }
2607
2608 ctx = g_new0(NvmeBounceContext, 1);
2609 ctx->req = req;
2610
2611 ctx->data.bounce = g_malloc(len);
2612
2613 qemu_iovec_init(&ctx->data.iov, 1);
2614 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2615
2616 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2617 BLOCK_ACCT_READ);
2618
2619 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2620 nvme_verify_mdata_in_cb, ctx);
2621 return NVME_NO_COMPLETE;
2622 }
2623
2624 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2625 {
2626 NvmeNamespace *ns = req->ns;
2627 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2628
2629 uint16_t nr = copy->nr + 1;
2630 uint8_t format = copy->control[0] & 0xf;
2631
2632 /*
2633 * Shift the PRINFOR/PRINFOW values by 10 to allow reusing the
2634 * NVME_RW_PRINFO constants.
2635 */
2636 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf) << 10;
2637 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf) << 10;
2638
2639 uint32_t nlb = 0;
2640 uint8_t *bounce = NULL, *bouncep = NULL;
2641 uint8_t *mbounce = NULL, *mbouncep = NULL;
2642 struct nvme_copy_ctx *ctx;
2643 uint16_t status;
2644 int i;
2645
2646 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2647
2648 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2649 ((prinfor & NVME_RW_PRINFO_PRACT) != (prinfow & NVME_RW_PRINFO_PRACT))) {
2650 return NVME_INVALID_FIELD | NVME_DNR;
2651 }
2652
2653 if (!(n->id_ctrl.ocfs & (1 << format))) {
2654 trace_pci_nvme_err_copy_invalid_format(format);
2655 return NVME_INVALID_FIELD | NVME_DNR;
2656 }
2657
2658 if (nr > ns->id_ns.msrc + 1) {
2659 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
2660 }
2661
2662 ctx = g_new(struct nvme_copy_ctx, 1);
2663 ctx->ranges = g_new(NvmeCopySourceRange, nr);
2664
2665 status = nvme_h2c(n, (uint8_t *)ctx->ranges,
2666 nr * sizeof(NvmeCopySourceRange), req);
2667 if (status) {
2668 goto out;
2669 }
2670
2671 for (i = 0; i < nr; i++) {
2672 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2673 uint32_t _nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2674
2675 if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2676 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2677 goto out;
2678 }
2679
2680 status = nvme_check_bounds(ns, slba, _nlb);
2681 if (status) {
2682 trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
2683 goto out;
2684 }
2685
2686 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2687 status = nvme_check_dulbe(ns, slba, _nlb);
2688 if (status) {
2689 goto out;
2690 }
2691 }
2692
2693 if (ns->params.zoned) {
2694 status = nvme_check_zone_read(ns, slba, _nlb);
2695 if (status) {
2696 goto out;
2697 }
2698 }
2699
2700 nlb += _nlb;
2701 }
2702
2703 if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
2704 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2705 goto out;
2706 }
2707
2708 bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
2709 if (nvme_msize(ns)) {
2710 mbounce = mbouncep = g_malloc(nvme_m2b(ns, nlb));
2711 }
2712
2713 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
2714 BLOCK_ACCT_READ);
2715
2716 ctx->bounce = bounce;
2717 ctx->mbounce = mbounce;
2718 ctx->nlb = nlb;
2719 ctx->copies = 1;
2720
2721 req->opaque = ctx;
2722
2723 for (i = 0; i < nr; i++) {
2724 uint64_t slba = le64_to_cpu(ctx->ranges[i].slba);
2725 uint32_t nlb = le16_to_cpu(ctx->ranges[i].nlb) + 1;
2726
2727 size_t len = nvme_l2b(ns, nlb);
2728 int64_t offset = nvme_l2b(ns, slba);
2729
2730 trace_pci_nvme_copy_source_range(slba, nlb);
2731
2732 struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2733 in_ctx->req = req;
2734
2735 qemu_iovec_init(&in_ctx->iov, 1);
2736 qemu_iovec_add(&in_ctx->iov, bouncep, len);
2737
2738 ctx->copies++;
2739
2740 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2741 nvme_aio_copy_in_cb, in_ctx);
2742
2743 bouncep += len;
2744
2745 if (nvme_msize(ns)) {
2746 len = nvme_m2b(ns, nlb);
2747 offset = ns->mdata_offset + nvme_m2b(ns, slba);
2748
2749 in_ctx = g_new(struct nvme_copy_in_ctx, 1);
2750 in_ctx->req = req;
2751
2752 qemu_iovec_init(&in_ctx->iov, 1);
2753 qemu_iovec_add(&in_ctx->iov, mbouncep, len);
2754
2755 ctx->copies++;
2756
2757 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
2758 nvme_aio_copy_in_cb, in_ctx);
2759
2760 mbouncep += len;
2761 }
2762 }
2763
2764 /* account for the 1-initialization */
2765 ctx->copies--;
2766
2767 if (!ctx->copies) {
2768 nvme_copy_in_complete(req);
2769 }
2770
2771 return NVME_NO_COMPLETE;
2772
2773 out:
2774 g_free(ctx->ranges);
2775 g_free(ctx);
2776
2777 return status;
2778 }
2779
2780 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2781 {
2782 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2783 NvmeNamespace *ns = req->ns;
2784 BlockBackend *blk = ns->blkconf.blk;
2785 uint64_t slba = le64_to_cpu(rw->slba);
2786 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2787 uint16_t ctrl = le16_to_cpu(rw->control);
2788 size_t data_len = nvme_l2b(ns, nlb);
2789 size_t len = data_len;
2790 int64_t offset = nvme_l2b(ns, slba);
2791 struct nvme_compare_ctx *ctx = NULL;
2792 uint16_t status;
2793
2794 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2795
2796 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (ctrl & NVME_RW_PRINFO_PRACT)) {
2797 return NVME_INVALID_PROT_INFO | NVME_DNR;
2798 }
2799
2800 if (nvme_ns_ext(ns)) {
2801 len += nvme_m2b(ns, nlb);
2802 }
2803
2804 status = nvme_check_mdts(n, len);
2805 if (status) {
2806 return status;
2807 }
2808
2809 status = nvme_check_bounds(ns, slba, nlb);
2810 if (status) {
2811 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2812 return status;
2813 }
2814
2815 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2816 status = nvme_check_dulbe(ns, slba, nlb);
2817 if (status) {
2818 return status;
2819 }
2820 }
2821
2822 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2823 if (status) {
2824 return status;
2825 }
2826
2827 ctx = g_new(struct nvme_compare_ctx, 1);
2828 ctx->data.bounce = g_malloc(data_len);
2829
2830 req->opaque = ctx;
2831
2832 qemu_iovec_init(&ctx->data.iov, 1);
2833 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2834
2835 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2836 BLOCK_ACCT_READ);
2837 blk_aio_preadv(blk, offset, &ctx->data.iov, 0, nvme_compare_data_cb, req);
2838
2839 return NVME_NO_COMPLETE;
2840 }
2841
2842 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
2843 {
2844 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2845 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
2846 uint16_t status;
2847 struct nvme_aio_flush_ctx *ctx;
2848 NvmeNamespace *ns;
2849
2850 trace_pci_nvme_flush(nvme_cid(req), nsid);
2851
2852 if (nsid != NVME_NSID_BROADCAST) {
2853 req->ns = nvme_ns(n, nsid);
2854 if (unlikely(!req->ns)) {
2855 return NVME_INVALID_FIELD | NVME_DNR;
2856 }
2857
2858 block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
2859 BLOCK_ACCT_FLUSH);
2860 req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
2861 return NVME_NO_COMPLETE;
2862 }
2863
2864 /* 1-initialize; see comment in nvme_dsm */
2865 *num_flushes = 1;
2866
2867 for (int i = 1; i <= n->num_namespaces; i++) {
2868 ns = nvme_ns(n, i);
2869 if (!ns) {
2870 continue;
2871 }
2872
2873 ctx = g_new(struct nvme_aio_flush_ctx, 1);
2874 ctx->req = req;
2875 ctx->ns = ns;
2876
2877 (*num_flushes)++;
2878
2879 block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
2880 BLOCK_ACCT_FLUSH);
2881 blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
2882 }
2883
2884 /* account for the 1-initialization */
2885 (*num_flushes)--;
2886
2887 if (*num_flushes) {
2888 status = NVME_NO_COMPLETE;
2889 } else {
2890 status = req->status;
2891 }
2892
2893 return status;
2894 }
2895
2896 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
2897 {
2898 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2899 NvmeNamespace *ns = req->ns;
2900 uint64_t slba = le64_to_cpu(rw->slba);
2901 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2902 uint16_t ctrl = le16_to_cpu(rw->control);
2903 uint64_t data_size = nvme_l2b(ns, nlb);
2904 uint64_t mapped_size = data_size;
2905 uint64_t data_offset;
2906 BlockBackend *blk = ns->blkconf.blk;
2907 uint16_t status;
2908
2909 if (nvme_ns_ext(ns)) {
2910 mapped_size += nvme_m2b(ns, nlb);
2911
2912 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2913 bool pract = ctrl & NVME_RW_PRINFO_PRACT;
2914
2915 if (pract && nvme_msize(ns) == 8) {
2916 mapped_size = data_size;
2917 }
2918 }
2919 }
2920
2921 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
2922
2923 status = nvme_check_mdts(n, mapped_size);
2924 if (status) {
2925 goto invalid;
2926 }
2927
2928 status = nvme_check_bounds(ns, slba, nlb);
2929 if (status) {
2930 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2931 goto invalid;
2932 }
2933
2934 if (ns->params.zoned) {
2935 status = nvme_check_zone_read(ns, slba, nlb);
2936 if (status) {
2937 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
2938 goto invalid;
2939 }
2940 }
2941
2942 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2943 status = nvme_check_dulbe(ns, slba, nlb);
2944 if (status) {
2945 goto invalid;
2946 }
2947 }
2948
2949 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2950 return nvme_dif_rw(n, req);
2951 }
2952
2953 status = nvme_map_data(n, nlb, req);
2954 if (status) {
2955 goto invalid;
2956 }
2957
2958 data_offset = nvme_l2b(ns, slba);
2959
2960 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2961 BLOCK_ACCT_READ);
2962 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
2963 return NVME_NO_COMPLETE;
2964
2965 invalid:
2966 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
2967 return status | NVME_DNR;
2968 }
2969
2970 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
2971 bool wrz)
2972 {
2973 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2974 NvmeNamespace *ns = req->ns;
2975 uint64_t slba = le64_to_cpu(rw->slba);
2976 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2977 uint16_t ctrl = le16_to_cpu(rw->control);
2978 uint64_t data_size = nvme_l2b(ns, nlb);
2979 uint64_t mapped_size = data_size;
2980 uint64_t data_offset;
2981 NvmeZone *zone;
2982 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
2983 BlockBackend *blk = ns->blkconf.blk;
2984 uint16_t status;
2985
2986 if (nvme_ns_ext(ns)) {
2987 mapped_size += nvme_m2b(ns, nlb);
2988
2989 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2990 bool pract = ctrl & NVME_RW_PRINFO_PRACT;
2991
2992 if (pract && nvme_msize(ns) == 8) {
2993 mapped_size -= nvme_m2b(ns, nlb);
2994 }
2995 }
2996 }
2997
2998 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
2999 nvme_nsid(ns), nlb, mapped_size, slba);
3000
3001 if (!wrz) {
3002 status = nvme_check_mdts(n, mapped_size);
3003 if (status) {
3004 goto invalid;
3005 }
3006 }
3007
3008 status = nvme_check_bounds(ns, slba, nlb);
3009 if (status) {
3010 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
3011 goto invalid;
3012 }
3013
3014 if (ns->params.zoned) {
3015 zone = nvme_get_zone_by_slba(ns, slba);
3016
3017 if (append) {
3018 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3019
3020 if (unlikely(slba != zone->d.zslba)) {
3021 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3022 status = NVME_INVALID_FIELD;
3023 goto invalid;
3024 }
3025
3026 if (n->params.zasl &&
3027 data_size > (uint64_t)n->page_size << n->params.zasl) {
3028 trace_pci_nvme_err_zasl(data_size);
3029 return NVME_INVALID_FIELD | NVME_DNR;
3030 }
3031
3032 slba = zone->w_ptr;
3033 rw->slba = cpu_to_le64(slba);
3034 res->slba = cpu_to_le64(slba);
3035
3036 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3037 case NVME_ID_NS_DPS_TYPE_1:
3038 if (!piremap) {
3039 return NVME_INVALID_PROT_INFO | NVME_DNR;
3040 }
3041
3042 /* fallthrough */
3043
3044 case NVME_ID_NS_DPS_TYPE_2:
3045 if (piremap) {
3046 uint32_t reftag = le32_to_cpu(rw->reftag);
3047 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3048 }
3049
3050 break;
3051
3052 case NVME_ID_NS_DPS_TYPE_3:
3053 if (piremap) {
3054 return NVME_INVALID_PROT_INFO | NVME_DNR;
3055 }
3056
3057 break;
3058 }
3059 }
3060
3061 status = nvme_check_zone_write(ns, zone, slba, nlb);
3062 if (status) {
3063 goto invalid;
3064 }
3065
3066 status = nvme_zrm_auto(ns, zone);
3067 if (status) {
3068 goto invalid;
3069 }
3070
3071 zone->w_ptr += nlb;
3072 }
3073
3074 data_offset = nvme_l2b(ns, slba);
3075
3076 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077 return nvme_dif_rw(n, req);
3078 }
3079
3080 if (!wrz) {
3081 status = nvme_map_data(n, nlb, req);
3082 if (status) {
3083 goto invalid;
3084 }
3085
3086 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3087 BLOCK_ACCT_WRITE);
3088 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3089 } else {
3090 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3091 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3092 req);
3093 }
3094
3095 return NVME_NO_COMPLETE;
3096
3097 invalid:
3098 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3099 return status | NVME_DNR;
3100 }
3101
3102 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3103 {
3104 return nvme_do_write(n, req, false, false);
3105 }
3106
3107 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3108 {
3109 return nvme_do_write(n, req, false, true);
3110 }
3111
3112 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3113 {
3114 return nvme_do_write(n, req, true, false);
3115 }
3116
3117 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3118 uint64_t *slba, uint32_t *zone_idx)
3119 {
3120 uint32_t dw10 = le32_to_cpu(c->cdw10);
3121 uint32_t dw11 = le32_to_cpu(c->cdw11);
3122
3123 if (!ns->params.zoned) {
3124 trace_pci_nvme_err_invalid_opc(c->opcode);
3125 return NVME_INVALID_OPCODE | NVME_DNR;
3126 }
3127
3128 *slba = ((uint64_t)dw11) << 32 | dw10;
3129 if (unlikely(*slba >= ns->id_ns.nsze)) {
3130 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3131 *slba = 0;
3132 return NVME_LBA_RANGE | NVME_DNR;
3133 }
3134
3135 *zone_idx = nvme_zone_idx(ns, *slba);
3136 assert(*zone_idx < ns->num_zones);
3137
3138 return NVME_SUCCESS;
3139 }
3140
3141 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3142 NvmeRequest *);
3143
3144 enum NvmeZoneProcessingMask {
3145 NVME_PROC_CURRENT_ZONE = 0,
3146 NVME_PROC_OPENED_ZONES = 1 << 0,
3147 NVME_PROC_CLOSED_ZONES = 1 << 1,
3148 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3149 NVME_PROC_FULL_ZONES = 1 << 3,
3150 };
3151
3152 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3153 NvmeZoneState state, NvmeRequest *req)
3154 {
3155 return nvme_zrm_open(ns, zone);
3156 }
3157
3158 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3159 NvmeZoneState state, NvmeRequest *req)
3160 {
3161 return nvme_zrm_close(ns, zone);
3162 }
3163
3164 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3165 NvmeZoneState state, NvmeRequest *req)
3166 {
3167 return nvme_zrm_finish(ns, zone);
3168 }
3169
3170 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
3171 NvmeZoneState state, NvmeRequest *req)
3172 {
3173 uintptr_t *resets = (uintptr_t *)&req->opaque;
3174 struct nvme_zone_reset_ctx *ctx;
3175
3176 switch (state) {
3177 case NVME_ZONE_STATE_EMPTY:
3178 return NVME_SUCCESS;
3179 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3180 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3181 case NVME_ZONE_STATE_CLOSED:
3182 case NVME_ZONE_STATE_FULL:
3183 break;
3184 default:
3185 return NVME_ZONE_INVAL_TRANSITION;
3186 }
3187
3188 /*
3189 * The zone reset aio callback needs to know the zone that is being reset
3190 * in order to transition the zone on completion.
3191 */
3192 ctx = g_new(struct nvme_zone_reset_ctx, 1);
3193 ctx->req = req;
3194 ctx->zone = zone;
3195
3196 (*resets)++;
3197
3198 blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
3199 nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
3200 nvme_aio_zone_reset_cb, ctx);
3201
3202 return NVME_NO_COMPLETE;
3203 }
3204
3205 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3206 NvmeZoneState state, NvmeRequest *req)
3207 {
3208 switch (state) {
3209 case NVME_ZONE_STATE_READ_ONLY:
3210 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3211 /* fall through */
3212 case NVME_ZONE_STATE_OFFLINE:
3213 return NVME_SUCCESS;
3214 default:
3215 return NVME_ZONE_INVAL_TRANSITION;
3216 }
3217 }
3218
3219 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3220 {
3221 uint16_t status;
3222 uint8_t state = nvme_get_zone_state(zone);
3223
3224 if (state == NVME_ZONE_STATE_EMPTY) {
3225 status = nvme_aor_check(ns, 1, 0);
3226 if (status) {
3227 return status;
3228 }
3229 nvme_aor_inc_active(ns);
3230 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3231 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3232 return NVME_SUCCESS;
3233 }
3234
3235 return NVME_ZONE_INVAL_TRANSITION;
3236 }
3237
3238 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3239 enum NvmeZoneProcessingMask proc_mask,
3240 op_handler_t op_hndlr, NvmeRequest *req)
3241 {
3242 uint16_t status = NVME_SUCCESS;
3243 NvmeZoneState zs = nvme_get_zone_state(zone);
3244 bool proc_zone;
3245
3246 switch (zs) {
3247 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3248 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3249 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3250 break;
3251 case NVME_ZONE_STATE_CLOSED:
3252 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3253 break;
3254 case NVME_ZONE_STATE_READ_ONLY:
3255 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3256 break;
3257 case NVME_ZONE_STATE_FULL:
3258 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3259 break;
3260 default:
3261 proc_zone = false;
3262 }
3263
3264 if (proc_zone) {
3265 status = op_hndlr(ns, zone, zs, req);
3266 }
3267
3268 return status;
3269 }
3270
3271 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3272 enum NvmeZoneProcessingMask proc_mask,
3273 op_handler_t op_hndlr, NvmeRequest *req)
3274 {
3275 NvmeZone *next;
3276 uint16_t status = NVME_SUCCESS;
3277 int i;
3278
3279 if (!proc_mask) {
3280 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3281 } else {
3282 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3283 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3284 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3285 req);
3286 if (status && status != NVME_NO_COMPLETE) {
3287 goto out;
3288 }
3289 }
3290 }
3291 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3292 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3293 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3294 req);
3295 if (status && status != NVME_NO_COMPLETE) {
3296 goto out;
3297 }
3298 }
3299
3300 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3301 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3302 req);
3303 if (status && status != NVME_NO_COMPLETE) {
3304 goto out;
3305 }
3306 }
3307 }
3308 if (proc_mask & NVME_PROC_FULL_ZONES) {
3309 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3310 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3311 req);
3312 if (status && status != NVME_NO_COMPLETE) {
3313 goto out;
3314 }
3315 }
3316 }
3317
3318 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3319 for (i = 0; i < ns->num_zones; i++, zone++) {
3320 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3321 req);
3322 if (status && status != NVME_NO_COMPLETE) {
3323 goto out;
3324 }
3325 }
3326 }
3327 }
3328
3329 out:
3330 return status;
3331 }
3332
3333 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3334 {
3335 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3336 NvmeNamespace *ns = req->ns;
3337 NvmeZone *zone;
3338 uintptr_t *resets;
3339 uint8_t *zd_ext;
3340 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3341 uint64_t slba = 0;
3342 uint32_t zone_idx = 0;
3343 uint16_t status;
3344 uint8_t action;
3345 bool all;
3346 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3347
3348 action = dw13 & 0xff;
3349 all = dw13 & 0x100;
3350
3351 req->status = NVME_SUCCESS;
3352
3353 if (!all) {
3354 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3355 if (status) {
3356 return status;
3357 }
3358 }
3359
3360 zone = &ns->zone_array[zone_idx];
3361 if (slba != zone->d.zslba) {
3362 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3363 return NVME_INVALID_FIELD | NVME_DNR;
3364 }
3365
3366 switch (action) {
3367
3368 case NVME_ZONE_ACTION_OPEN:
3369 if (all) {
3370 proc_mask = NVME_PROC_CLOSED_ZONES;
3371 }
3372 trace_pci_nvme_open_zone(slba, zone_idx, all);
3373 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3374 break;
3375
3376 case NVME_ZONE_ACTION_CLOSE:
3377 if (all) {
3378 proc_mask = NVME_PROC_OPENED_ZONES;
3379 }
3380 trace_pci_nvme_close_zone(slba, zone_idx, all);
3381 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3382 break;
3383
3384 case NVME_ZONE_ACTION_FINISH:
3385 if (all) {
3386 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3387 }
3388 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3389 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3390 break;
3391
3392 case NVME_ZONE_ACTION_RESET:
3393 resets = (uintptr_t *)&req->opaque;
3394
3395 if (all) {