block/export: Add blk_exp_close_all(_type)
[qemu.git] / nbd / server.c
1 /*
2 * Copyright (C) 2016-2018 Red Hat, Inc.
3 * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
4 *
5 * Network Block Device Server Side
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; under version 2 of the License.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21
22 #include "block/export.h"
23 #include "qapi/error.h"
24 #include "qemu/queue.h"
25 #include "trace.h"
26 #include "nbd-internal.h"
27 #include "qemu/units.h"
28
29 #define NBD_META_ID_BASE_ALLOCATION 0
30 #define NBD_META_ID_DIRTY_BITMAP 1
31
32 /*
33 * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
34 * constant. If an increase is needed, note that the NBD protocol
35 * recommends no larger than 32 mb, so that the client won't consider
36 * the reply as a denial of service attack.
37 */
38 #define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
39
40 static int system_errno_to_nbd_errno(int err)
41 {
42 switch (err) {
43 case 0:
44 return NBD_SUCCESS;
45 case EPERM:
46 case EROFS:
47 return NBD_EPERM;
48 case EIO:
49 return NBD_EIO;
50 case ENOMEM:
51 return NBD_ENOMEM;
52 #ifdef EDQUOT
53 case EDQUOT:
54 #endif
55 case EFBIG:
56 case ENOSPC:
57 return NBD_ENOSPC;
58 case EOVERFLOW:
59 return NBD_EOVERFLOW;
60 case ENOTSUP:
61 #if ENOTSUP != EOPNOTSUPP
62 case EOPNOTSUPP:
63 #endif
64 return NBD_ENOTSUP;
65 case ESHUTDOWN:
66 return NBD_ESHUTDOWN;
67 case EINVAL:
68 default:
69 return NBD_EINVAL;
70 }
71 }
72
73 /* Definitions for opaque data types */
74
75 typedef struct NBDRequestData NBDRequestData;
76
77 struct NBDRequestData {
78 QSIMPLEQ_ENTRY(NBDRequestData) entry;
79 NBDClient *client;
80 uint8_t *data;
81 bool complete;
82 };
83
84 struct NBDExport {
85 BlockExport common;
86
87 BlockBackend *blk;
88 char *name;
89 char *description;
90 uint64_t size;
91 uint16_t nbdflags;
92 QTAILQ_HEAD(, NBDClient) clients;
93 QTAILQ_ENTRY(NBDExport) next;
94
95 BlockBackend *eject_notifier_blk;
96 Notifier eject_notifier;
97
98 BdrvDirtyBitmap *export_bitmap;
99 char *export_bitmap_context;
100 };
101
102 static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
103
104 /* NBDExportMetaContexts represents a list of contexts to be exported,
105 * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
106 * NBD_OPT_LIST_META_CONTEXT. */
107 typedef struct NBDExportMetaContexts {
108 NBDExport *exp;
109 bool valid; /* means that negotiation of the option finished without
110 errors */
111 bool base_allocation; /* export base:allocation context (block status) */
112 bool bitmap; /* export qemu:dirty-bitmap:<export bitmap name> */
113 } NBDExportMetaContexts;
114
115 struct NBDClient {
116 int refcount;
117 void (*close_fn)(NBDClient *client, bool negotiated);
118
119 NBDExport *exp;
120 QCryptoTLSCreds *tlscreds;
121 char *tlsauthz;
122 QIOChannelSocket *sioc; /* The underlying data channel */
123 QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
124
125 Coroutine *recv_coroutine;
126
127 CoMutex send_lock;
128 Coroutine *send_coroutine;
129
130 QTAILQ_ENTRY(NBDClient) next;
131 int nb_requests;
132 bool closing;
133
134 uint32_t check_align; /* If non-zero, check for aligned client requests */
135
136 bool structured_reply;
137 NBDExportMetaContexts export_meta;
138
139 uint32_t opt; /* Current option being negotiated */
140 uint32_t optlen; /* remaining length of data in ioc for the option being
141 negotiated now */
142 };
143
144 static void nbd_client_receive_next_request(NBDClient *client);
145
146 /* Basic flow for negotiation
147
148 Server Client
149 Negotiate
150
151 or
152
153 Server Client
154 Negotiate #1
155 Option
156 Negotiate #2
157
158 ----
159
160 followed by
161
162 Server Client
163 Request
164 Response
165 Request
166 Response
167 ...
168 ...
169 Request (type == 2)
170
171 */
172
173 static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
174 uint32_t type, uint32_t length)
175 {
176 stq_be_p(&rep->magic, NBD_REP_MAGIC);
177 stl_be_p(&rep->option, option);
178 stl_be_p(&rep->type, type);
179 stl_be_p(&rep->length, length);
180 }
181
182 /* Send a reply header, including length, but no payload.
183 * Return -errno on error, 0 on success. */
184 static int nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
185 uint32_t len, Error **errp)
186 {
187 NBDOptionReply rep;
188
189 trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
190 type, nbd_rep_lookup(type), len);
191
192 assert(len < NBD_MAX_BUFFER_SIZE);
193
194 set_be_option_rep(&rep, client->opt, type, len);
195 return nbd_write(client->ioc, &rep, sizeof(rep), errp);
196 }
197
198 /* Send a reply header with default 0 length.
199 * Return -errno on error, 0 on success. */
200 static int nbd_negotiate_send_rep(NBDClient *client, uint32_t type,
201 Error **errp)
202 {
203 return nbd_negotiate_send_rep_len(client, type, 0, errp);
204 }
205
206 /* Send an error reply.
207 * Return -errno on error, 0 on success. */
208 static int GCC_FMT_ATTR(4, 0)
209 nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
210 Error **errp, const char *fmt, va_list va)
211 {
212 ERRP_GUARD();
213 g_autofree char *msg = NULL;
214 int ret;
215 size_t len;
216
217 msg = g_strdup_vprintf(fmt, va);
218 len = strlen(msg);
219 assert(len < NBD_MAX_STRING_SIZE);
220 trace_nbd_negotiate_send_rep_err(msg);
221 ret = nbd_negotiate_send_rep_len(client, type, len, errp);
222 if (ret < 0) {
223 return ret;
224 }
225 if (nbd_write(client->ioc, msg, len, errp) < 0) {
226 error_prepend(errp, "write failed (error message): ");
227 return -EIO;
228 }
229
230 return 0;
231 }
232
233 /*
234 * Return a malloc'd copy of @name suitable for use in an error reply.
235 */
236 static char *
237 nbd_sanitize_name(const char *name)
238 {
239 if (strnlen(name, 80) < 80) {
240 return g_strdup(name);
241 }
242 /* XXX Should we also try to sanitize any control characters? */
243 return g_strdup_printf("%.80s...", name);
244 }
245
246 /* Send an error reply.
247 * Return -errno on error, 0 on success. */
248 static int GCC_FMT_ATTR(4, 5)
249 nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
250 Error **errp, const char *fmt, ...)
251 {
252 va_list va;
253 int ret;
254
255 va_start(va, fmt);
256 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
257 va_end(va);
258 return ret;
259 }
260
261 /* Drop remainder of the current option, and send a reply with the
262 * given error type and message. Return -errno on read or write
263 * failure; or 0 if connection is still live. */
264 static int GCC_FMT_ATTR(4, 0)
265 nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
266 const char *fmt, va_list va)
267 {
268 int ret = nbd_drop(client->ioc, client->optlen, errp);
269
270 client->optlen = 0;
271 if (!ret) {
272 ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
273 }
274 return ret;
275 }
276
277 static int GCC_FMT_ATTR(4, 5)
278 nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
279 const char *fmt, ...)
280 {
281 int ret;
282 va_list va;
283
284 va_start(va, fmt);
285 ret = nbd_opt_vdrop(client, type, errp, fmt, va);
286 va_end(va);
287
288 return ret;
289 }
290
291 static int GCC_FMT_ATTR(3, 4)
292 nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
293 {
294 int ret;
295 va_list va;
296
297 va_start(va, fmt);
298 ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
299 va_end(va);
300
301 return ret;
302 }
303
304 /* Read size bytes from the unparsed payload of the current option.
305 * Return -errno on I/O error, 0 if option was completely handled by
306 * sending a reply about inconsistent lengths, or 1 on success. */
307 static int nbd_opt_read(NBDClient *client, void *buffer, size_t size,
308 Error **errp)
309 {
310 if (size > client->optlen) {
311 return nbd_opt_invalid(client, errp,
312 "Inconsistent lengths in option %s",
313 nbd_opt_lookup(client->opt));
314 }
315 client->optlen -= size;
316 return qio_channel_read_all(client->ioc, buffer, size, errp) < 0 ? -EIO : 1;
317 }
318
319 /* Drop size bytes from the unparsed payload of the current option.
320 * Return -errno on I/O error, 0 if option was completely handled by
321 * sending a reply about inconsistent lengths, or 1 on success. */
322 static int nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
323 {
324 if (size > client->optlen) {
325 return nbd_opt_invalid(client, errp,
326 "Inconsistent lengths in option %s",
327 nbd_opt_lookup(client->opt));
328 }
329 client->optlen -= size;
330 return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
331 }
332
333 /* nbd_opt_read_name
334 *
335 * Read a string with the format:
336 * uint32_t len (<= NBD_MAX_STRING_SIZE)
337 * len bytes string (not 0-terminated)
338 *
339 * On success, @name will be allocated.
340 * If @length is non-null, it will be set to the actual string length.
341 *
342 * Return -errno on I/O error, 0 if option was completely handled by
343 * sending a reply about inconsistent lengths, or 1 on success.
344 */
345 static int nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
346 Error **errp)
347 {
348 int ret;
349 uint32_t len;
350 g_autofree char *local_name = NULL;
351
352 *name = NULL;
353 ret = nbd_opt_read(client, &len, sizeof(len), errp);
354 if (ret <= 0) {
355 return ret;
356 }
357 len = cpu_to_be32(len);
358
359 if (len > NBD_MAX_STRING_SIZE) {
360 return nbd_opt_invalid(client, errp,
361 "Invalid name length: %" PRIu32, len);
362 }
363
364 local_name = g_malloc(len + 1);
365 ret = nbd_opt_read(client, local_name, len, errp);
366 if (ret <= 0) {
367 return ret;
368 }
369 local_name[len] = '\0';
370
371 if (length) {
372 *length = len;
373 }
374 *name = g_steal_pointer(&local_name);
375
376 return 1;
377 }
378
379 /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
380 * Return -errno on error, 0 on success. */
381 static int nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp,
382 Error **errp)
383 {
384 ERRP_GUARD();
385 size_t name_len, desc_len;
386 uint32_t len;
387 const char *name = exp->name ? exp->name : "";
388 const char *desc = exp->description ? exp->description : "";
389 QIOChannel *ioc = client->ioc;
390 int ret;
391
392 trace_nbd_negotiate_send_rep_list(name, desc);
393 name_len = strlen(name);
394 desc_len = strlen(desc);
395 assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
396 len = name_len + desc_len + sizeof(len);
397 ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
398 if (ret < 0) {
399 return ret;
400 }
401
402 len = cpu_to_be32(name_len);
403 if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
404 error_prepend(errp, "write failed (name length): ");
405 return -EINVAL;
406 }
407
408 if (nbd_write(ioc, name, name_len, errp) < 0) {
409 error_prepend(errp, "write failed (name buffer): ");
410 return -EINVAL;
411 }
412
413 if (nbd_write(ioc, desc, desc_len, errp) < 0) {
414 error_prepend(errp, "write failed (description buffer): ");
415 return -EINVAL;
416 }
417
418 return 0;
419 }
420
421 /* Process the NBD_OPT_LIST command, with a potential series of replies.
422 * Return -errno on error, 0 on success. */
423 static int nbd_negotiate_handle_list(NBDClient *client, Error **errp)
424 {
425 NBDExport *exp;
426 assert(client->opt == NBD_OPT_LIST);
427
428 /* For each export, send a NBD_REP_SERVER reply. */
429 QTAILQ_FOREACH(exp, &exports, next) {
430 if (nbd_negotiate_send_rep_list(client, exp, errp)) {
431 return -EINVAL;
432 }
433 }
434 /* Finish with a NBD_REP_ACK. */
435 return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
436 }
437
438 static void nbd_check_meta_export(NBDClient *client)
439 {
440 client->export_meta.valid &= client->exp == client->export_meta.exp;
441 }
442
443 /* Send a reply to NBD_OPT_EXPORT_NAME.
444 * Return -errno on error, 0 on success. */
445 static int nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
446 Error **errp)
447 {
448 ERRP_GUARD();
449 g_autofree char *name = NULL;
450 char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
451 size_t len;
452 int ret;
453 uint16_t myflags;
454
455 /* Client sends:
456 [20 .. xx] export name (length bytes)
457 Server replies:
458 [ 0 .. 7] size
459 [ 8 .. 9] export flags
460 [10 .. 133] reserved (0) [unless no_zeroes]
461 */
462 trace_nbd_negotiate_handle_export_name();
463 if (client->optlen > NBD_MAX_STRING_SIZE) {
464 error_setg(errp, "Bad length received");
465 return -EINVAL;
466 }
467 name = g_malloc(client->optlen + 1);
468 if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
469 return -EIO;
470 }
471 name[client->optlen] = '\0';
472 client->optlen = 0;
473
474 trace_nbd_negotiate_handle_export_name_request(name);
475
476 client->exp = nbd_export_find(name);
477 if (!client->exp) {
478 error_setg(errp, "export not found");
479 return -EINVAL;
480 }
481
482 myflags = client->exp->nbdflags;
483 if (client->structured_reply) {
484 myflags |= NBD_FLAG_SEND_DF;
485 }
486 trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
487 stq_be_p(buf, client->exp->size);
488 stw_be_p(buf + 8, myflags);
489 len = no_zeroes ? 10 : sizeof(buf);
490 ret = nbd_write(client->ioc, buf, len, errp);
491 if (ret < 0) {
492 error_prepend(errp, "write failed: ");
493 return ret;
494 }
495
496 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
497 blk_exp_ref(&client->exp->common);
498 nbd_check_meta_export(client);
499
500 return 0;
501 }
502
503 /* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
504 * The buffer does NOT include the info type prefix.
505 * Return -errno on error, 0 if ready to send more. */
506 static int nbd_negotiate_send_info(NBDClient *client,
507 uint16_t info, uint32_t length, void *buf,
508 Error **errp)
509 {
510 int rc;
511
512 trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
513 rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
514 sizeof(info) + length, errp);
515 if (rc < 0) {
516 return rc;
517 }
518 info = cpu_to_be16(info);
519 if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
520 return -EIO;
521 }
522 if (nbd_write(client->ioc, buf, length, errp) < 0) {
523 return -EIO;
524 }
525 return 0;
526 }
527
528 /* nbd_reject_length: Handle any unexpected payload.
529 * @fatal requests that we quit talking to the client, even if we are able
530 * to successfully send an error reply.
531 * Return:
532 * -errno transmission error occurred or @fatal was requested, errp is set
533 * 0 error message successfully sent to client, errp is not set
534 */
535 static int nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
536 {
537 int ret;
538
539 assert(client->optlen);
540 ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
541 nbd_opt_lookup(client->opt));
542 if (fatal && !ret) {
543 error_setg(errp, "option '%s' has unexpected length",
544 nbd_opt_lookup(client->opt));
545 return -EINVAL;
546 }
547 return ret;
548 }
549
550 /* Handle NBD_OPT_INFO and NBD_OPT_GO.
551 * Return -errno on error, 0 if ready for next option, and 1 to move
552 * into transmission phase. */
553 static int nbd_negotiate_handle_info(NBDClient *client, Error **errp)
554 {
555 int rc;
556 g_autofree char *name = NULL;
557 NBDExport *exp;
558 uint16_t requests;
559 uint16_t request;
560 uint32_t namelen;
561 bool sendname = false;
562 bool blocksize = false;
563 uint32_t sizes[3];
564 char buf[sizeof(uint64_t) + sizeof(uint16_t)];
565 uint32_t check_align = 0;
566 uint16_t myflags;
567
568 /* Client sends:
569 4 bytes: L, name length (can be 0)
570 L bytes: export name
571 2 bytes: N, number of requests (can be 0)
572 N * 2 bytes: N requests
573 */
574 rc = nbd_opt_read_name(client, &name, &namelen, errp);
575 if (rc <= 0) {
576 return rc;
577 }
578 trace_nbd_negotiate_handle_export_name_request(name);
579
580 rc = nbd_opt_read(client, &requests, sizeof(requests), errp);
581 if (rc <= 0) {
582 return rc;
583 }
584 requests = be16_to_cpu(requests);
585 trace_nbd_negotiate_handle_info_requests(requests);
586 while (requests--) {
587 rc = nbd_opt_read(client, &request, sizeof(request), errp);
588 if (rc <= 0) {
589 return rc;
590 }
591 request = be16_to_cpu(request);
592 trace_nbd_negotiate_handle_info_request(request,
593 nbd_info_lookup(request));
594 /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
595 * everything else is either a request we don't know or
596 * something we send regardless of request */
597 switch (request) {
598 case NBD_INFO_NAME:
599 sendname = true;
600 break;
601 case NBD_INFO_BLOCK_SIZE:
602 blocksize = true;
603 break;
604 }
605 }
606 if (client->optlen) {
607 return nbd_reject_length(client, false, errp);
608 }
609
610 exp = nbd_export_find(name);
611 if (!exp) {
612 g_autofree char *sane_name = nbd_sanitize_name(name);
613
614 return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
615 errp, "export '%s' not present",
616 sane_name);
617 }
618
619 /* Don't bother sending NBD_INFO_NAME unless client requested it */
620 if (sendname) {
621 rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
622 errp);
623 if (rc < 0) {
624 return rc;
625 }
626 }
627
628 /* Send NBD_INFO_DESCRIPTION only if available, regardless of
629 * client request */
630 if (exp->description) {
631 size_t len = strlen(exp->description);
632
633 assert(len <= NBD_MAX_STRING_SIZE);
634 rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
635 len, exp->description, errp);
636 if (rc < 0) {
637 return rc;
638 }
639 }
640
641 /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
642 * according to whether the client requested it, and according to
643 * whether this is OPT_INFO or OPT_GO. */
644 /* minimum - 1 for back-compat, or actual if client will obey it. */
645 if (client->opt == NBD_OPT_INFO || blocksize) {
646 check_align = sizes[0] = blk_get_request_alignment(exp->blk);
647 } else {
648 sizes[0] = 1;
649 }
650 assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
651 /* preferred - Hard-code to 4096 for now.
652 * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
653 sizes[1] = MAX(4096, sizes[0]);
654 /* maximum - At most 32M, but smaller as appropriate. */
655 sizes[2] = MIN(blk_get_max_transfer(exp->blk), NBD_MAX_BUFFER_SIZE);
656 trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
657 sizes[0] = cpu_to_be32(sizes[0]);
658 sizes[1] = cpu_to_be32(sizes[1]);
659 sizes[2] = cpu_to_be32(sizes[2]);
660 rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
661 sizeof(sizes), sizes, errp);
662 if (rc < 0) {
663 return rc;
664 }
665
666 /* Send NBD_INFO_EXPORT always */
667 myflags = exp->nbdflags;
668 if (client->structured_reply) {
669 myflags |= NBD_FLAG_SEND_DF;
670 }
671 trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
672 stq_be_p(buf, exp->size);
673 stw_be_p(buf + 8, myflags);
674 rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
675 sizeof(buf), buf, errp);
676 if (rc < 0) {
677 return rc;
678 }
679
680 /*
681 * If the client is just asking for NBD_OPT_INFO, but forgot to
682 * request block sizes in a situation that would impact
683 * performance, then return an error. But for NBD_OPT_GO, we
684 * tolerate all clients, regardless of alignments.
685 */
686 if (client->opt == NBD_OPT_INFO && !blocksize &&
687 blk_get_request_alignment(exp->blk) > 1) {
688 return nbd_negotiate_send_rep_err(client,
689 NBD_REP_ERR_BLOCK_SIZE_REQD,
690 errp,
691 "request NBD_INFO_BLOCK_SIZE to "
692 "use this export");
693 }
694
695 /* Final reply */
696 rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
697 if (rc < 0) {
698 return rc;
699 }
700
701 if (client->opt == NBD_OPT_GO) {
702 client->exp = exp;
703 client->check_align = check_align;
704 QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
705 blk_exp_ref(&client->exp->common);
706 nbd_check_meta_export(client);
707 rc = 1;
708 }
709 return rc;
710 }
711
712
713 /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
714 * new channel for all further (now-encrypted) communication. */
715 static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client,
716 Error **errp)
717 {
718 QIOChannel *ioc;
719 QIOChannelTLS *tioc;
720 struct NBDTLSHandshakeData data = { 0 };
721
722 assert(client->opt == NBD_OPT_STARTTLS);
723
724 trace_nbd_negotiate_handle_starttls();
725 ioc = client->ioc;
726
727 if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
728 return NULL;
729 }
730
731 tioc = qio_channel_tls_new_server(ioc,
732 client->tlscreds,
733 client->tlsauthz,
734 errp);
735 if (!tioc) {
736 return NULL;
737 }
738
739 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
740 trace_nbd_negotiate_handle_starttls_handshake();
741 data.loop = g_main_loop_new(g_main_context_default(), FALSE);
742 qio_channel_tls_handshake(tioc,
743 nbd_tls_handshake,
744 &data,
745 NULL,
746 NULL);
747
748 if (!data.complete) {
749 g_main_loop_run(data.loop);
750 }
751 g_main_loop_unref(data.loop);
752 if (data.error) {
753 object_unref(OBJECT(tioc));
754 error_propagate(errp, data.error);
755 return NULL;
756 }
757
758 return QIO_CHANNEL(tioc);
759 }
760
761 /* nbd_negotiate_send_meta_context
762 *
763 * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
764 *
765 * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
766 */
767 static int nbd_negotiate_send_meta_context(NBDClient *client,
768 const char *context,
769 uint32_t context_id,
770 Error **errp)
771 {
772 NBDOptionReplyMetaContext opt;
773 struct iovec iov[] = {
774 {.iov_base = &opt, .iov_len = sizeof(opt)},
775 {.iov_base = (void *)context, .iov_len = strlen(context)}
776 };
777
778 assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
779 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
780 context_id = 0;
781 }
782
783 trace_nbd_negotiate_meta_query_reply(context, context_id);
784 set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
785 sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
786 stl_be_p(&opt.context_id, context_id);
787
788 return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
789 }
790
791 /* Read strlen(@pattern) bytes, and set @match to true if they match @pattern.
792 * @match is never set to false.
793 *
794 * Return -errno on I/O error, 0 if option was completely handled by
795 * sending a reply about inconsistent lengths, or 1 on success.
796 *
797 * Note: return code = 1 doesn't mean that we've read exactly @pattern.
798 * It only means that there are no errors.
799 */
800 static int nbd_meta_pattern(NBDClient *client, const char *pattern, bool *match,
801 Error **errp)
802 {
803 int ret;
804 char *query;
805 size_t len = strlen(pattern);
806
807 assert(len);
808
809 query = g_malloc(len);
810 ret = nbd_opt_read(client, query, len, errp);
811 if (ret <= 0) {
812 g_free(query);
813 return ret;
814 }
815
816 if (strncmp(query, pattern, len) == 0) {
817 trace_nbd_negotiate_meta_query_parse(pattern);
818 *match = true;
819 } else {
820 trace_nbd_negotiate_meta_query_skip("pattern not matched");
821 }
822 g_free(query);
823
824 return 1;
825 }
826
827 /*
828 * Read @len bytes, and set @match to true if they match @pattern, or if @len
829 * is 0 and the client is performing _LIST_. @match is never set to false.
830 *
831 * Return -errno on I/O error, 0 if option was completely handled by
832 * sending a reply about inconsistent lengths, or 1 on success.
833 *
834 * Note: return code = 1 doesn't mean that we've read exactly @pattern.
835 * It only means that there are no errors.
836 */
837 static int nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
838 uint32_t len, bool *match, Error **errp)
839 {
840 if (len == 0) {
841 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
842 *match = true;
843 }
844 trace_nbd_negotiate_meta_query_parse("empty");
845 return 1;
846 }
847
848 if (len != strlen(pattern)) {
849 trace_nbd_negotiate_meta_query_skip("different lengths");
850 return nbd_opt_skip(client, len, errp);
851 }
852
853 return nbd_meta_pattern(client, pattern, match, errp);
854 }
855
856 /* nbd_meta_base_query
857 *
858 * Handle queries to 'base' namespace. For now, only the base:allocation
859 * context is available. 'len' is the amount of text remaining to be read from
860 * the current name, after the 'base:' portion has been stripped.
861 *
862 * Return -errno on I/O error, 0 if option was completely handled by
863 * sending a reply about inconsistent lengths, or 1 on success.
864 */
865 static int nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta,
866 uint32_t len, Error **errp)
867 {
868 return nbd_meta_empty_or_pattern(client, "allocation", len,
869 &meta->base_allocation, errp);
870 }
871
872 /* nbd_meta_bitmap_query
873 *
874 * Handle query to 'qemu:' namespace.
875 * @len is the amount of text remaining to be read from the current name, after
876 * the 'qemu:' portion has been stripped.
877 *
878 * Return -errno on I/O error, 0 if option was completely handled by
879 * sending a reply about inconsistent lengths, or 1 on success. */
880 static int nbd_meta_qemu_query(NBDClient *client, NBDExportMetaContexts *meta,
881 uint32_t len, Error **errp)
882 {
883 bool dirty_bitmap = false;
884 size_t dirty_bitmap_len = strlen("dirty-bitmap:");
885 int ret;
886
887 if (!meta->exp->export_bitmap) {
888 trace_nbd_negotiate_meta_query_skip("no dirty-bitmap exported");
889 return nbd_opt_skip(client, len, errp);
890 }
891
892 if (len == 0) {
893 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
894 meta->bitmap = true;
895 }
896 trace_nbd_negotiate_meta_query_parse("empty");
897 return 1;
898 }
899
900 if (len < dirty_bitmap_len) {
901 trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:");
902 return nbd_opt_skip(client, len, errp);
903 }
904
905 len -= dirty_bitmap_len;
906 ret = nbd_meta_pattern(client, "dirty-bitmap:", &dirty_bitmap, errp);
907 if (ret <= 0) {
908 return ret;
909 }
910 if (!dirty_bitmap) {
911 trace_nbd_negotiate_meta_query_skip("not dirty-bitmap:");
912 return nbd_opt_skip(client, len, errp);
913 }
914
915 trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
916
917 return nbd_meta_empty_or_pattern(
918 client, meta->exp->export_bitmap_context +
919 strlen("qemu:dirty_bitmap:"), len, &meta->bitmap, errp);
920 }
921
922 /* nbd_negotiate_meta_query
923 *
924 * Parse namespace name and call corresponding function to parse body of the
925 * query.
926 *
927 * The only supported namespaces are 'base' and 'qemu'.
928 *
929 * The function aims not wasting time and memory to read long unknown namespace
930 * names.
931 *
932 * Return -errno on I/O error, 0 if option was completely handled by
933 * sending a reply about inconsistent lengths, or 1 on success. */
934 static int nbd_negotiate_meta_query(NBDClient *client,
935 NBDExportMetaContexts *meta, Error **errp)
936 {
937 /*
938 * Both 'qemu' and 'base' namespaces have length = 5 including a
939 * colon. If another length namespace is later introduced, this
940 * should certainly be refactored.
941 */
942 int ret;
943 size_t ns_len = 5;
944 char ns[5];
945 uint32_t len;
946
947 ret = nbd_opt_read(client, &len, sizeof(len), errp);
948 if (ret <= 0) {
949 return ret;
950 }
951 len = cpu_to_be32(len);
952
953 if (len > NBD_MAX_STRING_SIZE) {
954 trace_nbd_negotiate_meta_query_skip("length too long");
955 return nbd_opt_skip(client, len, errp);
956 }
957 if (len < ns_len) {
958 trace_nbd_negotiate_meta_query_skip("length too short");
959 return nbd_opt_skip(client, len, errp);
960 }
961
962 len -= ns_len;
963 ret = nbd_opt_read(client, ns, ns_len, errp);
964 if (ret <= 0) {
965 return ret;
966 }
967
968 if (!strncmp(ns, "base:", ns_len)) {
969 trace_nbd_negotiate_meta_query_parse("base:");
970 return nbd_meta_base_query(client, meta, len, errp);
971 } else if (!strncmp(ns, "qemu:", ns_len)) {
972 trace_nbd_negotiate_meta_query_parse("qemu:");
973 return nbd_meta_qemu_query(client, meta, len, errp);
974 }
975
976 trace_nbd_negotiate_meta_query_skip("unknown namespace");
977 return nbd_opt_skip(client, len, errp);
978 }
979
980 /* nbd_negotiate_meta_queries
981 * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
982 *
983 * Return -errno on I/O error, or 0 if option was completely handled. */
984 static int nbd_negotiate_meta_queries(NBDClient *client,
985 NBDExportMetaContexts *meta, Error **errp)
986 {
987 int ret;
988 g_autofree char *export_name = NULL;
989 NBDExportMetaContexts local_meta;
990 uint32_t nb_queries;
991 int i;
992
993 if (!client->structured_reply) {
994 return nbd_opt_invalid(client, errp,
995 "request option '%s' when structured reply "
996 "is not negotiated",
997 nbd_opt_lookup(client->opt));
998 }
999
1000 if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
1001 /* Only change the caller's meta on SET. */
1002 meta = &local_meta;
1003 }
1004
1005 memset(meta, 0, sizeof(*meta));
1006
1007 ret = nbd_opt_read_name(client, &export_name, NULL, errp);
1008 if (ret <= 0) {
1009 return ret;
1010 }
1011
1012 meta->exp = nbd_export_find(export_name);
1013 if (meta->exp == NULL) {
1014 g_autofree char *sane_name = nbd_sanitize_name(export_name);
1015
1016 return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
1017 "export '%s' not present", sane_name);
1018 }
1019
1020 ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), errp);
1021 if (ret <= 0) {
1022 return ret;
1023 }
1024 nb_queries = cpu_to_be32(nb_queries);
1025 trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
1026 export_name, nb_queries);
1027
1028 if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
1029 /* enable all known contexts */
1030 meta->base_allocation = true;
1031 meta->bitmap = !!meta->exp->export_bitmap;
1032 } else {
1033 for (i = 0; i < nb_queries; ++i) {
1034 ret = nbd_negotiate_meta_query(client, meta, errp);
1035 if (ret <= 0) {
1036 return ret;
1037 }
1038 }
1039 }
1040
1041 if (meta->base_allocation) {
1042 ret = nbd_negotiate_send_meta_context(client, "base:allocation",
1043 NBD_META_ID_BASE_ALLOCATION,
1044 errp);
1045 if (ret < 0) {
1046 return ret;
1047 }
1048 }
1049
1050 if (meta->bitmap) {
1051 ret = nbd_negotiate_send_meta_context(client,
1052 meta->exp->export_bitmap_context,
1053 NBD_META_ID_DIRTY_BITMAP,
1054 errp);
1055 if (ret < 0) {
1056 return ret;
1057 }
1058 }
1059
1060 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1061 if (ret == 0) {
1062 meta->valid = true;
1063 }
1064
1065 return ret;
1066 }
1067
1068 /* nbd_negotiate_options
1069 * Process all NBD_OPT_* client option commands, during fixed newstyle
1070 * negotiation.
1071 * Return:
1072 * -errno on error, errp is set
1073 * 0 on successful negotiation, errp is not set
1074 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1075 * errp is not set
1076 */
1077 static int nbd_negotiate_options(NBDClient *client, Error **errp)
1078 {
1079 uint32_t flags;
1080 bool fixedNewstyle = false;
1081 bool no_zeroes = false;
1082
1083 /* Client sends:
1084 [ 0 .. 3] client flags
1085
1086 Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
1087 [ 0 .. 7] NBD_OPTS_MAGIC
1088 [ 8 .. 11] NBD option
1089 [12 .. 15] Data length
1090 ... Rest of request
1091
1092 [ 0 .. 7] NBD_OPTS_MAGIC
1093 [ 8 .. 11] Second NBD option
1094 [12 .. 15] Data length
1095 ... Rest of request
1096 */
1097
1098 if (nbd_read32(client->ioc, &flags, "flags", errp) < 0) {
1099 return -EIO;
1100 }
1101 trace_nbd_negotiate_options_flags(flags);
1102 if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
1103 fixedNewstyle = true;
1104 flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
1105 }
1106 if (flags & NBD_FLAG_C_NO_ZEROES) {
1107 no_zeroes = true;
1108 flags &= ~NBD_FLAG_C_NO_ZEROES;
1109 }
1110 if (flags != 0) {
1111 error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
1112 return -EINVAL;
1113 }
1114
1115 while (1) {
1116 int ret;
1117 uint32_t option, length;
1118 uint64_t magic;
1119
1120 if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
1121 return -EINVAL;
1122 }
1123 trace_nbd_negotiate_options_check_magic(magic);
1124 if (magic != NBD_OPTS_MAGIC) {
1125 error_setg(errp, "Bad magic received");
1126 return -EINVAL;
1127 }
1128
1129 if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
1130 return -EINVAL;
1131 }
1132 client->opt = option;
1133
1134 if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
1135 return -EINVAL;
1136 }
1137 assert(!client->optlen);
1138 client->optlen = length;
1139
1140 if (length > NBD_MAX_BUFFER_SIZE) {
1141 error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
1142 length, NBD_MAX_BUFFER_SIZE);
1143 return -EINVAL;
1144 }
1145
1146 trace_nbd_negotiate_options_check_option(option,
1147 nbd_opt_lookup(option));
1148 if (client->tlscreds &&
1149 client->ioc == (QIOChannel *)client->sioc) {
1150 QIOChannel *tioc;
1151 if (!fixedNewstyle) {
1152 error_setg(errp, "Unsupported option 0x%" PRIx32, option);
1153 return -EINVAL;
1154 }
1155 switch (option) {
1156 case NBD_OPT_STARTTLS:
1157 if (length) {
1158 /* Unconditionally drop the connection if the client
1159 * can't start a TLS negotiation correctly */
1160 return nbd_reject_length(client, true, errp);
1161 }
1162 tioc = nbd_negotiate_handle_starttls(client, errp);
1163 if (!tioc) {
1164 return -EIO;
1165 }
1166 ret = 0;
1167 object_unref(OBJECT(client->ioc));
1168 client->ioc = QIO_CHANNEL(tioc);
1169 break;
1170
1171 case NBD_OPT_EXPORT_NAME:
1172 /* No way to return an error to client, so drop connection */
1173 error_setg(errp, "Option 0x%x not permitted before TLS",
1174 option);
1175 return -EINVAL;
1176
1177 default:
1178 /* Let the client keep trying, unless they asked to
1179 * quit. Always try to give an error back to the
1180 * client; but when replying to OPT_ABORT, be aware
1181 * that the client may hang up before receiving the
1182 * error, in which case we are fine ignoring the
1183 * resulting EPIPE. */
1184 ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
1185 option == NBD_OPT_ABORT ? NULL : errp,
1186 "Option 0x%" PRIx32
1187 " not permitted before TLS", option);
1188 if (option == NBD_OPT_ABORT) {
1189 return 1;
1190 }
1191 break;
1192 }
1193 } else if (fixedNewstyle) {
1194 switch (option) {
1195 case NBD_OPT_LIST:
1196 if (length) {
1197 ret = nbd_reject_length(client, false, errp);
1198 } else {
1199 ret = nbd_negotiate_handle_list(client, errp);
1200 }
1201 break;
1202
1203 case NBD_OPT_ABORT:
1204 /* NBD spec says we must try to reply before
1205 * disconnecting, but that we must also tolerate
1206 * guests that don't wait for our reply. */
1207 nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
1208 return 1;
1209
1210 case NBD_OPT_EXPORT_NAME:
1211 return nbd_negotiate_handle_export_name(client, no_zeroes,
1212 errp);
1213
1214 case NBD_OPT_INFO:
1215 case NBD_OPT_GO:
1216 ret = nbd_negotiate_handle_info(client, errp);
1217 if (ret == 1) {
1218 assert(option == NBD_OPT_GO);
1219 return 0;
1220 }
1221 break;
1222
1223 case NBD_OPT_STARTTLS:
1224 if (length) {
1225 ret = nbd_reject_length(client, false, errp);
1226 } else if (client->tlscreds) {
1227 ret = nbd_negotiate_send_rep_err(client,
1228 NBD_REP_ERR_INVALID, errp,
1229 "TLS already enabled");
1230 } else {
1231 ret = nbd_negotiate_send_rep_err(client,
1232 NBD_REP_ERR_POLICY, errp,
1233 "TLS not configured");
1234 }
1235 break;
1236
1237 case NBD_OPT_STRUCTURED_REPLY:
1238 if (length) {
1239 ret = nbd_reject_length(client, false, errp);
1240 } else if (client->structured_reply) {
1241 ret = nbd_negotiate_send_rep_err(
1242 client, NBD_REP_ERR_INVALID, errp,
1243 "structured reply already negotiated");
1244 } else {
1245 ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
1246 client->structured_reply = true;
1247 }
1248 break;
1249
1250 case NBD_OPT_LIST_META_CONTEXT:
1251 case NBD_OPT_SET_META_CONTEXT:
1252 ret = nbd_negotiate_meta_queries(client, &client->export_meta,
1253 errp);
1254 break;
1255
1256 default:
1257 ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
1258 "Unsupported option %" PRIu32 " (%s)",
1259 option, nbd_opt_lookup(option));
1260 break;
1261 }
1262 } else {
1263 /*
1264 * If broken new-style we should drop the connection
1265 * for anything except NBD_OPT_EXPORT_NAME
1266 */
1267 switch (option) {
1268 case NBD_OPT_EXPORT_NAME:
1269 return nbd_negotiate_handle_export_name(client, no_zeroes,
1270 errp);
1271
1272 default:
1273 error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
1274 option, nbd_opt_lookup(option));
1275 return -EINVAL;
1276 }
1277 }
1278 if (ret < 0) {
1279 return ret;
1280 }
1281 }
1282 }
1283
1284 /* nbd_negotiate
1285 * Return:
1286 * -errno on error, errp is set
1287 * 0 on successful negotiation, errp is not set
1288 * 1 if client sent NBD_OPT_ABORT, i.e. on valid disconnect,
1289 * errp is not set
1290 */
1291 static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
1292 {
1293 ERRP_GUARD();
1294 char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
1295 int ret;
1296
1297 /* Old style negotiation header, no room for options
1298 [ 0 .. 7] passwd ("NBDMAGIC")
1299 [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
1300 [16 .. 23] size
1301 [24 .. 27] export flags (zero-extended)
1302 [28 .. 151] reserved (0)
1303
1304 New style negotiation header, client can send options
1305 [ 0 .. 7] passwd ("NBDMAGIC")
1306 [ 8 .. 15] magic (NBD_OPTS_MAGIC)
1307 [16 .. 17] server flags (0)
1308 ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
1309 */
1310
1311 qio_channel_set_blocking(client->ioc, false, NULL);
1312
1313 trace_nbd_negotiate_begin();
1314 memcpy(buf, "NBDMAGIC", 8);
1315
1316 stq_be_p(buf + 8, NBD_OPTS_MAGIC);
1317 stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
1318
1319 if (nbd_write(client->ioc, buf, 18, errp) < 0) {
1320 error_prepend(errp, "write failed: ");
1321 return -EINVAL;
1322 }
1323 ret = nbd_negotiate_options(client, errp);
1324 if (ret != 0) {
1325 if (ret < 0) {
1326 error_prepend(errp, "option negotiation failed: ");
1327 }
1328 return ret;
1329 }
1330
1331 /* Attach the channel to the same AioContext as the export */
1332 if (client->exp && client->exp->common.ctx) {
1333 qio_channel_attach_aio_context(client->ioc, client->exp->common.ctx);
1334 }
1335
1336 assert(!client->optlen);
1337 trace_nbd_negotiate_success();
1338
1339 return 0;
1340 }
1341
1342 static int nbd_receive_request(QIOChannel *ioc, NBDRequest *request,
1343 Error **errp)
1344 {
1345 uint8_t buf[NBD_REQUEST_SIZE];
1346 uint32_t magic;
1347 int ret;
1348
1349 ret = nbd_read(ioc, buf, sizeof(buf), "request", errp);
1350 if (ret < 0) {
1351 return ret;
1352 }
1353
1354 /* Request
1355 [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
1356 [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...)
1357 [ 6 .. 7] type (NBD_CMD_READ, ...)
1358 [ 8 .. 15] handle
1359 [16 .. 23] from
1360 [24 .. 27] len
1361 */
1362
1363 magic = ldl_be_p(buf);
1364 request->flags = lduw_be_p(buf + 4);
1365 request->type = lduw_be_p(buf + 6);
1366 request->handle = ldq_be_p(buf + 8);
1367 request->from = ldq_be_p(buf + 16);
1368 request->len = ldl_be_p(buf + 24);
1369
1370 trace_nbd_receive_request(magic, request->flags, request->type,
1371 request->from, request->len);
1372
1373 if (magic != NBD_REQUEST_MAGIC) {
1374 error_setg(errp, "invalid magic (got 0x%" PRIx32 ")", magic);
1375 return -EINVAL;
1376 }
1377 return 0;
1378 }
1379
1380 #define MAX_NBD_REQUESTS 16
1381
1382 void nbd_client_get(NBDClient *client)
1383 {
1384 client->refcount++;
1385 }
1386
1387 void nbd_client_put(NBDClient *client)
1388 {
1389 if (--client->refcount == 0) {
1390 /* The last reference should be dropped by client->close,
1391 * which is called by client_close.
1392 */
1393 assert(client->closing);
1394
1395 qio_channel_detach_aio_context(client->ioc);
1396 object_unref(OBJECT(client->sioc));
1397 object_unref(OBJECT(client->ioc));
1398 if (client->tlscreds) {
1399 object_unref(OBJECT(client->tlscreds));
1400 }
1401 g_free(client->tlsauthz);
1402 if (client->exp) {
1403 QTAILQ_REMOVE(&client->exp->clients, client, next);
1404 blk_exp_unref(&client->exp->common);
1405 }
1406 g_free(client);
1407 }
1408 }
1409
1410 static void client_close(NBDClient *client, bool negotiated)
1411 {
1412 if (client->closing) {
1413 return;
1414 }
1415
1416 client->closing = true;
1417
1418 /* Force requests to finish. They will drop their own references,
1419 * then we'll close the socket and free the NBDClient.
1420 */
1421 qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
1422 NULL);
1423
1424 /* Also tell the client, so that they release their reference. */
1425 if (client->close_fn) {
1426 client->close_fn(client, negotiated);
1427 }
1428 }
1429
1430 static NBDRequestData *nbd_request_get(NBDClient *client)
1431 {
1432 NBDRequestData *req;
1433
1434 assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
1435 client->nb_requests++;
1436
1437 req = g_new0(NBDRequestData, 1);
1438 nbd_client_get(client);
1439 req->client = client;
1440 return req;
1441 }
1442
1443 static void nbd_request_put(NBDRequestData *req)
1444 {
1445 NBDClient *client = req->client;
1446
1447 if (req->data) {
1448 qemu_vfree(req->data);
1449 }
1450 g_free(req);
1451
1452 client->nb_requests--;
1453 nbd_client_receive_next_request(client);
1454
1455 nbd_client_put(client);
1456 }
1457
1458 static void blk_aio_attached(AioContext *ctx, void *opaque)
1459 {
1460 NBDExport *exp = opaque;
1461 NBDClient *client;
1462
1463 trace_nbd_blk_aio_attached(exp->name, ctx);
1464
1465 exp->common.ctx = ctx;
1466
1467 QTAILQ_FOREACH(client, &exp->clients, next) {
1468 qio_channel_attach_aio_context(client->ioc, ctx);
1469 if (client->recv_coroutine) {
1470 aio_co_schedule(ctx, client->recv_coroutine);
1471 }
1472 if (client->send_coroutine) {
1473 aio_co_schedule(ctx, client->send_coroutine);
1474 }
1475 }
1476 }
1477
1478 static void blk_aio_detach(void *opaque)
1479 {
1480 NBDExport *exp = opaque;
1481 NBDClient *client;
1482
1483 trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
1484
1485 QTAILQ_FOREACH(client, &exp->clients, next) {
1486 qio_channel_detach_aio_context(client->ioc);
1487 }
1488
1489 exp->common.ctx = NULL;
1490 }
1491
1492 static void nbd_eject_notifier(Notifier *n, void *data)
1493 {
1494 NBDExport *exp = container_of(n, NBDExport, eject_notifier);
1495
1496 blk_exp_request_shutdown(&exp->common);
1497 }
1498
1499 void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
1500 {
1501 NBDExport *nbd_exp = container_of(exp, NBDExport, common);
1502 assert(exp->drv == &blk_exp_nbd);
1503 assert(nbd_exp->eject_notifier_blk == NULL);
1504
1505 blk_ref(blk);
1506 nbd_exp->eject_notifier_blk = blk;
1507 nbd_exp->eject_notifier.notify = nbd_eject_notifier;
1508 blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
1509 }
1510
1511 int nbd_export_new(BlockExport *blk_exp, BlockDriverState *bs,
1512 const char *name, const char *desc,
1513 const char *bitmap, bool readonly, bool shared,
1514 bool writethrough, Error **errp)
1515 {
1516 NBDExport *exp = container_of(blk_exp, NBDExport, common);
1517 AioContext *ctx;
1518 BlockBackend *blk;
1519 int64_t size;
1520 uint64_t perm;
1521 int ret;
1522
1523 size = bdrv_getlength(bs);
1524 if (size < 0) {
1525 error_setg_errno(errp, -size,
1526 "Failed to determine the NBD export's length");
1527 return size;
1528 }
1529
1530 ctx = bdrv_get_aio_context(bs);
1531 blk_exp->ctx = ctx;
1532
1533 /*
1534 * NBD exports are used for non-shared storage migration. Make sure
1535 * that BDRV_O_INACTIVE is cleared and the image is ready for write
1536 * access since the export could be available before migration handover.
1537 * ctx was acquired in the caller.
1538 */
1539 assert(name && strlen(name) <= NBD_MAX_STRING_SIZE);
1540
1541 bdrv_invalidate_cache(bs, NULL);
1542
1543 /* Don't allow resize while the NBD server is running, otherwise we don't
1544 * care what happens with the node. */
1545 perm = BLK_PERM_CONSISTENT_READ;
1546 if (!readonly) {
1547 perm |= BLK_PERM_WRITE;
1548 }
1549 blk = blk_new(ctx, perm,
1550 BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
1551 BLK_PERM_WRITE | BLK_PERM_GRAPH_MOD);
1552 ret = blk_insert_bs(blk, bs, errp);
1553 if (ret < 0) {
1554 goto fail;
1555 }
1556 blk_set_enable_write_cache(blk, !writethrough);
1557 blk_set_allow_aio_context_change(blk, true);
1558
1559 QTAILQ_INIT(&exp->clients);
1560 exp->blk = blk;
1561 exp->name = g_strdup(name);
1562 assert(!desc || strlen(desc) <= NBD_MAX_STRING_SIZE);
1563 exp->description = g_strdup(desc);
1564 exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
1565 NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
1566 if (readonly) {
1567 exp->nbdflags |= NBD_FLAG_READ_ONLY;
1568 if (shared) {
1569 exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
1570 }
1571 } else {
1572 exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
1573 NBD_FLAG_SEND_FAST_ZERO);
1574 }
1575 exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
1576
1577 if (bitmap) {
1578 BdrvDirtyBitmap *bm = NULL;
1579
1580 while (bs) {
1581 bm = bdrv_find_dirty_bitmap(bs, bitmap);
1582 if (bm != NULL) {
1583 break;
1584 }
1585
1586 bs = bdrv_filter_or_cow_bs(bs);
1587 }
1588
1589 if (bm == NULL) {
1590 ret = -ENOENT;
1591 error_setg(errp, "Bitmap '%s' is not found", bitmap);
1592 goto fail;
1593 }
1594
1595 if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
1596 ret = -EINVAL;
1597 goto fail;
1598 }
1599
1600 if (readonly && bdrv_is_writable(bs) &&
1601 bdrv_dirty_bitmap_enabled(bm)) {
1602 ret = -EINVAL;
1603 error_setg(errp,
1604 "Enabled bitmap '%s' incompatible with readonly export",
1605 bitmap);
1606 goto fail;
1607 }
1608
1609 bdrv_dirty_bitmap_set_busy(bm, true);
1610 exp->export_bitmap = bm;
1611 assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
1612 exp->export_bitmap_context = g_strdup_printf("qemu:dirty-bitmap:%s",
1613 bitmap);
1614 assert(strlen(exp->export_bitmap_context) < NBD_MAX_STRING_SIZE);
1615 }
1616
1617 blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
1618
1619 blk_exp_ref(&exp->common);
1620 QTAILQ_INSERT_TAIL(&exports, exp, next);
1621
1622 return 0;
1623
1624 fail:
1625 blk_unref(blk);
1626 g_free(exp->name);
1627 g_free(exp->description);
1628 return ret;
1629 }
1630
1631 NBDExport *nbd_export_find(const char *name)
1632 {
1633 NBDExport *exp;
1634 QTAILQ_FOREACH(exp, &exports, next) {
1635 if (strcmp(name, exp->name) == 0) {
1636 return exp;
1637 }
1638 }
1639
1640 return NULL;
1641 }
1642
1643 AioContext *
1644 nbd_export_aio_context(NBDExport *exp)
1645 {
1646 return exp->common.ctx;
1647 }
1648
1649 static void nbd_export_request_shutdown(BlockExport *blk_exp)
1650 {
1651 NBDExport *exp = container_of(blk_exp, NBDExport, common);
1652 NBDClient *client, *next;
1653
1654 blk_exp_ref(&exp->common);
1655 /*
1656 * TODO: Should we expand QMP NbdServerRemoveNode enum to allow a
1657 * close mode that stops advertising the export to new clients but
1658 * still permits existing clients to run to completion? Because of
1659 * that possibility, nbd_export_close() can be called more than
1660 * once on an export.
1661 */
1662 QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
1663 client_close(client, true);
1664 }
1665 if (exp->name) {
1666 blk_exp_unref(&exp->common);
1667 g_free(exp->name);
1668 exp->name = NULL;
1669 QTAILQ_REMOVE(&exports, exp, next);
1670 }
1671 blk_exp_unref(&exp->common);
1672 }
1673
1674 void nbd_export_remove(NBDExport *exp, NbdServerRemoveMode mode, Error **errp)
1675 {
1676 ERRP_GUARD();
1677 if (mode == NBD_SERVER_REMOVE_MODE_HARD || QTAILQ_EMPTY(&exp->clients)) {
1678 nbd_export_request_shutdown(&exp->common);
1679 return;
1680 }
1681
1682 assert(mode == NBD_SERVER_REMOVE_MODE_SAFE);
1683
1684 error_setg(errp, "export '%s' still in use", exp->name);
1685 error_append_hint(errp, "Use mode='hard' to force client disconnect\n");
1686 }
1687
1688 static void nbd_export_delete(BlockExport *blk_exp)
1689 {
1690 NBDExport *exp = container_of(blk_exp, NBDExport, common);
1691
1692 assert(exp->name == NULL);
1693 assert(QTAILQ_EMPTY(&exp->clients));
1694
1695 g_free(exp->description);
1696 exp->description = NULL;
1697
1698 if (exp->blk) {
1699 if (exp->eject_notifier_blk) {
1700 notifier_remove(&exp->eject_notifier);
1701 blk_unref(exp->eject_notifier_blk);
1702 }
1703 blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
1704 blk_aio_detach, exp);
1705 blk_unref(exp->blk);
1706 exp->blk = NULL;
1707 }
1708
1709 if (exp->export_bitmap) {
1710 bdrv_dirty_bitmap_set_busy(exp->export_bitmap, false);
1711 g_free(exp->export_bitmap_context);
1712 }
1713 }
1714
1715 const BlockExportDriver blk_exp_nbd = {
1716 .type = BLOCK_EXPORT_TYPE_NBD,
1717 .instance_size = sizeof(NBDExport),
1718 .create = nbd_export_create,
1719 .delete = nbd_export_delete,
1720 .request_shutdown = nbd_export_request_shutdown,
1721 };
1722
1723 static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
1724 unsigned niov, Error **errp)
1725 {
1726 int ret;
1727
1728 g_assert(qemu_in_coroutine());
1729 qemu_co_mutex_lock(&client->send_lock);
1730 client->send_coroutine = qemu_coroutine_self();
1731
1732 ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
1733
1734 client->send_coroutine = NULL;
1735 qemu_co_mutex_unlock(&client->send_lock);
1736
1737 return ret;
1738 }
1739
1740 static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
1741 uint64_t handle)
1742 {
1743 stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
1744 stl_be_p(&reply->error, error);
1745 stq_be_p(&reply->handle, handle);
1746 }
1747
1748 static int nbd_co_send_simple_reply(NBDClient *client,
1749 uint64_t handle,
1750 uint32_t error,
1751 void *data,
1752 size_t len,
1753 Error **errp)
1754 {
1755 NBDSimpleReply reply;
1756 int nbd_err = system_errno_to_nbd_errno(error);
1757 struct iovec iov[] = {
1758 {.iov_base = &reply, .iov_len = sizeof(reply)},
1759 {.iov_base = data, .iov_len = len}
1760 };
1761
1762 trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
1763 len);
1764 set_be_simple_reply(&reply, nbd_err, handle);
1765
1766 return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
1767 }
1768
1769 static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags,
1770 uint16_t type, uint64_t handle, uint32_t length)
1771 {
1772 stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
1773 stw_be_p(&chunk->flags, flags);
1774 stw_be_p(&chunk->type, type);
1775 stq_be_p(&chunk->handle, handle);
1776 stl_be_p(&chunk->length, length);
1777 }
1778
1779 static int coroutine_fn nbd_co_send_structured_done(NBDClient *client,
1780 uint64_t handle,
1781 Error **errp)
1782 {
1783 NBDStructuredReplyChunk chunk;
1784 struct iovec iov[] = {
1785 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1786 };
1787
1788 trace_nbd_co_send_structured_done(handle);
1789 set_be_chunk(&chunk, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_NONE, handle, 0);
1790
1791 return nbd_co_send_iov(client, iov, 1, errp);
1792 }
1793
1794 static int coroutine_fn nbd_co_send_structured_read(NBDClient *client,
1795 uint64_t handle,
1796 uint64_t offset,
1797 void *data,
1798 size_t size,
1799 bool final,
1800 Error **errp)
1801 {
1802 NBDStructuredReadData chunk;
1803 struct iovec iov[] = {
1804 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1805 {.iov_base = data, .iov_len = size}
1806 };
1807
1808 assert(size);
1809 trace_nbd_co_send_structured_read(handle, offset, data, size);
1810 set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0,
1811 NBD_REPLY_TYPE_OFFSET_DATA, handle,
1812 sizeof(chunk) - sizeof(chunk.h) + size);
1813 stq_be_p(&chunk.offset, offset);
1814
1815 return nbd_co_send_iov(client, iov, 2, errp);
1816 }
1817
1818 static int coroutine_fn nbd_co_send_structured_error(NBDClient *client,
1819 uint64_t handle,
1820 uint32_t error,
1821 const char *msg,
1822 Error **errp)
1823 {
1824 NBDStructuredError chunk;
1825 int nbd_err = system_errno_to_nbd_errno(error);
1826 struct iovec iov[] = {
1827 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1828 {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
1829 };
1830
1831 assert(nbd_err);
1832 trace_nbd_co_send_structured_error(handle, nbd_err,
1833 nbd_err_lookup(nbd_err), msg ? msg : "");
1834 set_be_chunk(&chunk.h, NBD_REPLY_FLAG_DONE, NBD_REPLY_TYPE_ERROR, handle,
1835 sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
1836 stl_be_p(&chunk.error, nbd_err);
1837 stw_be_p(&chunk.message_length, iov[1].iov_len);
1838
1839 return nbd_co_send_iov(client, iov, 1 + !!iov[1].iov_len, errp);
1840 }
1841
1842 /* Do a sparse read and send the structured reply to the client.
1843 * Returns -errno if sending fails. bdrv_block_status_above() failure is
1844 * reported to the client, at which point this function succeeds.
1845 */
1846 static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
1847 uint64_t handle,
1848 uint64_t offset,
1849 uint8_t *data,
1850 size_t size,
1851 Error **errp)
1852 {
1853 int ret = 0;
1854 NBDExport *exp = client->exp;
1855 size_t progress = 0;
1856
1857 while (progress < size) {
1858 int64_t pnum;
1859 int status = bdrv_block_status_above(blk_bs(exp->blk), NULL,
1860 offset + progress,
1861 size - progress, &pnum, NULL,
1862 NULL);
1863 bool final;
1864
1865 if (status < 0) {
1866 char *msg = g_strdup_printf("unable to check for holes: %s",
1867 strerror(-status));
1868
1869 ret = nbd_co_send_structured_error(client, handle, -status, msg,
1870 errp);
1871 g_free(msg);
1872 return ret;
1873 }
1874 assert(pnum && pnum <= size - progress);
1875 final = progress + pnum == size;
1876 if (status & BDRV_BLOCK_ZERO) {
1877 NBDStructuredReadHole chunk;
1878 struct iovec iov[] = {
1879 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
1880 };
1881
1882 trace_nbd_co_send_structured_read_hole(handle, offset + progress,
1883 pnum);
1884 set_be_chunk(&chunk.h, final ? NBD_REPLY_FLAG_DONE : 0,
1885 NBD_REPLY_TYPE_OFFSET_HOLE,
1886 handle, sizeof(chunk) - sizeof(chunk.h));
1887 stq_be_p(&chunk.offset, offset + progress);
1888 stl_be_p(&chunk.length, pnum);
1889 ret = nbd_co_send_iov(client, iov, 1, errp);
1890 } else {
1891 ret = blk_pread(exp->blk, offset + progress, data + progress, pnum);
1892 if (ret < 0) {
1893 error_setg_errno(errp, -ret, "reading from file failed");
1894 break;
1895 }
1896 ret = nbd_co_send_structured_read(client, handle, offset + progress,
1897 data + progress, pnum, final,
1898 errp);
1899 }
1900
1901 if (ret < 0) {
1902 break;
1903 }
1904 progress += pnum;
1905 }
1906 return ret;
1907 }
1908
1909 typedef struct NBDExtentArray {
1910 NBDExtent *extents;
1911 unsigned int nb_alloc;
1912 unsigned int count;
1913 uint64_t total_length;
1914 bool can_add;
1915 bool converted_to_be;
1916 } NBDExtentArray;
1917
1918 static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc)
1919 {
1920 NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
1921
1922 ea->nb_alloc = nb_alloc;
1923 ea->extents = g_new(NBDExtent, nb_alloc);
1924 ea->can_add = true;
1925
1926 return ea;
1927 }
1928
1929 static void nbd_extent_array_free(NBDExtentArray *ea)
1930 {
1931 g_free(ea->extents);
1932 g_free(ea);
1933 }
1934 G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free);
1935
1936 /* Further modifications of the array after conversion are abandoned */
1937 static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
1938 {
1939 int i;
1940
1941 assert(!ea->converted_to_be);
1942 ea->can_add = false;
1943 ea->converted_to_be = true;
1944
1945 for (i = 0; i < ea->count; i++) {
1946 ea->extents[i].flags = cpu_to_be32(ea->extents[i].flags);
1947 ea->extents[i].length = cpu_to_be32(ea->extents[i].length);
1948 }
1949 }
1950
1951 /*
1952 * Add extent to NBDExtentArray. If extent can't be added (no available space),
1953 * return -1.
1954 * For safety, when returning -1 for the first time, .can_add is set to false,
1955 * further call to nbd_extent_array_add() will crash.
1956 * (to avoid the situation, when after failing to add an extent (returned -1),
1957 * user miss this failure and add another extent, which is successfully added
1958 * (array is full, but new extent may be squashed into the last one), then we
1959 * have invalid array with skipped extent)
1960 */
1961 static int nbd_extent_array_add(NBDExtentArray *ea,
1962 uint32_t length, uint32_t flags)
1963 {
1964 assert(ea->can_add);
1965
1966 if (!length) {
1967 return 0;
1968 }
1969
1970 /* Extend previous extent if flags are the same */
1971 if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
1972 uint64_t sum = (uint64_t)length + ea->extents[ea->count - 1].length;
1973
1974 if (sum <= UINT32_MAX) {
1975 ea->extents[ea->count - 1].length = sum;
1976 ea->total_length += length;
1977 return 0;
1978 }
1979 }
1980
1981 if (ea->count >= ea->nb_alloc) {
1982 ea->can_add = false;
1983 return -1;
1984 }
1985
1986 ea->total_length += length;
1987 ea->extents[ea->count] = (NBDExtent) {.length = length, .flags = flags};
1988 ea->count++;
1989
1990 return 0;
1991 }
1992
1993 static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset,
1994 uint64_t bytes, NBDExtentArray *ea)
1995 {
1996 while (bytes) {
1997 uint32_t flags;
1998 int64_t num;
1999 int ret = bdrv_block_status_above(bs, NULL, offset, bytes, &num,
2000 NULL, NULL);
2001
2002 if (ret < 0) {
2003 return ret;
2004 }
2005
2006 flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) |
2007 (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
2008
2009 if (nbd_extent_array_add(ea, num, flags) < 0) {
2010 return 0;
2011 }
2012
2013 offset += num;
2014 bytes -= num;
2015 }
2016
2017 return 0;
2018 }
2019
2020 /*
2021 * nbd_co_send_extents
2022 *
2023 * @ea is converted to BE by the function
2024 * @last controls whether NBD_REPLY_FLAG_DONE is sent.
2025 */
2026 static int nbd_co_send_extents(NBDClient *client, uint64_t handle,
2027 NBDExtentArray *ea,
2028 bool last, uint32_t context_id, Error **errp)
2029 {
2030 NBDStructuredMeta chunk;
2031 struct iovec iov[] = {
2032 {.iov_base = &chunk, .iov_len = sizeof(chunk)},
2033 {.iov_base = ea->extents, .iov_len = ea->count * sizeof(ea->extents[0])}
2034 };
2035
2036 nbd_extent_array_convert_to_be(ea);
2037
2038 trace_nbd_co_send_extents(handle, ea->count, context_id, ea->total_length,
2039 last);
2040 set_be_chunk(&chunk.h, last ? NBD_REPLY_FLAG_DONE : 0,
2041 NBD_REPLY_TYPE_BLOCK_STATUS,
2042 handle, sizeof(chunk) - sizeof(chunk.h) + iov[1].iov_len);
2043 stl_be_p(&chunk.context_id, context_id);
2044
2045 return nbd_co_send_iov(client, iov, 2, errp);
2046 }
2047
2048 /* Get block status from the exported device and send it to the client */
2049 static int nbd_co_send_block_status(NBDClient *client, uint64_t handle,
2050 BlockDriverState *bs, uint64_t offset,
2051 uint32_t length, bool dont_fragment,
2052 bool last, uint32_t context_id,
2053 Error **errp)
2054 {
2055 int ret;
2056 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
2057 g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
2058
2059 ret = blockstatus_to_extents(bs, offset, length, ea);
2060 if (ret < 0) {
2061 return nbd_co_send_structured_error(
2062 client, handle, -ret, "can't get block status", errp);
2063 }
2064
2065 return nbd_co_send_extents(client, handle, ea, last, context_id, errp);
2066 }
2067
2068 /* Populate @ea from a dirty bitmap. */
2069 static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
2070 uint64_t offset, uint64_t length,
2071 NBDExtentArray *es)
2072 {
2073 int64_t start, dirty_start, dirty_count;
2074 int64_t end = offset + length;
2075 bool full = false;
2076
2077 bdrv_dirty_bitmap_lock(bitmap);
2078
2079 for (start = offset;
2080 bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, INT32_MAX,
2081 &dirty_start, &dirty_count);
2082 start = dirty_start + dirty_count)
2083 {
2084 if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
2085 (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
2086 {
2087 full = true;
2088 break;
2089 }
2090 }
2091
2092 if (!full) {
2093 /* last non dirty extent */
2094 nbd_extent_array_add(es, end - start, 0);
2095 }
2096
2097 bdrv_dirty_bitmap_unlock(bitmap);
2098 }
2099
2100 static int nbd_co_send_bitmap(NBDClient *client, uint64_t handle,
2101 BdrvDirtyBitmap *bitmap, uint64_t offset,
2102 uint32_t length, bool dont_fragment, bool last,
2103 uint32_t context_id, Error **errp)
2104 {
2105 unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
2106 g_autoptr(NBDExtentArray) ea = nbd_extent_array_new(nb_extents);
2107
2108 bitmap_to_extents(bitmap, offset, length, ea);
2109
2110 return nbd_co_send_extents(client, handle, ea, last, context_id, errp);
2111 }
2112
2113 /* nbd_co_receive_request
2114 * Collect a client request. Return 0 if request looks valid, -EIO to drop
2115 * connection right away, and any other negative value to report an error to
2116 * the client (although the caller may still need to disconnect after reporting
2117 * the error).
2118 */
2119 static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request,
2120 Error **errp)
2121 {
2122 NBDClient *client = req->client;
2123 int valid_flags;
2124
2125 g_assert(qemu_in_coroutine());
2126 assert(client->recv_coroutine == qemu_coroutine_self());
2127 if (nbd_receive_request(client->ioc, request, errp) < 0) {
2128 return -EIO;
2129 }
2130
2131 trace_nbd_co_receive_request_decode_type(request->handle, request->type,
2132 nbd_cmd_lookup(request->type));
2133
2134 if (request->type != NBD_CMD_WRITE) {
2135 /* No payload, we are ready to read the next request. */
2136 req->complete = true;
2137 }
2138
2139 if (request->type == NBD_CMD_DISC) {
2140 /* Special case: we're going to disconnect without a reply,
2141 * whether or not flags, from, or len are bogus */
2142 return -EIO;
2143 }
2144
2145 if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE ||
2146 request->type == NBD_CMD_CACHE)
2147 {
2148 if (request->len > NBD_MAX_BUFFER_SIZE) {
2149 error_setg(errp, "len (%" PRIu32" ) is larger than max len (%u)",
2150 request->len, NBD_MAX_BUFFER_SIZE);
2151 return -EINVAL;
2152 }
2153
2154 if (request->type != NBD_CMD_CACHE) {
2155 req->data = blk_try_blockalign(client->exp->blk, request->len);
2156 if (req->data == NULL) {
2157 error_setg(errp, "No memory");
2158 return -ENOMEM;
2159 }
2160 }
2161 }
2162
2163 if (request->type == NBD_CMD_WRITE) {
2164 if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data",
2165 errp) < 0)
2166 {
2167 return -EIO;
2168 }
2169 req->complete = true;
2170
2171 trace_nbd_co_receive_request_payload_received(request->handle,
2172 request->len);
2173 }
2174
2175 /* Sanity checks. */
2176 if (client->exp->nbdflags & NBD_FLAG_READ_ONLY &&
2177 (request->type == NBD_CMD_WRITE ||
2178 request->type == NBD_CMD_WRITE_ZEROES ||
2179 request->type == NBD_CMD_TRIM)) {
2180 error_setg(errp, "Export is read-only");
2181 return -EROFS;
2182 }
2183 if (request->from > client->exp->size ||
2184 request->len > client->exp->size - request->from) {
2185 error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu32
2186 ", Size: %" PRIu64, request->from, request->len,
2187 client->exp->size);
2188 return (request->type == NBD_CMD_WRITE ||
2189 request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
2190 }
2191 if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
2192 client->check_align)) {
2193 /*
2194 * The block layer gracefully handles unaligned requests, but
2195 * it's still worth tracing client non-compliance
2196 */
2197 trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
2198 request->from,
2199 request->len,
2200 client->check_align);
2201 }
2202 valid_flags = NBD_CMD_FLAG_FUA;
2203 if (request->type == NBD_CMD_READ && client->structured_reply) {
2204 valid_flags |= NBD_CMD_FLAG_DF;
2205 } else if (request->type == NBD_CMD_WRITE_ZEROES) {
2206 valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
2207 } else if (request->type == NBD_CMD_BLOCK_STATUS) {
2208 valid_flags |= NBD_CMD_FLAG_REQ_ONE;
2209 }
2210 if (request->flags & ~valid_flags) {
2211 error_setg(errp, "unsupported flags for command %s (got 0x%x)",
2212 nbd_cmd_lookup(request->type), request->flags);
2213 return -EINVAL;
2214 }
2215
2216 return 0;
2217 }
2218
2219 /* Send simple reply without a payload, or a structured error
2220 * @error_msg is ignored if @ret >= 0
2221 * Returns 0 if connection is still live, -errno on failure to talk to client
2222 */
2223 static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
2224 uint64_t handle,
2225 int ret,
2226 const char *error_msg,
2227 Error **errp)
2228 {
2229 if (client->structured_reply && ret < 0) {
2230 return nbd_co_send_structured_error(client, handle, -ret, error_msg,
2231 errp);
2232 } else {
2233 return nbd_co_send_simple_reply(client, handle, ret < 0 ? -ret : 0,
2234 NULL, 0, errp);
2235 }
2236 }
2237
2238 /* Handle NBD_CMD_READ request.
2239 * Return -errno if sending fails. Other errors are reported directly to the
2240 * client as an error reply. */
2241 static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
2242 uint8_t *data, Error **errp)
2243 {
2244 int ret;
2245 NBDExport *exp = client->exp;
2246
2247 assert(request->type == NBD_CMD_READ);
2248
2249 /* XXX: NBD Protocol only documents use of FUA with WRITE */
2250 if (request->flags & NBD_CMD_FLAG_FUA) {
2251 ret = blk_co_flush(exp->blk);
2252 if (ret < 0) {
2253 return nbd_send_generic_reply(client, request->handle, ret,
2254 "flush failed", errp);
2255 }
2256 }
2257
2258 if (client->structured_reply && !(request->flags & NBD_CMD_FLAG_DF) &&
2259 request->len)
2260 {
2261 return nbd_co_send_sparse_read(client, request->handle, request->from,
2262 data, request->len, errp);
2263 }
2264
2265 ret = blk_pread(exp->blk, request->from, data, request->len);
2266 if (ret < 0) {
2267 return nbd_send_generic_reply(client, request->handle, ret,
2268 "reading from file failed", errp);
2269 }
2270
2271 if (client->structured_reply) {
2272 if (request->len) {
2273 return nbd_co_send_structured_read(client, request->handle,
2274 request->from, data,
2275 request->len, true, errp);
2276 } else {
2277 return nbd_co_send_structured_done(client, request->handle, errp);
2278 }
2279 } else {
2280 return nbd_co_send_simple_reply(client, request->handle, 0,
2281 data, request->len, errp);
2282 }
2283 }
2284
2285 /*
2286 * nbd_do_cmd_cache
2287 *
2288 * Handle NBD_CMD_CACHE request.
2289 * Return -errno if sending fails. Other errors are reported directly to the
2290 * client as an error reply.
2291 */
2292 static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
2293 Error **errp)
2294 {
2295 int ret;
2296 NBDExport *exp = client->exp;
2297
2298 assert(request->type == NBD_CMD_CACHE);
2299
2300 ret = blk_co_preadv(exp->blk, request->from, request->len,
2301 NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
2302
2303 return nbd_send_generic_reply(client, request->handle, ret,
2304 "caching data failed", errp);
2305 }
2306
2307 /* Handle NBD request.
2308 * Return -errno if sending fails. Other errors are reported directly to the
2309 * client as an error reply. */
2310 static coroutine_fn int nbd_handle_request(NBDClient *client,
2311 NBDRequest *request,
2312 uint8_t *data, Error **errp)
2313 {
2314 int ret;
2315 int flags;
2316 NBDExport *exp = client->exp;
2317 char *msg;
2318
2319 switch (request->type) {
2320 case NBD_CMD_CACHE:
2321 return nbd_do_cmd_cache(client, request, errp);
2322
2323 case NBD_CMD_READ:
2324 return nbd_do_cmd_read(client, request, data, errp);
2325
2326 case NBD_CMD_WRITE:
2327 flags = 0;
2328 if (request->flags & NBD_CMD_FLAG_FUA) {
2329 flags |= BDRV_REQ_FUA;
2330 }
2331 ret = blk_pwrite(exp->blk, request->from, data, request->len, flags);
2332 return nbd_send_generic_reply(client, request->handle, ret,
2333 "writing to file failed", errp);
2334
2335 case NBD_CMD_WRITE_ZEROES:
2336 flags = 0;
2337 if (request->flags & NBD_CMD_FLAG_FUA) {
2338 flags |= BDRV_REQ_FUA;
2339 }
2340 if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
2341 flags |= BDRV_REQ_MAY_UNMAP;
2342 }
2343 if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
2344 flags |= BDRV_REQ_NO_FALLBACK;
2345 }
2346 ret = 0;
2347 /* FIXME simplify this when blk_pwrite_zeroes switches to 64-bit */
2348 while (ret >= 0 && request->len) {
2349 int align = client->check_align ?: 1;
2350 int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
2351 align));
2352 ret = blk_pwrite_zeroes(exp->blk, request->from, len, flags);
2353 request->len -= len;
2354 request->from += len;
2355 }
2356 return nbd_send_generic_reply(client, request->handle, ret,
2357 "writing to file failed", errp);
2358
2359 case NBD_CMD_DISC:
2360 /* unreachable, thanks to special case in nbd_co_receive_request() */
2361 abort();
2362
2363 case NBD_CMD_FLUSH:
2364 ret = blk_co_flush(exp->blk);
2365 return nbd_send_generic_reply(client, request->handle, ret,
2366 "flush failed", errp);
2367
2368 case NBD_CMD_TRIM:
2369 ret = 0;
2370 /* FIXME simplify this when blk_co_pdiscard switches to 64-bit */
2371 while (ret >= 0 && request->len) {
2372 int align = client->check_align ?: 1;
2373 int len = MIN(request->len, QEMU_ALIGN_DOWN(BDRV_REQUEST_MAX_BYTES,
2374 align));
2375 ret = blk_co_pdiscard(exp->blk, request->from, len);
2376 request->len -= len;
2377 request->from += len;
2378 }
2379 if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
2380 ret = blk_co_flush(exp->blk);
2381 }
2382 return nbd_send_generic_reply(client, request->handle, ret,
2383 "discard failed", errp);
2384
2385 case NBD_CMD_BLOCK_STATUS:
2386 if (!request->len) {
2387 return nbd_send_generic_reply(client, request->handle, -EINVAL,
2388 "need non-zero length", errp);
2389 }
2390 if (client->export_meta.valid &&
2391 (client->export_meta.base_allocation ||
2392 client->export_meta.bitmap))
2393 {
2394 bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
2395
2396 if (client->export_meta.base_allocation) {
2397 ret = nbd_co_send_block_status(client, request->handle,
2398 blk_bs(exp->blk), request->from,
2399 request->len, dont_fragment,
2400 !client->export_meta.bitmap,
2401 NBD_META_ID_BASE_ALLOCATION,
2402 errp);
2403 if (ret < 0) {
2404 return ret;
2405 }
2406 }
2407
2408 if (client->export_meta.bitmap) {
2409 ret = nbd_co_send_bitmap(client, request->handle,
2410 client->exp->export_bitmap,
2411 request->from, request->len,
2412 dont_fragment,
2413 true, NBD_META_ID_DIRTY_BITMAP, errp);
2414 if (ret < 0) {
2415 return ret;
2416 }
2417 }
2418
2419 return 0;
2420 } else {
2421 return nbd_send_generic_reply(client, request->handle, -EINVAL,
2422 "CMD_BLOCK_STATUS not negotiated",
2423 errp);
2424 }
2425
2426 default:
2427 msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
2428 request->type);
2429 ret = nbd_send_generic_reply(client, request->handle, -EINVAL, msg,
2430 errp);
2431 g_free(msg);
2432 return ret;
2433 }
2434 }
2435
2436 /* Owns a reference to the NBDClient passed as opaque. */
2437 static coroutine_fn void nbd_trip(void *opaque)
2438 {
2439 NBDClient *client = opaque;
2440 NBDRequestData *req;
2441 NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
2442 int ret;
2443 Error *local_err = NULL;
2444
2445 trace_nbd_trip();
2446 if (client->closing) {
2447 nbd_client_put(client);
2448 return;
2449 }
2450
2451 req = nbd_request_get(client);
2452 ret = nbd_co_receive_request(req, &request, &local_err);
2453 client->recv_coroutine = NULL;
2454
2455 if (client->closing) {
2456 /*
2457 * The client may be closed when we are blocked in
2458 * nbd_co_receive_request()
2459 */
2460 goto done;
2461 }
2462
2463 nbd_client_receive_next_request(client);
2464 if (ret == -EIO) {
2465 goto disconnect;
2466 }
2467
2468 if (ret < 0) {
2469 /* It wans't -EIO, so, according to nbd_co_receive_request()
2470 * semantics, we should return the error to the client. */
2471 Error *export_err = local_err;
2472
2473 local_err = NULL;
2474 ret = nbd_send_generic_reply(client, request.handle, -EINVAL,
2475 error_get_pretty(export_err), &local_err);
2476 error_free(export_err);
2477 } else {
2478 ret = nbd_handle_request(client, &request, req->data, &local_err);
2479 }
2480 if (ret < 0) {
2481 error_prepend(&local_err, "Failed to send reply: ");
2482 goto disconnect;
2483 }
2484
2485 /* We must disconnect after NBD_CMD_WRITE if we did not
2486 * read the payload.
2487 */
2488 if (!req->complete) {
2489 error_setg(&local_err, "Request handling failed in intermediate state");
2490 goto disconnect;
2491 }
2492
2493 done:
2494 nbd_request_put(req);
2495 nbd_client_put(client);
2496 return;
2497
2498 disconnect:
2499 if (local_err) {
2500 error_reportf_err(local_err, "Disconnect client, due to: ");
2501 }
2502 nbd_request_put(req);
2503 client_close(client, true);
2504 nbd_client_put(client);
2505 }
2506
2507 static void nbd_client_receive_next_request(NBDClient *client)
2508 {
2509 if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
2510 nbd_client_get(client);
2511 client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
2512 aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
2513 }
2514 }
2515
2516 static coroutine_fn void nbd_co_client_start(void *opaque)
2517 {
2518 NBDClient *client = opaque;
2519 Error *local_err = NULL;
2520
2521 qemu_co_mutex_init(&client->send_lock);
2522
2523 if (nbd_negotiate(client, &local_err)) {
2524 if (local_err) {
2525 error_report_err(local_err);
2526 }
2527 client_close(client, false);
2528 return;
2529 }
2530
2531 nbd_client_receive_next_request(client);
2532 }
2533
2534 /*
2535 * Create a new client listener using the given channel @sioc.
2536 * Begin servicing it in a coroutine. When the connection closes, call
2537 * @close_fn with an indication of whether the client completed negotiation.
2538 */
2539 void nbd_client_new(QIOChannelSocket *sioc,
2540 QCryptoTLSCreds *tlscreds,
2541 const char *tlsauthz,
2542 void (*close_fn)(NBDClient *, bool))
2543 {
2544 NBDClient *client;
2545 Coroutine *co;
2546
2547 client = g_new0(NBDClient, 1);
2548 client->refcount = 1;
2549 client->tlscreds = tlscreds;
2550 if (tlscreds) {
2551 object_ref(OBJECT(client->tlscreds));
2552 }
2553 client->tlsauthz = g_strdup(tlsauthz);
2554 client->sioc = sioc;
2555 object_ref(OBJECT(client->sioc));
2556 client->ioc = QIO_CHANNEL(sioc);
2557 object_ref(OBJECT(client->ioc));
2558 client->close_fn = close_fn;
2559
2560 co = qemu_coroutine_create(nbd_co_client_start, client);
2561 qemu_coroutine_enter(co);
2562 }