Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20210624' into...
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 * Copyright (c) 2020 Virtuozzo International GmbH.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
25
26 #include "qemu/osdep.h"
27 #include "block/trace.h"
28 #include "block/block_int.h"
29 #include "block/blockjob.h"
30 #include "block/fuse.h"
31 #include "block/nbd.h"
32 #include "block/qdict.h"
33 #include "qemu/error-report.h"
34 #include "block/module_block.h"
35 #include "qemu/main-loop.h"
36 #include "qemu/module.h"
37 #include "qapi/error.h"
38 #include "qapi/qmp/qdict.h"
39 #include "qapi/qmp/qjson.h"
40 #include "qapi/qmp/qnull.h"
41 #include "qapi/qmp/qstring.h"
42 #include "qapi/qobject-output-visitor.h"
43 #include "qapi/qapi-visit-block-core.h"
44 #include "sysemu/block-backend.h"
45 #include "qemu/notify.h"
46 #include "qemu/option.h"
47 #include "qemu/coroutine.h"
48 #include "block/qapi.h"
49 #include "qemu/timer.h"
50 #include "qemu/cutils.h"
51 #include "qemu/id.h"
52 #include "block/coroutines.h"
53
54 #ifdef CONFIG_BSD
55 #include <sys/ioctl.h>
56 #include <sys/queue.h>
57 #ifndef __DragonFly__
58 #include <sys/disk.h>
59 #endif
60 #endif
61
62 #ifdef _WIN32
63 #include <windows.h>
64 #endif
65
66 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
67
68 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
69 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
70
71 static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
72 QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
73
74 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
75 QLIST_HEAD_INITIALIZER(bdrv_drivers);
76
77 static BlockDriverState *bdrv_open_inherit(const char *filename,
78 const char *reference,
79 QDict *options, int flags,
80 BlockDriverState *parent,
81 const BdrvChildClass *child_class,
82 BdrvChildRole child_role,
83 Error **errp);
84
85 static void bdrv_replace_child_noperm(BdrvChild *child,
86 BlockDriverState *new_bs);
87 static int bdrv_attach_child_noperm(BlockDriverState *parent_bs,
88 BlockDriverState *child_bs,
89 const char *child_name,
90 const BdrvChildClass *child_class,
91 BdrvChildRole child_role,
92 BdrvChild **child,
93 Transaction *tran,
94 Error **errp);
95 static void bdrv_remove_filter_or_cow_child(BlockDriverState *bs,
96 Transaction *tran);
97
98 static int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
99 BlockReopenQueue *queue,
100 Transaction *set_backings_tran, Error **errp);
101 static void bdrv_reopen_commit(BDRVReopenState *reopen_state);
102 static void bdrv_reopen_abort(BDRVReopenState *reopen_state);
103
104 /* If non-zero, use only whitelisted block drivers */
105 static int use_bdrv_whitelist;
106
107 #ifdef _WIN32
108 static int is_windows_drive_prefix(const char *filename)
109 {
110 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
111 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
112 filename[1] == ':');
113 }
114
115 int is_windows_drive(const char *filename)
116 {
117 if (is_windows_drive_prefix(filename) &&
118 filename[2] == '\0')
119 return 1;
120 if (strstart(filename, "\\\\.\\", NULL) ||
121 strstart(filename, "//./", NULL))
122 return 1;
123 return 0;
124 }
125 #endif
126
127 size_t bdrv_opt_mem_align(BlockDriverState *bs)
128 {
129 if (!bs || !bs->drv) {
130 /* page size or 4k (hdd sector size) should be on the safe side */
131 return MAX(4096, qemu_real_host_page_size);
132 }
133
134 return bs->bl.opt_mem_alignment;
135 }
136
137 size_t bdrv_min_mem_align(BlockDriverState *bs)
138 {
139 if (!bs || !bs->drv) {
140 /* page size or 4k (hdd sector size) should be on the safe side */
141 return MAX(4096, qemu_real_host_page_size);
142 }
143
144 return bs->bl.min_mem_alignment;
145 }
146
147 /* check if the path starts with "<protocol>:" */
148 int path_has_protocol(const char *path)
149 {
150 const char *p;
151
152 #ifdef _WIN32
153 if (is_windows_drive(path) ||
154 is_windows_drive_prefix(path)) {
155 return 0;
156 }
157 p = path + strcspn(path, ":/\\");
158 #else
159 p = path + strcspn(path, ":/");
160 #endif
161
162 return *p == ':';
163 }
164
165 int path_is_absolute(const char *path)
166 {
167 #ifdef _WIN32
168 /* specific case for names like: "\\.\d:" */
169 if (is_windows_drive(path) || is_windows_drive_prefix(path)) {
170 return 1;
171 }
172 return (*path == '/' || *path == '\\');
173 #else
174 return (*path == '/');
175 #endif
176 }
177
178 /* if filename is absolute, just return its duplicate. Otherwise, build a
179 path to it by considering it is relative to base_path. URL are
180 supported. */
181 char *path_combine(const char *base_path, const char *filename)
182 {
183 const char *protocol_stripped = NULL;
184 const char *p, *p1;
185 char *result;
186 int len;
187
188 if (path_is_absolute(filename)) {
189 return g_strdup(filename);
190 }
191
192 if (path_has_protocol(base_path)) {
193 protocol_stripped = strchr(base_path, ':');
194 if (protocol_stripped) {
195 protocol_stripped++;
196 }
197 }
198 p = protocol_stripped ?: base_path;
199
200 p1 = strrchr(base_path, '/');
201 #ifdef _WIN32
202 {
203 const char *p2;
204 p2 = strrchr(base_path, '\\');
205 if (!p1 || p2 > p1) {
206 p1 = p2;
207 }
208 }
209 #endif
210 if (p1) {
211 p1++;
212 } else {
213 p1 = base_path;
214 }
215 if (p1 > p) {
216 p = p1;
217 }
218 len = p - base_path;
219
220 result = g_malloc(len + strlen(filename) + 1);
221 memcpy(result, base_path, len);
222 strcpy(result + len, filename);
223
224 return result;
225 }
226
227 /*
228 * Helper function for bdrv_parse_filename() implementations to remove optional
229 * protocol prefixes (especially "file:") from a filename and for putting the
230 * stripped filename into the options QDict if there is such a prefix.
231 */
232 void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
233 QDict *options)
234 {
235 if (strstart(filename, prefix, &filename)) {
236 /* Stripping the explicit protocol prefix may result in a protocol
237 * prefix being (wrongly) detected (if the filename contains a colon) */
238 if (path_has_protocol(filename)) {
239 GString *fat_filename;
240
241 /* This means there is some colon before the first slash; therefore,
242 * this cannot be an absolute path */
243 assert(!path_is_absolute(filename));
244
245 /* And we can thus fix the protocol detection issue by prefixing it
246 * by "./" */
247 fat_filename = g_string_new("./");
248 g_string_append(fat_filename, filename);
249
250 assert(!path_has_protocol(fat_filename->str));
251
252 qdict_put(options, "filename",
253 qstring_from_gstring(fat_filename));
254 } else {
255 /* If no protocol prefix was detected, we can use the shortened
256 * filename as-is */
257 qdict_put_str(options, "filename", filename);
258 }
259 }
260 }
261
262
263 /* Returns whether the image file is opened as read-only. Note that this can
264 * return false and writing to the image file is still not possible because the
265 * image is inactivated. */
266 bool bdrv_is_read_only(BlockDriverState *bs)
267 {
268 return bs->read_only;
269 }
270
271 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
272 bool ignore_allow_rdw, Error **errp)
273 {
274 /* Do not set read_only if copy_on_read is enabled */
275 if (bs->copy_on_read && read_only) {
276 error_setg(errp, "Can't set node '%s' to r/o with copy-on-read enabled",
277 bdrv_get_device_or_node_name(bs));
278 return -EINVAL;
279 }
280
281 /* Do not clear read_only if it is prohibited */
282 if (!read_only && !(bs->open_flags & BDRV_O_ALLOW_RDWR) &&
283 !ignore_allow_rdw)
284 {
285 error_setg(errp, "Node '%s' is read only",
286 bdrv_get_device_or_node_name(bs));
287 return -EPERM;
288 }
289
290 return 0;
291 }
292
293 /*
294 * Called by a driver that can only provide a read-only image.
295 *
296 * Returns 0 if the node is already read-only or it could switch the node to
297 * read-only because BDRV_O_AUTO_RDONLY is set.
298 *
299 * Returns -EACCES if the node is read-write and BDRV_O_AUTO_RDONLY is not set
300 * or bdrv_can_set_read_only() forbids making the node read-only. If @errmsg
301 * is not NULL, it is used as the error message for the Error object.
302 */
303 int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
304 Error **errp)
305 {
306 int ret = 0;
307
308 if (!(bs->open_flags & BDRV_O_RDWR)) {
309 return 0;
310 }
311 if (!(bs->open_flags & BDRV_O_AUTO_RDONLY)) {
312 goto fail;
313 }
314
315 ret = bdrv_can_set_read_only(bs, true, false, NULL);
316 if (ret < 0) {
317 goto fail;
318 }
319
320 bs->read_only = true;
321 bs->open_flags &= ~BDRV_O_RDWR;
322
323 return 0;
324
325 fail:
326 error_setg(errp, "%s", errmsg ?: "Image is read-only");
327 return -EACCES;
328 }
329
330 /*
331 * If @backing is empty, this function returns NULL without setting
332 * @errp. In all other cases, NULL will only be returned with @errp
333 * set.
334 *
335 * Therefore, a return value of NULL without @errp set means that
336 * there is no backing file; if @errp is set, there is one but its
337 * absolute filename cannot be generated.
338 */
339 char *bdrv_get_full_backing_filename_from_filename(const char *backed,
340 const char *backing,
341 Error **errp)
342 {
343 if (backing[0] == '\0') {
344 return NULL;
345 } else if (path_has_protocol(backing) || path_is_absolute(backing)) {
346 return g_strdup(backing);
347 } else if (backed[0] == '\0' || strstart(backed, "json:", NULL)) {
348 error_setg(errp, "Cannot use relative backing file names for '%s'",
349 backed);
350 return NULL;
351 } else {
352 return path_combine(backed, backing);
353 }
354 }
355
356 /*
357 * If @filename is empty or NULL, this function returns NULL without
358 * setting @errp. In all other cases, NULL will only be returned with
359 * @errp set.
360 */
361 static char *bdrv_make_absolute_filename(BlockDriverState *relative_to,
362 const char *filename, Error **errp)
363 {
364 char *dir, *full_name;
365
366 if (!filename || filename[0] == '\0') {
367 return NULL;
368 } else if (path_has_protocol(filename) || path_is_absolute(filename)) {
369 return g_strdup(filename);
370 }
371
372 dir = bdrv_dirname(relative_to, errp);
373 if (!dir) {
374 return NULL;
375 }
376
377 full_name = g_strconcat(dir, filename, NULL);
378 g_free(dir);
379 return full_name;
380 }
381
382 char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
383 {
384 return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
385 }
386
387 void bdrv_register(BlockDriver *bdrv)
388 {
389 assert(bdrv->format_name);
390 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
391 }
392
393 BlockDriverState *bdrv_new(void)
394 {
395 BlockDriverState *bs;
396 int i;
397
398 bs = g_new0(BlockDriverState, 1);
399 QLIST_INIT(&bs->dirty_bitmaps);
400 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
401 QLIST_INIT(&bs->op_blockers[i]);
402 }
403 notifier_with_return_list_init(&bs->before_write_notifiers);
404 qemu_co_mutex_init(&bs->reqs_lock);
405 qemu_mutex_init(&bs->dirty_bitmap_mutex);
406 bs->refcnt = 1;
407 bs->aio_context = qemu_get_aio_context();
408
409 qemu_co_queue_init(&bs->flush_queue);
410
411 for (i = 0; i < bdrv_drain_all_count; i++) {
412 bdrv_drained_begin(bs);
413 }
414
415 QTAILQ_INSERT_TAIL(&all_bdrv_states, bs, bs_list);
416
417 return bs;
418 }
419
420 static BlockDriver *bdrv_do_find_format(const char *format_name)
421 {
422 BlockDriver *drv1;
423
424 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
425 if (!strcmp(drv1->format_name, format_name)) {
426 return drv1;
427 }
428 }
429
430 return NULL;
431 }
432
433 BlockDriver *bdrv_find_format(const char *format_name)
434 {
435 BlockDriver *drv1;
436 int i;
437
438 drv1 = bdrv_do_find_format(format_name);
439 if (drv1) {
440 return drv1;
441 }
442
443 /* The driver isn't registered, maybe we need to load a module */
444 for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
445 if (!strcmp(block_driver_modules[i].format_name, format_name)) {
446 block_module_load_one(block_driver_modules[i].library_name);
447 break;
448 }
449 }
450
451 return bdrv_do_find_format(format_name);
452 }
453
454 static int bdrv_format_is_whitelisted(const char *format_name, bool read_only)
455 {
456 static const char *whitelist_rw[] = {
457 CONFIG_BDRV_RW_WHITELIST
458 NULL
459 };
460 static const char *whitelist_ro[] = {
461 CONFIG_BDRV_RO_WHITELIST
462 NULL
463 };
464 const char **p;
465
466 if (!whitelist_rw[0] && !whitelist_ro[0]) {
467 return 1; /* no whitelist, anything goes */
468 }
469
470 for (p = whitelist_rw; *p; p++) {
471 if (!strcmp(format_name, *p)) {
472 return 1;
473 }
474 }
475 if (read_only) {
476 for (p = whitelist_ro; *p; p++) {
477 if (!strcmp(format_name, *p)) {
478 return 1;
479 }
480 }
481 }
482 return 0;
483 }
484
485 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
486 {
487 return bdrv_format_is_whitelisted(drv->format_name, read_only);
488 }
489
490 bool bdrv_uses_whitelist(void)
491 {
492 return use_bdrv_whitelist;
493 }
494
495 typedef struct CreateCo {
496 BlockDriver *drv;
497 char *filename;
498 QemuOpts *opts;
499 int ret;
500 Error *err;
501 } CreateCo;
502
503 static void coroutine_fn bdrv_create_co_entry(void *opaque)
504 {
505 Error *local_err = NULL;
506 int ret;
507
508 CreateCo *cco = opaque;
509 assert(cco->drv);
510
511 ret = cco->drv->bdrv_co_create_opts(cco->drv,
512 cco->filename, cco->opts, &local_err);
513 error_propagate(&cco->err, local_err);
514 cco->ret = ret;
515 }
516
517 int bdrv_create(BlockDriver *drv, const char* filename,
518 QemuOpts *opts, Error **errp)
519 {
520 int ret;
521
522 Coroutine *co;
523 CreateCo cco = {
524 .drv = drv,
525 .filename = g_strdup(filename),
526 .opts = opts,
527 .ret = NOT_DONE,
528 .err = NULL,
529 };
530
531 if (!drv->bdrv_co_create_opts) {
532 error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
533 ret = -ENOTSUP;
534 goto out;
535 }
536
537 if (qemu_in_coroutine()) {
538 /* Fast-path if already in coroutine context */
539 bdrv_create_co_entry(&cco);
540 } else {
541 co = qemu_coroutine_create(bdrv_create_co_entry, &cco);
542 qemu_coroutine_enter(co);
543 while (cco.ret == NOT_DONE) {
544 aio_poll(qemu_get_aio_context(), true);
545 }
546 }
547
548 ret = cco.ret;
549 if (ret < 0) {
550 if (cco.err) {
551 error_propagate(errp, cco.err);
552 } else {
553 error_setg_errno(errp, -ret, "Could not create image");
554 }
555 }
556
557 out:
558 g_free(cco.filename);
559 return ret;
560 }
561
562 /**
563 * Helper function for bdrv_create_file_fallback(): Resize @blk to at
564 * least the given @minimum_size.
565 *
566 * On success, return @blk's actual length.
567 * Otherwise, return -errno.
568 */
569 static int64_t create_file_fallback_truncate(BlockBackend *blk,
570 int64_t minimum_size, Error **errp)
571 {
572 Error *local_err = NULL;
573 int64_t size;
574 int ret;
575
576 ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
577 &local_err);
578 if (ret < 0 && ret != -ENOTSUP) {
579 error_propagate(errp, local_err);
580 return ret;
581 }
582
583 size = blk_getlength(blk);
584 if (size < 0) {
585 error_free(local_err);
586 error_setg_errno(errp, -size,
587 "Failed to inquire the new image file's length");
588 return size;
589 }
590
591 if (size < minimum_size) {
592 /* Need to grow the image, but we failed to do that */
593 error_propagate(errp, local_err);
594 return -ENOTSUP;
595 }
596
597 error_free(local_err);
598 local_err = NULL;
599
600 return size;
601 }
602
603 /**
604 * Helper function for bdrv_create_file_fallback(): Zero the first
605 * sector to remove any potentially pre-existing image header.
606 */
607 static int create_file_fallback_zero_first_sector(BlockBackend *blk,
608 int64_t current_size,
609 Error **errp)
610 {
611 int64_t bytes_to_clear;
612 int ret;
613
614 bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
615 if (bytes_to_clear) {
616 ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
617 if (ret < 0) {
618 error_setg_errno(errp, -ret,
619 "Failed to clear the new image's first sector");
620 return ret;
621 }
622 }
623
624 return 0;
625 }
626
627 /**
628 * Simple implementation of bdrv_co_create_opts for protocol drivers
629 * which only support creation via opening a file
630 * (usually existing raw storage device)
631 */
632 int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
633 const char *filename,
634 QemuOpts *opts,
635 Error **errp)
636 {
637 BlockBackend *blk;
638 QDict *options;
639 int64_t size = 0;
640 char *buf = NULL;
641 PreallocMode prealloc;
642 Error *local_err = NULL;
643 int ret;
644
645 size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
646 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
647 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
648 PREALLOC_MODE_OFF, &local_err);
649 g_free(buf);
650 if (local_err) {
651 error_propagate(errp, local_err);
652 return -EINVAL;
653 }
654
655 if (prealloc != PREALLOC_MODE_OFF) {
656 error_setg(errp, "Unsupported preallocation mode '%s'",
657 PreallocMode_str(prealloc));
658 return -ENOTSUP;
659 }
660
661 options = qdict_new();
662 qdict_put_str(options, "driver", drv->format_name);
663
664 blk = blk_new_open(filename, NULL, options,
665 BDRV_O_RDWR | BDRV_O_RESIZE, errp);
666 if (!blk) {
667 error_prepend(errp, "Protocol driver '%s' does not support image "
668 "creation, and opening the image failed: ",
669 drv->format_name);
670 return -EINVAL;
671 }
672
673 size = create_file_fallback_truncate(blk, size, errp);
674 if (size < 0) {
675 ret = size;
676 goto out;
677 }
678
679 ret = create_file_fallback_zero_first_sector(blk, size, errp);
680 if (ret < 0) {
681 goto out;
682 }
683
684 ret = 0;
685 out:
686 blk_unref(blk);
687 return ret;
688 }
689
690 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp)
691 {
692 QemuOpts *protocol_opts;
693 BlockDriver *drv;
694 QDict *qdict;
695 int ret;
696
697 drv = bdrv_find_protocol(filename, true, errp);
698 if (drv == NULL) {
699 return -ENOENT;
700 }
701
702 if (!drv->create_opts) {
703 error_setg(errp, "Driver '%s' does not support image creation",
704 drv->format_name);
705 return -ENOTSUP;
706 }
707
708 /*
709 * 'opts' contains a QemuOptsList with a combination of format and protocol
710 * default values.
711 *
712 * The format properly removes its options, but the default values remain
713 * in 'opts->list'. So if the protocol has options with the same name
714 * (e.g. rbd has 'cluster_size' as qcow2), it will see the default values
715 * of the format, since for overlapping options, the format wins.
716 *
717 * To avoid this issue, lets convert QemuOpts to QDict, in this way we take
718 * only the set options, and then convert it back to QemuOpts, using the
719 * create_opts of the protocol. So the new QemuOpts, will contain only the
720 * protocol defaults.
721 */
722 qdict = qemu_opts_to_qdict(opts, NULL);
723 protocol_opts = qemu_opts_from_qdict(drv->create_opts, qdict, errp);
724 if (protocol_opts == NULL) {
725 ret = -EINVAL;
726 goto out;
727 }
728
729 ret = bdrv_create(drv, filename, protocol_opts, errp);
730 out:
731 qemu_opts_del(protocol_opts);
732 qobject_unref(qdict);
733 return ret;
734 }
735
736 int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
737 {
738 Error *local_err = NULL;
739 int ret;
740
741 assert(bs != NULL);
742
743 if (!bs->drv) {
744 error_setg(errp, "Block node '%s' is not opened", bs->filename);
745 return -ENOMEDIUM;
746 }
747
748 if (!bs->drv->bdrv_co_delete_file) {
749 error_setg(errp, "Driver '%s' does not support image deletion",
750 bs->drv->format_name);
751 return -ENOTSUP;
752 }
753
754 ret = bs->drv->bdrv_co_delete_file(bs, &local_err);
755 if (ret < 0) {
756 error_propagate(errp, local_err);
757 }
758
759 return ret;
760 }
761
762 void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs)
763 {
764 Error *local_err = NULL;
765 int ret;
766
767 if (!bs) {
768 return;
769 }
770
771 ret = bdrv_co_delete_file(bs, &local_err);
772 /*
773 * ENOTSUP will happen if the block driver doesn't support
774 * the 'bdrv_co_delete_file' interface. This is a predictable
775 * scenario and shouldn't be reported back to the user.
776 */
777 if (ret == -ENOTSUP) {
778 error_free(local_err);
779 } else if (ret < 0) {
780 error_report_err(local_err);
781 }
782 }
783
784 /**
785 * Try to get @bs's logical and physical block size.
786 * On success, store them in @bsz struct and return 0.
787 * On failure return -errno.
788 * @bs must not be empty.
789 */
790 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
791 {
792 BlockDriver *drv = bs->drv;
793 BlockDriverState *filtered = bdrv_filter_bs(bs);
794
795 if (drv && drv->bdrv_probe_blocksizes) {
796 return drv->bdrv_probe_blocksizes(bs, bsz);
797 } else if (filtered) {
798 return bdrv_probe_blocksizes(filtered, bsz);
799 }
800
801 return -ENOTSUP;
802 }
803
804 /**
805 * Try to get @bs's geometry (cyls, heads, sectors).
806 * On success, store them in @geo struct and return 0.
807 * On failure return -errno.
808 * @bs must not be empty.
809 */
810 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
811 {
812 BlockDriver *drv = bs->drv;
813 BlockDriverState *filtered = bdrv_filter_bs(bs);
814
815 if (drv && drv->bdrv_probe_geometry) {
816 return drv->bdrv_probe_geometry(bs, geo);
817 } else if (filtered) {
818 return bdrv_probe_geometry(filtered, geo);
819 }
820
821 return -ENOTSUP;
822 }
823
824 /*
825 * Create a uniquely-named empty temporary file.
826 * Return 0 upon success, otherwise a negative errno value.
827 */
828 int get_tmp_filename(char *filename, int size)
829 {
830 #ifdef _WIN32
831 char temp_dir[MAX_PATH];
832 /* GetTempFileName requires that its output buffer (4th param)
833 have length MAX_PATH or greater. */
834 assert(size >= MAX_PATH);
835 return (GetTempPath(MAX_PATH, temp_dir)
836 && GetTempFileName(temp_dir, "qem", 0, filename)
837 ? 0 : -GetLastError());
838 #else
839 int fd;
840 const char *tmpdir;
841 tmpdir = getenv("TMPDIR");
842 if (!tmpdir) {
843 tmpdir = "/var/tmp";
844 }
845 if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) {
846 return -EOVERFLOW;
847 }
848 fd = mkstemp(filename);
849 if (fd < 0) {
850 return -errno;
851 }
852 if (close(fd) != 0) {
853 unlink(filename);
854 return -errno;
855 }
856 return 0;
857 #endif
858 }
859
860 /*
861 * Detect host devices. By convention, /dev/cdrom[N] is always
862 * recognized as a host CDROM.
863 */
864 static BlockDriver *find_hdev_driver(const char *filename)
865 {
866 int score_max = 0, score;
867 BlockDriver *drv = NULL, *d;
868
869 QLIST_FOREACH(d, &bdrv_drivers, list) {
870 if (d->bdrv_probe_device) {
871 score = d->bdrv_probe_device(filename);
872 if (score > score_max) {
873 score_max = score;
874 drv = d;
875 }
876 }
877 }
878
879 return drv;
880 }
881
882 static BlockDriver *bdrv_do_find_protocol(const char *protocol)
883 {
884 BlockDriver *drv1;
885
886 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
887 if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
888 return drv1;
889 }
890 }
891
892 return NULL;
893 }
894
895 BlockDriver *bdrv_find_protocol(const char *filename,
896 bool allow_protocol_prefix,
897 Error **errp)
898 {
899 BlockDriver *drv1;
900 char protocol[128];
901 int len;
902 const char *p;
903 int i;
904
905 /* TODO Drivers without bdrv_file_open must be specified explicitly */
906
907 /*
908 * XXX(hch): we really should not let host device detection
909 * override an explicit protocol specification, but moving this
910 * later breaks access to device names with colons in them.
911 * Thanks to the brain-dead persistent naming schemes on udev-
912 * based Linux systems those actually are quite common.
913 */
914 drv1 = find_hdev_driver(filename);
915 if (drv1) {
916 return drv1;
917 }
918
919 if (!path_has_protocol(filename) || !allow_protocol_prefix) {
920 return &bdrv_file;
921 }
922
923 p = strchr(filename, ':');
924 assert(p != NULL);
925 len = p - filename;
926 if (len > sizeof(protocol) - 1)
927 len = sizeof(protocol) - 1;
928 memcpy(protocol, filename, len);
929 protocol[len] = '\0';
930
931 drv1 = bdrv_do_find_protocol(protocol);
932 if (drv1) {
933 return drv1;
934 }
935
936 for (i = 0; i < (int)ARRAY_SIZE(block_driver_modules); ++i) {
937 if (block_driver_modules[i].protocol_name &&
938 !strcmp(block_driver_modules[i].protocol_name, protocol)) {
939 block_module_load_one(block_driver_modules[i].library_name);
940 break;
941 }
942 }
943
944 drv1 = bdrv_do_find_protocol(protocol);
945 if (!drv1) {
946 error_setg(errp, "Unknown protocol '%s'", protocol);
947 }
948 return drv1;
949 }
950
951 /*
952 * Guess image format by probing its contents.
953 * This is not a good idea when your image is raw (CVE-2008-2004), but
954 * we do it anyway for backward compatibility.
955 *
956 * @buf contains the image's first @buf_size bytes.
957 * @buf_size is the buffer size in bytes (generally BLOCK_PROBE_BUF_SIZE,
958 * but can be smaller if the image file is smaller)
959 * @filename is its filename.
960 *
961 * For all block drivers, call the bdrv_probe() method to get its
962 * probing score.
963 * Return the first block driver with the highest probing score.
964 */
965 BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
966 const char *filename)
967 {
968 int score_max = 0, score;
969 BlockDriver *drv = NULL, *d;
970
971 QLIST_FOREACH(d, &bdrv_drivers, list) {
972 if (d->bdrv_probe) {
973 score = d->bdrv_probe(buf, buf_size, filename);
974 if (score > score_max) {
975 score_max = score;
976 drv = d;
977 }
978 }
979 }
980
981 return drv;
982 }
983
984 static int find_image_format(BlockBackend *file, const char *filename,
985 BlockDriver **pdrv, Error **errp)
986 {
987 BlockDriver *drv;
988 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
989 int ret = 0;
990
991 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
992 if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) {
993 *pdrv = &bdrv_raw;
994 return ret;
995 }
996
997 ret = blk_pread(file, 0, buf, sizeof(buf));
998 if (ret < 0) {
999 error_setg_errno(errp, -ret, "Could not read image for determining its "
1000 "format");
1001 *pdrv = NULL;
1002 return ret;
1003 }
1004
1005 drv = bdrv_probe_all(buf, ret, filename);
1006 if (!drv) {
1007 error_setg(errp, "Could not determine image format: No compatible "
1008 "driver found");
1009 ret = -ENOENT;
1010 }
1011 *pdrv = drv;
1012 return ret;
1013 }
1014
1015 /**
1016 * Set the current 'total_sectors' value
1017 * Return 0 on success, -errno on error.
1018 */
1019 int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
1020 {
1021 BlockDriver *drv = bs->drv;
1022
1023 if (!drv) {
1024 return -ENOMEDIUM;
1025 }
1026
1027 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
1028 if (bdrv_is_sg(bs))
1029 return 0;
1030
1031 /* query actual device if possible, otherwise just trust the hint */
1032 if (drv->bdrv_getlength) {
1033 int64_t length = drv->bdrv_getlength(bs);
1034 if (length < 0) {
1035 return length;
1036 }
1037 hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE);
1038 }
1039
1040 bs->total_sectors = hint;
1041
1042 if (bs->total_sectors * BDRV_SECTOR_SIZE > BDRV_MAX_LENGTH) {
1043 return -EFBIG;
1044 }
1045
1046 return 0;
1047 }
1048
1049 /**
1050 * Combines a QDict of new block driver @options with any missing options taken
1051 * from @old_options, so that leaving out an option defaults to its old value.
1052 */
1053 static void bdrv_join_options(BlockDriverState *bs, QDict *options,
1054 QDict *old_options)
1055 {
1056 if (bs->drv && bs->drv->bdrv_join_options) {
1057 bs->drv->bdrv_join_options(options, old_options);
1058 } else {
1059 qdict_join(options, old_options, false);
1060 }
1061 }
1062
1063 static BlockdevDetectZeroesOptions bdrv_parse_detect_zeroes(QemuOpts *opts,
1064 int open_flags,
1065 Error **errp)
1066 {
1067 Error *local_err = NULL;
1068 char *value = qemu_opt_get_del(opts, "detect-zeroes");
1069 BlockdevDetectZeroesOptions detect_zeroes =
1070 qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
1071 BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
1072 g_free(value);
1073 if (local_err) {
1074 error_propagate(errp, local_err);
1075 return detect_zeroes;
1076 }
1077
1078 if (detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP &&
1079 !(open_flags & BDRV_O_UNMAP))
1080 {
1081 error_setg(errp, "setting detect-zeroes to unmap is not allowed "
1082 "without setting discard operation to unmap");
1083 }
1084
1085 return detect_zeroes;
1086 }
1087
1088 /**
1089 * Set open flags for aio engine
1090 *
1091 * Return 0 on success, -1 if the engine specified is invalid
1092 */
1093 int bdrv_parse_aio(const char *mode, int *flags)
1094 {
1095 if (!strcmp(mode, "threads")) {
1096 /* do nothing, default */
1097 } else if (!strcmp(mode, "native")) {
1098 *flags |= BDRV_O_NATIVE_AIO;
1099 #ifdef CONFIG_LINUX_IO_URING
1100 } else if (!strcmp(mode, "io_uring")) {
1101 *flags |= BDRV_O_IO_URING;
1102 #endif
1103 } else {
1104 return -1;
1105 }
1106
1107 return 0;
1108 }
1109
1110 /**
1111 * Set open flags for a given discard mode
1112 *
1113 * Return 0 on success, -1 if the discard mode was invalid.
1114 */
1115 int bdrv_parse_discard_flags(const char *mode, int *flags)
1116 {
1117 *flags &= ~BDRV_O_UNMAP;
1118
1119 if (!strcmp(mode, "off") || !strcmp(mode, "ignore")) {
1120 /* do nothing */
1121 } else if (!strcmp(mode, "on") || !strcmp(mode, "unmap")) {
1122 *flags |= BDRV_O_UNMAP;
1123 } else {
1124 return -1;
1125 }
1126
1127 return 0;
1128 }
1129
1130 /**
1131 * Set open flags for a given cache mode
1132 *
1133 * Return 0 on success, -1 if the cache mode was invalid.
1134 */
1135 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough)
1136 {
1137 *flags &= ~BDRV_O_CACHE_MASK;
1138
1139 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
1140 *writethrough = false;
1141 *flags |= BDRV_O_NOCACHE;
1142 } else if (!strcmp(mode, "directsync")) {
1143 *writethrough = true;
1144 *flags |= BDRV_O_NOCACHE;
1145 } else if (!strcmp(mode, "writeback")) {
1146 *writethrough = false;
1147 } else if (!strcmp(mode, "unsafe")) {
1148 *writethrough = false;
1149 *flags |= BDRV_O_NO_FLUSH;
1150 } else if (!strcmp(mode, "writethrough")) {
1151 *writethrough = true;
1152 } else {
1153 return -1;
1154 }
1155
1156 return 0;
1157 }
1158
1159 static char *bdrv_child_get_parent_desc(BdrvChild *c)
1160 {
1161 BlockDriverState *parent = c->opaque;
1162 return g_strdup(bdrv_get_device_or_node_name(parent));
1163 }
1164
1165 static void bdrv_child_cb_drained_begin(BdrvChild *child)
1166 {
1167 BlockDriverState *bs = child->opaque;
1168 bdrv_do_drained_begin_quiesce(bs, NULL, false);
1169 }
1170
1171 static bool bdrv_child_cb_drained_poll(BdrvChild *child)
1172 {
1173 BlockDriverState *bs = child->opaque;
1174 return bdrv_drain_poll(bs, false, NULL, false);
1175 }
1176
1177 static void bdrv_child_cb_drained_end(BdrvChild *child,
1178 int *drained_end_counter)
1179 {
1180 BlockDriverState *bs = child->opaque;
1181 bdrv_drained_end_no_poll(bs, drained_end_counter);
1182 }
1183
1184 static int bdrv_child_cb_inactivate(BdrvChild *child)
1185 {
1186 BlockDriverState *bs = child->opaque;
1187 assert(bs->open_flags & BDRV_O_INACTIVE);
1188 return 0;
1189 }
1190
1191 static bool bdrv_child_cb_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1192 GSList **ignore, Error **errp)
1193 {
1194 BlockDriverState *bs = child->opaque;
1195 return bdrv_can_set_aio_context(bs, ctx, ignore, errp);
1196 }
1197
1198 static void bdrv_child_cb_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1199 GSList **ignore)
1200 {
1201 BlockDriverState *bs = child->opaque;
1202 return bdrv_set_aio_context_ignore(bs, ctx, ignore);
1203 }
1204
1205 /*
1206 * Returns the options and flags that a temporary snapshot should get, based on
1207 * the originally requested flags (the originally requested image will have
1208 * flags like a backing file)
1209 */
1210 static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
1211 int parent_flags, QDict *parent_options)
1212 {
1213 *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
1214
1215 /* For temporary files, unconditional cache=unsafe is fine */
1216 qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
1217 qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
1218
1219 /* Copy the read-only and discard options from the parent */
1220 qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1221 qdict_copy_default(child_options, parent_options, BDRV_OPT_DISCARD);
1222
1223 /* aio=native doesn't work for cache.direct=off, so disable it for the
1224 * temporary snapshot */
1225 *child_flags &= ~BDRV_O_NATIVE_AIO;
1226 }
1227
1228 static void bdrv_backing_attach(BdrvChild *c)
1229 {
1230 BlockDriverState *parent = c->opaque;
1231 BlockDriverState *backing_hd = c->bs;
1232
1233 assert(!parent->backing_blocker);
1234 error_setg(&parent->backing_blocker,
1235 "node is used as backing hd of '%s'",
1236 bdrv_get_device_or_node_name(parent));
1237
1238 bdrv_refresh_filename(backing_hd);
1239
1240 parent->open_flags &= ~BDRV_O_NO_BACKING;
1241
1242 bdrv_op_block_all(backing_hd, parent->backing_blocker);
1243 /* Otherwise we won't be able to commit or stream */
1244 bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
1245 parent->backing_blocker);
1246 bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_STREAM,
1247 parent->backing_blocker);
1248 /*
1249 * We do backup in 3 ways:
1250 * 1. drive backup
1251 * The target bs is new opened, and the source is top BDS
1252 * 2. blockdev backup
1253 * Both the source and the target are top BDSes.
1254 * 3. internal backup(used for block replication)
1255 * Both the source and the target are backing file
1256 *
1257 * In case 1 and 2, neither the source nor the target is the backing file.
1258 * In case 3, we will block the top BDS, so there is only one block job
1259 * for the top BDS and its backing chain.
1260 */
1261 bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
1262 parent->backing_blocker);
1263 bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
1264 parent->backing_blocker);
1265 }
1266
1267 static void bdrv_backing_detach(BdrvChild *c)
1268 {
1269 BlockDriverState *parent = c->opaque;
1270
1271 assert(parent->backing_blocker);
1272 bdrv_op_unblock_all(c->bs, parent->backing_blocker);
1273 error_free(parent->backing_blocker);
1274 parent->backing_blocker = NULL;
1275 }
1276
1277 static int bdrv_backing_update_filename(BdrvChild *c, BlockDriverState *base,
1278 const char *filename, Error **errp)
1279 {
1280 BlockDriverState *parent = c->opaque;
1281 bool read_only = bdrv_is_read_only(parent);
1282 int ret;
1283
1284 if (read_only) {
1285 ret = bdrv_reopen_set_read_only(parent, false, errp);
1286 if (ret < 0) {
1287 return ret;
1288 }
1289 }
1290
1291 ret = bdrv_change_backing_file(parent, filename,
1292 base->drv ? base->drv->format_name : "",
1293 false);
1294 if (ret < 0) {
1295 error_setg_errno(errp, -ret, "Could not update backing file link");
1296 }
1297
1298 if (read_only) {
1299 bdrv_reopen_set_read_only(parent, true, NULL);
1300 }
1301
1302 return ret;
1303 }
1304
1305 /*
1306 * Returns the options and flags that a generic child of a BDS should
1307 * get, based on the given options and flags for the parent BDS.
1308 */
1309 static void bdrv_inherited_options(BdrvChildRole role, bool parent_is_format,
1310 int *child_flags, QDict *child_options,
1311 int parent_flags, QDict *parent_options)
1312 {
1313 int flags = parent_flags;
1314
1315 /*
1316 * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
1317 * Generally, the question to answer is: Should this child be
1318 * format-probed by default?
1319 */
1320
1321 /*
1322 * Pure and non-filtered data children of non-format nodes should
1323 * be probed by default (even when the node itself has BDRV_O_PROTOCOL
1324 * set). This only affects a very limited set of drivers (namely
1325 * quorum and blkverify when this comment was written).
1326 * Force-clear BDRV_O_PROTOCOL then.
1327 */
1328 if (!parent_is_format &&
1329 (role & BDRV_CHILD_DATA) &&
1330 !(role & (BDRV_CHILD_METADATA | BDRV_CHILD_FILTERED)))
1331 {
1332 flags &= ~BDRV_O_PROTOCOL;
1333 }
1334
1335 /*
1336 * All children of format nodes (except for COW children) and all
1337 * metadata children in general should never be format-probed.
1338 * Force-set BDRV_O_PROTOCOL then.
1339 */
1340 if ((parent_is_format && !(role & BDRV_CHILD_COW)) ||
1341 (role & BDRV_CHILD_METADATA))
1342 {
1343 flags |= BDRV_O_PROTOCOL;
1344 }
1345
1346 /*
1347 * If the cache mode isn't explicitly set, inherit direct and no-flush from
1348 * the parent.
1349 */
1350 qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_DIRECT);
1351 qdict_copy_default(child_options, parent_options, BDRV_OPT_CACHE_NO_FLUSH);
1352 qdict_copy_default(child_options, parent_options, BDRV_OPT_FORCE_SHARE);
1353
1354 if (role & BDRV_CHILD_COW) {
1355 /* backing files are opened read-only by default */
1356 qdict_set_default_str(child_options, BDRV_OPT_READ_ONLY, "on");
1357 qdict_set_default_str(child_options, BDRV_OPT_AUTO_READ_ONLY, "off");
1358 } else {
1359 /* Inherit the read-only option from the parent if it's not set */
1360 qdict_copy_default(child_options, parent_options, BDRV_OPT_READ_ONLY);
1361 qdict_copy_default(child_options, parent_options,
1362 BDRV_OPT_AUTO_READ_ONLY);
1363 }
1364
1365 /*
1366 * bdrv_co_pdiscard() respects unmap policy for the parent, so we
1367 * can default to enable it on lower layers regardless of the
1368 * parent option.
1369 */
1370 qdict_set_default_str(child_options, BDRV_OPT_DISCARD, "unmap");
1371
1372 /* Clear flags that only apply to the top layer */
1373 flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ);
1374
1375 if (role & BDRV_CHILD_METADATA) {
1376 flags &= ~BDRV_O_NO_IO;
1377 }
1378 if (role & BDRV_CHILD_COW) {
1379 flags &= ~BDRV_O_TEMPORARY;
1380 }
1381
1382 *child_flags = flags;
1383 }
1384
1385 static void bdrv_child_cb_attach(BdrvChild *child)
1386 {
1387 BlockDriverState *bs = child->opaque;
1388
1389 if (child->role & BDRV_CHILD_COW) {
1390 bdrv_backing_attach(child);
1391 }
1392
1393 bdrv_apply_subtree_drain(child, bs);
1394 }
1395
1396 static void bdrv_child_cb_detach(BdrvChild *child)
1397 {
1398 BlockDriverState *bs = child->opaque;
1399
1400 if (child->role & BDRV_CHILD_COW) {
1401 bdrv_backing_detach(child);
1402 }
1403
1404 bdrv_unapply_subtree_drain(child, bs);
1405 }
1406
1407 static int bdrv_child_cb_update_filename(BdrvChild *c, BlockDriverState *base,
1408 const char *filename, Error **errp)
1409 {
1410 if (c->role & BDRV_CHILD_COW) {
1411 return bdrv_backing_update_filename(c, base, filename, errp);
1412 }
1413 return 0;
1414 }
1415
1416 static AioContext *bdrv_child_cb_get_parent_aio_context(BdrvChild *c)
1417 {
1418 BlockDriverState *bs = c->opaque;
1419
1420 return bdrv_get_aio_context(bs);
1421 }
1422
1423 const BdrvChildClass child_of_bds = {
1424 .parent_is_bds = true,
1425 .get_parent_desc = bdrv_child_get_parent_desc,
1426 .inherit_options = bdrv_inherited_options,
1427 .drained_begin = bdrv_child_cb_drained_begin,
1428 .drained_poll = bdrv_child_cb_drained_poll,
1429 .drained_end = bdrv_child_cb_drained_end,
1430 .attach = bdrv_child_cb_attach,
1431 .detach = bdrv_child_cb_detach,
1432 .inactivate = bdrv_child_cb_inactivate,
1433 .can_set_aio_ctx = bdrv_child_cb_can_set_aio_ctx,
1434 .set_aio_ctx = bdrv_child_cb_set_aio_ctx,
1435 .update_filename = bdrv_child_cb_update_filename,
1436 .get_parent_aio_context = bdrv_child_cb_get_parent_aio_context,
1437 };
1438
1439 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
1440 {
1441 return c->klass->get_parent_aio_context(c);
1442 }
1443
1444 static int bdrv_open_flags(BlockDriverState *bs, int flags)
1445 {
1446 int open_flags = flags;
1447
1448 /*
1449 * Clear flags that are internal to the block layer before opening the
1450 * image.
1451 */
1452 open_flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_PROTOCOL);
1453
1454 return open_flags;
1455 }
1456
1457 static void update_flags_from_options(int *flags, QemuOpts *opts)
1458 {
1459 *flags &= ~(BDRV_O_CACHE_MASK | BDRV_O_RDWR | BDRV_O_AUTO_RDONLY);
1460
1461 if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_NO_FLUSH, false)) {
1462 *flags |= BDRV_O_NO_FLUSH;
1463 }
1464
1465 if (qemu_opt_get_bool_del(opts, BDRV_OPT_CACHE_DIRECT, false)) {
1466 *flags |= BDRV_O_NOCACHE;
1467 }
1468
1469 if (!qemu_opt_get_bool_del(opts, BDRV_OPT_READ_ONLY, false)) {
1470 *flags |= BDRV_O_RDWR;
1471 }
1472
1473 if (qemu_opt_get_bool_del(opts, BDRV_OPT_AUTO_READ_ONLY, false)) {
1474 *flags |= BDRV_O_AUTO_RDONLY;
1475 }
1476 }
1477
1478 static void update_options_from_flags(QDict *options, int flags)
1479 {
1480 if (!qdict_haskey(options, BDRV_OPT_CACHE_DIRECT)) {
1481 qdict_put_bool(options, BDRV_OPT_CACHE_DIRECT, flags & BDRV_O_NOCACHE);
1482 }
1483 if (!qdict_haskey(options, BDRV_OPT_CACHE_NO_FLUSH)) {
1484 qdict_put_bool(options, BDRV_OPT_CACHE_NO_FLUSH,
1485 flags & BDRV_O_NO_FLUSH);
1486 }
1487 if (!qdict_haskey(options, BDRV_OPT_READ_ONLY)) {
1488 qdict_put_bool(options, BDRV_OPT_READ_ONLY, !(flags & BDRV_O_RDWR));
1489 }
1490 if (!qdict_haskey(options, BDRV_OPT_AUTO_READ_ONLY)) {
1491 qdict_put_bool(options, BDRV_OPT_AUTO_READ_ONLY,
1492 flags & BDRV_O_AUTO_RDONLY);
1493 }
1494 }
1495
1496 static void bdrv_assign_node_name(BlockDriverState *bs,
1497 const char *node_name,
1498 Error **errp)
1499 {
1500 char *gen_node_name = NULL;
1501
1502 if (!node_name) {
1503 node_name = gen_node_name = id_generate(ID_BLOCK);
1504 } else if (!id_wellformed(node_name)) {
1505 /*
1506 * Check for empty string or invalid characters, but not if it is
1507 * generated (generated names use characters not available to the user)
1508 */
1509 error_setg(errp, "Invalid node-name: '%s'", node_name);
1510 return;
1511 }
1512
1513 /* takes care of avoiding namespaces collisions */
1514 if (blk_by_name(node_name)) {
1515 error_setg(errp, "node-name=%s is conflicting with a device id",
1516 node_name);
1517 goto out;
1518 }
1519
1520 /* takes care of avoiding duplicates node names */
1521 if (bdrv_find_node(node_name)) {
1522 error_setg(errp, "Duplicate nodes with node-name='%s'", node_name);
1523 goto out;
1524 }
1525
1526 /* Make sure that the node name isn't truncated */
1527 if (strlen(node_name) >= sizeof(bs->node_name)) {
1528 error_setg(errp, "Node name too long");
1529 goto out;
1530 }
1531
1532 /* copy node name into the bs and insert it into the graph list */
1533 pstrcpy(bs->node_name, sizeof(bs->node_name), node_name);
1534 QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list);
1535 out:
1536 g_free(gen_node_name);
1537 }
1538
1539 static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
1540 const char *node_name, QDict *options,
1541 int open_flags, Error **errp)
1542 {
1543 Error *local_err = NULL;
1544 int i, ret;
1545
1546 bdrv_assign_node_name(bs, node_name, &local_err);
1547 if (local_err) {
1548 error_propagate(errp, local_err);
1549 return -EINVAL;
1550 }
1551
1552 bs->drv = drv;
1553 bs->read_only = !(bs->open_flags & BDRV_O_RDWR);
1554 bs->opaque = g_malloc0(drv->instance_size);
1555
1556 if (drv->bdrv_file_open) {
1557 assert(!drv->bdrv_needs_filename || bs->filename[0]);
1558 ret = drv->bdrv_file_open(bs, options, open_flags, &local_err);
1559 } else if (drv->bdrv_open) {
1560 ret = drv->bdrv_open(bs, options, open_flags, &local_err);
1561 } else {
1562 ret = 0;
1563 }
1564
1565 if (ret < 0) {
1566 if (local_err) {
1567 error_propagate(errp, local_err);
1568 } else if (bs->filename[0]) {
1569 error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
1570 } else {
1571 error_setg_errno(errp, -ret, "Could not open image");
1572 }
1573 goto open_failed;
1574 }
1575
1576 ret = refresh_total_sectors(bs, bs->total_sectors);
1577 if (ret < 0) {
1578 error_setg_errno(errp, -ret, "Could not refresh total sector count");
1579 return ret;
1580 }
1581
1582 bdrv_refresh_limits(bs, NULL, &local_err);
1583 if (local_err) {
1584 error_propagate(errp, local_err);
1585 return -EINVAL;
1586 }
1587
1588 assert(bdrv_opt_mem_align(bs) != 0);
1589 assert(bdrv_min_mem_align(bs) != 0);
1590 assert(is_power_of_2(bs->bl.request_alignment));
1591
1592 for (i = 0; i < bs->quiesce_counter; i++) {
1593 if (drv->bdrv_co_drain_begin) {
1594 drv->bdrv_co_drain_begin(bs);
1595 }
1596 }
1597
1598 return 0;
1599 open_failed:
1600 bs->drv = NULL;
1601 if (bs->file != NULL) {
1602 bdrv_unref_child(bs, bs->file);
1603 bs->file = NULL;
1604 }
1605 g_free(bs->opaque);
1606 bs->opaque = NULL;
1607 return ret;
1608 }
1609
1610 BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
1611 int flags, Error **errp)
1612 {
1613 BlockDriverState *bs;
1614 int ret;
1615
1616 bs = bdrv_new();
1617 bs->open_flags = flags;
1618 bs->explicit_options = qdict_new();
1619 bs->options = qdict_new();
1620 bs->opaque = NULL;
1621
1622 update_options_from_flags(bs->options, flags);
1623
1624 ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
1625 if (ret < 0) {
1626 qobject_unref(bs->explicit_options);
1627 bs->explicit_options = NULL;
1628 qobject_unref(bs->options);
1629 bs->options = NULL;
1630 bdrv_unref(bs);
1631 return NULL;
1632 }
1633
1634 return bs;
1635 }
1636
1637 QemuOptsList bdrv_runtime_opts = {
1638 .name = "bdrv_common",
1639 .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
1640 .desc = {
1641 {
1642 .name = "node-name",
1643 .type = QEMU_OPT_STRING,
1644 .help = "Node name of the block device node",
1645 },
1646 {
1647 .name = "driver",
1648 .type = QEMU_OPT_STRING,
1649 .help = "Block driver to use for the node",
1650 },
1651 {
1652 .name = BDRV_OPT_CACHE_DIRECT,
1653 .type = QEMU_OPT_BOOL,
1654 .help = "Bypass software writeback cache on the host",
1655 },
1656 {
1657 .name = BDRV_OPT_CACHE_NO_FLUSH,
1658 .type = QEMU_OPT_BOOL,
1659 .help = "Ignore flush requests",
1660 },
1661 {
1662 .name = BDRV_OPT_READ_ONLY,
1663 .type = QEMU_OPT_BOOL,
1664 .help = "Node is opened in read-only mode",
1665 },
1666 {
1667 .name = BDRV_OPT_AUTO_READ_ONLY,
1668 .type = QEMU_OPT_BOOL,
1669 .help = "Node can become read-only if opening read-write fails",
1670 },
1671 {
1672 .name = "detect-zeroes",
1673 .type = QEMU_OPT_STRING,
1674 .help = "try to optimize zero writes (off, on, unmap)",
1675 },
1676 {
1677 .name = BDRV_OPT_DISCARD,
1678 .type = QEMU_OPT_STRING,
1679 .help = "discard operation (ignore/off, unmap/on)",
1680 },
1681 {
1682 .name = BDRV_OPT_FORCE_SHARE,
1683 .type = QEMU_OPT_BOOL,
1684 .help = "always accept other writers (default: off)",
1685 },
1686 { /* end of list */ }
1687 },
1688 };
1689
1690 QemuOptsList bdrv_create_opts_simple = {
1691 .name = "simple-create-opts",
1692 .head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
1693 .desc = {
1694 {
1695 .name = BLOCK_OPT_SIZE,
1696 .type = QEMU_OPT_SIZE,
1697 .help = "Virtual disk size"
1698 },
1699 {
1700 .name = BLOCK_OPT_PREALLOC,
1701 .type = QEMU_OPT_STRING,
1702 .help = "Preallocation mode (allowed values: off)"
1703 },
1704 { /* end of list */ }
1705 }
1706 };
1707
1708 /*
1709 * Common part for opening disk images and files
1710 *
1711 * Removes all processed options from *options.
1712 */
1713 static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
1714 QDict *options, Error **errp)
1715 {
1716 int ret, open_flags;
1717 const char *filename;
1718 const char *driver_name = NULL;
1719 const char *node_name = NULL;
1720 const char *discard;
1721 QemuOpts *opts;
1722 BlockDriver *drv;
1723 Error *local_err = NULL;
1724
1725 assert(bs->file == NULL);
1726 assert(options != NULL && bs->options != options);
1727
1728 opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
1729 if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1730 ret = -EINVAL;
1731 goto fail_opts;
1732 }
1733
1734 update_flags_from_options(&bs->open_flags, opts);
1735
1736 driver_name = qemu_opt_get(opts, "driver");
1737 drv = bdrv_find_format(driver_name);
1738 assert(drv != NULL);
1739
1740 bs->force_share = qemu_opt_get_bool(opts, BDRV_OPT_FORCE_SHARE, false);
1741
1742 if (bs->force_share && (bs->open_flags & BDRV_O_RDWR)) {
1743 error_setg(errp,
1744 BDRV_OPT_FORCE_SHARE
1745 "=on can only be used with read-only images");
1746 ret = -EINVAL;
1747 goto fail_opts;
1748 }
1749
1750 if (file != NULL) {
1751 bdrv_refresh_filename(blk_bs(file));
1752 filename = blk_bs(file)->filename;
1753 } else {
1754 /*
1755 * Caution: while qdict_get_try_str() is fine, getting
1756 * non-string types would require more care. When @options
1757 * come from -blockdev or blockdev_add, its members are typed
1758 * according to the QAPI schema, but when they come from
1759 * -drive, they're all QString.
1760 */
1761 filename = qdict_get_try_str(options, "filename");
1762 }
1763
1764 if (drv->bdrv_needs_filename && (!filename || !filename[0])) {
1765 error_setg(errp, "The '%s' block driver requires a file name",
1766 drv->format_name);
1767 ret = -EINVAL;
1768 goto fail_opts;
1769 }
1770
1771 trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
1772 drv->format_name);
1773
1774 bs->read_only = !(bs->open_flags & BDRV_O_RDWR);
1775
1776 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
1777 if (!bs->read_only && bdrv_is_whitelisted(drv, true)) {
1778 ret = bdrv_apply_auto_read_only(bs, NULL, NULL);
1779 } else {
1780 ret = -ENOTSUP;
1781 }
1782 if (ret < 0) {
1783 error_setg(errp,
1784 !bs->read_only && bdrv_is_whitelisted(drv, true)
1785 ? "Driver '%s' can only be used for read-only devices"
1786 : "Driver '%s' is not whitelisted",
1787 drv->format_name);
1788 goto fail_opts;
1789 }
1790 }
1791
1792 /* bdrv_new() and bdrv_close() make it so */
1793 assert(qatomic_read(&bs->copy_on_read) == 0);
1794
1795 if (bs->open_flags & BDRV_O_COPY_ON_READ) {
1796 if (!bs->read_only) {
1797 bdrv_enable_copy_on_read(bs);
1798 } else {
1799 error_setg(errp, "Can't use copy-on-read on read-only device");
1800 ret = -EINVAL;
1801 goto fail_opts;
1802 }
1803 }
1804
1805 discard = qemu_opt_get(opts, BDRV_OPT_DISCARD);
1806 if (discard != NULL) {
1807 if (bdrv_parse_discard_flags(discard, &bs->open_flags) != 0) {
1808 error_setg(errp, "Invalid discard option");
1809 ret = -EINVAL;
1810 goto fail_opts;
1811 }
1812 }
1813
1814 bs->detect_zeroes =
1815 bdrv_parse_detect_zeroes(opts, bs->open_flags, &local_err);
1816 if (local_err) {
1817 error_propagate(errp, local_err);
1818 ret = -EINVAL;
1819 goto fail_opts;
1820 }
1821
1822 if (filename != NULL) {
1823 pstrcpy(bs->filename, sizeof(bs->filename), filename);
1824 } else {
1825 bs->filename[0] = '\0';
1826 }
1827 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
1828
1829 /* Open the image, either directly or using a protocol */
1830 open_flags = bdrv_open_flags(bs, bs->open_flags);
1831 node_name = qemu_opt_get(opts, "node-name");
1832
1833 assert(!drv->bdrv_file_open || file == NULL);
1834 ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
1835 if (ret < 0) {
1836 goto fail_opts;
1837 }
1838
1839 qemu_opts_del(opts);
1840 return 0;
1841
1842 fail_opts:
1843 qemu_opts_del(opts);
1844 return ret;
1845 }
1846
1847 static QDict *parse_json_filename(const char *filename, Error **errp)
1848 {
1849 QObject *options_obj;
1850 QDict *options;
1851 int ret;
1852
1853 ret = strstart(filename, "json:", &filename);
1854 assert(ret);
1855
1856 options_obj = qobject_from_json(filename, errp);
1857 if (!options_obj) {
1858 error_prepend(errp, "Could not parse the JSON options: ");
1859 return NULL;
1860 }
1861
1862 options = qobject_to(QDict, options_obj);
1863 if (!options) {
1864 qobject_unref(options_obj);
1865 error_setg(errp, "Invalid JSON object given");
1866 return NULL;
1867 }
1868
1869 qdict_flatten(options);
1870
1871 return options;
1872 }
1873
1874 static void parse_json_protocol(QDict *options, const char **pfilename,
1875 Error **errp)
1876 {
1877 QDict *json_options;
1878 Error *local_err = NULL;
1879
1880 /* Parse json: pseudo-protocol */
1881 if (!*pfilename || !g_str_has_prefix(*pfilename, "json:")) {
1882 return;
1883 }
1884
1885 json_options = parse_json_filename(*pfilename, &local_err);
1886 if (local_err) {
1887 error_propagate(errp, local_err);
1888 return;
1889 }
1890
1891 /* Options given in the filename have lower priority than options
1892 * specified directly */
1893 qdict_join(options, json_options, false);
1894 qobject_unref(json_options);
1895 *pfilename = NULL;
1896 }
1897
1898 /*
1899 * Fills in default options for opening images and converts the legacy
1900 * filename/flags pair to option QDict entries.
1901 * The BDRV_O_PROTOCOL flag in *flags will be set or cleared accordingly if a
1902 * block driver has been specified explicitly.
1903 */
1904 static int bdrv_fill_options(QDict **options, const char *filename,
1905 int *flags, Error **errp)
1906 {
1907 const char *drvname;
1908 bool protocol = *flags & BDRV_O_PROTOCOL;
1909 bool parse_filename = false;
1910 BlockDriver *drv = NULL;
1911 Error *local_err = NULL;
1912
1913 /*
1914 * Caution: while qdict_get_try_str() is fine, getting non-string
1915 * types would require more care. When @options come from
1916 * -blockdev or blockdev_add, its members are typed according to
1917 * the QAPI schema, but when they come from -drive, they're all
1918 * QString.
1919 */
1920 drvname = qdict_get_try_str(*options, "driver");
1921 if (drvname) {
1922 drv = bdrv_find_format(drvname);
1923 if (!drv) {
1924 error_setg(errp, "Unknown driver '%s'", drvname);
1925 return -ENOENT;
1926 }
1927 /* If the user has explicitly specified the driver, this choice should
1928 * override the BDRV_O_PROTOCOL flag */
1929 protocol = drv->bdrv_file_open;
1930 }
1931
1932 if (protocol) {
1933 *flags |= BDRV_O_PROTOCOL;
1934 } else {
1935 *flags &= ~BDRV_O_PROTOCOL;
1936 }
1937
1938 /* Translate cache options from flags into options */
1939 update_options_from_flags(*options, *flags);
1940
1941 /* Fetch the file name from the options QDict if necessary */
1942 if (protocol && filename) {
1943 if (!qdict_haskey(*options, "filename")) {
1944 qdict_put_str(*options, "filename", filename);
1945 parse_filename = true;
1946 } else {
1947 error_setg(errp, "Can't specify 'file' and 'filename' options at "
1948 "the same time");
1949 return -EINVAL;
1950 }
1951 }
1952
1953 /* Find the right block driver */
1954 /* See cautionary note on accessing @options above */
1955 filename = qdict_get_try_str(*options, "filename");
1956
1957 if (!drvname && protocol) {
1958 if (filename) {
1959 drv = bdrv_find_protocol(filename, parse_filename, errp);
1960 if (!drv) {
1961 return -EINVAL;
1962 }
1963
1964 drvname = drv->format_name;
1965 qdict_put_str(*options, "driver", drvname);
1966 } else {
1967 error_setg(errp, "Must specify either driver or file");
1968 return -EINVAL;
1969 }
1970 }
1971
1972 assert(drv || !protocol);
1973
1974 /* Driver-specific filename parsing */
1975 if (drv && drv->bdrv_parse_filename && parse_filename) {
1976 drv->bdrv_parse_filename(filename, *options, &local_err);
1977 if (local_err) {
1978 error_propagate(errp, local_err);
1979 return -EINVAL;
1980 }
1981
1982 if (!drv->bdrv_needs_filename) {
1983 qdict_del(*options, "filename");
1984 }
1985 }
1986
1987 return 0;
1988 }
1989
1990 typedef struct BlockReopenQueueEntry {
1991 bool prepared;
1992 bool perms_checked;
1993 BDRVReopenState state;
1994 QTAILQ_ENTRY(BlockReopenQueueEntry) entry;
1995 } BlockReopenQueueEntry;
1996
1997 /*
1998 * Return the flags that @bs will have after the reopens in @q have
1999 * successfully completed. If @q is NULL (or @bs is not contained in @q),
2000 * return the current flags.
2001 */
2002 static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)
2003 {
2004 BlockReopenQueueEntry *entry;
2005
2006 if (q != NULL) {
2007 QTAILQ_FOREACH(entry, q, entry) {
2008 if (entry->state.bs == bs) {
2009 return entry->state.flags;
2010 }
2011 }
2012 }
2013
2014 return bs->open_flags;
2015 }
2016
2017 /* Returns whether the image file can be written to after the reopen queue @q
2018 * has been successfully applied, or right now if @q is NULL. */
2019 static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
2020 BlockReopenQueue *q)
2021 {
2022 int flags = bdrv_reopen_get_flags(q, bs);
2023
2024 return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
2025 }
2026
2027 /*
2028 * Return whether the BDS can be written to. This is not necessarily
2029 * the same as !bdrv_is_read_only(bs), as inactivated images may not
2030 * be written to but do not count as read-only images.
2031 */
2032 bool bdrv_is_writable(BlockDriverState *bs)
2033 {
2034 return bdrv_is_writable_after_reopen(bs, NULL);
2035 }
2036
2037 static char *bdrv_child_user_desc(BdrvChild *c)
2038 {
2039 if (c->klass->get_parent_desc) {
2040 return c->klass->get_parent_desc(c);
2041 }
2042
2043 return g_strdup("another user");
2044 }
2045
2046 static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, Error **errp)
2047 {
2048 g_autofree char *user = NULL;
2049 g_autofree char *perm_names = NULL;
2050
2051 if ((b->perm & a->shared_perm) == b->perm) {
2052 return true;
2053 }
2054
2055 perm_names = bdrv_perm_names(b->perm & ~a->shared_perm);
2056 user = bdrv_child_user_desc(a);
2057 error_setg(errp, "Conflicts with use by %s as '%s', which does not "
2058 "allow '%s' on %s",
2059 user, a->name, perm_names, bdrv_get_node_name(b->bs));
2060
2061 return false;
2062 }
2063
2064 static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
2065 {
2066 BdrvChild *a, *b;
2067
2068 /*
2069 * During the loop we'll look at each pair twice. That's correct because
2070 * bdrv_a_allow_b() is asymmetric and we should check each pair in both
2071 * directions.
2072 */
2073 QLIST_FOREACH(a, &bs->parents, next_parent) {
2074 QLIST_FOREACH(b, &bs->parents, next_parent) {
2075 if (a == b) {
2076 continue;
2077 }
2078
2079 if (!bdrv_a_allow_b(a, b, errp)) {
2080 return true;
2081 }
2082 }
2083 }
2084
2085 return false;
2086 }
2087
2088 static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
2089 BdrvChild *c, BdrvChildRole role,
2090 BlockReopenQueue *reopen_queue,
2091 uint64_t parent_perm, uint64_t parent_shared,
2092 uint64_t *nperm, uint64_t *nshared)
2093 {
2094 assert(bs->drv && bs->drv->bdrv_child_perm);
2095 bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
2096 parent_perm, parent_shared,
2097 nperm, nshared);
2098 /* TODO Take force_share from reopen_queue */
2099 if (child_bs && child_bs->force_share) {
2100 *nshared = BLK_PERM_ALL;
2101 }
2102 }
2103
2104 /*
2105 * Adds the whole subtree of @bs (including @bs itself) to the @list (except for
2106 * nodes that are already in the @list, of course) so that final list is
2107 * topologically sorted. Return the result (GSList @list object is updated, so
2108 * don't use old reference after function call).
2109 *
2110 * On function start @list must be already topologically sorted and for any node
2111 * in the @list the whole subtree of the node must be in the @list as well. The
2112 * simplest way to satisfy this criteria: use only result of
2113 * bdrv_topological_dfs() or NULL as @list parameter.
2114 */
2115 static GSList *bdrv_topological_dfs(GSList *list, GHashTable *found,
2116 BlockDriverState *bs)
2117 {
2118 BdrvChild *child;
2119 g_autoptr(GHashTable) local_found = NULL;
2120
2121 if (!found) {
2122 assert(!list);
2123 found = local_found = g_hash_table_new(NULL, NULL);
2124 }
2125
2126 if (g_hash_table_contains(found, bs)) {
2127 return list;
2128 }
2129 g_hash_table_add(found, bs);
2130
2131 QLIST_FOREACH(child, &bs->children, next) {
2132 list = bdrv_topological_dfs(list, found, child->bs);
2133 }
2134
2135 return g_slist_prepend(list, bs);
2136 }
2137
2138 typedef struct BdrvChildSetPermState {
2139 BdrvChild *child;
2140 uint64_t old_perm;
2141 uint64_t old_shared_perm;
2142 } BdrvChildSetPermState;
2143
2144 static void bdrv_child_set_perm_abort(void *opaque)
2145 {
2146 BdrvChildSetPermState *s = opaque;
2147
2148 s->child->perm = s->old_perm;
2149 s->child->shared_perm = s->old_shared_perm;
2150 }
2151
2152 static TransactionActionDrv bdrv_child_set_pem_drv = {
2153 .abort = bdrv_child_set_perm_abort,
2154 .clean = g_free,
2155 };
2156
2157 static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm,
2158 uint64_t shared, Transaction *tran)
2159 {
2160 BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
2161
2162 *s = (BdrvChildSetPermState) {
2163 .child = c,
2164 .old_perm = c->perm,
2165 .old_shared_perm = c->shared_perm,
2166 };
2167
2168 c->perm = perm;
2169 c->shared_perm = shared;
2170
2171 tran_add(tran, &bdrv_child_set_pem_drv, s);
2172 }
2173
2174 static void bdrv_drv_set_perm_commit(void *opaque)
2175 {
2176 BlockDriverState *bs = opaque;
2177 uint64_t cumulative_perms, cumulative_shared_perms;
2178
2179 if (bs->drv->bdrv_set_perm) {
2180 bdrv_get_cumulative_perm(bs, &cumulative_perms,
2181 &cumulative_shared_perms);
2182 bs->drv->bdrv_set_perm(bs, cumulative_perms, cumulative_shared_perms);
2183 }
2184 }
2185
2186 static void bdrv_drv_set_perm_abort(void *opaque)
2187 {
2188 BlockDriverState *bs = opaque;
2189
2190 if (bs->drv->bdrv_abort_perm_update) {
2191 bs->drv->bdrv_abort_perm_update(bs);
2192 }
2193 }
2194
2195 TransactionActionDrv bdrv_drv_set_perm_drv = {
2196 .abort = bdrv_drv_set_perm_abort,
2197 .commit = bdrv_drv_set_perm_commit,
2198 };
2199
2200 static int bdrv_drv_set_perm(BlockDriverState *bs, uint64_t perm,
2201 uint64_t shared_perm, Transaction *tran,
2202 Error **errp)
2203 {
2204 if (!bs->drv) {
2205 return 0;
2206 }
2207
2208 if (bs->drv->bdrv_check_perm) {
2209 int ret = bs->drv->bdrv_check_perm(bs, perm, shared_perm, errp);
2210 if (ret < 0) {
2211 return ret;
2212 }
2213 }
2214
2215 if (tran) {
2216 tran_add(tran, &bdrv_drv_set_perm_drv, bs);
2217 }
2218
2219 return 0;
2220 }
2221
2222 typedef struct BdrvReplaceChildState {
2223 BdrvChild *child;
2224 BlockDriverState *old_bs;
2225 } BdrvReplaceChildState;
2226
2227 static void bdrv_replace_child_commit(void *opaque)
2228 {
2229 BdrvReplaceChildState *s = opaque;
2230
2231 bdrv_unref(s->old_bs);
2232 }
2233
2234 static void bdrv_replace_child_abort(void *opaque)
2235 {
2236 BdrvReplaceChildState *s = opaque;
2237 BlockDriverState *new_bs = s->child->bs;
2238
2239 /* old_bs reference is transparently moved from @s to @s->child */
2240 bdrv_replace_child_noperm(s->child, s->old_bs);
2241 bdrv_unref(new_bs);
2242 }
2243
2244 static TransactionActionDrv bdrv_replace_child_drv = {
2245 .commit = bdrv_replace_child_commit,
2246 .abort = bdrv_replace_child_abort,
2247 .clean = g_free,
2248 };
2249
2250 /*
2251 * bdrv_replace_child
2252 *
2253 * Note: real unref of old_bs is done only on commit.
2254 */
2255 static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs,
2256 Transaction *tran)
2257 {
2258 BdrvReplaceChildState *s = g_new(BdrvReplaceChildState, 1);
2259 *s = (BdrvReplaceChildState) {
2260 .child = child,
2261 .old_bs = child->bs,
2262 };
2263 tran_add(tran, &bdrv_replace_child_drv, s);
2264
2265 if (new_bs) {
2266 bdrv_ref(new_bs);
2267 }
2268 bdrv_replace_child_noperm(child, new_bs);
2269 /* old_bs reference is transparently moved from @child to @s */
2270 }
2271
2272 /*
2273 * Refresh permissions in @bs subtree. The function is intended to be called
2274 * after some graph modification that was done without permission update.
2275 */
2276 static int bdrv_node_refresh_perm(BlockDriverState *bs, BlockReopenQueue *q,
2277 Transaction *tran, Error **errp)
2278 {
2279 BlockDriver *drv = bs->drv;
2280 BdrvChild *c;
2281 int ret;
2282 uint64_t cumulative_perms, cumulative_shared_perms;
2283
2284 bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
2285
2286 /* Write permissions never work with read-only images */
2287 if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2288 !bdrv_is_writable_after_reopen(bs, q))
2289 {
2290 if (!bdrv_is_writable_after_reopen(bs, NULL)) {
2291 error_setg(errp, "Block node is read-only");
2292 } else {
2293 error_setg(errp, "Read-only block node '%s' cannot support "
2294 "read-write users", bdrv_get_node_name(bs));
2295 }
2296
2297 return -EPERM;
2298 }
2299
2300 /*
2301 * Unaligned requests will automatically be aligned to bl.request_alignment
2302 * and without RESIZE we can't extend requests to write to space beyond the
2303 * end of the image, so it's required that the image size is aligned.
2304 */
2305 if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
2306 !(cumulative_perms & BLK_PERM_RESIZE))
2307 {
2308 if ((bs->total_sectors * BDRV_SECTOR_SIZE) % bs->bl.request_alignment) {
2309 error_setg(errp, "Cannot get 'write' permission without 'resize': "
2310 "Image size is not a multiple of request "
2311 "alignment");
2312 return -EPERM;
2313 }
2314 }
2315
2316 /* Check this node */
2317 if (!drv) {
2318 return 0;
2319 }
2320
2321 ret = bdrv_drv_set_perm(bs, cumulative_perms, cumulative_shared_perms, tran,
2322 errp);
2323 if (ret < 0) {
2324 return ret;
2325 }
2326
2327 /* Drivers that never have children can omit .bdrv_child_perm() */
2328 if (!drv->bdrv_child_perm) {
2329 assert(QLIST_EMPTY(&bs->children));
2330 return 0;
2331 }
2332
2333 /* Check all children */
2334 QLIST_FOREACH(c, &bs->children, next) {
2335 uint64_t cur_perm, cur_shared;
2336
2337 bdrv_child_perm(bs, c->bs, c, c->role, q,
2338 cumulative_perms, cumulative_shared_perms,
2339 &cur_perm, &cur_shared);
2340 bdrv_child_set_perm(c, cur_perm, cur_shared, tran);
2341 }
2342
2343 return 0;
2344 }
2345
2346 static int bdrv_list_refresh_perms(GSList *list, BlockReopenQueue *q,
2347 Transaction *tran, Error **errp)
2348 {
2349 int ret;
2350 BlockDriverState *bs;
2351
2352 for ( ; list; list = list->next) {
2353 bs = list->data;
2354
2355 if (bdrv_parent_perms_conflict(bs, errp)) {
2356 return -EINVAL;
2357 }
2358
2359 ret = bdrv_node_refresh_perm(bs, q, tran, errp);
2360 if (ret < 0) {
2361 return ret;
2362 }
2363 }
2364
2365 return 0;
2366 }
2367
2368 void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
2369 uint64_t *shared_perm)
2370 {
2371 BdrvChild *c;
2372 uint64_t cumulative_perms = 0;
2373 uint64_t cumulative_shared_perms = BLK_PERM_ALL;
2374
2375 QLIST_FOREACH(c, &bs->parents, next_parent) {
2376 cumulative_perms |= c->perm;
2377 cumulative_shared_perms &= c->shared_perm;
2378 }
2379
2380 *perm = cumulative_perms;
2381 *shared_perm = cumulative_shared_perms;
2382 }
2383
2384 char *bdrv_perm_names(uint64_t perm)
2385 {
2386 struct perm_name {
2387 uint64_t perm;
2388 const char *name;
2389 } permissions[] = {
2390 { BLK_PERM_CONSISTENT_READ, "consistent read" },
2391 { BLK_PERM_WRITE, "write" },
2392 { BLK_PERM_WRITE_UNCHANGED, "write unchanged" },
2393 { BLK_PERM_RESIZE, "resize" },
2394 { BLK_PERM_GRAPH_MOD, "change children" },
2395 { 0, NULL }
2396 };
2397
2398 GString *result = g_string_sized_new(30);
2399 struct perm_name *p;
2400
2401 for (p = permissions; p->name; p++) {
2402 if (perm & p->perm) {
2403 if (result->len > 0) {
2404 g_string_append(result, ", ");
2405 }
2406 g_string_append(result, p->name);
2407 }
2408 }
2409
2410 return g_string_free(result, FALSE);
2411 }
2412
2413
2414 static int bdrv_refresh_perms(BlockDriverState *bs, Error **errp)
2415 {
2416 int ret;
2417 Transaction *tran = tran_new();
2418 g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
2419
2420 ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
2421 tran_finalize(tran, ret);
2422
2423 return ret;
2424 }
2425
2426 int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
2427 Error **errp)
2428 {
2429 Error *local_err = NULL;
2430 Transaction *tran = tran_new();
2431 int ret;
2432
2433 bdrv_child_set_perm(c, perm, shared, tran);
2434
2435 ret = bdrv_refresh_perms(c->bs, &local_err);
2436
2437 tran_finalize(tran, ret);
2438
2439 if (ret < 0) {
2440 if ((perm & ~c->perm) || (c->shared_perm & ~shared)) {
2441 /* tighten permissions */
2442 error_propagate(errp, local_err);
2443 } else {
2444 /*
2445 * Our caller may intend to only loosen restrictions and
2446 * does not expect this function to fail. Errors are not
2447 * fatal in such a case, so we can just hide them from our
2448 * caller.
2449 */
2450 error_free(local_err);
2451 ret = 0;
2452 }
2453 }
2454
2455 return ret;
2456 }
2457
2458 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp)
2459 {
2460 uint64_t parent_perms, parent_shared;
2461 uint64_t perms, shared;
2462
2463 bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
2464 bdrv_child_perm(bs, c->bs, c, c->role, NULL,
2465 parent_perms, parent_shared, &perms, &shared);
2466
2467 return bdrv_child_try_set_perm(c, perms, shared, errp);
2468 }
2469
2470 /*
2471 * Default implementation for .bdrv_child_perm() for block filters:
2472 * Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED, and RESIZE to the
2473 * filtered child.
2474 */
2475 static void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
2476 BdrvChildRole role,
2477 BlockReopenQueue *reopen_queue,
2478 uint64_t perm, uint64_t shared,
2479 uint64_t *nperm, uint64_t *nshared)
2480 {
2481 *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
2482 *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
2483 }
2484
2485 static void bdrv_default_perms_for_cow(BlockDriverState *bs, BdrvChild *c,
2486 BdrvChildRole role,
2487 BlockReopenQueue *reopen_queue,
2488 uint64_t perm, uint64_t shared,
2489 uint64_t *nperm, uint64_t *nshared)
2490 {
2491 assert(role & BDRV_CHILD_COW);
2492
2493 /*
2494 * We want consistent read from backing files if the parent needs it.
2495 * No other operations are performed on backing files.
2496 */
2497 perm &= BLK_PERM_CONSISTENT_READ;
2498
2499 /*
2500 * If the parent can deal with changing data, we're okay with a
2501 * writable and resizable backing file.
2502 * TODO Require !(perm & BLK_PERM_CONSISTENT_READ), too?
2503 */
2504 if (shared & BLK_PERM_WRITE) {
2505 shared = BLK_PERM_WRITE | BLK_PERM_RESIZE;
2506 } else {
2507 shared = 0;
2508 }
2509
2510 shared |= BLK_PERM_CONSISTENT_READ | BLK_PERM_GRAPH_MOD |
2511 BLK_PERM_WRITE_UNCHANGED;
2512
2513 if (bs->open_flags & BDRV_O_INACTIVE) {
2514 shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2515 }
2516
2517 *nperm = perm;
2518 *nshared = shared;
2519 }
2520
2521 static void bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
2522 BdrvChildRole role,
2523 BlockReopenQueue *reopen_queue,
2524 uint64_t perm, uint64_t shared,
2525 uint64_t *nperm, uint64_t *nshared)
2526 {
2527 int flags;
2528
2529 assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
2530
2531 flags = bdrv_reopen_get_flags(reopen_queue, bs);
2532
2533 /*
2534 * Apart from the modifications below, the same permissions are
2535 * forwarded and left alone as for filters
2536 */
2537 bdrv_filter_default_perms(bs, c, role, reopen_queue,
2538 perm, shared, &perm, &shared);
2539
2540 if (role & BDRV_CHILD_METADATA) {
2541 /* Format drivers may touch metadata even if the guest doesn't write */
2542 if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
2543 perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2544 }
2545
2546 /*
2547 * bs->file always needs to be consistent because of the
2548 * metadata. We can never allow other users to resize or write
2549 * to it.
2550 */
2551 if (!(flags & BDRV_O_NO_IO)) {
2552 perm |= BLK_PERM_CONSISTENT_READ;
2553 }
2554 shared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
2555 }
2556
2557 if (role & BDRV_CHILD_DATA) {
2558 /*
2559 * Technically, everything in this block is a subset of the
2560 * BDRV_CHILD_METADATA path taken above, and so this could
2561 * be an "else if" branch. However, that is not obvious, and
2562 * this function is not performance critical, therefore we let
2563 * this be an independent "if".
2564 */
2565
2566 /*
2567 * We cannot allow other users to resize the file because the
2568 * format driver might have some assumptions about the size
2569 * (e.g. because it is stored in metadata, or because the file
2570 * is split into fixed-size data files).
2571 */
2572 shared &= ~BLK_PERM_RESIZE;
2573
2574 /*
2575 * WRITE_UNCHANGED often cannot be performed as such on the
2576 * data file. For example, the qcow2 driver may still need to
2577 * write copied clusters on copy-on-read.
2578 */
2579 if (perm & BLK_PERM_WRITE_UNCHANGED) {
2580 perm |= BLK_PERM_WRITE;
2581 }
2582
2583 /*
2584 * If the data file is written to, the format driver may
2585 * expect to be able to resize it by writing beyond the EOF.
2586 */
2587 if (perm & BLK_PERM_WRITE) {
2588 perm |= BLK_PERM_RESIZE;
2589 }
2590 }
2591
2592 if (bs->open_flags & BDRV_O_INACTIVE) {
2593 shared |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
2594 }
2595
2596 *nperm = perm;
2597 *nshared = shared;
2598 }
2599
2600 void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
2601 BdrvChildRole role, BlockReopenQueue *reopen_queue,
2602 uint64_t perm, uint64_t shared,
2603 uint64_t *nperm, uint64_t *nshared)
2604 {
2605 if (role & BDRV_CHILD_FILTERED) {
2606 assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
2607 BDRV_CHILD_COW)));
2608 bdrv_filter_default_perms(bs, c, role, reopen_queue,
2609 perm, shared, nperm, nshared);
2610 } else if (role & BDRV_CHILD_COW) {
2611 assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA)));
2612 bdrv_default_perms_for_cow(bs, c, role, reopen_queue,
2613 perm, shared, nperm, nshared);
2614 } else if (role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA)) {
2615 bdrv_default_perms_for_storage(bs, c, role, reopen_queue,
2616 perm, shared, nperm, nshared);
2617 } else {
2618 g_assert_not_reached();
2619 }
2620 }
2621
2622 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm)
2623 {
2624 static const uint64_t permissions[] = {
2625 [BLOCK_PERMISSION_CONSISTENT_READ] = BLK_PERM_CONSISTENT_READ,
2626 [BLOCK_PERMISSION_WRITE] = BLK_PERM_WRITE,
2627 [BLOCK_PERMISSION_WRITE_UNCHANGED] = BLK_PERM_WRITE_UNCHANGED,
2628 [BLOCK_PERMISSION_RESIZE] = BLK_PERM_RESIZE,
2629 [BLOCK_PERMISSION_GRAPH_MOD] = BLK_PERM_GRAPH_MOD,
2630 };
2631
2632 QEMU_BUILD_BUG_ON(ARRAY_SIZE(permissions) != BLOCK_PERMISSION__MAX);
2633 QEMU_BUILD_BUG_ON(1UL << ARRAY_SIZE(permissions) != BLK_PERM_ALL + 1);
2634
2635 assert(qapi_perm < BLOCK_PERMISSION__MAX);
2636
2637 return permissions[qapi_perm];
2638 }
2639
2640 static void bdrv_replace_child_noperm(BdrvChild *child,
2641 BlockDriverState *new_bs)
2642 {
2643 BlockDriverState *old_bs = child->bs;
2644 int new_bs_quiesce_counter;
2645 int drain_saldo;
2646
2647 assert(!child->frozen);
2648
2649 if (old_bs && new_bs) {
2650 assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
2651 }
2652
2653 new_bs_quiesce_counter = (new_bs ? new_bs->quiesce_counter : 0);
2654 drain_saldo = new_bs_quiesce_counter - child->parent_quiesce_counter;
2655
2656 /*
2657 * If the new child node is drained but the old one was not, flush
2658 * all outstanding requests to the old child node.
2659 */
2660 while (drain_saldo > 0 && child->klass->drained_begin) {
2661 bdrv_parent_drained_begin_single(child, true);
2662 drain_saldo--;
2663 }
2664
2665 if (old_bs) {
2666 /* Detach first so that the recursive drain sections coming from @child
2667 * are already gone and we only end the drain sections that came from
2668 * elsewhere. */
2669 if (child->klass->detach) {
2670 child->klass->detach(child);
2671 }
2672 QLIST_REMOVE(child, next_parent);
2673 }
2674
2675 child->bs = new_bs;
2676
2677 if (new_bs) {
2678 QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
2679
2680 /*
2681 * Detaching the old node may have led to the new node's
2682 * quiesce_counter having been decreased. Not a problem, we
2683 * just need to recognize this here and then invoke
2684 * drained_end appropriately more often.
2685 */
2686 assert(new_bs->quiesce_counter <= new_bs_quiesce_counter);
2687 drain_saldo += new_bs->quiesce_counter - new_bs_quiesce_counter;
2688
2689 /* Attach only after starting new drained sections, so that recursive
2690 * drain sections coming from @child don't get an extra .drained_begin
2691 * callback. */
2692 if (child->klass->attach) {
2693 child->klass->attach(child);
2694 }
2695 }
2696
2697 /*
2698 * If the old child node was drained but the new one is not, allow
2699 * requests to come in only after the new node has been attached.
2700 */
2701 while (drain_saldo < 0 && child->klass->drained_end) {
2702 bdrv_parent_drained_end_single(child);
2703 drain_saldo++;
2704 }
2705 }
2706
2707 static void bdrv_child_free(void *opaque)
2708 {
2709 BdrvChild *c = opaque;
2710
2711 g_free(c->name);
2712 g_free(c);
2713 }
2714
2715 static void bdrv_remove_empty_child(BdrvChild *child)
2716 {
2717 assert(!child->bs);
2718 QLIST_SAFE_REMOVE(child, next);
2719 bdrv_child_free(child);
2720 }
2721
2722 typedef struct BdrvAttachChildCommonState {
2723 BdrvChild **child;
2724 AioContext *old_parent_ctx;
2725 AioContext *old_child_ctx;
2726 } BdrvAttachChildCommonState;
2727
2728 static void bdrv_attach_child_common_abort(void *opaque)
2729 {
2730 BdrvAttachChildCommonState *s = opaque;
2731 BdrvChild *child = *s->child;
2732 BlockDriverState *bs = child->bs;
2733
2734 bdrv_replace_child_noperm(child, NULL);
2735
2736 if (bdrv_get_aio_context(bs) != s->old_child_ctx) {
2737 bdrv_try_set_aio_context(bs, s->old_child_ctx, &error_abort);
2738 }
2739
2740 if (bdrv_child_get_parent_aio_context(child) != s->old_parent_ctx) {
2741 GSList *ignore = g_slist_prepend(NULL, child);
2742
2743 child->klass->can_set_aio_ctx(child, s->old_parent_ctx, &ignore,
2744 &error_abort);
2745 g_slist_free(ignore);
2746 ignore = g_slist_prepend(NULL, child);
2747 child->klass->set_aio_ctx(child, s->old_parent_ctx, &ignore);
2748
2749 g_slist_free(ignore);
2750 }
2751
2752 bdrv_unref(bs);
2753 bdrv_remove_empty_child(child);
2754 *s->child = NULL;
2755 }
2756
2757 static TransactionActionDrv bdrv_attach_child_common_drv = {
2758 .abort = bdrv_attach_child_common_abort,
2759 .clean = g_free,
2760 };
2761
2762 /*
2763 * Common part of attaching bdrv child to bs or to blk or to job
2764 */
2765 static int bdrv_attach_child_common(BlockDriverState *child_bs,
2766 const char *child_name,
2767 const BdrvChildClass *child_class,
2768 BdrvChildRole child_role,
2769 uint64_t perm, uint64_t shared_perm,
2770 void *opaque, BdrvChild **child,
2771 Transaction *tran, Error **errp)
2772 {
2773 BdrvChild *new_child;
2774 AioContext *parent_ctx;
2775 AioContext *child_ctx = bdrv_get_aio_context(child_bs);
2776
2777 assert(child);
2778 assert(*child == NULL);
2779
2780 new_child = g_new(BdrvChild, 1);
2781 *new_child = (BdrvChild) {
2782 .bs = NULL,
2783 .name = g_strdup(child_name),
2784 .klass = child_class,
2785 .role = child_role,
2786 .perm = perm,
2787 .shared_perm = shared_perm,
2788 .opaque = opaque,
2789 };
2790
2791 /*
2792 * If the AioContexts don't match, first try to move the subtree of
2793 * child_bs into the AioContext of the new parent. If this doesn't work,
2794 * try moving the parent into the AioContext of child_bs instead.
2795 */
2796 parent_ctx = bdrv_child_get_parent_aio_context(new_child);
2797 if (child_ctx != parent_ctx) {
2798 Error *local_err = NULL;
2799 int ret = bdrv_try_set_aio_context(child_bs, parent_ctx, &local_err);
2800
2801 if (ret < 0 && child_class->can_set_aio_ctx) {
2802 GSList *ignore = g_slist_prepend(NULL, new_child);
2803 if (child_class->can_set_aio_ctx(new_child, child_ctx, &ignore,
2804 NULL))
2805 {
2806 error_free(local_err);
2807 ret = 0;
2808 g_slist_free(ignore);
2809 ignore = g_slist_prepend(NULL, new_child);
2810 child_class->set_aio_ctx(new_child, child_ctx, &ignore);
2811 }
2812 g_slist_free(ignore);
2813 }
2814
2815 if (ret < 0) {
2816 error_propagate(errp, local_err);
2817 bdrv_remove_empty_child(new_child);
2818 return ret;
2819 }
2820 }
2821
2822 bdrv_ref(child_bs);
2823 bdrv_replace_child_noperm(new_child, child_bs);
2824
2825 *child = new_child;
2826
2827 BdrvAttachChildCommonState *s = g_new(BdrvAttachChildCommonState, 1);
2828 *s = (BdrvAttachChildCommonState) {
2829 .child = child,
2830 .old_parent_ctx = parent_ctx,
2831 .old_child_ctx = child_ctx,
2832 };
2833 tran_add(tran, &bdrv_attach_child_common_drv, s);
2834
2835 return 0;
2836 }
2837
2838 static int bdrv_attach_child_noperm(BlockDriverState *parent_bs,
2839 BlockDriverState *child_bs,
2840 const char *child_name,
2841 const BdrvChildClass *child_class,
2842 BdrvChildRole child_role,
2843 BdrvChild **child,
2844 Transaction *tran,
2845 Error **errp)
2846 {
2847 int ret;
2848 uint64_t perm, shared_perm;
2849
2850 assert(parent_bs->drv);
2851
2852 bdrv_get_cumulative_perm(parent_bs, &perm, &shared_perm);
2853 bdrv_child_perm(parent_bs, child_bs, NULL, child_role, NULL,
2854 perm, shared_perm, &perm, &shared_perm);
2855
2856 ret = bdrv_attach_child_common(child_bs, child_name, child_class,
2857 child_role, perm, shared_perm, parent_bs,
2858 child, tran, errp);
2859 if (ret < 0) {
2860 return ret;
2861 }
2862
2863 QLIST_INSERT_HEAD(&parent_bs->children, *child, next);
2864 /*
2865 * child is removed in bdrv_attach_child_common_abort(), so don't care to
2866 * abort this change separately.
2867 */
2868
2869 return 0;
2870 }
2871
2872 static void bdrv_detach_child(BdrvChild *child)
2873 {
2874 BlockDriverState *old_bs = child->bs;
2875
2876 bdrv_replace_child_noperm(child, NULL);
2877 bdrv_remove_empty_child(child);
2878
2879 if (old_bs) {
2880 /*
2881 * Update permissions for old node. We're just taking a parent away, so
2882 * we're loosening restrictions. Errors of permission update are not
2883 * fatal in this case, ignore them.
2884 */
2885 bdrv_refresh_perms(old_bs, NULL);
2886
2887 /*
2888 * When the parent requiring a non-default AioContext is removed, the
2889 * node moves back to the main AioContext
2890 */
2891 bdrv_try_set_aio_context(old_bs, qemu_get_aio_context(), NULL);
2892 }
2893 }
2894
2895 /*
2896 * This function steals the reference to child_bs from the caller.
2897 * That reference is later dropped by bdrv_root_unref_child().
2898 *
2899 * On failure NULL is returned, errp is set and the reference to
2900 * child_bs is also dropped.
2901 *
2902 * The caller must hold the AioContext lock @child_bs, but not that of @ctx
2903 * (unless @child_bs is already in @ctx).
2904 */
2905 BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
2906 const char *child_name,
2907 const BdrvChildClass *child_class,
2908 BdrvChildRole child_role,
2909 uint64_t perm, uint64_t shared_perm,
2910 void *opaque, Error **errp)
2911 {
2912 int ret;
2913 BdrvChild *child = NULL;
2914 Transaction *tran = tran_new();
2915
2916 ret = bdrv_attach_child_common(child_bs, child_name, child_class,
2917 child_role, perm, shared_perm, opaque,
2918 &child, tran, errp);
2919 if (ret < 0) {
2920 bdrv_unref(child_bs);
2921 return NULL;
2922 }
2923
2924 ret = bdrv_refresh_perms(child_bs, errp);
2925 tran_finalize(tran, ret);
2926
2927 bdrv_unref(child_bs);
2928 return child;
2929 }
2930
2931 /*
2932 * This function transfers the reference to child_bs from the caller
2933 * to parent_bs. That reference is later dropped by parent_bs on
2934 * bdrv_close() or if someone calls bdrv_unref_child().
2935 *
2936 * On failure NULL is returned, errp is set and the reference to
2937 * child_bs is also dropped.
2938 *
2939 * If @parent_bs and @child_bs are in different AioContexts, the caller must
2940 * hold the AioContext lock for @child_bs, but not for @parent_bs.
2941 */
2942 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
2943 BlockDriverState *child_bs,
2944 const char *child_name,
2945 const BdrvChildClass *child_class,
2946 BdrvChildRole child_role,
2947 Error **errp)
2948 {
2949 int ret;
2950 BdrvChild *child = NULL;
2951 Transaction *tran = tran_new();
2952
2953 ret = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, child_class,
2954 child_role, &child, tran, errp);
2955 if (ret < 0) {
2956 goto out;
2957 }
2958
2959 ret = bdrv_refresh_perms(parent_bs, errp);
2960 if (ret < 0) {
2961 goto out;
2962 }
2963
2964 out:
2965 tran_finalize(tran, ret);
2966
2967 bdrv_unref(child_bs);
2968
2969 return child;
2970 }
2971
2972 /* Callers must ensure that child->frozen is false. */
2973 void bdrv_root_unref_child(BdrvChild *child)
2974 {
2975 BlockDriverState *child_bs;
2976
2977 child_bs = child->bs;
2978 bdrv_detach_child(child);
2979 bdrv_unref(child_bs);
2980 }
2981
2982 typedef struct BdrvSetInheritsFrom {
2983 BlockDriverState *bs;
2984 BlockDriverState *old_inherits_from;
2985 } BdrvSetInheritsFrom;
2986
2987 static void bdrv_set_inherits_from_abort(void *opaque)
2988 {
2989 BdrvSetInheritsFrom *s = opaque;
2990
2991 s->bs->inherits_from = s->old_inherits_from;
2992 }
2993
2994 static TransactionActionDrv bdrv_set_inherits_from_drv = {
2995 .abort = bdrv_set_inherits_from_abort,
2996 .clean = g_free,
2997 };
2998
2999 /* @tran is allowed to be NULL. In this case no rollback is possible */
3000 static void bdrv_set_inherits_from(BlockDriverState *bs,
3001 BlockDriverState *new_inherits_from,
3002 Transaction *tran)
3003 {
3004 if (tran) {
3005 BdrvSetInheritsFrom *s = g_new(BdrvSetInheritsFrom, 1);
3006
3007 *s = (BdrvSetInheritsFrom) {
3008 .bs = bs,
3009 .old_inherits_from = bs->inherits_from,
3010 };
3011
3012 tran_add(tran, &bdrv_set_inherits_from_drv, s);
3013 }
3014
3015 bs->inherits_from = new_inherits_from;
3016 }
3017
3018 /**
3019 * Clear all inherits_from pointers from children and grandchildren of
3020 * @root that point to @root, where necessary.
3021 * @tran is allowed to be NULL. In this case no rollback is possible
3022 */
3023 static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child,
3024 Transaction *tran)
3025 {
3026 BdrvChild *c;
3027
3028 if (child->bs->inherits_from == root) {
3029 /*
3030 * Remove inherits_from only when the last reference between root and
3031 * child->bs goes away.
3032 */
3033 QLIST_FOREACH(c, &root->children, next) {
3034 if (c != child && c->bs == child->bs) {
3035 break;
3036 }
3037 }
3038 if (c == NULL) {
3039 bdrv_set_inherits_from(child->bs, NULL, tran);
3040 }
3041 }
3042
3043 QLIST_FOREACH(c, &child->bs->children, next) {
3044 bdrv_unset_inherits_from(root, c, tran);
3045 }
3046 }
3047
3048 /* Callers must ensure that child->frozen is false. */
3049 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
3050 {
3051 if (child == NULL) {
3052 return;
3053 }
3054
3055 bdrv_unset_inherits_from(parent, child, NULL);
3056 bdrv_root_unref_child(child);
3057 }
3058
3059
3060 static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
3061 {
3062 BdrvChild *c;
3063 QLIST_FOREACH(c, &bs->parents, next_parent) {
3064 if (c->klass->change_media) {
3065 c->klass->change_media(c, load);
3066 }
3067 }
3068 }
3069
3070 /* Return true if you can reach parent going through child->inherits_from
3071 * recursively. If parent or child are NULL, return false */
3072 static bool bdrv_inherits_from_recursive(BlockDriverState *child,
3073 BlockDriverState *parent)
3074 {
3075 while (child && child != parent) {
3076 child = child->inherits_from;
3077 }
3078
3079 return child != NULL;
3080 }
3081
3082 /*
3083 * Return the BdrvChildRole for @bs's backing child. bs->backing is
3084 * mostly used for COW backing children (role = COW), but also for
3085 * filtered children (role = FILTERED | PRIMARY).
3086 */
3087 static BdrvChildRole bdrv_backing_role(BlockDriverState *bs)
3088 {
3089 if (bs->drv && bs->drv->is_filter) {
3090 return BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY;
3091 } else {
3092 return BDRV_CHILD_COW;
3093 }
3094 }
3095
3096 /*
3097 * Sets the bs->backing link of a BDS. A new reference is created; callers
3098 * which don't need their own reference any more must call bdrv_unref().
3099 */
3100 static int bdrv_set_backing_noperm(BlockDriverState *bs,
3101 BlockDriverState *backing_hd,
3102 Transaction *tran, Error **errp)
3103 {
3104 int ret = 0;
3105 bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) &&
3106 bdrv_inherits_from_recursive(backing_hd, bs);
3107
3108 if (bdrv_is_backing_chain_frozen(bs, child_bs(bs->backing), errp)) {
3109 return -EPERM;
3110 }
3111
3112 if (bs->backing) {
3113 /* Cannot be frozen, we checked that above */
3114 bdrv_unset_inherits_from(bs, bs->backing, tran);
3115 bdrv_remove_filter_or_cow_child(bs, tran);
3116 }
3117
3118 if (!backing_hd) {
3119 goto out;
3120 }
3121
3122 ret = bdrv_attach_child_noperm(bs, backing_hd, "backing",
3123 &child_of_bds, bdrv_backing_role(bs),
3124 &bs->backing, tran, errp);
3125 if (ret < 0) {
3126 return ret;
3127 }
3128
3129
3130 /*
3131 * If backing_hd was already part of bs's backing chain, and
3132 * inherits_from pointed recursively to bs then let's update it to
3133 * point directly to bs (else it will become NULL).
3134 */
3135 if (update_inherits_from) {
3136 bdrv_set_inherits_from(backing_hd, bs, tran);
3137 }
3138
3139 out:
3140 bdrv_refresh_limits(bs, tran, NULL);
3141
3142 return 0;
3143 }
3144
3145 int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
3146 Error **errp)
3147 {
3148 int ret;
3149 Transaction *tran = tran_new();
3150
3151 ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
3152 if (ret < 0) {
3153 goto out;
3154 }
3155
3156 ret = bdrv_refresh_perms(bs, errp);
3157 out:
3158 tran_finalize(tran, ret);
3159
3160 return ret;
3161 }
3162
3163 /*
3164 * Opens the backing file for a BlockDriverState if not yet open
3165 *
3166 * bdref_key specifies the key for the image's BlockdevRef in the options QDict.
3167 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3168 * itself, all options starting with "${bdref_key}." are considered part of the
3169 * BlockdevRef.
3170 *
3171 * TODO Can this be unified with bdrv_open_image()?
3172 */
3173 int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
3174 const char *bdref_key, Error **errp)
3175 {
3176 char *backing_filename = NULL;
3177 char *bdref_key_dot;
3178 const char *reference = NULL;
3179 int ret = 0;
3180 bool implicit_backing = false;
3181 BlockDriverState *backing_hd;
3182 QDict *options;
3183 QDict *tmp_parent_options = NULL;
3184 Error *local_err = NULL;
3185
3186 if (bs->backing != NULL) {
3187 goto free_exit;
3188 }
3189
3190 /* NULL means an empty set of options */
3191 if (parent_options == NULL) {
3192 tmp_parent_options = qdict_new();
3193 parent_options = tmp_parent_options;
3194 }
3195
3196 bs->open_flags &= ~BDRV_O_NO_BACKING;
3197
3198 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3199 qdict_extract_subqdict(parent_options, &options, bdref_key_dot);
3200 g_free(bdref_key_dot);
3201
3202 /*
3203 * Caution: while qdict_get_try_str() is fine, getting non-string
3204 * types would require more care. When @parent_options come from
3205 * -blockdev or blockdev_add, its members are typed according to
3206 * the QAPI schema, but when they come from -drive, they're all
3207 * QString.
3208 */
3209 reference = qdict_get_try_str(parent_options, bdref_key);
3210 if (reference || qdict_haskey(options, "file.filename")) {
3211 /* keep backing_filename NULL */
3212 } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) {
3213 qobject_unref(options);
3214 goto free_exit;
3215 } else {
3216 if (qdict_size(options) == 0) {
3217 /* If the user specifies options that do not modify the
3218 * backing file's behavior, we might still consider it the
3219 * implicit backing file. But it's easier this way, and
3220 * just specifying some of the backing BDS's options is
3221 * only possible with -drive anyway (otherwise the QAPI
3222 * schema forces the user to specify everything). */
3223 implicit_backing = !strcmp(bs->auto_backing_file, bs->backing_file);
3224 }
3225
3226 backing_filename = bdrv_get_full_backing_filename(bs, &local_err);
3227 if (local_err) {
3228 ret = -EINVAL;
3229 error_propagate(errp, local_err);
3230 qobject_unref(options);
3231 goto free_exit;
3232 }
3233 }
3234
3235 if (!bs->drv || !bs->drv->supports_backing) {
3236 ret = -EINVAL;
3237 error_setg(errp, "Driver doesn't support backing files");
3238 qobject_unref(options);
3239 goto free_exit;
3240 }
3241
3242 if (!reference &&
3243 bs->backing_format[0] != '\0' && !qdict_haskey(options, "driver")) {
3244 qdict_put_str(options, "driver", bs->backing_format);
3245 }
3246
3247 backing_hd = bdrv_open_inherit(backing_filename, reference, options, 0, bs,
3248 &child_of_bds, bdrv_backing_role(bs), errp);
3249 if (!backing_hd) {
3250 bs->open_flags |= BDRV_O_NO_BACKING;
3251 error_prepend(errp, "Could not open backing file: ");
3252 ret = -EINVAL;
3253 goto free_exit;
3254 }
3255
3256 if (implicit_backing) {
3257 bdrv_refresh_filename(backing_hd);
3258 pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3259 backing_hd->filename);
3260 }
3261
3262 /* Hook up the backing file link; drop our reference, bs owns the
3263 * backing_hd reference now */
3264 ret = bdrv_set_backing_hd(bs, backing_hd, errp);
3265 bdrv_unref(backing_hd);
3266 if (ret < 0) {
3267 goto free_exit;
3268 }
3269
3270 qdict_del(parent_options, bdref_key);
3271
3272 free_exit:
3273 g_free(backing_filename);
3274 qobject_unref(tmp_parent_options);
3275 return ret;
3276 }
3277
3278 static BlockDriverState *
3279 bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
3280 BlockDriverState *parent, const BdrvChildClass *child_class,
3281 BdrvChildRole child_role, bool allow_none, Error **errp)
3282 {
3283 BlockDriverState *bs = NULL;
3284 QDict *image_options;
3285 char *bdref_key_dot;
3286 const char *reference;
3287
3288 assert(child_class != NULL);
3289
3290 bdref_key_dot = g_strdup_printf("%s.", bdref_key);
3291 qdict_extract_subqdict(options, &image_options, bdref_key_dot);
3292 g_free(bdref_key_dot);
3293
3294 /*
3295 * Caution: while qdict_get_try_str() is fine, getting non-string
3296 * types would require more care. When @options come from
3297 * -blockdev or blockdev_add, its members are typed according to
3298 * the QAPI schema, but when they come from -drive, they're all
3299 * QString.
3300 */
3301 reference = qdict_get_try_str(options, bdref_key);
3302 if (!filename && !reference && !qdict_size(image_options)) {
3303 if (!allow_none) {
3304 error_setg(errp, "A block device must be specified for \"%s\"",
3305 bdref_key);
3306 }
3307 qobject_unref(image_options);
3308 goto done;
3309 }
3310
3311 bs = bdrv_open_inherit(filename, reference, image_options, 0,
3312 parent, child_class, child_role, errp);
3313 if (!bs) {
3314 goto done;
3315 }
3316
3317 done:
3318 qdict_del(options, bdref_key);
3319 return bs;
3320 }
3321
3322 /*
3323 * Opens a disk image whose options are given as BlockdevRef in another block
3324 * device's options.
3325 *
3326 * If allow_none is true, no image will be opened if filename is false and no
3327 * BlockdevRef is given. NULL will be returned, but errp remains unset.
3328 *
3329 * bdrev_key specifies the key for the image's BlockdevRef in the options QDict.
3330 * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
3331 * itself, all options starting with "${bdref_key}." are considered part of the
3332 * BlockdevRef.
3333 *
3334 * The BlockdevRef will be removed from the options QDict.
3335 */
3336 BdrvChild *bdrv_open_child(const char *filename,
3337 QDict *options, const char *bdref_key,
3338 BlockDriverState *parent,
3339 const BdrvChildClass *child_class,
3340 BdrvChildRole child_role,
3341 bool allow_none, Error **errp)
3342 {
3343 BlockDriverState *bs;
3344
3345 bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_class,
3346 child_role, allow_none, errp);
3347 if (bs == NULL) {
3348 return NULL;
3349 }
3350
3351 return bdrv_attach_child(parent, bs, bdref_key, child_class, child_role,
3352 errp);
3353 }
3354
3355 /*
3356 * TODO Future callers may need to specify parent/child_class in order for
3357 * option inheritance to work. Existing callers use it for the root node.
3358 */
3359 BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp)
3360 {
3361 BlockDriverState *bs = NULL;
3362 QObject *obj = NULL;
3363 QDict *qdict = NULL;
3364 const char *reference = NULL;
3365 Visitor *v = NULL;
3366
3367 if (ref->type == QTYPE_QSTRING) {
3368 reference = ref->u.reference;
3369 } else {
3370 BlockdevOptions *options = &ref->u.definition;
3371 assert(ref->type == QTYPE_QDICT);
3372
3373 v = qobject_output_visitor_new(&obj);
3374 visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
3375 visit_complete(v, &obj);
3376
3377 qdict = qobject_to(QDict, obj);
3378 qdict_flatten(qdict);
3379
3380 /* bdrv_open_inherit() defaults to the values in bdrv_flags (for
3381 * compatibility with other callers) rather than what we want as the
3382 * real defaults. Apply the defaults here instead. */
3383 qdict_set_default_str(qdict, BDRV_OPT_CACHE_DIRECT, "off");
3384 qdict_set_default_str(qdict, BDRV_OPT_CACHE_NO_FLUSH, "off");
3385 qdict_set_default_str(qdict, BDRV_OPT_READ_ONLY, "off");
3386 qdict_set_default_str(qdict, BDRV_OPT_AUTO_READ_ONLY, "off");
3387
3388 }
3389
3390 bs = bdrv_open_inherit(NULL, reference, qdict, 0, NULL, NULL, 0, errp);
3391 obj = NULL;
3392 qobject_unref(obj);
3393 visit_free(v);
3394 return bs;
3395 }
3396
3397 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
3398 int flags,
3399 QDict *snapshot_options,
3400 Error **errp)
3401 {
3402 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
3403 char *tmp_filename = g_malloc0(PATH_MAX + 1);
3404 int64_t total_size;
3405 QemuOpts *opts = NULL;
3406 BlockDriverState *bs_snapshot = NULL;
3407 int ret;
3408
3409 /* if snapshot, we create a temporary backing file and open it
3410 instead of opening 'filename' directly */
3411
3412 /* Get the required size from the image */
3413 total_size = bdrv_getlength(bs);
3414 if (total_size < 0) {
3415 error_setg_errno(errp, -total_size, "Could not get image size");
3416 goto out;
3417 }
3418
3419 /* Create the temporary image */
3420 ret = get_tmp_filename(tmp_filename, PATH_MAX + 1);
3421 if (ret < 0) {
3422 error_setg_errno(errp, -ret, "Could not get temporary filename");
3423 goto out;
3424 }
3425
3426 opts = qemu_opts_create(bdrv_qcow2.create_opts, NULL, 0,
3427 &error_abort);
3428 qemu_opt_set_number(opts, BLOCK_OPT_SIZE, total_size, &error_abort);
3429 ret = bdrv_create(&bdrv_qcow2, tmp_filename, opts, errp);
3430 qemu_opts_del(opts);
3431 if (ret < 0) {
3432 error_prepend(errp, "Could not create temporary overlay '%s': ",
3433 tmp_filename);
3434 goto out;
3435 }
3436
3437 /* Prepare options QDict for the temporary file */
3438 qdict_put_str(snapshot_options, "file.driver", "file");
3439 qdict_put_str(snapshot_options, "file.filename", tmp_filename);
3440 qdict_put_str(snapshot_options, "driver", "qcow2");
3441
3442 bs_snapshot = bdrv_open(NULL, NULL, snapshot_options, flags, errp);
3443 snapshot_options = NULL;
3444 if (!bs_snapshot) {
3445 goto out;
3446 }
3447
3448 ret = bdrv_append(bs_snapshot, bs, errp);
3449 if (ret < 0) {
3450 bs_snapshot = NULL;
3451 goto out;
3452 }
3453
3454 out:
3455 qobject_unref(snapshot_options);
3456 g_free(tmp_filename);
3457 return bs_snapshot;
3458 }
3459
3460 /*
3461 * Opens a disk image (raw, qcow2, vmdk, ...)
3462 *
3463 * options is a QDict of options to pass to the block drivers, or NULL for an
3464 * empty set of options. The reference to the QDict belongs to the block layer
3465 * after the call (even on failure), so if the caller intends to reuse the
3466 * dictionary, it needs to use qobject_ref() before calling bdrv_open.
3467 *
3468 * If *pbs is NULL, a new BDS will be created with a pointer to it stored there.
3469 * If it is not NULL, the referenced BDS will be reused.
3470 *
3471 * The reference parameter may be used to specify an existing block device which
3472 * should be opened. If specified, neither options nor a filename may be given,
3473 * nor can an existing BDS be reused (that is, *pbs has to be NULL).
3474 */
3475 static BlockDriverState *bdrv_open_inherit(const char *filename,
3476 const char *reference,
3477 QDict *options, int flags,
3478 BlockDriverState *parent,
3479 const BdrvChildClass *child_class,
3480 BdrvChildRole child_role,
3481 Error **errp)
3482 {
3483 int ret;
3484 BlockBackend *file = NULL;
3485 BlockDriverState *bs;
3486 BlockDriver *drv = NULL;
3487 BdrvChild *child;
3488 const char *drvname;
3489 const char *backing;
3490 Error *local_err = NULL;
3491 QDict *snapshot_options = NULL;
3492 int snapshot_flags = 0;
3493
3494 assert(!child_class || !flags);
3495 assert(!child_class == !parent);
3496
3497 if (reference) {
3498 bool options_non_empty = options ? qdict_size(options) : false;
3499 qobject_unref(options);
3500
3501 if (filename || options_non_empty) {
3502 error_setg(errp, "Cannot reference an existing block device with "
3503 "additional options or a new filename");
3504 return NULL;
3505 }
3506
3507 bs = bdrv_lookup_bs(reference, reference, errp);
3508 if (!bs) {
3509 return NULL;
3510 }
3511
3512 bdrv_ref(bs);
3513 return bs;
3514 }
3515
3516 bs = bdrv_new();
3517
3518 /* NULL means an empty set of options */
3519 if (options == NULL) {
3520 options = qdict_new();
3521 }
3522
3523 /* json: syntax counts as explicit options, as if in the QDict */
3524 parse_json_protocol(options, &filename, &local_err);
3525 if (local_err) {
3526 goto fail;
3527 }
3528
3529 bs->explicit_options = qdict_clone_shallow(options);
3530
3531 if (child_class) {
3532 bool parent_is_format;
3533
3534 if (parent->drv) {
3535 parent_is_format = parent->drv->is_format;
3536 } else {
3537 /*
3538 * parent->drv is not set yet because this node is opened for
3539 * (potential) format probing. That means that @parent is going
3540 * to be a format node.
3541 */
3542 parent_is_format = true;
3543 }
3544
3545 bs->inherits_from = parent;
3546 child_class->inherit_options(child_role, parent_is_format,
3547 &flags, options,
3548 parent->open_flags, parent->options);
3549 }
3550
3551 ret = bdrv_fill_options(&options, filename, &flags, &local_err);
3552 if (ret < 0) {
3553 goto fail;
3554 }
3555
3556 /*
3557 * Set the BDRV_O_RDWR and BDRV_O_ALLOW_RDWR flags.
3558 * Caution: getting a boolean member of @options requires care.
3559 * When @options come from -blockdev or blockdev_add, members are
3560 * typed according to the QAPI schema, but when they come from
3561 * -drive, they're all QString.
3562 */
3563 if (g_strcmp0(qdict_get_try_str(options, BDRV_OPT_READ_ONLY), "on") &&
3564 !qdict_get_try_bool(options, BDRV_OPT_READ_ONLY, false)) {
3565 flags |= (BDRV_O_RDWR | BDRV_O_ALLOW_RDWR);
3566 } else {
3567 flags &= ~BDRV_O_RDWR;
3568 }
3569
3570 if (flags & BDRV_O_SNAPSHOT) {
3571 snapshot_options = qdict_new();
3572 bdrv_temp_snapshot_options(&snapshot_flags, snapshot_options,
3573 flags, options);
3574 /* Let bdrv_backing_options() override "read-only" */
3575 qdict_del(options, BDRV_OPT_READ_ONLY);
3576 bdrv_inherited_options(BDRV_CHILD_COW, true,
3577 &flags, options, flags, options);
3578 }
3579
3580 bs->open_flags = flags;
3581 bs->options = options;
3582 options = qdict_clone_shallow(options);
3583
3584 /* Find the right image format driver */
3585 /* See cautionary note on accessing @options above */
3586 drvname = qdict_get_try_str(options, "driver");
3587 if (drvname) {
3588 drv = bdrv_find_format(drvname);
3589 if (!drv) {
3590 error_setg(errp, "Unknown driver: '%s'", drvname);
3591 goto fail;
3592 }
3593 }
3594
3595 assert(drvname || !(flags & BDRV_O_PROTOCOL));
3596
3597 /* See cautionary note on accessing @options above */
3598 backing = qdict_get_try_str(options, "backing");
3599 if (qobject_to(QNull, qdict_get(options, "backing")) != NULL ||
3600 (backing && *backing == '\0'))
3601 {
3602 if (backing) {
3603 warn_report("Use of \"backing\": \"\" is deprecated; "
3604 "use \"backing\": null instead");
3605 }
3606 flags |= BDRV_O_NO_BACKING;
3607 qdict_del(bs->explicit_options, "backing");
3608 qdict_del(bs->options, "backing");
3609 qdict_del(options, "backing");
3610 }
3611
3612 /* Open image file without format layer. This BlockBackend is only used for
3613 * probing, the block drivers will do their own bdrv_open_child() for the
3614 * same BDS, which is why we put the node name back into options. */
3615 if ((flags & BDRV_O_PROTOCOL) == 0) {
3616 BlockDriverState *file_bs;
3617
3618 file_bs = bdrv_open_child_bs(filename, options, "file", bs,
3619 &child_of_bds, BDRV_CHILD_IMAGE,
3620 true, &local_err);
3621 if (local_err) {
3622 goto fail;
3623 }
3624 if (file_bs != NULL) {
3625 /* Not requesting BLK_PERM_CONSISTENT_READ because we're only
3626 * looking at the header to guess the image format. This works even
3627 * in cases where a guest would not see a consistent state. */
3628 file = blk_new(bdrv_get_aio_context(file_bs), 0, BLK_PERM_ALL);
3629 blk_insert_bs(file, file_bs, &local_err);
3630 bdrv_unref(file_bs);
3631 if (local_err) {
3632 goto fail;
3633 }
3634
3635 qdict_put_str(options, "file", bdrv_get_node_name(file_bs));
3636 }
3637 }
3638
3639 /* Image format probing */
3640 bs->probed = !drv;
3641 if (!drv && file) {
3642 ret = find_image_format(file, filename, &drv, &local_err);
3643 if (ret < 0) {
3644 goto fail;
3645 }
3646 /*
3647 * This option update would logically belong in bdrv_fill_options(),
3648 * but we first need to open bs->file for the probing to work, while
3649 * opening bs->file already requires the (mostly) final set of options
3650 * so that cache mode etc. can be inherited.
3651 *
3652 * Adding the driver later is somewhat ugly, but it's not an option
3653 * that would ever be inherited, so it's correct. We just need to make
3654 * sure to update both bs->options (which has the full effective
3655 * options for bs) and options (which has file.* already removed).
3656 */
3657 qdict_put_str(bs->options, "driver", drv->format_name);
3658 qdict_put_str(options, "driver", drv->format_name);
3659 } else if (!drv) {
3660 error_setg(errp, "Must specify either driver or file");
3661 goto fail;
3662 }
3663
3664 /* BDRV_O_PROTOCOL must be set iff a protocol BDS is about to be created */
3665 assert(!!(flags & BDRV_O_PROTOCOL) == !!drv->bdrv_file_open);
3666 /* file must be NULL if a protocol BDS is about to be created
3667 * (the inverse results in an error message from bdrv_open_common()) */
3668 assert(!(flags & BDRV_O_PROTOCOL) || !file);
3669
3670 /* Open the image */
3671 ret = bdrv_open_common(bs, file, options, &local_err);
3672 if (ret < 0) {
3673 goto fail;
3674 }
3675
3676 if (file) {
3677 blk_unref(file);
3678 file = NULL;
3679 }
3680
3681 /* If there is a backing file, use it */
3682 if ((flags & BDRV_O_NO_BACKING) == 0) {
3683 ret = bdrv_open_backing_file(bs, options, "backing", &local_err);
3684 if (ret < 0) {
3685 goto close_and_fail;
3686 }
3687 }
3688
3689 /* Remove all children options and references
3690 * from bs->options and bs->explicit_options */
3691 QLIST_FOREACH(child, &bs->children, next) {
3692 char *child_key_dot;
3693 child_key_dot = g_strdup_printf("%s.", child->name);
3694 qdict_extract_subqdict(bs->explicit_options, NULL, child_key_dot);
3695 qdict_extract_subqdict(bs->options, NULL, child_key_dot);
3696 qdict_del(bs->explicit_options, child->name);
3697 qdict_del(bs->options, child->name);
3698 g_free(child_key_dot);
3699 }
3700
3701 /* Check if any unknown options were used */
3702 if (qdict_size(options) != 0) {
3703 const QDictEntry *entry = qdict_first(options);
3704 if (flags & BDRV_O_PROTOCOL) {
3705 error_setg(errp, "Block protocol '%s' doesn't support the option "
3706 "'%s'", drv->format_name, entry->key);
3707 } else {
3708 error_setg(errp,
3709 "Block format '%s' does not support the option '%s'",
3710 drv->format_name, entry->key);
3711 }
3712
3713 goto close_and_fail;
3714 }
3715
3716 bdrv_parent_cb_change_media(bs, true);
3717
3718 qobject_unref(options);
3719 options = NULL;
3720
3721 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
3722 * temporary snapshot afterwards. */
3723 if (snapshot_flags) {
3724 BlockDriverState *snapshot_bs;
3725 snapshot_bs = bdrv_append_temp_snapshot(bs, snapshot_flags,
3726 snapshot_options, &local_err);
3727 snapshot_options = NULL;
3728 if (local_err) {
3729 goto close_and_fail;
3730 }
3731 /* We are not going to return bs but the overlay on top of it
3732 * (snapshot_bs); thus, we have to drop the strong reference to bs
3733 * (which we obtained by calling bdrv_new()). bs will not be deleted,
3734 * though, because the overlay still has a reference to it. */
3735 bdrv_unref(bs);
3736 bs = snapshot_bs;
3737 }
3738
3739 return bs;
3740
3741 fail:
3742 blk_unref(file);
3743 qobject_unref(snapshot_options);
3744 qobject_unref(bs->explicit_options);
3745 qobject_unref(bs->options);
3746 qobject_unref(options);
3747 bs->options = NULL;
3748 bs->explicit_options = NULL;
3749 bdrv_unref(bs);
3750 error_propagate(errp, local_err);
3751 return NULL;
3752
3753 close_and_fail:
3754 bdrv_unref(bs);
3755 qobject_unref(snapshot_options);
3756 qobject_unref(options);
3757 error_propagate(errp, local_err);
3758 return NULL;
3759 }
3760
3761 BlockDriverState *bdrv_open(const char *filename, const char *reference,
3762 QDict *options, int flags, Error **errp)
3763 {
3764 return bdrv_open_inherit(filename, reference, options, flags, NULL,
3765 NULL, 0, errp);
3766 }
3767
3768 /* Return true if the NULL-terminated @list contains @str */
3769 static bool is_str_in_list(const char *str, const char *const *list)
3770 {
3771 if (str && list) {
3772 int i;
3773 for (i = 0; list[i] != NULL; i++) {
3774 if (!strcmp(str, list[i])) {
3775 return true;
3776 }
3777 }
3778 }
3779 return false;
3780 }
3781
3782 /*
3783 * Check that every option set in @bs->options is also set in
3784 * @new_opts.
3785 *
3786 * Options listed in the common_options list and in
3787 * @bs->drv->mutable_opts are skipped.
3788 *
3789 * Return 0 on success, otherwise return -EINVAL and set @errp.
3790 */
3791 static int bdrv_reset_options_allowed(BlockDriverState *bs,
3792 const QDict *new_opts, Error **errp)
3793 {
3794 const QDictEntry *e;
3795 /* These options are common to all block drivers and are handled
3796 * in bdrv_reopen_prepare() so they can be left out of @new_opts */
3797 const char *const common_options[] = {
3798 "node-name", "discard", "cache.direct", "cache.no-flush",
3799 "read-only", "auto-read-only", "detect-zeroes", NULL
3800 };
3801
3802 for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) {
3803 if (!qdict_haskey(new_opts, e->key) &&
3804 !is_str_in_list(e->key, common_options) &&
3805 !is_str_in_list(e->key, bs->drv->mutable_opts)) {
3806 error_setg(errp, "Option '%s' cannot be reset "
3807 "to its default value", e->key);
3808 return -EINVAL;
3809 }
3810 }
3811
3812 return 0;
3813 }
3814
3815 /*
3816 * Returns true if @child can be reached recursively from @bs
3817 */
3818 static bool bdrv_recurse_has_child(BlockDriverState *bs,
3819 BlockDriverState *child)
3820 {
3821 BdrvChild *c;
3822
3823 if (bs == child) {
3824 return true;
3825 }
3826
3827 QLIST_FOREACH(c, &bs->children, next) {
3828 if (bdrv_recurse_has_child(c->bs, child)) {
3829 return true;
3830 }
3831 }
3832
3833 return false;
3834 }
3835
3836 /*
3837 * Adds a BlockDriverState to a simple queue for an atomic, transactional
3838 * reopen of multiple devices.
3839 *
3840 * bs_queue can either be an existing BlockReopenQueue that has had QTAILQ_INIT
3841 * already performed, or alternatively may be NULL a new BlockReopenQueue will
3842 * be created and initialized. This newly created BlockReopenQueue should be
3843 * passed back in for subsequent calls that are intended to be of the same
3844 * atomic 'set'.
3845 *
3846 * bs is the BlockDriverState to add to the reopen queue.
3847 *
3848 * options contains the changed options for the associated bs
3849 * (the BlockReopenQueue takes ownership)
3850 *
3851 * flags contains the open flags for the associated bs
3852 *
3853 * returns a pointer to bs_queue, which is either the newly allocated
3854 * bs_queue, or the existing bs_queue being used.
3855 *
3856 * bs must be drained between bdrv_reopen_queue() and bdrv_reopen_multiple().
3857 */
3858 static BlockReopenQueue *bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
3859 BlockDriverState *bs,
3860 QDict *options,
3861 const BdrvChildClass *klass,
3862 BdrvChildRole role,
3863 bool parent_is_format,
3864 QDict *parent_options,
3865 int parent_flags,
3866 bool keep_old_opts)
3867 {
3868 assert(bs != NULL);
3869
3870 BlockReopenQueueEntry *bs_entry;
3871 BdrvChild *child;
3872 QDict *old_options, *explicit_options, *options_copy;
3873 int flags;
3874 QemuOpts *opts;
3875
3876 /* Make sure that the caller remembered to use a drained section. This is
3877 * important to avoid graph changes between the recursive queuing here and
3878 * bdrv_reopen_multiple(). */
3879 assert(bs->quiesce_counter > 0);
3880
3881 if (bs_queue == NULL) {
3882 bs_queue = g_new0(BlockReopenQueue, 1);
3883 QTAILQ_INIT(bs_queue);
3884 }
3885
3886 if (!options) {
3887 options = qdict_new();
3888 }
3889
3890 /* Check if this BlockDriverState is already in the queue */
3891 QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
3892 if (bs == bs_entry->state.bs) {
3893 break;
3894 }
3895 }
3896
3897 /*
3898 * Precedence of options:
3899 * 1. Explicitly passed in options (highest)
3900 * 2. Retained from explicitly set options of bs
3901 * 3. Inherited from parent node
3902 * 4. Retained from effective options of bs
3903 */
3904
3905 /* Old explicitly set values (don't overwrite by inherited value) */
3906 if (bs_entry || keep_old_opts) {
3907 old_options = qdict_clone_shallow(bs_entry ?
3908 bs_entry->state.explicit_options :
3909 bs->explicit_options);
3910 bdrv_join_options(bs, options, old_options);
3911 qobject_unref(old_options);
3912 }
3913
3914 explicit_options = qdict_clone_shallow(options);
3915
3916 /* Inherit from parent node */
3917 if (parent_options) {
3918 flags = 0;
3919 klass->inherit_options(role, parent_is_format, &flags, options,
3920 parent_flags, parent_options);
3921 } else {
3922 flags = bdrv_get_flags(bs);
3923 }
3924
3925 if (keep_old_opts) {
3926 /* Old values are used for options that aren't set yet */
3927 old_options = qdict_clone_shallow(bs->options);
3928 bdrv_join_options(bs, options, old_options);
3929 qobject_unref(old_options);
3930 }
3931
3932 /* We have the final set of options so let's update the flags */
3933 options_copy = qdict_clone_shallow(options);
3934 opts = qemu_opts_create(&bdrv_runtime_opts, NULL, 0, &error_abort);
3935 qemu_opts_absorb_qdict(opts, options_copy, NULL);
3936 update_flags_from_options(&flags, opts);
3937 qemu_opts_del(opts);
3938 qobject_unref(options_copy);
3939
3940 /* bdrv_open_inherit() sets and clears some additional flags internally */
3941 flags &= ~BDRV_O_PROTOCOL;
3942 if (flags & BDRV_O_RDWR) {
3943 flags |= BDRV_O_ALLOW_RDWR;
3944 }
3945
3946 if (!bs_entry) {
3947 bs_entry = g_new0(BlockReopenQueueEntry, 1);
3948 QTAILQ_INSERT_TAIL(bs_queue, bs_entry, entry);
3949 } else {
3950 qobject_unref(bs_entry->state.options);
3951 qobject_unref(bs_entry->state.explicit_options);
3952 }
3953
3954 bs_entry->state.bs = bs;
3955 bs_entry->state.options = options;
3956 bs_entry->state.explicit_options = explicit_options;
3957 bs_entry->state.flags = flags;
3958
3959 /*
3960 * If keep_old_opts is false then it means that unspecified
3961 * options must be reset to their original value. We don't allow
3962 * resetting 'backing' but we need to know if the option is
3963 * missing in order to decide if we have to return an error.
3964 */
3965 if (!keep_old_opts) {
3966 bs_entry->state.backing_missing =
3967 !qdict_haskey(options, "backing") &&
3968 !qdict_haskey(options, "backing.driver");
3969 }
3970
3971 QLIST_FOREACH(child, &bs->children, next) {
3972 QDict *new_child_options = NULL;
3973 bool child_keep_old = keep_old_opts;
3974
3975 /* reopen can only change the options of block devices that were
3976 * implicitly created and inherited options. For other (referenced)
3977 * block devices, a syntax like "backing.foo" results in an error. */
3978 if (child->bs->inherits_from != bs) {
3979 continue;
3980 }
3981
3982 /* Check if the options contain a child reference */
3983 if (qdict_haskey(options, child->name)) {
3984 const char *childref = qdict_get_try_str(options, child->name);
3985 /*
3986 * The current child must not be reopened if the child
3987 * reference is null or points to a different node.
3988 */
3989 if (g_strcmp0(childref, child->bs->node_name)) {
3990 continue;
3991 }
3992 /*
3993 * If the child reference points to the current child then
3994 * reopen it with its existing set of options (note that
3995 * it can still inherit new options from the parent).
3996 */
3997 child_keep_old = true;
3998 } else {
3999 /* Extract child options ("child-name.*") */
4000 char *child_key_dot = g_strdup_printf("%s.", child->name);
4001 qdict_extract_subqdict(explicit_options, NULL, child_key_dot);
4002 qdict_extract_subqdict(options, &new_child_options, child_key_dot);
4003 g_free(child_key_dot);
4004 }
4005
4006 bdrv_reopen_queue_child(bs_queue, child->bs, new_child_options,
4007 child->klass, child->role, bs->drv->is_format,
4008 options, flags, child_keep_old);
4009 }
4010
4011 return bs_queue;
4012 }
4013
4014 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
4015 BlockDriverState *bs,
4016 QDict *options, bool keep_old_opts)
4017 {
4018 return bdrv_reopen_queue_child(bs_queue, bs, options, NULL, 0, false,
4019 NULL, 0, keep_old_opts);
4020 }
4021
4022 /*
4023 * Reopen multiple BlockDriverStates atomically & transactionally.
4024 *
4025 * The queue passed in (bs_queue) must have been built up previous
4026 * via bdrv_reopen_queue().
4027 *
4028 * Reopens all BDS specified in the queue, with the appropriate
4029 * flags. All devices are prepared for reopen, and failure of any
4030 * device will cause all device changes to be abandoned, and intermediate
4031 * data cleaned up.
4032 *
4033 * If all devices prepare successfully, then the changes are committed
4034 * to all devices.
4035 *
4036 * All affected nodes must be drained between bdrv_reopen_queue() and
4037 * bdrv_reopen_multiple().
4038 */
4039 int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
4040 {
4041 int ret = -1;
4042 BlockReopenQueueEntry *bs_entry, *next;
4043 Transaction *tran = tran_new();
4044 g_autoptr(GHashTable) found = NULL;
4045 g_autoptr(GSList) refresh_list = NULL;
4046
4047 assert(bs_queue != NULL);
4048
4049 QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4050 ret = bdrv_flush(bs_entry->state.bs);
4051 if (ret < 0) {
4052 error_setg_errno(errp, -ret, "Error flushing drive");
4053 goto cleanup;
4054 }
4055 }
4056
4057 QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4058 assert(bs_entry->state.bs->quiesce_counter > 0);
4059 ret = bdrv_reopen_prepare(&bs_entry->state, bs_queue, tran, errp);
4060 if (ret < 0) {
4061 goto abort;
4062 }
4063 bs_entry->prepared = true;
4064 }
4065
4066 found = g_hash_table_new(NULL, NULL);
4067 QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
4068 BDRVReopenState *state = &bs_entry->state;
4069
4070 refresh_list = bdrv_topological_dfs(refresh_list, found, state->bs);
4071 if (state->old_backing_bs) {
4072 refresh_list = bdrv_topological_dfs(refresh_list, found,
4073 state->old_backing_bs);
4074 }
4075 }
4076
4077 /*
4078