PPC: rename msync to msync_4xx
[qemu.git] / block.c
1 /*
2 * QEMU System Emulator block driver
3 *
4 * Copyright (c) 2003 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include "config-host.h"
25 #include "qemu-common.h"
26 #include "trace.h"
27 #include "monitor.h"
28 #include "block_int.h"
29 #include "module.h"
30 #include "qjson.h"
31 #include "qemu-coroutine.h"
32 #include "qmp-commands.h"
33 #include "qemu-timer.h"
34
35 #ifdef CONFIG_BSD
36 #include <sys/types.h>
37 #include <sys/stat.h>
38 #include <sys/ioctl.h>
39 #include <sys/queue.h>
40 #ifndef __DragonFly__
41 #include <sys/disk.h>
42 #endif
43 #endif
44
45 #ifdef _WIN32
46 #include <windows.h>
47 #endif
48
49 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
50
51 typedef enum {
52 BDRV_REQ_COPY_ON_READ = 0x1,
53 } BdrvRequestFlags;
54
55 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
56 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
57 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
58 BlockDriverCompletionFunc *cb, void *opaque);
59 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
60 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
61 BlockDriverCompletionFunc *cb, void *opaque);
62 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
63 int64_t sector_num, int nb_sectors,
64 QEMUIOVector *iov);
65 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
66 int64_t sector_num, int nb_sectors,
67 QEMUIOVector *iov);
68 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
69 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
70 BdrvRequestFlags flags);
71 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
72 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
73 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
74 int64_t sector_num,
75 QEMUIOVector *qiov,
76 int nb_sectors,
77 BlockDriverCompletionFunc *cb,
78 void *opaque,
79 bool is_write);
80 static void coroutine_fn bdrv_co_do_rw(void *opaque);
81
82 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
83 bool is_write, double elapsed_time, uint64_t *wait);
84 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
85 double elapsed_time, uint64_t *wait);
86 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
87 bool is_write, int64_t *wait);
88
89 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
90 QTAILQ_HEAD_INITIALIZER(bdrv_states);
91
92 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
93 QLIST_HEAD_INITIALIZER(bdrv_drivers);
94
95 /* The device to use for VM snapshots */
96 static BlockDriverState *bs_snapshots;
97
98 /* If non-zero, use only whitelisted block drivers */
99 static int use_bdrv_whitelist;
100
101 #ifdef _WIN32
102 static int is_windows_drive_prefix(const char *filename)
103 {
104 return (((filename[0] >= 'a' && filename[0] <= 'z') ||
105 (filename[0] >= 'A' && filename[0] <= 'Z')) &&
106 filename[1] == ':');
107 }
108
109 int is_windows_drive(const char *filename)
110 {
111 if (is_windows_drive_prefix(filename) &&
112 filename[2] == '\0')
113 return 1;
114 if (strstart(filename, "\\\\.\\", NULL) ||
115 strstart(filename, "//./", NULL))
116 return 1;
117 return 0;
118 }
119 #endif
120
121 /* throttling disk I/O limits */
122 void bdrv_io_limits_disable(BlockDriverState *bs)
123 {
124 bs->io_limits_enabled = false;
125
126 while (qemu_co_queue_next(&bs->throttled_reqs));
127
128 if (bs->block_timer) {
129 qemu_del_timer(bs->block_timer);
130 qemu_free_timer(bs->block_timer);
131 bs->block_timer = NULL;
132 }
133
134 bs->slice_start = 0;
135 bs->slice_end = 0;
136 bs->slice_time = 0;
137 memset(&bs->io_base, 0, sizeof(bs->io_base));
138 }
139
140 static void bdrv_block_timer(void *opaque)
141 {
142 BlockDriverState *bs = opaque;
143
144 qemu_co_queue_next(&bs->throttled_reqs);
145 }
146
147 void bdrv_io_limits_enable(BlockDriverState *bs)
148 {
149 qemu_co_queue_init(&bs->throttled_reqs);
150 bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
151 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
152 bs->slice_start = qemu_get_clock_ns(vm_clock);
153 bs->slice_end = bs->slice_start + bs->slice_time;
154 memset(&bs->io_base, 0, sizeof(bs->io_base));
155 bs->io_limits_enabled = true;
156 }
157
158 bool bdrv_io_limits_enabled(BlockDriverState *bs)
159 {
160 BlockIOLimit *io_limits = &bs->io_limits;
161 return io_limits->bps[BLOCK_IO_LIMIT_READ]
162 || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
163 || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
164 || io_limits->iops[BLOCK_IO_LIMIT_READ]
165 || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
166 || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
167 }
168
169 static void bdrv_io_limits_intercept(BlockDriverState *bs,
170 bool is_write, int nb_sectors)
171 {
172 int64_t wait_time = -1;
173
174 if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
175 qemu_co_queue_wait(&bs->throttled_reqs);
176 }
177
178 /* In fact, we hope to keep each request's timing, in FIFO mode. The next
179 * throttled requests will not be dequeued until the current request is
180 * allowed to be serviced. So if the current request still exceeds the
181 * limits, it will be inserted to the head. All requests followed it will
182 * be still in throttled_reqs queue.
183 */
184
185 while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
186 qemu_mod_timer(bs->block_timer,
187 wait_time + qemu_get_clock_ns(vm_clock));
188 qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
189 }
190
191 qemu_co_queue_next(&bs->throttled_reqs);
192 }
193
194 /* check if the path starts with "<protocol>:" */
195 static int path_has_protocol(const char *path)
196 {
197 #ifdef _WIN32
198 if (is_windows_drive(path) ||
199 is_windows_drive_prefix(path)) {
200 return 0;
201 }
202 #endif
203
204 return strchr(path, ':') != NULL;
205 }
206
207 int path_is_absolute(const char *path)
208 {
209 const char *p;
210 #ifdef _WIN32
211 /* specific case for names like: "\\.\d:" */
212 if (*path == '/' || *path == '\\')
213 return 1;
214 #endif
215 p = strchr(path, ':');
216 if (p)
217 p++;
218 else
219 p = path;
220 #ifdef _WIN32
221 return (*p == '/' || *p == '\\');
222 #else
223 return (*p == '/');
224 #endif
225 }
226
227 /* if filename is absolute, just copy it to dest. Otherwise, build a
228 path to it by considering it is relative to base_path. URL are
229 supported. */
230 void path_combine(char *dest, int dest_size,
231 const char *base_path,
232 const char *filename)
233 {
234 const char *p, *p1;
235 int len;
236
237 if (dest_size <= 0)
238 return;
239 if (path_is_absolute(filename)) {
240 pstrcpy(dest, dest_size, filename);
241 } else {
242 p = strchr(base_path, ':');
243 if (p)
244 p++;
245 else
246 p = base_path;
247 p1 = strrchr(base_path, '/');
248 #ifdef _WIN32
249 {
250 const char *p2;
251 p2 = strrchr(base_path, '\\');
252 if (!p1 || p2 > p1)
253 p1 = p2;
254 }
255 #endif
256 if (p1)
257 p1++;
258 else
259 p1 = base_path;
260 if (p1 > p)
261 p = p1;
262 len = p - base_path;
263 if (len > dest_size - 1)
264 len = dest_size - 1;
265 memcpy(dest, base_path, len);
266 dest[len] = '\0';
267 pstrcat(dest, dest_size, filename);
268 }
269 }
270
271 void bdrv_register(BlockDriver *bdrv)
272 {
273 /* Block drivers without coroutine functions need emulation */
274 if (!bdrv->bdrv_co_readv) {
275 bdrv->bdrv_co_readv = bdrv_co_readv_em;
276 bdrv->bdrv_co_writev = bdrv_co_writev_em;
277
278 /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if
279 * the block driver lacks aio we need to emulate that too.
280 */
281 if (!bdrv->bdrv_aio_readv) {
282 /* add AIO emulation layer */
283 bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
284 bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
285 }
286 }
287
288 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
289 }
290
291 /* create a new block device (by default it is empty) */
292 BlockDriverState *bdrv_new(const char *device_name)
293 {
294 BlockDriverState *bs;
295
296 bs = g_malloc0(sizeof(BlockDriverState));
297 pstrcpy(bs->device_name, sizeof(bs->device_name), device_name);
298 if (device_name[0] != '\0') {
299 QTAILQ_INSERT_TAIL(&bdrv_states, bs, list);
300 }
301 bdrv_iostatus_disable(bs);
302 return bs;
303 }
304
305 BlockDriver *bdrv_find_format(const char *format_name)
306 {
307 BlockDriver *drv1;
308 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
309 if (!strcmp(drv1->format_name, format_name)) {
310 return drv1;
311 }
312 }
313 return NULL;
314 }
315
316 static int bdrv_is_whitelisted(BlockDriver *drv)
317 {
318 static const char *whitelist[] = {
319 CONFIG_BDRV_WHITELIST
320 };
321 const char **p;
322
323 if (!whitelist[0])
324 return 1; /* no whitelist, anything goes */
325
326 for (p = whitelist; *p; p++) {
327 if (!strcmp(drv->format_name, *p)) {
328 return 1;
329 }
330 }
331 return 0;
332 }
333
334 BlockDriver *bdrv_find_whitelisted_format(const char *format_name)
335 {
336 BlockDriver *drv = bdrv_find_format(format_name);
337 return drv && bdrv_is_whitelisted(drv) ? drv : NULL;
338 }
339
340 int bdrv_create(BlockDriver *drv, const char* filename,
341 QEMUOptionParameter *options)
342 {
343 if (!drv->bdrv_create)
344 return -ENOTSUP;
345
346 return drv->bdrv_create(filename, options);
347 }
348
349 int bdrv_create_file(const char* filename, QEMUOptionParameter *options)
350 {
351 BlockDriver *drv;
352
353 drv = bdrv_find_protocol(filename);
354 if (drv == NULL) {
355 return -ENOENT;
356 }
357
358 return bdrv_create(drv, filename, options);
359 }
360
361 #ifdef _WIN32
362 void get_tmp_filename(char *filename, int size)
363 {
364 char temp_dir[MAX_PATH];
365
366 GetTempPath(MAX_PATH, temp_dir);
367 GetTempFileName(temp_dir, "qem", 0, filename);
368 }
369 #else
370 void get_tmp_filename(char *filename, int size)
371 {
372 int fd;
373 const char *tmpdir;
374 /* XXX: race condition possible */
375 tmpdir = getenv("TMPDIR");
376 if (!tmpdir)
377 tmpdir = "/tmp";
378 snprintf(filename, size, "%s/vl.XXXXXX", tmpdir);
379 fd = mkstemp(filename);
380 close(fd);
381 }
382 #endif
383
384 /*
385 * Detect host devices. By convention, /dev/cdrom[N] is always
386 * recognized as a host CDROM.
387 */
388 static BlockDriver *find_hdev_driver(const char *filename)
389 {
390 int score_max = 0, score;
391 BlockDriver *drv = NULL, *d;
392
393 QLIST_FOREACH(d, &bdrv_drivers, list) {
394 if (d->bdrv_probe_device) {
395 score = d->bdrv_probe_device(filename);
396 if (score > score_max) {
397 score_max = score;
398 drv = d;
399 }
400 }
401 }
402
403 return drv;
404 }
405
406 BlockDriver *bdrv_find_protocol(const char *filename)
407 {
408 BlockDriver *drv1;
409 char protocol[128];
410 int len;
411 const char *p;
412
413 /* TODO Drivers without bdrv_file_open must be specified explicitly */
414
415 /*
416 * XXX(hch): we really should not let host device detection
417 * override an explicit protocol specification, but moving this
418 * later breaks access to device names with colons in them.
419 * Thanks to the brain-dead persistent naming schemes on udev-
420 * based Linux systems those actually are quite common.
421 */
422 drv1 = find_hdev_driver(filename);
423 if (drv1) {
424 return drv1;
425 }
426
427 if (!path_has_protocol(filename)) {
428 return bdrv_find_format("file");
429 }
430 p = strchr(filename, ':');
431 assert(p != NULL);
432 len = p - filename;
433 if (len > sizeof(protocol) - 1)
434 len = sizeof(protocol) - 1;
435 memcpy(protocol, filename, len);
436 protocol[len] = '\0';
437 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
438 if (drv1->protocol_name &&
439 !strcmp(drv1->protocol_name, protocol)) {
440 return drv1;
441 }
442 }
443 return NULL;
444 }
445
446 static int find_image_format(const char *filename, BlockDriver **pdrv)
447 {
448 int ret, score, score_max;
449 BlockDriver *drv1, *drv;
450 uint8_t buf[2048];
451 BlockDriverState *bs;
452
453 ret = bdrv_file_open(&bs, filename, 0);
454 if (ret < 0) {
455 *pdrv = NULL;
456 return ret;
457 }
458
459 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
460 if (bs->sg || !bdrv_is_inserted(bs)) {
461 bdrv_delete(bs);
462 drv = bdrv_find_format("raw");
463 if (!drv) {
464 ret = -ENOENT;
465 }
466 *pdrv = drv;
467 return ret;
468 }
469
470 ret = bdrv_pread(bs, 0, buf, sizeof(buf));
471 bdrv_delete(bs);
472 if (ret < 0) {
473 *pdrv = NULL;
474 return ret;
475 }
476
477 score_max = 0;
478 drv = NULL;
479 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
480 if (drv1->bdrv_probe) {
481 score = drv1->bdrv_probe(buf, ret, filename);
482 if (score > score_max) {
483 score_max = score;
484 drv = drv1;
485 }
486 }
487 }
488 if (!drv) {
489 ret = -ENOENT;
490 }
491 *pdrv = drv;
492 return ret;
493 }
494
495 /**
496 * Set the current 'total_sectors' value
497 */
498 static int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
499 {
500 BlockDriver *drv = bs->drv;
501
502 /* Do not attempt drv->bdrv_getlength() on scsi-generic devices */
503 if (bs->sg)
504 return 0;
505
506 /* query actual device if possible, otherwise just trust the hint */
507 if (drv->bdrv_getlength) {
508 int64_t length = drv->bdrv_getlength(bs);
509 if (length < 0) {
510 return length;
511 }
512 hint = length >> BDRV_SECTOR_BITS;
513 }
514
515 bs->total_sectors = hint;
516 return 0;
517 }
518
519 /**
520 * Set open flags for a given cache mode
521 *
522 * Return 0 on success, -1 if the cache mode was invalid.
523 */
524 int bdrv_parse_cache_flags(const char *mode, int *flags)
525 {
526 *flags &= ~BDRV_O_CACHE_MASK;
527
528 if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
529 *flags |= BDRV_O_NOCACHE | BDRV_O_CACHE_WB;
530 } else if (!strcmp(mode, "directsync")) {
531 *flags |= BDRV_O_NOCACHE;
532 } else if (!strcmp(mode, "writeback")) {
533 *flags |= BDRV_O_CACHE_WB;
534 } else if (!strcmp(mode, "unsafe")) {
535 *flags |= BDRV_O_CACHE_WB;
536 *flags |= BDRV_O_NO_FLUSH;
537 } else if (!strcmp(mode, "writethrough")) {
538 /* this is the default */
539 } else {
540 return -1;
541 }
542
543 return 0;
544 }
545
546 /**
547 * The copy-on-read flag is actually a reference count so multiple users may
548 * use the feature without worrying about clobbering its previous state.
549 * Copy-on-read stays enabled until all users have called to disable it.
550 */
551 void bdrv_enable_copy_on_read(BlockDriverState *bs)
552 {
553 bs->copy_on_read++;
554 }
555
556 void bdrv_disable_copy_on_read(BlockDriverState *bs)
557 {
558 assert(bs->copy_on_read > 0);
559 bs->copy_on_read--;
560 }
561
562 /*
563 * Common part for opening disk images and files
564 */
565 static int bdrv_open_common(BlockDriverState *bs, const char *filename,
566 int flags, BlockDriver *drv)
567 {
568 int ret, open_flags;
569
570 assert(drv != NULL);
571
572 trace_bdrv_open_common(bs, filename, flags, drv->format_name);
573
574 bs->file = NULL;
575 bs->total_sectors = 0;
576 bs->encrypted = 0;
577 bs->valid_key = 0;
578 bs->sg = 0;
579 bs->open_flags = flags;
580 bs->growable = 0;
581 bs->buffer_alignment = 512;
582
583 assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */
584 if ((flags & BDRV_O_RDWR) && (flags & BDRV_O_COPY_ON_READ)) {
585 bdrv_enable_copy_on_read(bs);
586 }
587
588 pstrcpy(bs->filename, sizeof(bs->filename), filename);
589 bs->backing_file[0] = '\0';
590
591 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) {
592 return -ENOTSUP;
593 }
594
595 bs->drv = drv;
596 bs->opaque = g_malloc0(drv->instance_size);
597
598 bs->enable_write_cache = !!(flags & BDRV_O_CACHE_WB);
599
600 /*
601 * Clear flags that are internal to the block layer before opening the
602 * image.
603 */
604 open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
605
606 /*
607 * Snapshots should be writable.
608 */
609 if (bs->is_temporary) {
610 open_flags |= BDRV_O_RDWR;
611 }
612
613 bs->keep_read_only = bs->read_only = !(open_flags & BDRV_O_RDWR);
614
615 /* Open the image, either directly or using a protocol */
616 if (drv->bdrv_file_open) {
617 ret = drv->bdrv_file_open(bs, filename, open_flags);
618 } else {
619 ret = bdrv_file_open(&bs->file, filename, open_flags);
620 if (ret >= 0) {
621 ret = drv->bdrv_open(bs, open_flags);
622 }
623 }
624
625 if (ret < 0) {
626 goto free_and_fail;
627 }
628
629 ret = refresh_total_sectors(bs, bs->total_sectors);
630 if (ret < 0) {
631 goto free_and_fail;
632 }
633
634 #ifndef _WIN32
635 if (bs->is_temporary) {
636 unlink(filename);
637 }
638 #endif
639 return 0;
640
641 free_and_fail:
642 if (bs->file) {
643 bdrv_delete(bs->file);
644 bs->file = NULL;
645 }
646 g_free(bs->opaque);
647 bs->opaque = NULL;
648 bs->drv = NULL;
649 return ret;
650 }
651
652 /*
653 * Opens a file using a protocol (file, host_device, nbd, ...)
654 */
655 int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags)
656 {
657 BlockDriverState *bs;
658 BlockDriver *drv;
659 int ret;
660
661 drv = bdrv_find_protocol(filename);
662 if (!drv) {
663 return -ENOENT;
664 }
665
666 bs = bdrv_new("");
667 ret = bdrv_open_common(bs, filename, flags, drv);
668 if (ret < 0) {
669 bdrv_delete(bs);
670 return ret;
671 }
672 bs->growable = 1;
673 *pbs = bs;
674 return 0;
675 }
676
677 /*
678 * Opens a disk image (raw, qcow2, vmdk, ...)
679 */
680 int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
681 BlockDriver *drv)
682 {
683 int ret;
684 char tmp_filename[PATH_MAX];
685
686 if (flags & BDRV_O_SNAPSHOT) {
687 BlockDriverState *bs1;
688 int64_t total_size;
689 int is_protocol = 0;
690 BlockDriver *bdrv_qcow2;
691 QEMUOptionParameter *options;
692 char backing_filename[PATH_MAX];
693
694 /* if snapshot, we create a temporary backing file and open it
695 instead of opening 'filename' directly */
696
697 /* if there is a backing file, use it */
698 bs1 = bdrv_new("");
699 ret = bdrv_open(bs1, filename, 0, drv);
700 if (ret < 0) {
701 bdrv_delete(bs1);
702 return ret;
703 }
704 total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK;
705
706 if (bs1->drv && bs1->drv->protocol_name)
707 is_protocol = 1;
708
709 bdrv_delete(bs1);
710
711 get_tmp_filename(tmp_filename, sizeof(tmp_filename));
712
713 /* Real path is meaningless for protocols */
714 if (is_protocol)
715 snprintf(backing_filename, sizeof(backing_filename),
716 "%s", filename);
717 else if (!realpath(filename, backing_filename))
718 return -errno;
719
720 bdrv_qcow2 = bdrv_find_format("qcow2");
721 options = parse_option_parameters("", bdrv_qcow2->create_options, NULL);
722
723 set_option_parameter_int(options, BLOCK_OPT_SIZE, total_size);
724 set_option_parameter(options, BLOCK_OPT_BACKING_FILE, backing_filename);
725 if (drv) {
726 set_option_parameter(options, BLOCK_OPT_BACKING_FMT,
727 drv->format_name);
728 }
729
730 ret = bdrv_create(bdrv_qcow2, tmp_filename, options);
731 free_option_parameters(options);
732 if (ret < 0) {
733 return ret;
734 }
735
736 filename = tmp_filename;
737 drv = bdrv_qcow2;
738 bs->is_temporary = 1;
739 }
740
741 /* Find the right image format driver */
742 if (!drv) {
743 ret = find_image_format(filename, &drv);
744 }
745
746 if (!drv) {
747 goto unlink_and_fail;
748 }
749
750 /* Open the image */
751 ret = bdrv_open_common(bs, filename, flags, drv);
752 if (ret < 0) {
753 goto unlink_and_fail;
754 }
755
756 /* If there is a backing file, use it */
757 if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
758 char backing_filename[PATH_MAX];
759 int back_flags;
760 BlockDriver *back_drv = NULL;
761
762 bs->backing_hd = bdrv_new("");
763
764 if (path_has_protocol(bs->backing_file)) {
765 pstrcpy(backing_filename, sizeof(backing_filename),
766 bs->backing_file);
767 } else {
768 path_combine(backing_filename, sizeof(backing_filename),
769 filename, bs->backing_file);
770 }
771
772 if (bs->backing_format[0] != '\0') {
773 back_drv = bdrv_find_format(bs->backing_format);
774 }
775
776 /* backing files always opened read-only */
777 back_flags =
778 flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);
779
780 ret = bdrv_open(bs->backing_hd, backing_filename, back_flags, back_drv);
781 if (ret < 0) {
782 bdrv_close(bs);
783 return ret;
784 }
785 if (bs->is_temporary) {
786 bs->backing_hd->keep_read_only = !(flags & BDRV_O_RDWR);
787 } else {
788 /* base image inherits from "parent" */
789 bs->backing_hd->keep_read_only = bs->keep_read_only;
790 }
791 }
792
793 if (!bdrv_key_required(bs)) {
794 bdrv_dev_change_media_cb(bs, true);
795 }
796
797 /* throttling disk I/O limits */
798 if (bs->io_limits_enabled) {
799 bdrv_io_limits_enable(bs);
800 }
801
802 return 0;
803
804 unlink_and_fail:
805 if (bs->is_temporary) {
806 unlink(filename);
807 }
808 return ret;
809 }
810
811 void bdrv_close(BlockDriverState *bs)
812 {
813 if (bs->drv) {
814 if (bs == bs_snapshots) {
815 bs_snapshots = NULL;
816 }
817 if (bs->backing_hd) {
818 bdrv_delete(bs->backing_hd);
819 bs->backing_hd = NULL;
820 }
821 bs->drv->bdrv_close(bs);
822 g_free(bs->opaque);
823 #ifdef _WIN32
824 if (bs->is_temporary) {
825 unlink(bs->filename);
826 }
827 #endif
828 bs->opaque = NULL;
829 bs->drv = NULL;
830 bs->copy_on_read = 0;
831
832 if (bs->file != NULL) {
833 bdrv_close(bs->file);
834 }
835
836 bdrv_dev_change_media_cb(bs, false);
837 }
838
839 /*throttling disk I/O limits*/
840 if (bs->io_limits_enabled) {
841 bdrv_io_limits_disable(bs);
842 }
843 }
844
845 void bdrv_close_all(void)
846 {
847 BlockDriverState *bs;
848
849 QTAILQ_FOREACH(bs, &bdrv_states, list) {
850 bdrv_close(bs);
851 }
852 }
853
854 /*
855 * Wait for pending requests to complete across all BlockDriverStates
856 *
857 * This function does not flush data to disk, use bdrv_flush_all() for that
858 * after calling this function.
859 */
860 void bdrv_drain_all(void)
861 {
862 BlockDriverState *bs;
863
864 qemu_aio_flush();
865
866 /* If requests are still pending there is a bug somewhere */
867 QTAILQ_FOREACH(bs, &bdrv_states, list) {
868 assert(QLIST_EMPTY(&bs->tracked_requests));
869 assert(qemu_co_queue_empty(&bs->throttled_reqs));
870 }
871 }
872
873 /* make a BlockDriverState anonymous by removing from bdrv_state list.
874 Also, NULL terminate the device_name to prevent double remove */
875 void bdrv_make_anon(BlockDriverState *bs)
876 {
877 if (bs->device_name[0] != '\0') {
878 QTAILQ_REMOVE(&bdrv_states, bs, list);
879 }
880 bs->device_name[0] = '\0';
881 }
882
883 void bdrv_delete(BlockDriverState *bs)
884 {
885 assert(!bs->dev);
886
887 /* remove from list, if necessary */
888 bdrv_make_anon(bs);
889
890 bdrv_close(bs);
891 if (bs->file != NULL) {
892 bdrv_delete(bs->file);
893 }
894
895 assert(bs != bs_snapshots);
896 g_free(bs);
897 }
898
899 int bdrv_attach_dev(BlockDriverState *bs, void *dev)
900 /* TODO change to DeviceState *dev when all users are qdevified */
901 {
902 if (bs->dev) {
903 return -EBUSY;
904 }
905 bs->dev = dev;
906 bdrv_iostatus_reset(bs);
907 return 0;
908 }
909
910 /* TODO qdevified devices don't use this, remove when devices are qdevified */
911 void bdrv_attach_dev_nofail(BlockDriverState *bs, void *dev)
912 {
913 if (bdrv_attach_dev(bs, dev) < 0) {
914 abort();
915 }
916 }
917
918 void bdrv_detach_dev(BlockDriverState *bs, void *dev)
919 /* TODO change to DeviceState *dev when all users are qdevified */
920 {
921 assert(bs->dev == dev);
922 bs->dev = NULL;
923 bs->dev_ops = NULL;
924 bs->dev_opaque = NULL;
925 bs->buffer_alignment = 512;
926 }
927
928 /* TODO change to return DeviceState * when all users are qdevified */
929 void *bdrv_get_attached_dev(BlockDriverState *bs)
930 {
931 return bs->dev;
932 }
933
934 void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
935 void *opaque)
936 {
937 bs->dev_ops = ops;
938 bs->dev_opaque = opaque;
939 if (bdrv_dev_has_removable_media(bs) && bs == bs_snapshots) {
940 bs_snapshots = NULL;
941 }
942 }
943
944 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load)
945 {
946 if (bs->dev_ops && bs->dev_ops->change_media_cb) {
947 bs->dev_ops->change_media_cb(bs->dev_opaque, load);
948 }
949 }
950
951 bool bdrv_dev_has_removable_media(BlockDriverState *bs)
952 {
953 return !bs->dev || (bs->dev_ops && bs->dev_ops->change_media_cb);
954 }
955
956 void bdrv_dev_eject_request(BlockDriverState *bs, bool force)
957 {
958 if (bs->dev_ops && bs->dev_ops->eject_request_cb) {
959 bs->dev_ops->eject_request_cb(bs->dev_opaque, force);
960 }
961 }
962
963 bool bdrv_dev_is_tray_open(BlockDriverState *bs)
964 {
965 if (bs->dev_ops && bs->dev_ops->is_tray_open) {
966 return bs->dev_ops->is_tray_open(bs->dev_opaque);
967 }
968 return false;
969 }
970
971 static void bdrv_dev_resize_cb(BlockDriverState *bs)
972 {
973 if (bs->dev_ops && bs->dev_ops->resize_cb) {
974 bs->dev_ops->resize_cb(bs->dev_opaque);
975 }
976 }
977
978 bool bdrv_dev_is_medium_locked(BlockDriverState *bs)
979 {
980 if (bs->dev_ops && bs->dev_ops->is_medium_locked) {
981 return bs->dev_ops->is_medium_locked(bs->dev_opaque);
982 }
983 return false;
984 }
985
986 /*
987 * Run consistency checks on an image
988 *
989 * Returns 0 if the check could be completed (it doesn't mean that the image is
990 * free of errors) or -errno when an internal error occurred. The results of the
991 * check are stored in res.
992 */
993 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res)
994 {
995 if (bs->drv->bdrv_check == NULL) {
996 return -ENOTSUP;
997 }
998
999 memset(res, 0, sizeof(*res));
1000 return bs->drv->bdrv_check(bs, res);
1001 }
1002
1003 #define COMMIT_BUF_SECTORS 2048
1004
1005 /* commit COW file into the raw image */
1006 int bdrv_commit(BlockDriverState *bs)
1007 {
1008 BlockDriver *drv = bs->drv;
1009 BlockDriver *backing_drv;
1010 int64_t sector, total_sectors;
1011 int n, ro, open_flags;
1012 int ret = 0, rw_ret = 0;
1013 uint8_t *buf;
1014 char filename[1024];
1015 BlockDriverState *bs_rw, *bs_ro;
1016
1017 if (!drv)
1018 return -ENOMEDIUM;
1019
1020 if (!bs->backing_hd) {
1021 return -ENOTSUP;
1022 }
1023
1024 if (bs->backing_hd->keep_read_only) {
1025 return -EACCES;
1026 }
1027
1028 if (bdrv_in_use(bs) || bdrv_in_use(bs->backing_hd)) {
1029 return -EBUSY;
1030 }
1031
1032 backing_drv = bs->backing_hd->drv;
1033 ro = bs->backing_hd->read_only;
1034 strncpy(filename, bs->backing_hd->filename, sizeof(filename));
1035 open_flags = bs->backing_hd->open_flags;
1036
1037 if (ro) {
1038 /* re-open as RW */
1039 bdrv_delete(bs->backing_hd);
1040 bs->backing_hd = NULL;
1041 bs_rw = bdrv_new("");
1042 rw_ret = bdrv_open(bs_rw, filename, open_flags | BDRV_O_RDWR,
1043 backing_drv);
1044 if (rw_ret < 0) {
1045 bdrv_delete(bs_rw);
1046 /* try to re-open read-only */
1047 bs_ro = bdrv_new("");
1048 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1049 backing_drv);
1050 if (ret < 0) {
1051 bdrv_delete(bs_ro);
1052 /* drive not functional anymore */
1053 bs->drv = NULL;
1054 return ret;
1055 }
1056 bs->backing_hd = bs_ro;
1057 return rw_ret;
1058 }
1059 bs->backing_hd = bs_rw;
1060 }
1061
1062 total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
1063 buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
1064
1065 for (sector = 0; sector < total_sectors; sector += n) {
1066 if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) {
1067
1068 if (bdrv_read(bs, sector, buf, n) != 0) {
1069 ret = -EIO;
1070 goto ro_cleanup;
1071 }
1072
1073 if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) {
1074 ret = -EIO;
1075 goto ro_cleanup;
1076 }
1077 }
1078 }
1079
1080 if (drv->bdrv_make_empty) {
1081 ret = drv->bdrv_make_empty(bs);
1082 bdrv_flush(bs);
1083 }
1084
1085 /*
1086 * Make sure all data we wrote to the backing device is actually
1087 * stable on disk.
1088 */
1089 if (bs->backing_hd)
1090 bdrv_flush(bs->backing_hd);
1091
1092 ro_cleanup:
1093 g_free(buf);
1094
1095 if (ro) {
1096 /* re-open as RO */
1097 bdrv_delete(bs->backing_hd);
1098 bs->backing_hd = NULL;
1099 bs_ro = bdrv_new("");
1100 ret = bdrv_open(bs_ro, filename, open_flags & ~BDRV_O_RDWR,
1101 backing_drv);
1102 if (ret < 0) {
1103 bdrv_delete(bs_ro);
1104 /* drive not functional anymore */
1105 bs->drv = NULL;
1106 return ret;
1107 }
1108 bs->backing_hd = bs_ro;
1109 bs->backing_hd->keep_read_only = 0;
1110 }
1111
1112 return ret;
1113 }
1114
1115 void bdrv_commit_all(void)
1116 {
1117 BlockDriverState *bs;
1118
1119 QTAILQ_FOREACH(bs, &bdrv_states, list) {
1120 bdrv_commit(bs);
1121 }
1122 }
1123
1124 struct BdrvTrackedRequest {
1125 BlockDriverState *bs;
1126 int64_t sector_num;
1127 int nb_sectors;
1128 bool is_write;
1129 QLIST_ENTRY(BdrvTrackedRequest) list;
1130 Coroutine *co; /* owner, used for deadlock detection */
1131 CoQueue wait_queue; /* coroutines blocked on this request */
1132 };
1133
1134 /**
1135 * Remove an active request from the tracked requests list
1136 *
1137 * This function should be called when a tracked request is completing.
1138 */
1139 static void tracked_request_end(BdrvTrackedRequest *req)
1140 {
1141 QLIST_REMOVE(req, list);
1142 qemu_co_queue_restart_all(&req->wait_queue);
1143 }
1144
1145 /**
1146 * Add an active request to the tracked requests list
1147 */
1148 static void tracked_request_begin(BdrvTrackedRequest *req,
1149 BlockDriverState *bs,
1150 int64_t sector_num,
1151 int nb_sectors, bool is_write)
1152 {
1153 *req = (BdrvTrackedRequest){
1154 .bs = bs,
1155 .sector_num = sector_num,
1156 .nb_sectors = nb_sectors,
1157 .is_write = is_write,
1158 .co = qemu_coroutine_self(),
1159 };
1160
1161 qemu_co_queue_init(&req->wait_queue);
1162
1163 QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
1164 }
1165
1166 /**
1167 * Round a region to cluster boundaries
1168 */
1169 static void round_to_clusters(BlockDriverState *bs,
1170 int64_t sector_num, int nb_sectors,
1171 int64_t *cluster_sector_num,
1172 int *cluster_nb_sectors)
1173 {
1174 BlockDriverInfo bdi;
1175
1176 if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
1177 *cluster_sector_num = sector_num;
1178 *cluster_nb_sectors = nb_sectors;
1179 } else {
1180 int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
1181 *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
1182 *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
1183 nb_sectors, c);
1184 }
1185 }
1186
1187 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
1188 int64_t sector_num, int nb_sectors) {
1189 /* aaaa bbbb */
1190 if (sector_num >= req->sector_num + req->nb_sectors) {
1191 return false;
1192 }
1193 /* bbbb aaaa */
1194 if (req->sector_num >= sector_num + nb_sectors) {
1195 return false;
1196 }
1197 return true;
1198 }
1199
1200 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
1201 int64_t sector_num, int nb_sectors)
1202 {
1203 BdrvTrackedRequest *req;
1204 int64_t cluster_sector_num;
1205 int cluster_nb_sectors;
1206 bool retry;
1207
1208 /* If we touch the same cluster it counts as an overlap. This guarantees
1209 * that allocating writes will be serialized and not race with each other
1210 * for the same cluster. For example, in copy-on-read it ensures that the
1211 * CoR read and write operations are atomic and guest writes cannot
1212 * interleave between them.
1213 */
1214 round_to_clusters(bs, sector_num, nb_sectors,
1215 &cluster_sector_num, &cluster_nb_sectors);
1216
1217 do {
1218 retry = false;
1219 QLIST_FOREACH(req, &bs->tracked_requests, list) {
1220 if (tracked_request_overlaps(req, cluster_sector_num,
1221 cluster_nb_sectors)) {
1222 /* Hitting this means there was a reentrant request, for
1223 * example, a block driver issuing nested requests. This must
1224 * never happen since it means deadlock.
1225 */
1226 assert(qemu_coroutine_self() != req->co);
1227
1228 qemu_co_queue_wait(&req->wait_queue);
1229 retry = true;
1230 break;
1231 }
1232 }
1233 } while (retry);
1234 }
1235
1236 /*
1237 * Return values:
1238 * 0 - success
1239 * -EINVAL - backing format specified, but no file
1240 * -ENOSPC - can't update the backing file because no space is left in the
1241 * image file header
1242 * -ENOTSUP - format driver doesn't support changing the backing file
1243 */
1244 int bdrv_change_backing_file(BlockDriverState *bs,
1245 const char *backing_file, const char *backing_fmt)
1246 {
1247 BlockDriver *drv = bs->drv;
1248
1249 if (drv->bdrv_change_backing_file != NULL) {
1250 return drv->bdrv_change_backing_file(bs, backing_file, backing_fmt);
1251 } else {
1252 return -ENOTSUP;
1253 }
1254 }
1255
1256 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
1257 size_t size)
1258 {
1259 int64_t len;
1260
1261 if (!bdrv_is_inserted(bs))
1262 return -ENOMEDIUM;
1263
1264 if (bs->growable)
1265 return 0;
1266
1267 len = bdrv_getlength(bs);
1268
1269 if (offset < 0)
1270 return -EIO;
1271
1272 if ((offset > len) || (len - offset < size))
1273 return -EIO;
1274
1275 return 0;
1276 }
1277
1278 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
1279 int nb_sectors)
1280 {
1281 return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
1282 nb_sectors * BDRV_SECTOR_SIZE);
1283 }
1284
1285 typedef struct RwCo {
1286 BlockDriverState *bs;
1287 int64_t sector_num;
1288 int nb_sectors;
1289 QEMUIOVector *qiov;
1290 bool is_write;
1291 int ret;
1292 } RwCo;
1293
1294 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
1295 {
1296 RwCo *rwco = opaque;
1297
1298 if (!rwco->is_write) {
1299 rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num,
1300 rwco->nb_sectors, rwco->qiov, 0);
1301 } else {
1302 rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
1303 rwco->nb_sectors, rwco->qiov);
1304 }
1305 }
1306
1307 /*
1308 * Process a synchronous request using coroutines
1309 */
1310 static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
1311 int nb_sectors, bool is_write)
1312 {
1313 QEMUIOVector qiov;
1314 struct iovec iov = {
1315 .iov_base = (void *)buf,
1316 .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
1317 };
1318 Coroutine *co;
1319 RwCo rwco = {
1320 .bs = bs,
1321 .sector_num = sector_num,
1322 .nb_sectors = nb_sectors,
1323 .qiov = &qiov,
1324 .is_write = is_write,
1325 .ret = NOT_DONE,
1326 };
1327
1328 qemu_iovec_init_external(&qiov, &iov, 1);
1329
1330 if (qemu_in_coroutine()) {
1331 /* Fast-path if already in coroutine context */
1332 bdrv_rw_co_entry(&rwco);
1333 } else {
1334 co = qemu_coroutine_create(bdrv_rw_co_entry);
1335 qemu_coroutine_enter(co, &rwco);
1336 while (rwco.ret == NOT_DONE) {
1337 qemu_aio_wait();
1338 }
1339 }
1340 return rwco.ret;
1341 }
1342
1343 /* return < 0 if error. See bdrv_write() for the return codes */
1344 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
1345 uint8_t *buf, int nb_sectors)
1346 {
1347 return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false);
1348 }
1349
1350 static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
1351 int nb_sectors, int dirty)
1352 {
1353 int64_t start, end;
1354 unsigned long val, idx, bit;
1355
1356 start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
1357 end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
1358
1359 for (; start <= end; start++) {
1360 idx = start / (sizeof(unsigned long) * 8);
1361 bit = start % (sizeof(unsigned long) * 8);
1362 val = bs->dirty_bitmap[idx];
1363 if (dirty) {
1364 if (!(val & (1UL << bit))) {
1365 bs->dirty_count++;
1366 val |= 1UL << bit;
1367 }
1368 } else {
1369 if (val & (1UL << bit)) {
1370 bs->dirty_count--;
1371 val &= ~(1UL << bit);
1372 }
1373 }
1374 bs->dirty_bitmap[idx] = val;
1375 }
1376 }
1377
1378 /* Return < 0 if error. Important errors are:
1379 -EIO generic I/O error (may happen for all errors)
1380 -ENOMEDIUM No media inserted.
1381 -EINVAL Invalid sector number or nb_sectors
1382 -EACCES Trying to write a read-only device
1383 */
1384 int bdrv_write(BlockDriverState *bs, int64_t sector_num,
1385 const uint8_t *buf, int nb_sectors)
1386 {
1387 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true);
1388 }
1389
1390 int bdrv_pread(BlockDriverState *bs, int64_t offset,
1391 void *buf, int count1)
1392 {
1393 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1394 int len, nb_sectors, count;
1395 int64_t sector_num;
1396 int ret;
1397
1398 count = count1;
1399 /* first read to align to sector start */
1400 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1401 if (len > count)
1402 len = count;
1403 sector_num = offset >> BDRV_SECTOR_BITS;
1404 if (len > 0) {
1405 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1406 return ret;
1407 memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len);
1408 count -= len;
1409 if (count == 0)
1410 return count1;
1411 sector_num++;
1412 buf += len;
1413 }
1414
1415 /* read the sectors "in place" */
1416 nb_sectors = count >> BDRV_SECTOR_BITS;
1417 if (nb_sectors > 0) {
1418 if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0)
1419 return ret;
1420 sector_num += nb_sectors;
1421 len = nb_sectors << BDRV_SECTOR_BITS;
1422 buf += len;
1423 count -= len;
1424 }
1425
1426 /* add data from the last sector */
1427 if (count > 0) {
1428 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1429 return ret;
1430 memcpy(buf, tmp_buf, count);
1431 }
1432 return count1;
1433 }
1434
1435 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
1436 const void *buf, int count1)
1437 {
1438 uint8_t tmp_buf[BDRV_SECTOR_SIZE];
1439 int len, nb_sectors, count;
1440 int64_t sector_num;
1441 int ret;
1442
1443 count = count1;
1444 /* first write to align to sector start */
1445 len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1);
1446 if (len > count)
1447 len = count;
1448 sector_num = offset >> BDRV_SECTOR_BITS;
1449 if (len > 0) {
1450 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1451 return ret;
1452 memcpy(tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), buf, len);
1453 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1454 return ret;
1455 count -= len;
1456 if (count == 0)
1457 return count1;
1458 sector_num++;
1459 buf += len;
1460 }
1461
1462 /* write the sectors "in place" */
1463 nb_sectors = count >> BDRV_SECTOR_BITS;
1464 if (nb_sectors > 0) {
1465 if ((ret = bdrv_write(bs, sector_num, buf, nb_sectors)) < 0)
1466 return ret;
1467 sector_num += nb_sectors;
1468 len = nb_sectors << BDRV_SECTOR_BITS;
1469 buf += len;
1470 count -= len;
1471 }
1472
1473 /* add data from the last sector */
1474 if (count > 0) {
1475 if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0)
1476 return ret;
1477 memcpy(tmp_buf, buf, count);
1478 if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0)
1479 return ret;
1480 }
1481 return count1;
1482 }
1483
1484 /*
1485 * Writes to the file and ensures that no writes are reordered across this
1486 * request (acts as a barrier)
1487 *
1488 * Returns 0 on success, -errno in error cases.
1489 */
1490 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
1491 const void *buf, int count)
1492 {
1493 int ret;
1494
1495 ret = bdrv_pwrite(bs, offset, buf, count);
1496 if (ret < 0) {
1497 return ret;
1498 }
1499
1500 /* No flush needed for cache modes that use O_DSYNC */
1501 if ((bs->open_flags & BDRV_O_CACHE_WB) != 0) {
1502 bdrv_flush(bs);
1503 }
1504
1505 return 0;
1506 }
1507
1508 static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
1509 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1510 {
1511 /* Perform I/O through a temporary buffer so that users who scribble over
1512 * their read buffer while the operation is in progress do not end up
1513 * modifying the image file. This is critical for zero-copy guest I/O
1514 * where anything might happen inside guest memory.
1515 */
1516 void *bounce_buffer;
1517
1518 struct iovec iov;
1519 QEMUIOVector bounce_qiov;
1520 int64_t cluster_sector_num;
1521 int cluster_nb_sectors;
1522 size_t skip_bytes;
1523 int ret;
1524
1525 /* Cover entire cluster so no additional backing file I/O is required when
1526 * allocating cluster in the image file.
1527 */
1528 round_to_clusters(bs, sector_num, nb_sectors,
1529 &cluster_sector_num, &cluster_nb_sectors);
1530
1531 trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors,
1532 cluster_sector_num, cluster_nb_sectors);
1533
1534 iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
1535 iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
1536 qemu_iovec_init_external(&bounce_qiov, &iov, 1);
1537
1538 ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
1539 &bounce_qiov);
1540 if (ret < 0) {
1541 goto err;
1542 }
1543
1544 ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
1545 &bounce_qiov);
1546 if (ret < 0) {
1547 /* It might be okay to ignore write errors for guest requests. If this
1548 * is a deliberate copy-on-read then we don't want to ignore the error.
1549 * Simply report it in all cases.
1550 */
1551 goto err;
1552 }
1553
1554 skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
1555 qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
1556 nb_sectors * BDRV_SECTOR_SIZE);
1557
1558 err:
1559 qemu_vfree(bounce_buffer);
1560 return ret;
1561 }
1562
1563 /*
1564 * Handle a read request in coroutine context
1565 */
1566 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
1567 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1568 BdrvRequestFlags flags)
1569 {
1570 BlockDriver *drv = bs->drv;
1571 BdrvTrackedRequest req;
1572 int ret;
1573
1574 if (!drv) {
1575 return -ENOMEDIUM;
1576 }
1577 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1578 return -EIO;
1579 }
1580
1581 /* throttling disk read I/O */
1582 if (bs->io_limits_enabled) {
1583 bdrv_io_limits_intercept(bs, false, nb_sectors);
1584 }
1585
1586 if (bs->copy_on_read) {
1587 flags |= BDRV_REQ_COPY_ON_READ;
1588 }
1589 if (flags & BDRV_REQ_COPY_ON_READ) {
1590 bs->copy_on_read_in_flight++;
1591 }
1592
1593 if (bs->copy_on_read_in_flight) {
1594 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1595 }
1596
1597 tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
1598
1599 if (flags & BDRV_REQ_COPY_ON_READ) {
1600 int pnum;
1601
1602 ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
1603 if (ret < 0) {
1604 goto out;
1605 }
1606
1607 if (!ret || pnum != nb_sectors) {
1608 ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov);
1609 goto out;
1610 }
1611 }
1612
1613 ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1614
1615 out:
1616 tracked_request_end(&req);
1617
1618 if (flags & BDRV_REQ_COPY_ON_READ) {
1619 bs->copy_on_read_in_flight--;
1620 }
1621
1622 return ret;
1623 }
1624
1625 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
1626 int nb_sectors, QEMUIOVector *qiov)
1627 {
1628 trace_bdrv_co_readv(bs, sector_num, nb_sectors);
1629
1630 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0);
1631 }
1632
1633 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
1634 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1635 {
1636 trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors);
1637
1638 return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov,
1639 BDRV_REQ_COPY_ON_READ);
1640 }
1641
1642 /*
1643 * Handle a write request in coroutine context
1644 */
1645 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
1646 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
1647 {
1648 BlockDriver *drv = bs->drv;
1649 BdrvTrackedRequest req;
1650 int ret;
1651
1652 if (!bs->drv) {
1653 return -ENOMEDIUM;
1654 }
1655 if (bs->read_only) {
1656 return -EACCES;
1657 }
1658 if (bdrv_check_request(bs, sector_num, nb_sectors)) {
1659 return -EIO;
1660 }
1661
1662 /* throttling disk write I/O */
1663 if (bs->io_limits_enabled) {
1664 bdrv_io_limits_intercept(bs, true, nb_sectors);
1665 }
1666
1667 if (bs->copy_on_read_in_flight) {
1668 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
1669 }
1670
1671 tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
1672
1673 ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
1674
1675 if (bs->dirty_bitmap) {
1676 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
1677 }
1678
1679 if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
1680 bs->wr_highest_sector = sector_num + nb_sectors - 1;
1681 }
1682
1683 tracked_request_end(&req);
1684
1685 return ret;
1686 }
1687
1688 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
1689 int nb_sectors, QEMUIOVector *qiov)
1690 {
1691 trace_bdrv_co_writev(bs, sector_num, nb_sectors);
1692
1693 return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
1694 }
1695
1696 /**
1697 * Truncate file to 'offset' bytes (needed only for file protocols)
1698 */
1699 int bdrv_truncate(BlockDriverState *bs, int64_t offset)
1700 {
1701 BlockDriver *drv = bs->drv;
1702 int ret;
1703 if (!drv)
1704 return -ENOMEDIUM;
1705 if (!drv->bdrv_truncate)
1706 return -ENOTSUP;
1707 if (bs->read_only)
1708 return -EACCES;
1709 if (bdrv_in_use(bs))
1710 return -EBUSY;
1711 ret = drv->bdrv_truncate(bs, offset);
1712 if (ret == 0) {
1713 ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
1714 bdrv_dev_resize_cb(bs);
1715 }
1716 return ret;
1717 }
1718
1719 /**
1720 * Length of a allocated file in bytes. Sparse files are counted by actual
1721 * allocated space. Return < 0 if error or unknown.
1722 */
1723 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
1724 {
1725 BlockDriver *drv = bs->drv;
1726 if (!drv) {
1727 return -ENOMEDIUM;
1728 }
1729 if (drv->bdrv_get_allocated_file_size) {
1730 return drv->bdrv_get_allocated_file_size(bs);
1731 }
1732 if (bs->file) {
1733 return bdrv_get_allocated_file_size(bs->file);
1734 }
1735 return -ENOTSUP;
1736 }
1737
1738 /**
1739 * Length of a file in bytes. Return < 0 if error or unknown.
1740 */
1741 int64_t bdrv_getlength(BlockDriverState *bs)
1742 {
1743 BlockDriver *drv = bs->drv;
1744 if (!drv)
1745 return -ENOMEDIUM;
1746
1747 if (bs->growable || bdrv_dev_has_removable_media(bs)) {
1748 if (drv->bdrv_getlength) {
1749 return drv->bdrv_getlength(bs);
1750 }
1751 }
1752 return bs->total_sectors * BDRV_SECTOR_SIZE;
1753 }
1754
1755 /* return 0 as number of sectors if no device present or error */
1756 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
1757 {
1758 int64_t length;
1759 length = bdrv_getlength(bs);
1760 if (length < 0)
1761 length = 0;
1762 else
1763 length = length >> BDRV_SECTOR_BITS;
1764 *nb_sectors_ptr = length;
1765 }
1766
1767 struct partition {
1768 uint8_t boot_ind; /* 0x80 - active */
1769 uint8_t head; /* starting head */
1770 uint8_t sector; /* starting sector */
1771 uint8_t cyl; /* starting cylinder */
1772 uint8_t sys_ind; /* What partition type */
1773 uint8_t end_head; /* end head */
1774 uint8_t end_sector; /* end sector */
1775 uint8_t end_cyl; /* end cylinder */
1776 uint32_t start_sect; /* starting sector counting from 0 */
1777 uint32_t nr_sects; /* nr of sectors in partition */
1778 } QEMU_PACKED;
1779
1780 /* try to guess the disk logical geometry from the MSDOS partition table. Return 0 if OK, -1 if could not guess */
1781 static int guess_disk_lchs(BlockDriverState *bs,
1782 int *pcylinders, int *pheads, int *psectors)
1783 {
1784 uint8_t buf[BDRV_SECTOR_SIZE];
1785 int ret, i, heads, sectors, cylinders;
1786 struct partition *p;
1787 uint32_t nr_sects;
1788 uint64_t nb_sectors;
1789
1790 bdrv_get_geometry(bs, &nb_sectors);
1791
1792 ret = bdrv_read(bs, 0, buf, 1);
1793 if (ret < 0)
1794 return -1;
1795 /* test msdos magic */
1796 if (buf[510] != 0x55 || buf[511] != 0xaa)
1797 return -1;
1798 for(i = 0; i < 4; i++) {
1799 p = ((struct partition *)(buf + 0x1be)) + i;
1800 nr_sects = le32_to_cpu(p->nr_sects);
1801 if (nr_sects && p->end_head) {
1802 /* We make the assumption that the partition terminates on
1803 a cylinder boundary */
1804 heads = p->end_head + 1;
1805 sectors = p->end_sector & 63;
1806 if (sectors == 0)
1807 continue;
1808 cylinders = nb_sectors / (heads * sectors);
1809 if (cylinders < 1 || cylinders > 16383)
1810 continue;
1811 *pheads = heads;
1812 *psectors = sectors;
1813 *pcylinders = cylinders;
1814 #if 0
1815 printf("guessed geometry: LCHS=%d %d %d\n",
1816 cylinders, heads, sectors);
1817 #endif
1818 return 0;
1819 }
1820 }
1821 return -1;
1822 }
1823
1824 void bdrv_guess_geometry(BlockDriverState *bs, int *pcyls, int *pheads, int *psecs)
1825 {
1826 int translation, lba_detected = 0;
1827 int cylinders, heads, secs;
1828 uint64_t nb_sectors;
1829
1830 /* if a geometry hint is available, use it */
1831 bdrv_get_geometry(bs, &nb_sectors);
1832 bdrv_get_geometry_hint(bs, &cylinders, &heads, &secs);
1833 translation = bdrv_get_translation_hint(bs);
1834 if (cylinders != 0) {
1835 *pcyls = cylinders;
1836 *pheads = heads;
1837 *psecs = secs;
1838 } else {
1839 if (guess_disk_lchs(bs, &cylinders, &heads, &secs) == 0) {
1840 if (heads > 16) {
1841 /* if heads > 16, it means that a BIOS LBA
1842 translation was active, so the default
1843 hardware geometry is OK */
1844 lba_detected = 1;
1845 goto default_geometry;
1846 } else {
1847 *pcyls = cylinders;
1848 *pheads = heads;
1849 *psecs = secs;
1850 /* disable any translation to be in sync with
1851 the logical geometry */
1852 if (translation == BIOS_ATA_TRANSLATION_AUTO) {
1853 bdrv_set_translation_hint(bs,
1854 BIOS_ATA_TRANSLATION_NONE);
1855 }
1856 }
1857 } else {
1858 default_geometry:
1859 /* if no geometry, use a standard physical disk geometry */
1860 cylinders = nb_sectors / (16 * 63);
1861
1862 if (cylinders > 16383)
1863 cylinders = 16383;
1864 else if (cylinders < 2)
1865 cylinders = 2;
1866 *pcyls = cylinders;
1867 *pheads = 16;
1868 *psecs = 63;
1869 if ((lba_detected == 1) && (translation == BIOS_ATA_TRANSLATION_AUTO)) {
1870 if ((*pcyls * *pheads) <= 131072) {
1871 bdrv_set_translation_hint(bs,
1872 BIOS_ATA_TRANSLATION_LARGE);
1873 } else {
1874 bdrv_set_translation_hint(bs,
1875 BIOS_ATA_TRANSLATION_LBA);
1876 }
1877 }
1878 }
1879 bdrv_set_geometry_hint(bs, *pcyls, *pheads, *psecs);
1880 }
1881 }
1882
1883 void bdrv_set_geometry_hint(BlockDriverState *bs,
1884 int cyls, int heads, int secs)
1885 {
1886 bs->cyls = cyls;
1887 bs->heads = heads;
1888 bs->secs = secs;
1889 }
1890
1891 void bdrv_set_translation_hint(BlockDriverState *bs, int translation)
1892 {
1893 bs->translation = translation;
1894 }
1895
1896 void bdrv_get_geometry_hint(BlockDriverState *bs,
1897 int *pcyls, int *pheads, int *psecs)
1898 {
1899 *pcyls = bs->cyls;
1900 *pheads = bs->heads;
1901 *psecs = bs->secs;
1902 }
1903
1904 /* throttling disk io limits */
1905 void bdrv_set_io_limits(BlockDriverState *bs,
1906 BlockIOLimit *io_limits)
1907 {
1908 bs->io_limits = *io_limits;
1909 bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
1910 }
1911
1912 /* Recognize floppy formats */
1913 typedef struct FDFormat {
1914 FDriveType drive;
1915 uint8_t last_sect;
1916 uint8_t max_track;
1917 uint8_t max_head;
1918 } FDFormat;
1919
1920 static const FDFormat fd_formats[] = {
1921 /* First entry is default format */
1922 /* 1.44 MB 3"1/2 floppy disks */
1923 { FDRIVE_DRV_144, 18, 80, 1, },
1924 { FDRIVE_DRV_144, 20, 80, 1, },
1925 { FDRIVE_DRV_144, 21, 80, 1, },
1926 { FDRIVE_DRV_144, 21, 82, 1, },
1927 { FDRIVE_DRV_144, 21, 83, 1, },
1928 { FDRIVE_DRV_144, 22, 80, 1, },
1929 { FDRIVE_DRV_144, 23, 80, 1, },
1930 { FDRIVE_DRV_144, 24, 80, 1, },
1931 /* 2.88 MB 3"1/2 floppy disks */
1932 { FDRIVE_DRV_288, 36, 80, 1, },
1933 { FDRIVE_DRV_288, 39, 80, 1, },
1934 { FDRIVE_DRV_288, 40, 80, 1, },
1935 { FDRIVE_DRV_288, 44, 80, 1, },
1936 { FDRIVE_DRV_288, 48, 80, 1, },
1937 /* 720 kB 3"1/2 floppy disks */
1938 { FDRIVE_DRV_144, 9, 80, 1, },
1939 { FDRIVE_DRV_144, 10, 80, 1, },
1940 { FDRIVE_DRV_144, 10, 82, 1, },
1941 { FDRIVE_DRV_144, 10, 83, 1, },
1942 { FDRIVE_DRV_144, 13, 80, 1, },
1943 { FDRIVE_DRV_144, 14, 80, 1, },
1944 /* 1.2 MB 5"1/4 floppy disks */
1945 { FDRIVE_DRV_120, 15, 80, 1, },
1946 { FDRIVE_DRV_120, 18, 80, 1, },
1947 { FDRIVE_DRV_120, 18, 82, 1, },
1948 { FDRIVE_DRV_120, 18, 83, 1, },
1949 { FDRIVE_DRV_120, 20, 80, 1, },
1950 /* 720 kB 5"1/4 floppy disks */
1951 { FDRIVE_DRV_120, 9, 80, 1, },
1952 { FDRIVE_DRV_120, 11, 80, 1, },
1953 /* 360 kB 5"1/4 floppy disks */
1954 { FDRIVE_DRV_120, 9, 40, 1, },
1955 { FDRIVE_DRV_120, 9, 40, 0, },
1956 { FDRIVE_DRV_120, 10, 41, 1, },
1957 { FDRIVE_DRV_120, 10, 42, 1, },
1958 /* 320 kB 5"1/4 floppy disks */
1959 { FDRIVE_DRV_120, 8, 40, 1, },
1960 { FDRIVE_DRV_120, 8, 40, 0, },
1961 /* 360 kB must match 5"1/4 better than 3"1/2... */
1962 { FDRIVE_DRV_144, 9, 80, 0, },
1963 /* end */
1964 { FDRIVE_DRV_NONE, -1, -1, 0, },
1965 };
1966
1967 void bdrv_get_floppy_geometry_hint(BlockDriverState *bs, int *nb_heads,
1968 int *max_track, int *last_sect,
1969 FDriveType drive_in, FDriveType *drive)
1970 {
1971 const FDFormat *parse;
1972 uint64_t nb_sectors, size;
1973 int i, first_match, match;
1974
1975 bdrv_get_geometry_hint(bs, nb_heads, max_track, last_sect);
1976 if (*nb_heads != 0 && *max_track != 0 && *last_sect != 0) {
1977 /* User defined disk */
1978 } else {
1979 bdrv_get_geometry(bs, &nb_sectors);
1980 match = -1;
1981 first_match = -1;
1982 for (i = 0; ; i++) {
1983 parse = &fd_formats[i];
1984 if (parse->drive == FDRIVE_DRV_NONE) {
1985 break;
1986 }
1987 if (drive_in == parse->drive ||
1988 drive_in == FDRIVE_DRV_NONE) {
1989 size = (parse->max_head + 1) * parse->max_track *
1990 parse->last_sect;
1991 if (nb_sectors == size) {
1992 match = i;
1993 break;
1994 }
1995 if (first_match == -1) {
1996 first_match = i;
1997 }
1998 }
1999 }
2000 if (match == -1) {
2001 if (first_match == -1) {
2002 match = 1;
2003 } else {
2004 match = first_match;
2005 }
2006 parse = &fd_formats[match];
2007 }
2008 *nb_heads = parse->max_head + 1;
2009 *max_track = parse->max_track;
2010 *last_sect = parse->last_sect;
2011 *drive = parse->drive;
2012 }
2013 }
2014
2015 int bdrv_get_translation_hint(BlockDriverState *bs)
2016 {
2017 return bs->translation;
2018 }
2019
2020 void bdrv_set_on_error(BlockDriverState *bs, BlockErrorAction on_read_error,
2021 BlockErrorAction on_write_error)
2022 {
2023 bs->on_read_error = on_read_error;
2024 bs->on_write_error = on_write_error;
2025 }
2026
2027 BlockErrorAction bdrv_get_on_error(BlockDriverState *bs, int is_read)
2028 {
2029 return is_read ? bs->on_read_error : bs->on_write_error;
2030 }
2031
2032 int bdrv_is_read_only(BlockDriverState *bs)
2033 {
2034 return bs->read_only;
2035 }
2036
2037 int bdrv_is_sg(BlockDriverState *bs)
2038 {
2039 return bs->sg;
2040 }
2041
2042 int bdrv_enable_write_cache(BlockDriverState *bs)
2043 {
2044 return bs->enable_write_cache;
2045 }
2046
2047 int bdrv_is_encrypted(BlockDriverState *bs)
2048 {
2049 if (bs->backing_hd && bs->backing_hd->encrypted)
2050 return 1;
2051 return bs->encrypted;
2052 }
2053
2054 int bdrv_key_required(BlockDriverState *bs)
2055 {
2056 BlockDriverState *backing_hd = bs->backing_hd;
2057
2058 if (backing_hd && backing_hd->encrypted && !backing_hd->valid_key)
2059 return 1;
2060 return (bs->encrypted && !bs->valid_key);
2061 }
2062
2063 int bdrv_set_key(BlockDriverState *bs, const char *key)
2064 {
2065 int ret;
2066 if (bs->backing_hd && bs->backing_hd->encrypted) {
2067 ret = bdrv_set_key(bs->backing_hd, key);
2068 if (ret < 0)
2069 return ret;
2070 if (!bs->encrypted)
2071 return 0;
2072 }
2073 if (!bs->encrypted) {
2074 return -EINVAL;
2075 } else if (!bs->drv || !bs->drv->bdrv_set_key) {
2076 return -ENOMEDIUM;
2077 }
2078 ret = bs->drv->bdrv_set_key(bs, key);
2079 if (ret < 0) {
2080 bs->valid_key = 0;
2081 } else if (!bs->valid_key) {
2082 bs->valid_key = 1;
2083 /* call the change callback now, we skipped it on open */
2084 bdrv_dev_change_media_cb(bs, true);
2085 }
2086 return ret;
2087 }
2088
2089 void bdrv_get_format(BlockDriverState *bs, char *buf, int buf_size)
2090 {
2091 if (!bs->drv) {
2092 buf[0] = '\0';
2093 } else {
2094 pstrcpy(buf, buf_size, bs->drv->format_name);
2095 }
2096 }
2097
2098 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
2099 void *opaque)
2100 {
2101 BlockDriver *drv;
2102
2103 QLIST_FOREACH(drv, &bdrv_drivers, list) {
2104 it(opaque, drv->format_name);
2105 }
2106 }
2107
2108 BlockDriverState *bdrv_find(const char *name)
2109 {
2110 BlockDriverState *bs;
2111
2112 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2113 if (!strcmp(name, bs->device_name)) {
2114 return bs;
2115 }
2116 }
2117 return NULL;
2118 }
2119
2120 BlockDriverState *bdrv_next(BlockDriverState *bs)
2121 {
2122 if (!bs) {
2123 return QTAILQ_FIRST(&bdrv_states);
2124 }
2125 return QTAILQ_NEXT(bs, list);
2126 }
2127
2128 void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque)
2129 {
2130 BlockDriverState *bs;
2131
2132 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2133 it(opaque, bs);
2134 }
2135 }
2136
2137 const char *bdrv_get_device_name(BlockDriverState *bs)
2138 {
2139 return bs->device_name;
2140 }
2141
2142 void bdrv_flush_all(void)
2143 {
2144 BlockDriverState *bs;
2145
2146 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2147 if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
2148 bdrv_flush(bs);
2149 }
2150 }
2151 }
2152
2153 int bdrv_has_zero_init(BlockDriverState *bs)
2154 {
2155 assert(bs->drv);
2156
2157 if (bs->drv->bdrv_has_zero_init) {
2158 return bs->drv->bdrv_has_zero_init(bs);
2159 }
2160
2161 return 1;
2162 }
2163
2164 typedef struct BdrvCoIsAllocatedData {
2165 BlockDriverState *bs;
2166 int64_t sector_num;
2167 int nb_sectors;
2168 int *pnum;
2169 int ret;
2170 bool done;
2171 } BdrvCoIsAllocatedData;
2172
2173 /*
2174 * Returns true iff the specified sector is present in the disk image. Drivers
2175 * not implementing the functionality are assumed to not support backing files,
2176 * hence all their sectors are reported as allocated.
2177 *
2178 * If 'sector_num' is beyond the end of the disk image the return value is 0
2179 * and 'pnum' is set to 0.
2180 *
2181 * 'pnum' is set to the number of sectors (including and immediately following
2182 * the specified sector) that are known to be in the same
2183 * allocated/unallocated state.
2184 *
2185 * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes
2186 * beyond the end of the disk image it will be clamped.
2187 */
2188 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
2189 int nb_sectors, int *pnum)
2190 {
2191 int64_t n;
2192
2193 if (sector_num >= bs->total_sectors) {
2194 *pnum = 0;
2195 return 0;
2196 }
2197
2198 n = bs->total_sectors - sector_num;
2199 if (n < nb_sectors) {
2200 nb_sectors = n;
2201 }
2202
2203 if (!bs->drv->bdrv_co_is_allocated) {
2204 *pnum = nb_sectors;
2205 return 1;
2206 }
2207
2208 return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum);
2209 }
2210
2211 /* Coroutine wrapper for bdrv_is_allocated() */
2212 static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque)
2213 {
2214 BdrvCoIsAllocatedData *data = opaque;
2215 BlockDriverState *bs = data->bs;
2216
2217 data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors,
2218 data->pnum);
2219 data->done = true;
2220 }
2221
2222 /*
2223 * Synchronous wrapper around bdrv_co_is_allocated().
2224 *
2225 * See bdrv_co_is_allocated() for details.
2226 */
2227 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
2228 int *pnum)
2229 {
2230 Coroutine *co;
2231 BdrvCoIsAllocatedData data = {
2232 .bs = bs,
2233 .sector_num = sector_num,
2234 .nb_sectors = nb_sectors,
2235 .pnum = pnum,
2236 .done = false,
2237 };
2238
2239 co = qemu_coroutine_create(bdrv_is_allocated_co_entry);
2240 qemu_coroutine_enter(co, &data);
2241 while (!data.done) {
2242 qemu_aio_wait();
2243 }
2244 return data.ret;
2245 }
2246
2247 void bdrv_mon_event(const BlockDriverState *bdrv,
2248 BlockMonEventAction action, int is_read)
2249 {
2250 QObject *data;
2251 const char *action_str;
2252
2253 switch (action) {
2254 case BDRV_ACTION_REPORT:
2255 action_str = "report";
2256 break;
2257 case BDRV_ACTION_IGNORE:
2258 action_str = "ignore";
2259 break;
2260 case BDRV_ACTION_STOP:
2261 action_str = "stop";
2262 break;
2263 default:
2264 abort();
2265 }
2266
2267 data = qobject_from_jsonf("{ 'device': %s, 'action': %s, 'operation': %s }",
2268 bdrv->device_name,
2269 action_str,
2270 is_read ? "read" : "write");
2271 monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
2272
2273 qobject_decref(data);
2274 }
2275
2276 BlockInfoList *qmp_query_block(Error **errp)
2277 {
2278 BlockInfoList *head = NULL, *cur_item = NULL;
2279 BlockDriverState *bs;
2280
2281 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2282 BlockInfoList *info = g_malloc0(sizeof(*info));
2283
2284 info->value = g_malloc0(sizeof(*info->value));
2285 info->value->device = g_strdup(bs->device_name);
2286 info->value->type = g_strdup("unknown");
2287 info->value->locked = bdrv_dev_is_medium_locked(bs);
2288 info->value->removable = bdrv_dev_has_removable_media(bs);
2289
2290 if (bdrv_dev_has_removable_media(bs)) {
2291 info->value->has_tray_open = true;
2292 info->value->tray_open = bdrv_dev_is_tray_open(bs);
2293 }
2294
2295 if (bdrv_iostatus_is_enabled(bs)) {
2296 info->value->has_io_status = true;
2297 info->value->io_status = bs->iostatus;
2298 }
2299
2300 if (bs->drv) {
2301 info->value->has_inserted = true;
2302 info->value->inserted = g_malloc0(sizeof(*info->value->inserted));
2303 info->value->inserted->file = g_strdup(bs->filename);
2304 info->value->inserted->ro = bs->read_only;
2305 info->value->inserted->drv = g_strdup(bs->drv->format_name);
2306 info->value->inserted->encrypted = bs->encrypted;
2307 if (bs->backing_file[0]) {
2308 info->value->inserted->has_backing_file = true;
2309 info->value->inserted->backing_file = g_strdup(bs->backing_file);
2310 }
2311
2312 if (bs->io_limits_enabled) {
2313 info->value->inserted->bps =
2314 bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2315 info->value->inserted->bps_rd =
2316 bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
2317 info->value->inserted->bps_wr =
2318 bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
2319 info->value->inserted->iops =
2320 bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2321 info->value->inserted->iops_rd =
2322 bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
2323 info->value->inserted->iops_wr =
2324 bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
2325 }
2326 }
2327
2328 /* XXX: waiting for the qapi to support GSList */
2329 if (!cur_item) {
2330 head = cur_item = info;
2331 } else {
2332 cur_item->next = info;
2333 cur_item = info;
2334 }
2335 }
2336
2337 return head;
2338 }
2339
2340 /* Consider exposing this as a full fledged QMP command */
2341 static BlockStats *qmp_query_blockstat(const BlockDriverState *bs, Error **errp)
2342 {
2343 BlockStats *s;
2344
2345 s = g_malloc0(sizeof(*s));
2346
2347 if (bs->device_name[0]) {
2348 s->has_device = true;
2349 s->device = g_strdup(bs->device_name);
2350 }
2351
2352 s->stats = g_malloc0(sizeof(*s->stats));
2353 s->stats->rd_bytes = bs->nr_bytes[BDRV_ACCT_READ];
2354 s->stats->wr_bytes = bs->nr_bytes[BDRV_ACCT_WRITE];
2355 s->stats->rd_operations = bs->nr_ops[BDRV_ACCT_READ];
2356 s->stats->wr_operations = bs->nr_ops[BDRV_ACCT_WRITE];
2357 s->stats->wr_highest_offset = bs->wr_highest_sector * BDRV_SECTOR_SIZE;
2358 s->stats->flush_operations = bs->nr_ops[BDRV_ACCT_FLUSH];
2359 s->stats->wr_total_time_ns = bs->total_time_ns[BDRV_ACCT_WRITE];
2360 s->stats->rd_total_time_ns = bs->total_time_ns[BDRV_ACCT_READ];
2361 s->stats->flush_total_time_ns = bs->total_time_ns[BDRV_ACCT_FLUSH];
2362
2363 if (bs->file) {
2364 s->has_parent = true;
2365 s->parent = qmp_query_blockstat(bs->file, NULL);
2366 }
2367
2368 return s;
2369 }
2370
2371 BlockStatsList *qmp_query_blockstats(Error **errp)
2372 {
2373 BlockStatsList *head = NULL, *cur_item = NULL;
2374 BlockDriverState *bs;
2375
2376 QTAILQ_FOREACH(bs, &bdrv_states, list) {
2377 BlockStatsList *info = g_malloc0(sizeof(*info));
2378 info->value = qmp_query_blockstat(bs, NULL);
2379
2380 /* XXX: waiting for the qapi to support GSList */
2381 if (!cur_item) {
2382 head = cur_item = info;
2383 } else {
2384 cur_item->next = info;
2385 cur_item = info;
2386 }
2387 }
2388
2389 return head;
2390 }
2391
2392 const char *bdrv_get_encrypted_filename(BlockDriverState *bs)
2393 {
2394 if (bs->backing_hd && bs->backing_hd->encrypted)
2395 return bs->backing_file;
2396 else if (bs->encrypted)
2397 return bs->filename;
2398 else
2399 return NULL;
2400 }
2401
2402 void bdrv_get_backing_filename(BlockDriverState *bs,
2403 char *filename, int filename_size)
2404 {
2405 pstrcpy(filename, filename_size, bs->backing_file);
2406 }
2407
2408 int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
2409 const uint8_t *buf, int nb_sectors)
2410 {
2411 BlockDriver *drv = bs->drv;
2412 if (!drv)
2413 return -ENOMEDIUM;
2414 if (!drv->bdrv_write_compressed)
2415 return -ENOTSUP;
2416 if (bdrv_check_request(bs, sector_num, nb_sectors))
2417 return -EIO;
2418
2419 if (bs->dirty_bitmap) {
2420 set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
2421 }
2422
2423 return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
2424 }
2425
2426 int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
2427 {
2428 BlockDriver *drv = bs->drv;
2429 if (!drv)
2430 return -ENOMEDIUM;
2431 if (!drv->bdrv_get_info)
2432 return -ENOTSUP;
2433 memset(bdi, 0, sizeof(*bdi));
2434 return drv->bdrv_get_info(bs, bdi);
2435 }
2436
2437 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2438 int64_t pos, int size)
2439 {
2440 BlockDriver *drv = bs->drv;
2441 if (!drv)
2442 return -ENOMEDIUM;
2443 if (drv->bdrv_save_vmstate)
2444 return drv->bdrv_save_vmstate(bs, buf, pos, size);
2445 if (bs->file)
2446 return bdrv_save_vmstate(bs->file, buf, pos, size);
2447 return -ENOTSUP;
2448 }
2449
2450 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2451 int64_t pos, int size)
2452 {
2453 BlockDriver *drv = bs->drv;
2454 if (!drv)
2455 return -ENOMEDIUM;
2456 if (drv->bdrv_load_vmstate)
2457 return drv->bdrv_load_vmstate(bs, buf, pos, size);
2458 if (bs->file)
2459 return bdrv_load_vmstate(bs->file, buf, pos, size);
2460 return -ENOTSUP;
2461 }
2462
2463 void bdrv_debug_event(BlockDriverState *bs, BlkDebugEvent event)
2464 {
2465 BlockDriver *drv = bs->drv;
2466
2467 if (!drv || !drv->bdrv_debug_event) {
2468 return;
2469 }
2470
2471 return drv->bdrv_debug_event(bs, event);
2472
2473 }
2474
2475 /**************************************************************/
2476 /* handling of snapshots */
2477
2478 int bdrv_can_snapshot(BlockDriverState *bs)
2479 {
2480 BlockDriver *drv = bs->drv;
2481 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
2482 return 0;
2483 }
2484
2485 if (!drv->bdrv_snapshot_create) {
2486 if (bs->file != NULL) {
2487 return bdrv_can_snapshot(bs->file);
2488 }
2489 return 0;
2490 }
2491
2492 return 1;
2493 }
2494
2495 int bdrv_is_snapshot(BlockDriverState *bs)
2496 {
2497 return !!(bs->open_flags & BDRV_O_SNAPSHOT);
2498 }
2499
2500 BlockDriverState *bdrv_snapshots(void)
2501 {
2502 BlockDriverState *bs;
2503
2504 if (bs_snapshots) {
2505 return bs_snapshots;
2506 }
2507
2508 bs = NULL;
2509 while ((bs = bdrv_next(bs))) {
2510 if (bdrv_can_snapshot(bs)) {
2511 bs_snapshots = bs;
2512 return bs;
2513 }
2514 }
2515 return NULL;
2516 }
2517
2518 int bdrv_snapshot_create(BlockDriverState *bs,
2519 QEMUSnapshotInfo *sn_info)
2520 {
2521 BlockDriver *drv = bs->drv;
2522 if (!drv)
2523 return -ENOMEDIUM;
2524 if (drv->bdrv_snapshot_create)
2525 return drv->bdrv_snapshot_create(bs, sn_info);
2526 if (bs->file)
2527 return bdrv_snapshot_create(bs->file, sn_info);
2528 return -ENOTSUP;
2529 }
2530
2531 int bdrv_snapshot_goto(BlockDriverState *bs,
2532 const char *snapshot_id)
2533 {
2534 BlockDriver *drv = bs->drv;
2535 int ret, open_ret;
2536
2537 if (!drv)
2538 return -ENOMEDIUM;
2539 if (drv->bdrv_snapshot_goto)
2540 return drv->bdrv_snapshot_goto(bs, snapshot_id);
2541
2542 if (bs->file) {
2543 drv->bdrv_close(bs);
2544 ret = bdrv_snapshot_goto(bs->file, snapshot_id);
2545 open_ret = drv->bdrv_open(bs, bs->open_flags);
2546 if (open_ret < 0) {
2547 bdrv_delete(bs->file);
2548 bs->drv = NULL;
2549 return open_ret;
2550 }
2551 return ret;
2552 }
2553
2554 return -ENOTSUP;
2555 }
2556
2557 int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
2558 {
2559 BlockDriver *drv = bs->drv;
2560 if (!drv)
2561 return -ENOMEDIUM;
2562 if (drv->bdrv_snapshot_delete)
2563 return drv->bdrv_snapshot_delete(bs, snapshot_id);
2564 if (bs->file)
2565 return bdrv_snapshot_delete(bs->file, snapshot_id);
2566 return -ENOTSUP;
2567 }
2568
2569 int bdrv_snapshot_list(BlockDriverState *bs,
2570 QEMUSnapshotInfo **psn_info)
2571 {
2572 BlockDriver *drv = bs->drv;
2573 if (!drv)
2574 return -ENOMEDIUM;
2575 if (drv->bdrv_snapshot_list)
2576 return drv->bdrv_snapshot_list(bs, psn_info);
2577 if (bs->file)
2578 return bdrv_snapshot_list(bs->file, psn_info);
2579 return -ENOTSUP;
2580 }
2581
2582 int bdrv_snapshot_load_tmp(BlockDriverState *bs,
2583 const char *snapshot_name)
2584 {
2585 BlockDriver *drv = bs->drv;
2586 if (!drv) {
2587 return -ENOMEDIUM;
2588 }
2589 if (!bs->read_only) {
2590 return -EINVAL;
2591 }
2592 if (drv->bdrv_snapshot_load_tmp) {
2593 return drv->bdrv_snapshot_load_tmp(bs, snapshot_name);
2594 }
2595 return -ENOTSUP;
2596 }
2597
2598 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
2599 const char *backing_file)
2600 {
2601 if (!bs->drv) {
2602 return NULL;
2603 }
2604
2605 if (bs->backing_hd) {
2606 if (strcmp(bs->backing_file, backing_file) == 0) {
2607 return bs->backing_hd;
2608 } else {
2609 return bdrv_find_backing_image(bs->backing_hd, backing_file);
2610 }
2611 }
2612
2613 return NULL;
2614 }
2615
2616 #define NB_SUFFIXES 4
2617
2618 char *get_human_readable_size(char *buf, int buf_size, int64_t size)
2619 {
2620 static const char suffixes[NB_SUFFIXES] = "KMGT";
2621 int64_t base;
2622 int i;
2623
2624 if (size <= 999) {
2625 snprintf(buf, buf_size, "%" PRId64, size);
2626 } else {
2627 base = 1024;
2628 for(i = 0; i < NB_SUFFIXES; i++) {
2629 if (size < (10 * base)) {
2630 snprintf(buf, buf_size, "%0.1f%c",
2631 (double)size / base,
2632 suffixes[i]);
2633 break;
2634 } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) {
2635 snprintf(buf, buf_size, "%" PRId64 "%c",
2636 ((size + (base >> 1)) / base),
2637 suffixes[i]);
2638 break;
2639 }
2640 base = base * 1024;
2641 }
2642 }
2643 return buf;
2644 }
2645
2646 char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
2647 {
2648 char buf1[128], date_buf[128], clock_buf[128];
2649 #ifdef _WIN32
2650 struct tm *ptm;
2651 #else
2652 struct tm tm;
2653 #endif
2654 time_t ti;
2655 int64_t secs;
2656
2657 if (!sn) {
2658 snprintf(buf, buf_size,
2659 "%-10s%-20s%7s%20s%15s",
2660 "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK");
2661 } else {
2662 ti = sn->date_sec;
2663 #ifdef _WIN32
2664 ptm = localtime(&ti);
2665 strftime(date_buf, sizeof(date_buf),
2666 "%Y-%m-%d %H:%M:%S", ptm);
2667 #else
2668 localtime_r(&ti, &tm);
2669 strftime(date_buf, sizeof(date_buf),
2670 "%Y-%m-%d %H:%M:%S", &tm);
2671 #endif
2672 secs = sn->vm_clock_nsec / 1000000000;
2673 snprintf(clock_buf, sizeof(clock_buf),
2674 "%02d:%02d:%02d.%03d",
2675 (int)(secs / 3600),
2676 (int)((secs / 60) % 60),
2677 (int)(secs % 60),
2678 (int)((sn->vm_clock_nsec / 1000000) % 1000));
2679 snprintf(buf, buf_size,
2680 "%-10s%-20s%7s%20s%15s",
2681 sn->id_str, sn->name,
2682 get_human_readable_size(buf1, sizeof(buf1), sn->vm_state_size),
2683 date_buf,
2684 clock_buf);
2685 }
2686 return buf;
2687 }
2688
2689 /**************************************************************/
2690 /* async I/Os */
2691
2692 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
2693 QEMUIOVector *qiov, int nb_sectors,
2694 BlockDriverCompletionFunc *cb, void *opaque)
2695 {
2696 trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
2697
2698 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2699 cb, opaque, false);
2700 }
2701
2702 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
2703 QEMUIOVector *qiov, int nb_sectors,
2704 BlockDriverCompletionFunc *cb, void *opaque)
2705 {
2706 trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
2707
2708 return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors,
2709 cb, opaque, true);
2710 }
2711
2712
2713 typedef struct MultiwriteCB {
2714 int error;
2715 int num_requests;
2716 int num_callbacks;
2717 struct {
2718 BlockDriverCompletionFunc *cb;
2719 void *opaque;
2720 QEMUIOVector *free_qiov;
2721 void *free_buf;
2722 } callbacks[];
2723 } MultiwriteCB;
2724
2725 static void multiwrite_user_cb(MultiwriteCB *mcb)
2726 {
2727 int i;
2728
2729 for (i = 0; i < mcb->num_callbacks; i++) {
2730 mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error);
2731 if (mcb->callbacks[i].free_qiov) {
2732 qemu_iovec_destroy(mcb->callbacks[i].free_qiov);
2733 }
2734 g_free(mcb->callbacks[i].free_qiov);
2735 qemu_vfree(mcb->callbacks[i].free_buf);
2736 }
2737 }
2738
2739 static void multiwrite_cb(void *opaque, int ret)
2740 {
2741 MultiwriteCB *mcb = opaque;
2742
2743 trace_multiwrite_cb(mcb, ret);
2744
2745 if (ret < 0 && !mcb->error) {
2746 mcb->error = ret;
2747 }
2748
2749 mcb->num_requests--;
2750 if (mcb->num_requests == 0) {
2751 multiwrite_user_cb(mcb);
2752 g_free(mcb);
2753 }
2754 }
2755
2756 static int multiwrite_req_compare(const void *a, const void *b)
2757 {
2758 const BlockRequest *req1 = a, *req2 = b;
2759
2760 /*
2761 * Note that we can't simply subtract req2->sector from req1->sector
2762 * here as that could overflow the return value.
2763 */
2764 if (req1->sector > req2->sector) {
2765 return 1;
2766 } else if (req1->sector < req2->sector) {
2767 return -1;
2768 } else {
2769 return 0;
2770 }
2771 }
2772
2773 /*
2774 * Takes a bunch of requests and tries to merge them. Returns the number of
2775 * requests that remain after merging.
2776 */
2777 static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
2778 int num_reqs, MultiwriteCB *mcb)
2779 {
2780 int i, outidx;
2781
2782 // Sort requests by start sector
2783 qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare);
2784
2785 // Check if adjacent requests touch the same clusters. If so, combine them,
2786 // filling up gaps with zero sectors.
2787 outidx = 0;
2788 for (i = 1; i < num_reqs; i++) {
2789 int merge = 0;
2790 int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors;
2791
2792 // This handles the cases that are valid for all block drivers, namely
2793 // exactly sequential writes and overlapping writes.
2794 if (reqs[i].sector <= oldreq_last) {
2795 merge = 1;
2796 }
2797
2798 // The block driver may decide that it makes sense to combine requests
2799 // even if there is a gap of some sectors between them. In this case,
2800 // the gap is filled with zeros (therefore only applicable for yet
2801 // unused space in format like qcow2).
2802 if (!merge && bs->drv->bdrv_merge_requests) {
2803 merge = bs->drv->bdrv_merge_requests(bs, &reqs[outidx], &reqs[i]);
2804 }
2805
2806 if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) {
2807 merge = 0;
2808 }
2809
2810 if (merge) {
2811 size_t size;
2812 QEMUIOVector *qiov = g_malloc0(sizeof(*qiov));
2813 qemu_iovec_init(qiov,
2814 reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1);
2815
2816 // Add the first request to the merged one. If the requests are
2817 // overlapping, drop the last sectors of the first request.
2818 size = (reqs[i].sector - reqs[outidx].sector) << 9;
2819 qemu_iovec_concat(qiov, reqs[outidx].qiov, size);
2820
2821 // We might need to add some zeros between the two requests
2822 if (reqs[i].sector > oldreq_last) {
2823 size_t zero_bytes = (reqs[i].sector - oldreq_last) << 9;
2824 uint8_t *buf = qemu_blockalign(bs, zero_bytes);
2825 memset(buf, 0, zero_bytes);
2826 qemu_iovec_add(qiov, buf, zero_bytes);
2827 mcb->callbacks[i].free_buf = buf;
2828 }
2829
2830 // Add the second request
2831 qemu_iovec_concat(qiov, reqs[i].qiov, reqs[i].qiov->size);
2832
2833 reqs[outidx].nb_sectors = qiov->size >> 9;
2834 reqs[outidx].qiov = qiov;
2835
2836 mcb->callbacks[i].free_qiov = reqs[outidx].qiov;
2837 } else {
2838 outidx++;
2839 reqs[outidx].sector = reqs[i].sector;
2840 reqs[outidx].nb_sectors = reqs[i].nb_sectors;
2841 reqs[outidx].qiov = reqs[i].qiov;
2842 }
2843 }
2844
2845 return outidx + 1;
2846 }
2847
2848 /*
2849 * Submit multiple AIO write requests at once.
2850 *
2851 * On success, the function returns 0 and all requests in the reqs array have
2852 * been submitted. In error case this function returns -1, and any of the
2853 * requests may or may not be submitted yet. In particular, this means that the
2854 * callback will be called for some of the requests, for others it won't. The
2855 * caller must check the error field of the BlockRequest to wait for the right
2856 * callbacks (if error != 0, no callback will be called).
2857 *
2858 * The implementation may modify the contents of the reqs array, e.g. to merge
2859 * requests. However, the fields opaque and error are left unmodified as they
2860 * are used to signal failure for a single request to the caller.
2861 */
2862 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs)
2863 {
2864 MultiwriteCB *mcb;
2865 int i;
2866
2867 /* don't submit writes if we don't have a medium */
2868 if (bs->drv == NULL) {
2869 for (i = 0; i < num_reqs; i++) {
2870 reqs[i].error = -ENOMEDIUM;
2871 }
2872 return -1;
2873 }
2874
2875 if (num_reqs == 0) {
2876 return 0;
2877 }
2878
2879 // Create MultiwriteCB structure
2880 mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks));
2881 mcb->num_requests = 0;
2882 mcb->num_callbacks = num_reqs;
2883
2884 for (i = 0; i < num_reqs; i++) {
2885 mcb->callbacks[i].cb = reqs[i].cb;
2886 mcb->callbacks[i].opaque = reqs[i].opaque;
2887 }
2888
2889 // Check for mergable requests
2890 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
2891
2892 trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs);
2893
2894 /* Run the aio requests. */
2895 mcb->num_requests = num_reqs;
2896 for (i = 0; i < num_reqs; i++) {
2897 bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
2898 reqs[i].nb_sectors, multiwrite_cb, mcb);
2899 }
2900
2901 return 0;
2902 }
2903
2904 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
2905 {
2906 acb->pool->cancel(acb);
2907 }
2908
2909 /* block I/O throttling */
2910 static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
2911 bool is_write, double elapsed_time, uint64_t *wait)
2912 {
2913 uint64_t bps_limit = 0;
2914 double bytes_limit, bytes_base, bytes_res;
2915 double slice_time, wait_time;
2916
2917 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2918 bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
2919 } else if (bs->io_limits.bps[is_write]) {
2920 bps_limit = bs->io_limits.bps[is_write];
2921 } else {
2922 if (wait) {
2923 *wait = 0;
2924 }
2925
2926 return false;
2927 }
2928
2929 slice_time = bs->slice_end - bs->slice_start;
2930 slice_time /= (NANOSECONDS_PER_SECOND);
2931 bytes_limit = bps_limit * slice_time;
2932 bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
2933 if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
2934 bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
2935 }
2936
2937 /* bytes_base: the bytes of data which have been read/written; and
2938 * it is obtained from the history statistic info.
2939 * bytes_res: the remaining bytes of data which need to be read/written.
2940 * (bytes_base + bytes_res) / bps_limit: used to calcuate
2941 * the total time for completing reading/writting all data.
2942 */
2943 bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
2944
2945 if (bytes_base + bytes_res <= bytes_limit) {
2946 if (wait) {
2947 *wait = 0;
2948 }
2949
2950 return false;
2951 }
2952
2953 /* Calc approx time to dispatch */
2954 wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
2955
2956 /* When the I/O rate at runtime exceeds the limits,
2957 * bs->slice_end need to be extended in order that the current statistic
2958 * info can be kept until the timer fire, so it is increased and tuned
2959 * based on the result of experiment.
2960 */
2961 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
2962 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
2963 if (wait) {
2964 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
2965 }
2966
2967 return true;
2968 }
2969
2970 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
2971 double elapsed_time, uint64_t *wait)
2972 {
2973 uint64_t iops_limit = 0;
2974 double ios_limit, ios_base;
2975 double slice_time, wait_time;
2976
2977 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2978 iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
2979 } else if (bs->io_limits.iops[is_write]) {
2980 iops_limit = bs->io_limits.iops[is_write];
2981 } else {
2982 if (wait) {
2983 *wait = 0;
2984 }
2985
2986 return false;
2987 }
2988
2989 slice_time = bs->slice_end - bs->slice_start;
2990 slice_time /= (NANOSECONDS_PER_SECOND);
2991 ios_limit = iops_limit * slice_time;
2992 ios_base = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
2993 if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
2994 ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
2995 }
2996
2997 if (ios_base + 1 <= ios_limit) {
2998 if (wait) {
2999 *wait = 0;
3000 }
3001
3002 return false;
3003 }
3004
3005 /* Calc approx time to dispatch */
3006 wait_time = (ios_base + 1) / iops_limit;
3007 if (wait_time > elapsed_time) {
3008 wait_time = wait_time - elapsed_time;
3009 } else {
3010 wait_time = 0;
3011 }
3012
3013 bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
3014 bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
3015 if (wait) {
3016 *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
3017 }
3018
3019 return true;
3020 }
3021
3022 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
3023 bool is_write, int64_t *wait)
3024 {
3025 int64_t now, max_wait;
3026 uint64_t bps_wait = 0, iops_wait = 0;
3027 double elapsed_time;
3028 int bps_ret, iops_ret;
3029
3030 now = qemu_get_clock_ns(vm_clock);
3031 if ((bs->slice_start < now)
3032 && (bs->slice_end > now)) {
3033 bs->slice_end = now + bs->slice_time;
3034 } else {
3035 bs->slice_time = 5 * BLOCK_IO_SLICE_TIME;
3036 bs->slice_start = now;
3037 bs->slice_end = now + bs->slice_time;
3038
3039 bs->io_base.bytes[is_write] = bs->nr_bytes[is_write];
3040 bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
3041
3042 bs->io_base.ios[is_write] = bs->nr_ops[is_write];
3043 bs->io_base.ios[!is_write] = bs->nr_ops[!is_write];
3044 }
3045
3046 elapsed_time = now - bs->slice_start;
3047 elapsed_time /= (NANOSECONDS_PER_SECOND);
3048
3049 bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors,
3050 is_write, elapsed_time, &bps_wait);
3051 iops_ret = bdrv_exceed_iops_limits(bs, is_write,
3052 elapsed_time, &iops_wait);
3053 if (bps_ret || iops_ret) {
3054 max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
3055 if (wait) {
3056 *wait = max_wait;
3057 }
3058
3059 now = qemu_get_clock_ns(vm_clock);
3060 if (bs->slice_end < now + max_wait) {
3061 bs->slice_end = now + max_wait;
3062 }
3063
3064 return true;
3065 }
3066
3067 if (wait) {
3068 *wait = 0;
3069 }
3070
3071 return false;
3072 }
3073
3074 /**************************************************************/
3075 /* async block device emulation */
3076
3077 typedef struct BlockDriverAIOCBSync {
3078 BlockDriverAIOCB common;
3079 QEMUBH *bh;
3080 int ret;
3081 /* vector translation state */
3082 QEMUIOVector *qiov;
3083 uint8_t *bounce;
3084 int is_write;
3085 } BlockDriverAIOCBSync;
3086
3087 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
3088 {
3089 BlockDriverAIOCBSync *acb =
3090 container_of(blockacb, BlockDriverAIOCBSync, common);
3091 qemu_bh_delete(acb->bh);
3092 acb->bh = NULL;
3093 qemu_aio_release(acb);
3094 }
3095
3096 static AIOPool bdrv_em_aio_pool = {
3097 .aiocb_size = sizeof(BlockDriverAIOCBSync),
3098 .cancel = bdrv_aio_cancel_em,
3099 };
3100
3101 static void bdrv_aio_bh_cb(void *opaque)
3102 {
3103 BlockDriverAIOCBSync *acb = opaque;
3104
3105 if (!acb->is_write)
3106 qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size);
3107 qemu_vfree(acb->bounce);
3108 acb->common.cb(acb->common.opaque, acb->ret);
3109 qemu_bh_delete(acb->bh);
3110 acb->bh = NULL;
3111 qemu_aio_release(acb);
3112 }
3113
3114 static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs,
3115 int64_t sector_num,
3116 QEMUIOVector *qiov,
3117 int nb_sectors,
3118 BlockDriverCompletionFunc *cb,
3119 void *opaque,
3120 int is_write)
3121
3122 {
3123 BlockDriverAIOCBSync *acb;
3124
3125 acb = qemu_aio_get(&bdrv_em_aio_pool, bs, cb, opaque);
3126 acb->is_write = is_write;
3127 acb->qiov = qiov;
3128 acb->bounce = qemu_blockalign(bs, qiov->size);
3129 acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
3130
3131 if (is_write) {
3132 qemu_iovec_to_buffer(acb->qiov, acb->bounce);
3133 acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors);
3134 } else {
3135 acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors);
3136 }
3137
3138 qemu_bh_schedule(acb->bh);
3139
3140 return &acb->common;
3141 }
3142
3143 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
3144 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3145 BlockDriverCompletionFunc *cb, void *opaque)
3146 {
3147 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
3148 }
3149
3150 static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
3151 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
3152 BlockDriverCompletionFunc *cb, void *opaque)
3153 {
3154 return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
3155 }
3156
3157
3158 typedef struct BlockDriverAIOCBCoroutine {
3159 BlockDriverAIOCB common;
3160 BlockRequest req;
3161 bool is_write;
3162 QEMUBH* bh;
3163 } BlockDriverAIOCBCoroutine;
3164
3165 static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
3166 {
3167 qemu_aio_flush();
3168 }
3169
3170 static AIOPool bdrv_em_co_aio_pool = {
3171 .aiocb_size = sizeof(BlockDriverAIOCBCoroutine),
3172 .cancel = bdrv_aio_co_cancel_em,
3173 };
3174
3175 static void bdrv_co_em_bh(void *opaque)
3176 {
3177 BlockDriverAIOCBCoroutine *acb = opaque;
3178
3179 acb->common.cb(acb->common.opaque, acb->req.error);
3180 qemu_bh_delete(acb->bh);
3181 qemu_aio_release(acb);
3182 }
3183
3184 /* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
3185 static void coroutine_fn bdrv_co_do_rw(void *opaque)
3186 {
3187 BlockDriverAIOCBCoroutine *acb = opaque;
3188 BlockDriverState *bs = acb->common.bs;
3189
3190 if (!acb->is_write) {
3191 acb->req.error = bdrv_co_do_readv(bs, acb->req.sector,
3192 acb->req.nb_sectors, acb->req.qiov, 0);
3193 } else {
3194 acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
3195 acb->req.nb_sectors, acb->req.qiov);
3196 }
3197
3198 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3199 qemu_bh_schedule(acb->bh);
3200 }
3201
3202 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
3203 int64_t sector_num,
3204 QEMUIOVector *qiov,
3205 int nb_sectors,
3206 BlockDriverCompletionFunc *cb,
3207 void *opaque,
3208 bool is_write)
3209 {
3210 Coroutine *co;
3211 BlockDriverAIOCBCoroutine *acb;
3212
3213 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3214 acb->req.sector = sector_num;
3215 acb->req.nb_sectors = nb_sectors;
3216 acb->req.qiov = qiov;
3217 acb->is_write = is_write;
3218
3219 co = qemu_coroutine_create(bdrv_co_do_rw);
3220 qemu_coroutine_enter(co, acb);
3221
3222 return &acb->common;
3223 }
3224
3225 static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
3226 {
3227 BlockDriverAIOCBCoroutine *acb = opaque;
3228 BlockDriverState *bs = acb->common.bs;
3229
3230 acb->req.error = bdrv_co_flush(bs);
3231 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3232 qemu_bh_schedule(acb->bh);
3233 }
3234
3235 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
3236 BlockDriverCompletionFunc *cb, void *opaque)
3237 {
3238 trace_bdrv_aio_flush(bs, opaque);
3239
3240 Coroutine *co;
3241 BlockDriverAIOCBCoroutine *acb;
3242
3243 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3244 co = qemu_coroutine_create(bdrv_aio_flush_co_entry);
3245 qemu_coroutine_enter(co, acb);
3246
3247 return &acb->common;
3248 }
3249
3250 static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
3251 {
3252 BlockDriverAIOCBCoroutine *acb = opaque;
3253 BlockDriverState *bs = acb->common.bs;
3254
3255 acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
3256 acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
3257 qemu_bh_schedule(acb->bh);
3258 }
3259
3260 BlockDriverAIOCB *bdrv_aio_discard(BlockDriverState *bs,
3261 int64_t sector_num, int nb_sectors,
3262 BlockDriverCompletionFunc *cb, void *opaque)
3263 {
3264 Coroutine *co;
3265 BlockDriverAIOCBCoroutine *acb;
3266
3267 trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
3268
3269 acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
3270 acb->req.sector = sector_num;
3271 acb->req.nb_sectors = nb_sectors;
3272 co = qemu_coroutine_create(bdrv_aio_discard_co_entry);
3273 qemu_coroutine_enter(co, acb);
3274
3275 return &acb->common;
3276 }
3277
3278 void bdrv_init(void)
3279 {
3280 module_call_init(MODULE_INIT_BLOCK);
3281 }
3282
3283 void bdrv_init_with_whitelist(void)
3284 {
3285 use_bdrv_whitelist = 1;
3286 bdrv_init();
3287 }
3288
3289 void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
3290 BlockDriverCompletionFunc *cb, void *opaque)
3291 {
3292 BlockDriverAIOCB *acb;
3293
3294 if (pool->free_aiocb) {
3295 acb = pool->free_aiocb;
3296 pool->free_aiocb = acb->next;
3297 } else {
3298 acb = g_malloc0(pool->aiocb_size);
3299 acb->pool = pool;
3300 }
3301 acb->bs = bs;
3302 acb->cb = cb;
3303 acb->opaque = opaque;
3304 return acb;
3305 }
3306
3307 void qemu_aio_release(void *p)
3308 {
3309 BlockDriverAIOCB *acb = (BlockDriverAIOCB *)p;
3310 AIOPool *pool = acb->pool;
3311 acb->next = pool->free_aiocb;
3312 pool->free_aiocb = acb;
3313 }
3314
3315 /**************************************************************/
3316 /* Coroutine block device emulation */
3317
3318 typedef struct CoroutineIOCompletion {
3319 Coroutine *coroutine;
3320 int ret;
3321 } CoroutineIOCompletion;
3322
3323 static void bdrv_co_io_em_complete(void *opaque, int ret)
3324 {
3325 CoroutineIOCompletion *co = opaque;
3326
3327 co->ret = ret;
3328 qemu_coroutine_enter(co->coroutine, NULL);
3329 }
3330
3331 static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
3332 int nb_sectors, QEMUIOVector *iov,
3333 bool is_write)
3334 {
3335 CoroutineIOCompletion co = {
3336 .coroutine = qemu_coroutine_self(),
3337 };
3338 BlockDriverAIOCB *acb;
3339
3340 if (is_write) {
3341 acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
3342 bdrv_co_io_em_complete, &co);
3343 } else {
3344 acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
3345 bdrv_co_io_em_complete, &co);
3346 }
3347
3348 trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb);
3349 if (!acb) {
3350 return -EIO;
3351 }
3352 qemu_coroutine_yield();
3353
3354 return co.ret;
3355 }
3356
3357 static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
3358 int64_t sector_num, int nb_sectors,
3359 QEMUIOVector *iov)
3360 {
3361 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
3362 }
3363
3364 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
3365 int64_t sector_num, int nb_sectors,
3366 QEMUIOVector *iov)
3367 {
3368 return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
3369 }
3370
3371 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
3372 {
3373 RwCo *rwco = opaque;
3374
3375 rwco->ret = bdrv_co_flush(rwco->bs);
3376 }
3377
3378 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
3379 {
3380 int ret;
3381
3382 if (!bs->drv) {
3383 return 0;
3384 }
3385
3386 /* Write back cached data to the OS even with cache=unsafe */
3387 if (bs->drv->bdrv_co_flush_to_os) {
3388 ret = bs->drv->bdrv_co_flush_to_os(bs);
3389 if (ret < 0) {
3390 return ret;
3391 }
3392 }
3393
3394 /* But don't actually force it to the disk with cache=unsafe */
3395 if (bs->open_flags & BDRV_O_NO_FLUSH) {
3396 return 0;
3397 }
3398
3399 if (bs->drv->bdrv_co_flush_to_disk) {
3400 return bs->drv->bdrv_co_flush_to_disk(bs);
3401 } else if (bs->drv->bdrv_aio_flush) {
3402 BlockDriverAIOCB *acb;
3403 CoroutineIOCompletion co = {
3404 .coroutine = qemu_coroutine_self(),
3405 };
3406
3407 acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3408 if (acb == NULL) {
3409 return -EIO;
3410 } else {
3411 qemu_coroutine_yield();
3412 return co.ret;
3413 }
3414 } else {
3415 /*
3416 * Some block drivers always operate in either writethrough or unsafe
3417 * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3418 * know how the server works (because the behaviour is hardcoded or
3419 * depends on server-side configuration), so we can't ensure that
3420 * everything is safe on disk. Returning an error doesn't work because
3421 * that would break guests even if the server operates in writethrough
3422 * mode.
3423 *
3424 * Let's hope the user knows what he's doing.
3425 */
3426 return 0;
3427 }
3428 }
3429
3430 void bdrv_invalidate_cache(BlockDriverState *bs)
3431 {
3432 if (bs->drv && bs->drv->bdrv_invalidate_cache) {
3433 bs->drv->bdrv_invalidate_cache(bs);
3434 }
3435 }
3436
3437 void bdrv_invalidate_cache_all(void)
3438 {
3439 BlockDriverState *bs;
3440
3441 QTAILQ_FOREACH(bs, &bdrv_states, list) {
3442 bdrv_invalidate_cache(bs);
3443 }
3444 }
3445
3446 int bdrv_flush(BlockDriverState *bs)
3447 {
3448 Coroutine *co;
3449 RwCo rwco = {
3450 .bs = bs,
3451 .ret = NOT_DONE,
3452 };
3453
3454 if (qemu_in_coroutine()) {
3455 /* Fast-path if already in coroutine context */
3456 bdrv_flush_co_entry(&rwco);
3457 } else {
3458 co = qemu_coroutine_create(bdrv_flush_co_entry);
3459 qemu_coroutine_enter(co, &rwco);
3460 while (rwco.ret == NOT_DONE) {
3461 qemu_aio_wait();
3462 }
3463 }
3464
3465 return rwco.ret;
3466 }
3467
3468 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
3469 {
3470 RwCo *rwco = opaque;
3471
3472 rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
3473 }
3474
3475 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
3476 int nb_sectors)
3477 {
3478 if (!bs->drv) {
3479 return -ENOMEDIUM;
3480 } else if (bdrv_check_request(bs, sector_num, nb_sectors)) {
3481 return -EIO;
3482 } else if (bs->read_only) {
3483 return -EROFS;
3484 } else if (bs->drv->bdrv_co_discard) {
3485 return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors);
3486 } else if (bs->drv->bdrv_aio_discard) {
3487 BlockDriverAIOCB *acb;
3488 CoroutineIOCompletion co = {
3489 .coroutine = qemu_coroutine_self(),
3490 };
3491
3492 acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
3493 bdrv_co_io_em_complete, &co);
3494 if (acb == NULL) {
3495 return -EIO;
3496 } else {
3497 qemu_coroutine_yield();
3498 return co.ret;
3499 }