Merge remote-tracking branch 'remotes/cohuck/tags/s390x-20191214-2' into staging
[qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "migration/vmstate.h"
29 #include "monitor/monitor.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-commands-misc.h"
32 #include "qapi/qapi-events-run-state.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qemu/error-report.h"
35 #include "qemu/qemu-print.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
46
47 #include "qemu/thread.h"
48 #include "qemu/plugin.h"
49 #include "sysemu/cpus.h"
50 #include "sysemu/qtest.h"
51 #include "qemu/main-loop.h"
52 #include "qemu/option.h"
53 #include "qemu/bitmap.h"
54 #include "qemu/seqlock.h"
55 #include "qemu/guest-random.h"
56 #include "tcg.h"
57 #include "hw/nmi.h"
58 #include "sysemu/replay.h"
59 #include "sysemu/runstate.h"
60 #include "hw/boards.h"
61 #include "hw/hw.h"
62
63 #ifdef CONFIG_LINUX
64
65 #include <sys/prctl.h>
66
67 #ifndef PR_MCE_KILL
68 #define PR_MCE_KILL 33
69 #endif
70
71 #ifndef PR_MCE_KILL_SET
72 #define PR_MCE_KILL_SET 1
73 #endif
74
75 #ifndef PR_MCE_KILL_EARLY
76 #define PR_MCE_KILL_EARLY 1
77 #endif
78
79 #endif /* CONFIG_LINUX */
80
81 static QemuMutex qemu_global_mutex;
82
83 int64_t max_delay;
84 int64_t max_advance;
85
86 /* vcpu throttling controls */
87 static QEMUTimer *throttle_timer;
88 static unsigned int throttle_percentage;
89
90 #define CPU_THROTTLE_PCT_MIN 1
91 #define CPU_THROTTLE_PCT_MAX 99
92 #define CPU_THROTTLE_TIMESLICE_NS 10000000
93
94 bool cpu_is_stopped(CPUState *cpu)
95 {
96 return cpu->stopped || !runstate_is_running();
97 }
98
99 static bool cpu_thread_is_idle(CPUState *cpu)
100 {
101 if (cpu->stop || cpu->queued_work_first) {
102 return false;
103 }
104 if (cpu_is_stopped(cpu)) {
105 return true;
106 }
107 if (!cpu->halted || cpu_has_work(cpu) ||
108 kvm_halt_in_kernel()) {
109 return false;
110 }
111 return true;
112 }
113
114 static bool all_cpu_threads_idle(void)
115 {
116 CPUState *cpu;
117
118 CPU_FOREACH(cpu) {
119 if (!cpu_thread_is_idle(cpu)) {
120 return false;
121 }
122 }
123 return true;
124 }
125
126 /***********************************************************/
127 /* guest cycle counter */
128
129 /* Protected by TimersState seqlock */
130
131 static bool icount_sleep = true;
132 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
133 #define MAX_ICOUNT_SHIFT 10
134
135 typedef struct TimersState {
136 /* Protected by BQL. */
137 int64_t cpu_ticks_prev;
138 int64_t cpu_ticks_offset;
139
140 /* Protect fields that can be respectively read outside the
141 * BQL, and written from multiple threads.
142 */
143 QemuSeqLock vm_clock_seqlock;
144 QemuSpin vm_clock_lock;
145
146 int16_t cpu_ticks_enabled;
147
148 /* Conversion factor from emulated instructions to virtual clock ticks. */
149 int16_t icount_time_shift;
150
151 /* Compensate for varying guest execution speed. */
152 int64_t qemu_icount_bias;
153
154 int64_t vm_clock_warp_start;
155 int64_t cpu_clock_offset;
156
157 /* Only written by TCG thread */
158 int64_t qemu_icount;
159
160 /* for adjusting icount */
161 QEMUTimer *icount_rt_timer;
162 QEMUTimer *icount_vm_timer;
163 QEMUTimer *icount_warp_timer;
164 } TimersState;
165
166 static TimersState timers_state;
167 bool mttcg_enabled;
168
169 /*
170 * We default to false if we know other options have been enabled
171 * which are currently incompatible with MTTCG. Otherwise when each
172 * guest (target) has been updated to support:
173 * - atomic instructions
174 * - memory ordering primitives (barriers)
175 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
176 *
177 * Once a guest architecture has been converted to the new primitives
178 * there are two remaining limitations to check.
179 *
180 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
181 * - The host must have a stronger memory order than the guest
182 *
183 * It may be possible in future to support strong guests on weak hosts
184 * but that will require tagging all load/stores in a guest with their
185 * implicit memory order requirements which would likely slow things
186 * down a lot.
187 */
188
189 static bool check_tcg_memory_orders_compatible(void)
190 {
191 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
192 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
193 #else
194 return false;
195 #endif
196 }
197
198 static bool default_mttcg_enabled(void)
199 {
200 if (use_icount || TCG_OVERSIZED_GUEST) {
201 return false;
202 } else {
203 #ifdef TARGET_SUPPORTS_MTTCG
204 return check_tcg_memory_orders_compatible();
205 #else
206 return false;
207 #endif
208 }
209 }
210
211 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
212 {
213 const char *t = qemu_opt_get(opts, "thread");
214 if (t) {
215 if (strcmp(t, "multi") == 0) {
216 if (TCG_OVERSIZED_GUEST) {
217 error_setg(errp, "No MTTCG when guest word size > hosts");
218 } else if (use_icount) {
219 error_setg(errp, "No MTTCG when icount is enabled");
220 } else {
221 #ifndef TARGET_SUPPORTS_MTTCG
222 warn_report("Guest not yet converted to MTTCG - "
223 "you may get unexpected results");
224 #endif
225 if (!check_tcg_memory_orders_compatible()) {
226 warn_report("Guest expects a stronger memory ordering "
227 "than the host provides");
228 error_printf("This may cause strange/hard to debug errors\n");
229 }
230 mttcg_enabled = true;
231 }
232 } else if (strcmp(t, "single") == 0) {
233 mttcg_enabled = false;
234 } else {
235 error_setg(errp, "Invalid 'thread' setting %s", t);
236 }
237 } else {
238 mttcg_enabled = default_mttcg_enabled();
239 }
240 }
241
242 /* The current number of executed instructions is based on what we
243 * originally budgeted minus the current state of the decrementing
244 * icount counters in extra/u16.low.
245 */
246 static int64_t cpu_get_icount_executed(CPUState *cpu)
247 {
248 return (cpu->icount_budget -
249 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
250 }
251
252 /*
253 * Update the global shared timer_state.qemu_icount to take into
254 * account executed instructions. This is done by the TCG vCPU
255 * thread so the main-loop can see time has moved forward.
256 */
257 static void cpu_update_icount_locked(CPUState *cpu)
258 {
259 int64_t executed = cpu_get_icount_executed(cpu);
260 cpu->icount_budget -= executed;
261
262 atomic_set_i64(&timers_state.qemu_icount,
263 timers_state.qemu_icount + executed);
264 }
265
266 /*
267 * Update the global shared timer_state.qemu_icount to take into
268 * account executed instructions. This is done by the TCG vCPU
269 * thread so the main-loop can see time has moved forward.
270 */
271 void cpu_update_icount(CPUState *cpu)
272 {
273 seqlock_write_lock(&timers_state.vm_clock_seqlock,
274 &timers_state.vm_clock_lock);
275 cpu_update_icount_locked(cpu);
276 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
277 &timers_state.vm_clock_lock);
278 }
279
280 static int64_t cpu_get_icount_raw_locked(void)
281 {
282 CPUState *cpu = current_cpu;
283
284 if (cpu && cpu->running) {
285 if (!cpu->can_do_io) {
286 error_report("Bad icount read");
287 exit(1);
288 }
289 /* Take into account what has run */
290 cpu_update_icount_locked(cpu);
291 }
292 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
293 return atomic_read_i64(&timers_state.qemu_icount);
294 }
295
296 static int64_t cpu_get_icount_locked(void)
297 {
298 int64_t icount = cpu_get_icount_raw_locked();
299 return atomic_read_i64(&timers_state.qemu_icount_bias) +
300 cpu_icount_to_ns(icount);
301 }
302
303 int64_t cpu_get_icount_raw(void)
304 {
305 int64_t icount;
306 unsigned start;
307
308 do {
309 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
310 icount = cpu_get_icount_raw_locked();
311 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
312
313 return icount;
314 }
315
316 /* Return the virtual CPU time, based on the instruction counter. */
317 int64_t cpu_get_icount(void)
318 {
319 int64_t icount;
320 unsigned start;
321
322 do {
323 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
324 icount = cpu_get_icount_locked();
325 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
326
327 return icount;
328 }
329
330 int64_t cpu_icount_to_ns(int64_t icount)
331 {
332 return icount << atomic_read(&timers_state.icount_time_shift);
333 }
334
335 static int64_t cpu_get_ticks_locked(void)
336 {
337 int64_t ticks = timers_state.cpu_ticks_offset;
338 if (timers_state.cpu_ticks_enabled) {
339 ticks += cpu_get_host_ticks();
340 }
341
342 if (timers_state.cpu_ticks_prev > ticks) {
343 /* Non increasing ticks may happen if the host uses software suspend. */
344 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
345 ticks = timers_state.cpu_ticks_prev;
346 }
347
348 timers_state.cpu_ticks_prev = ticks;
349 return ticks;
350 }
351
352 /* return the time elapsed in VM between vm_start and vm_stop. Unless
353 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
354 * counter.
355 */
356 int64_t cpu_get_ticks(void)
357 {
358 int64_t ticks;
359
360 if (use_icount) {
361 return cpu_get_icount();
362 }
363
364 qemu_spin_lock(&timers_state.vm_clock_lock);
365 ticks = cpu_get_ticks_locked();
366 qemu_spin_unlock(&timers_state.vm_clock_lock);
367 return ticks;
368 }
369
370 static int64_t cpu_get_clock_locked(void)
371 {
372 int64_t time;
373
374 time = timers_state.cpu_clock_offset;
375 if (timers_state.cpu_ticks_enabled) {
376 time += get_clock();
377 }
378
379 return time;
380 }
381
382 /* Return the monotonic time elapsed in VM, i.e.,
383 * the time between vm_start and vm_stop
384 */
385 int64_t cpu_get_clock(void)
386 {
387 int64_t ti;
388 unsigned start;
389
390 do {
391 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
392 ti = cpu_get_clock_locked();
393 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
394
395 return ti;
396 }
397
398 /* enable cpu_get_ticks()
399 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
400 */
401 void cpu_enable_ticks(void)
402 {
403 seqlock_write_lock(&timers_state.vm_clock_seqlock,
404 &timers_state.vm_clock_lock);
405 if (!timers_state.cpu_ticks_enabled) {
406 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
407 timers_state.cpu_clock_offset -= get_clock();
408 timers_state.cpu_ticks_enabled = 1;
409 }
410 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
411 &timers_state.vm_clock_lock);
412 }
413
414 /* disable cpu_get_ticks() : the clock is stopped. You must not call
415 * cpu_get_ticks() after that.
416 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
417 */
418 void cpu_disable_ticks(void)
419 {
420 seqlock_write_lock(&timers_state.vm_clock_seqlock,
421 &timers_state.vm_clock_lock);
422 if (timers_state.cpu_ticks_enabled) {
423 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
424 timers_state.cpu_clock_offset = cpu_get_clock_locked();
425 timers_state.cpu_ticks_enabled = 0;
426 }
427 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
428 &timers_state.vm_clock_lock);
429 }
430
431 /* Correlation between real and virtual time is always going to be
432 fairly approximate, so ignore small variation.
433 When the guest is idle real and virtual time will be aligned in
434 the IO wait loop. */
435 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
436
437 static void icount_adjust(void)
438 {
439 int64_t cur_time;
440 int64_t cur_icount;
441 int64_t delta;
442
443 /* Protected by TimersState mutex. */
444 static int64_t last_delta;
445
446 /* If the VM is not running, then do nothing. */
447 if (!runstate_is_running()) {
448 return;
449 }
450
451 seqlock_write_lock(&timers_state.vm_clock_seqlock,
452 &timers_state.vm_clock_lock);
453 cur_time = cpu_get_clock_locked();
454 cur_icount = cpu_get_icount_locked();
455
456 delta = cur_icount - cur_time;
457 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
458 if (delta > 0
459 && last_delta + ICOUNT_WOBBLE < delta * 2
460 && timers_state.icount_time_shift > 0) {
461 /* The guest is getting too far ahead. Slow time down. */
462 atomic_set(&timers_state.icount_time_shift,
463 timers_state.icount_time_shift - 1);
464 }
465 if (delta < 0
466 && last_delta - ICOUNT_WOBBLE > delta * 2
467 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
468 /* The guest is getting too far behind. Speed time up. */
469 atomic_set(&timers_state.icount_time_shift,
470 timers_state.icount_time_shift + 1);
471 }
472 last_delta = delta;
473 atomic_set_i64(&timers_state.qemu_icount_bias,
474 cur_icount - (timers_state.qemu_icount
475 << timers_state.icount_time_shift));
476 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
477 &timers_state.vm_clock_lock);
478 }
479
480 static void icount_adjust_rt(void *opaque)
481 {
482 timer_mod(timers_state.icount_rt_timer,
483 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
484 icount_adjust();
485 }
486
487 static void icount_adjust_vm(void *opaque)
488 {
489 timer_mod(timers_state.icount_vm_timer,
490 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
491 NANOSECONDS_PER_SECOND / 10);
492 icount_adjust();
493 }
494
495 static int64_t qemu_icount_round(int64_t count)
496 {
497 int shift = atomic_read(&timers_state.icount_time_shift);
498 return (count + (1 << shift) - 1) >> shift;
499 }
500
501 static void icount_warp_rt(void)
502 {
503 unsigned seq;
504 int64_t warp_start;
505
506 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
507 * changes from -1 to another value, so the race here is okay.
508 */
509 do {
510 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
511 warp_start = timers_state.vm_clock_warp_start;
512 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
513
514 if (warp_start == -1) {
515 return;
516 }
517
518 seqlock_write_lock(&timers_state.vm_clock_seqlock,
519 &timers_state.vm_clock_lock);
520 if (runstate_is_running()) {
521 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
522 cpu_get_clock_locked());
523 int64_t warp_delta;
524
525 warp_delta = clock - timers_state.vm_clock_warp_start;
526 if (use_icount == 2) {
527 /*
528 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
529 * far ahead of real time.
530 */
531 int64_t cur_icount = cpu_get_icount_locked();
532 int64_t delta = clock - cur_icount;
533 warp_delta = MIN(warp_delta, delta);
534 }
535 atomic_set_i64(&timers_state.qemu_icount_bias,
536 timers_state.qemu_icount_bias + warp_delta);
537 }
538 timers_state.vm_clock_warp_start = -1;
539 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
540 &timers_state.vm_clock_lock);
541
542 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
543 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
544 }
545 }
546
547 static void icount_timer_cb(void *opaque)
548 {
549 /* No need for a checkpoint because the timer already synchronizes
550 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
551 */
552 icount_warp_rt();
553 }
554
555 void qtest_clock_warp(int64_t dest)
556 {
557 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
558 AioContext *aio_context;
559 assert(qtest_enabled());
560 aio_context = qemu_get_aio_context();
561 while (clock < dest) {
562 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
563 QEMU_TIMER_ATTR_ALL);
564 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
565
566 seqlock_write_lock(&timers_state.vm_clock_seqlock,
567 &timers_state.vm_clock_lock);
568 atomic_set_i64(&timers_state.qemu_icount_bias,
569 timers_state.qemu_icount_bias + warp);
570 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
571 &timers_state.vm_clock_lock);
572
573 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
574 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
575 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
576 }
577 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
578 }
579
580 void qemu_start_warp_timer(void)
581 {
582 int64_t clock;
583 int64_t deadline;
584
585 if (!use_icount) {
586 return;
587 }
588
589 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
590 * do not fire, so computing the deadline does not make sense.
591 */
592 if (!runstate_is_running()) {
593 return;
594 }
595
596 if (replay_mode != REPLAY_MODE_PLAY) {
597 if (!all_cpu_threads_idle()) {
598 return;
599 }
600
601 if (qtest_enabled()) {
602 /* When testing, qtest commands advance icount. */
603 return;
604 }
605
606 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
607 } else {
608 /* warp clock deterministically in record/replay mode */
609 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
610 /* vCPU is sleeping and warp can't be started.
611 It is probably a race condition: notification sent
612 to vCPU was processed in advance and vCPU went to sleep.
613 Therefore we have to wake it up for doing someting. */
614 if (replay_has_checkpoint()) {
615 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
616 }
617 return;
618 }
619 }
620
621 /* We want to use the earliest deadline from ALL vm_clocks */
622 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
623 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
624 ~QEMU_TIMER_ATTR_EXTERNAL);
625 if (deadline < 0) {
626 static bool notified;
627 if (!icount_sleep && !notified) {
628 warn_report("icount sleep disabled and no active timers");
629 notified = true;
630 }
631 return;
632 }
633
634 if (deadline > 0) {
635 /*
636 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
637 * sleep. Otherwise, the CPU might be waiting for a future timer
638 * interrupt to wake it up, but the interrupt never comes because
639 * the vCPU isn't running any insns and thus doesn't advance the
640 * QEMU_CLOCK_VIRTUAL.
641 */
642 if (!icount_sleep) {
643 /*
644 * We never let VCPUs sleep in no sleep icount mode.
645 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
646 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
647 * It is useful when we want a deterministic execution time,
648 * isolated from host latencies.
649 */
650 seqlock_write_lock(&timers_state.vm_clock_seqlock,
651 &timers_state.vm_clock_lock);
652 atomic_set_i64(&timers_state.qemu_icount_bias,
653 timers_state.qemu_icount_bias + deadline);
654 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
655 &timers_state.vm_clock_lock);
656 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
657 } else {
658 /*
659 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
660 * "real" time, (related to the time left until the next event) has
661 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
662 * This avoids that the warps are visible externally; for example,
663 * you will not be sending network packets continuously instead of
664 * every 100ms.
665 */
666 seqlock_write_lock(&timers_state.vm_clock_seqlock,
667 &timers_state.vm_clock_lock);
668 if (timers_state.vm_clock_warp_start == -1
669 || timers_state.vm_clock_warp_start > clock) {
670 timers_state.vm_clock_warp_start = clock;
671 }
672 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
673 &timers_state.vm_clock_lock);
674 timer_mod_anticipate(timers_state.icount_warp_timer,
675 clock + deadline);
676 }
677 } else if (deadline == 0) {
678 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
679 }
680 }
681
682 static void qemu_account_warp_timer(void)
683 {
684 if (!use_icount || !icount_sleep) {
685 return;
686 }
687
688 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
689 * do not fire, so computing the deadline does not make sense.
690 */
691 if (!runstate_is_running()) {
692 return;
693 }
694
695 /* warp clock deterministically in record/replay mode */
696 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
697 return;
698 }
699
700 timer_del(timers_state.icount_warp_timer);
701 icount_warp_rt();
702 }
703
704 static bool icount_state_needed(void *opaque)
705 {
706 return use_icount;
707 }
708
709 static bool warp_timer_state_needed(void *opaque)
710 {
711 TimersState *s = opaque;
712 return s->icount_warp_timer != NULL;
713 }
714
715 static bool adjust_timers_state_needed(void *opaque)
716 {
717 TimersState *s = opaque;
718 return s->icount_rt_timer != NULL;
719 }
720
721 /*
722 * Subsection for warp timer migration is optional, because may not be created
723 */
724 static const VMStateDescription icount_vmstate_warp_timer = {
725 .name = "timer/icount/warp_timer",
726 .version_id = 1,
727 .minimum_version_id = 1,
728 .needed = warp_timer_state_needed,
729 .fields = (VMStateField[]) {
730 VMSTATE_INT64(vm_clock_warp_start, TimersState),
731 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
732 VMSTATE_END_OF_LIST()
733 }
734 };
735
736 static const VMStateDescription icount_vmstate_adjust_timers = {
737 .name = "timer/icount/timers",
738 .version_id = 1,
739 .minimum_version_id = 1,
740 .needed = adjust_timers_state_needed,
741 .fields = (VMStateField[]) {
742 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
743 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
744 VMSTATE_END_OF_LIST()
745 }
746 };
747
748 /*
749 * This is a subsection for icount migration.
750 */
751 static const VMStateDescription icount_vmstate_timers = {
752 .name = "timer/icount",
753 .version_id = 1,
754 .minimum_version_id = 1,
755 .needed = icount_state_needed,
756 .fields = (VMStateField[]) {
757 VMSTATE_INT64(qemu_icount_bias, TimersState),
758 VMSTATE_INT64(qemu_icount, TimersState),
759 VMSTATE_END_OF_LIST()
760 },
761 .subsections = (const VMStateDescription*[]) {
762 &icount_vmstate_warp_timer,
763 &icount_vmstate_adjust_timers,
764 NULL
765 }
766 };
767
768 static const VMStateDescription vmstate_timers = {
769 .name = "timer",
770 .version_id = 2,
771 .minimum_version_id = 1,
772 .fields = (VMStateField[]) {
773 VMSTATE_INT64(cpu_ticks_offset, TimersState),
774 VMSTATE_UNUSED(8),
775 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
776 VMSTATE_END_OF_LIST()
777 },
778 .subsections = (const VMStateDescription*[]) {
779 &icount_vmstate_timers,
780 NULL
781 }
782 };
783
784 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
785 {
786 double pct;
787 double throttle_ratio;
788 int64_t sleeptime_ns, endtime_ns;
789
790 if (!cpu_throttle_get_percentage()) {
791 return;
792 }
793
794 pct = (double)cpu_throttle_get_percentage()/100;
795 throttle_ratio = pct / (1 - pct);
796 /* Add 1ns to fix double's rounding error (like 0.9999999...) */
797 sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
798 endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
799 while (sleeptime_ns > 0 && !cpu->stop) {
800 if (sleeptime_ns > SCALE_MS) {
801 qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
802 sleeptime_ns / SCALE_MS);
803 } else {
804 qemu_mutex_unlock_iothread();
805 g_usleep(sleeptime_ns / SCALE_US);
806 qemu_mutex_lock_iothread();
807 }
808 sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
809 }
810 atomic_set(&cpu->throttle_thread_scheduled, 0);
811 }
812
813 static void cpu_throttle_timer_tick(void *opaque)
814 {
815 CPUState *cpu;
816 double pct;
817
818 /* Stop the timer if needed */
819 if (!cpu_throttle_get_percentage()) {
820 return;
821 }
822 CPU_FOREACH(cpu) {
823 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
824 async_run_on_cpu(cpu, cpu_throttle_thread,
825 RUN_ON_CPU_NULL);
826 }
827 }
828
829 pct = (double)cpu_throttle_get_percentage()/100;
830 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
831 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
832 }
833
834 void cpu_throttle_set(int new_throttle_pct)
835 {
836 /* Ensure throttle percentage is within valid range */
837 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
838 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
839
840 atomic_set(&throttle_percentage, new_throttle_pct);
841
842 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
843 CPU_THROTTLE_TIMESLICE_NS);
844 }
845
846 void cpu_throttle_stop(void)
847 {
848 atomic_set(&throttle_percentage, 0);
849 }
850
851 bool cpu_throttle_active(void)
852 {
853 return (cpu_throttle_get_percentage() != 0);
854 }
855
856 int cpu_throttle_get_percentage(void)
857 {
858 return atomic_read(&throttle_percentage);
859 }
860
861 void cpu_ticks_init(void)
862 {
863 seqlock_init(&timers_state.vm_clock_seqlock);
864 qemu_spin_init(&timers_state.vm_clock_lock);
865 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
866 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
867 cpu_throttle_timer_tick, NULL);
868 }
869
870 void configure_icount(QemuOpts *opts, Error **errp)
871 {
872 const char *option;
873 char *rem_str = NULL;
874
875 option = qemu_opt_get(opts, "shift");
876 if (!option) {
877 if (qemu_opt_get(opts, "align") != NULL) {
878 error_setg(errp, "Please specify shift option when using align");
879 }
880 return;
881 }
882
883 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
884 if (icount_sleep) {
885 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
886 icount_timer_cb, NULL);
887 }
888
889 icount_align_option = qemu_opt_get_bool(opts, "align", false);
890
891 if (icount_align_option && !icount_sleep) {
892 error_setg(errp, "align=on and sleep=off are incompatible");
893 }
894 if (strcmp(option, "auto") != 0) {
895 errno = 0;
896 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
897 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
898 error_setg(errp, "icount: Invalid shift value");
899 }
900 use_icount = 1;
901 return;
902 } else if (icount_align_option) {
903 error_setg(errp, "shift=auto and align=on are incompatible");
904 } else if (!icount_sleep) {
905 error_setg(errp, "shift=auto and sleep=off are incompatible");
906 }
907
908 use_icount = 2;
909
910 /* 125MIPS seems a reasonable initial guess at the guest speed.
911 It will be corrected fairly quickly anyway. */
912 timers_state.icount_time_shift = 3;
913
914 /* Have both realtime and virtual time triggers for speed adjustment.
915 The realtime trigger catches emulated time passing too slowly,
916 the virtual time trigger catches emulated time passing too fast.
917 Realtime triggers occur even when idle, so use them less frequently
918 than VM triggers. */
919 timers_state.vm_clock_warp_start = -1;
920 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
921 icount_adjust_rt, NULL);
922 timer_mod(timers_state.icount_rt_timer,
923 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
924 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
925 icount_adjust_vm, NULL);
926 timer_mod(timers_state.icount_vm_timer,
927 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
928 NANOSECONDS_PER_SECOND / 10);
929 }
930
931 /***********************************************************/
932 /* TCG vCPU kick timer
933 *
934 * The kick timer is responsible for moving single threaded vCPU
935 * emulation on to the next vCPU. If more than one vCPU is running a
936 * timer event with force a cpu->exit so the next vCPU can get
937 * scheduled.
938 *
939 * The timer is removed if all vCPUs are idle and restarted again once
940 * idleness is complete.
941 */
942
943 static QEMUTimer *tcg_kick_vcpu_timer;
944 static CPUState *tcg_current_rr_cpu;
945
946 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
947
948 static inline int64_t qemu_tcg_next_kick(void)
949 {
950 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
951 }
952
953 /* Kick the currently round-robin scheduled vCPU to next */
954 static void qemu_cpu_kick_rr_next_cpu(void)
955 {
956 CPUState *cpu;
957 do {
958 cpu = atomic_mb_read(&tcg_current_rr_cpu);
959 if (cpu) {
960 cpu_exit(cpu);
961 }
962 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
963 }
964
965 /* Kick all RR vCPUs */
966 static void qemu_cpu_kick_rr_cpus(void)
967 {
968 CPUState *cpu;
969
970 CPU_FOREACH(cpu) {
971 cpu_exit(cpu);
972 };
973 }
974
975 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
976 {
977 }
978
979 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
980 {
981 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
982 qemu_notify_event();
983 return;
984 }
985
986 if (qemu_in_vcpu_thread()) {
987 /* A CPU is currently running; kick it back out to the
988 * tcg_cpu_exec() loop so it will recalculate its
989 * icount deadline immediately.
990 */
991 qemu_cpu_kick(current_cpu);
992 } else if (first_cpu) {
993 /* qemu_cpu_kick is not enough to kick a halted CPU out of
994 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
995 * causes cpu_thread_is_idle to return false. This way,
996 * handle_icount_deadline can run.
997 * If we have no CPUs at all for some reason, we don't
998 * need to do anything.
999 */
1000 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
1001 }
1002 }
1003
1004 static void kick_tcg_thread(void *opaque)
1005 {
1006 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1007 qemu_cpu_kick_rr_next_cpu();
1008 }
1009
1010 static void start_tcg_kick_timer(void)
1011 {
1012 assert(!mttcg_enabled);
1013 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1014 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1015 kick_tcg_thread, NULL);
1016 }
1017 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1018 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1019 }
1020 }
1021
1022 static void stop_tcg_kick_timer(void)
1023 {
1024 assert(!mttcg_enabled);
1025 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1026 timer_del(tcg_kick_vcpu_timer);
1027 }
1028 }
1029
1030 /***********************************************************/
1031 void hw_error(const char *fmt, ...)
1032 {
1033 va_list ap;
1034 CPUState *cpu;
1035
1036 va_start(ap, fmt);
1037 fprintf(stderr, "qemu: hardware error: ");
1038 vfprintf(stderr, fmt, ap);
1039 fprintf(stderr, "\n");
1040 CPU_FOREACH(cpu) {
1041 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1042 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1043 }
1044 va_end(ap);
1045 abort();
1046 }
1047
1048 void cpu_synchronize_all_states(void)
1049 {
1050 CPUState *cpu;
1051
1052 CPU_FOREACH(cpu) {
1053 cpu_synchronize_state(cpu);
1054 /* TODO: move to cpu_synchronize_state() */
1055 if (hvf_enabled()) {
1056 hvf_cpu_synchronize_state(cpu);
1057 }
1058 }
1059 }
1060
1061 void cpu_synchronize_all_post_reset(void)
1062 {
1063 CPUState *cpu;
1064
1065 CPU_FOREACH(cpu) {
1066 cpu_synchronize_post_reset(cpu);
1067 /* TODO: move to cpu_synchronize_post_reset() */
1068 if (hvf_enabled()) {
1069 hvf_cpu_synchronize_post_reset(cpu);
1070 }
1071 }
1072 }
1073
1074 void cpu_synchronize_all_post_init(void)
1075 {
1076 CPUState *cpu;
1077
1078 CPU_FOREACH(cpu) {
1079 cpu_synchronize_post_init(cpu);
1080 /* TODO: move to cpu_synchronize_post_init() */
1081 if (hvf_enabled()) {
1082 hvf_cpu_synchronize_post_init(cpu);
1083 }
1084 }
1085 }
1086
1087 void cpu_synchronize_all_pre_loadvm(void)
1088 {
1089 CPUState *cpu;
1090
1091 CPU_FOREACH(cpu) {
1092 cpu_synchronize_pre_loadvm(cpu);
1093 }
1094 }
1095
1096 static int do_vm_stop(RunState state, bool send_stop)
1097 {
1098 int ret = 0;
1099
1100 if (runstate_is_running()) {
1101 cpu_disable_ticks();
1102 pause_all_vcpus();
1103 runstate_set(state);
1104 vm_state_notify(0, state);
1105 if (send_stop) {
1106 qapi_event_send_stop();
1107 }
1108 }
1109
1110 bdrv_drain_all();
1111 ret = bdrv_flush_all();
1112
1113 return ret;
1114 }
1115
1116 /* Special vm_stop() variant for terminating the process. Historically clients
1117 * did not expect a QMP STOP event and so we need to retain compatibility.
1118 */
1119 int vm_shutdown(void)
1120 {
1121 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1122 }
1123
1124 static bool cpu_can_run(CPUState *cpu)
1125 {
1126 if (cpu->stop) {
1127 return false;
1128 }
1129 if (cpu_is_stopped(cpu)) {
1130 return false;
1131 }
1132 return true;
1133 }
1134
1135 static void cpu_handle_guest_debug(CPUState *cpu)
1136 {
1137 gdb_set_stop_cpu(cpu);
1138 qemu_system_debug_request();
1139 cpu->stopped = true;
1140 }
1141
1142 #ifdef CONFIG_LINUX
1143 static void sigbus_reraise(void)
1144 {
1145 sigset_t set;
1146 struct sigaction action;
1147
1148 memset(&action, 0, sizeof(action));
1149 action.sa_handler = SIG_DFL;
1150 if (!sigaction(SIGBUS, &action, NULL)) {
1151 raise(SIGBUS);
1152 sigemptyset(&set);
1153 sigaddset(&set, SIGBUS);
1154 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1155 }
1156 perror("Failed to re-raise SIGBUS!\n");
1157 abort();
1158 }
1159
1160 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1161 {
1162 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1163 sigbus_reraise();
1164 }
1165
1166 if (current_cpu) {
1167 /* Called asynchronously in VCPU thread. */
1168 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1169 sigbus_reraise();
1170 }
1171 } else {
1172 /* Called synchronously (via signalfd) in main thread. */
1173 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1174 sigbus_reraise();
1175 }
1176 }
1177 }
1178
1179 static void qemu_init_sigbus(void)
1180 {
1181 struct sigaction action;
1182
1183 memset(&action, 0, sizeof(action));
1184 action.sa_flags = SA_SIGINFO;
1185 action.sa_sigaction = sigbus_handler;
1186 sigaction(SIGBUS, &action, NULL);
1187
1188 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1189 }
1190 #else /* !CONFIG_LINUX */
1191 static void qemu_init_sigbus(void)
1192 {
1193 }
1194 #endif /* !CONFIG_LINUX */
1195
1196 static QemuThread io_thread;
1197
1198 /* cpu creation */
1199 static QemuCond qemu_cpu_cond;
1200 /* system init */
1201 static QemuCond qemu_pause_cond;
1202
1203 void qemu_init_cpu_loop(void)
1204 {
1205 qemu_init_sigbus();
1206 qemu_cond_init(&qemu_cpu_cond);
1207 qemu_cond_init(&qemu_pause_cond);
1208 qemu_mutex_init(&qemu_global_mutex);
1209
1210 qemu_thread_get_self(&io_thread);
1211 }
1212
1213 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1214 {
1215 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1216 }
1217
1218 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1219 {
1220 if (kvm_destroy_vcpu(cpu) < 0) {
1221 error_report("kvm_destroy_vcpu failed");
1222 exit(EXIT_FAILURE);
1223 }
1224 }
1225
1226 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1227 {
1228 }
1229
1230 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1231 {
1232 g_assert(qemu_cpu_is_self(cpu));
1233 cpu->stop = false;
1234 cpu->stopped = true;
1235 if (exit) {
1236 cpu_exit(cpu);
1237 }
1238 qemu_cond_broadcast(&qemu_pause_cond);
1239 }
1240
1241 static void qemu_wait_io_event_common(CPUState *cpu)
1242 {
1243 atomic_mb_set(&cpu->thread_kicked, false);
1244 if (cpu->stop) {
1245 qemu_cpu_stop(cpu, false);
1246 }
1247 process_queued_cpu_work(cpu);
1248 }
1249
1250 static void qemu_tcg_rr_wait_io_event(void)
1251 {
1252 CPUState *cpu;
1253
1254 while (all_cpu_threads_idle()) {
1255 stop_tcg_kick_timer();
1256 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1257 }
1258
1259 start_tcg_kick_timer();
1260
1261 CPU_FOREACH(cpu) {
1262 qemu_wait_io_event_common(cpu);
1263 }
1264 }
1265
1266 static void qemu_wait_io_event(CPUState *cpu)
1267 {
1268 bool slept = false;
1269
1270 while (cpu_thread_is_idle(cpu)) {
1271 if (!slept) {
1272 slept = true;
1273 qemu_plugin_vcpu_idle_cb(cpu);
1274 }
1275 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1276 }
1277 if (slept) {
1278 qemu_plugin_vcpu_resume_cb(cpu);
1279 }
1280
1281 #ifdef _WIN32
1282 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1283 if (!tcg_enabled()) {
1284 SleepEx(0, TRUE);
1285 }
1286 #endif
1287 qemu_wait_io_event_common(cpu);
1288 }
1289
1290 static void *qemu_kvm_cpu_thread_fn(void *arg)
1291 {
1292 CPUState *cpu = arg;
1293 int r;
1294
1295 rcu_register_thread();
1296
1297 qemu_mutex_lock_iothread();
1298 qemu_thread_get_self(cpu->thread);
1299 cpu->thread_id = qemu_get_thread_id();
1300 cpu->can_do_io = 1;
1301 current_cpu = cpu;
1302
1303 r = kvm_init_vcpu(cpu);
1304 if (r < 0) {
1305 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1306 exit(1);
1307 }
1308
1309 kvm_init_cpu_signals(cpu);
1310
1311 /* signal CPU creation */
1312 cpu->created = true;
1313 qemu_cond_signal(&qemu_cpu_cond);
1314 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1315
1316 do {
1317 if (cpu_can_run(cpu)) {
1318 r = kvm_cpu_exec(cpu);
1319 if (r == EXCP_DEBUG) {
1320 cpu_handle_guest_debug(cpu);
1321 }
1322 }
1323 qemu_wait_io_event(cpu);
1324 } while (!cpu->unplug || cpu_can_run(cpu));
1325
1326 qemu_kvm_destroy_vcpu(cpu);
1327 cpu->created = false;
1328 qemu_cond_signal(&qemu_cpu_cond);
1329 qemu_mutex_unlock_iothread();
1330 rcu_unregister_thread();
1331 return NULL;
1332 }
1333
1334 static void *qemu_dummy_cpu_thread_fn(void *arg)
1335 {
1336 #ifdef _WIN32
1337 error_report("qtest is not supported under Windows");
1338 exit(1);
1339 #else
1340 CPUState *cpu = arg;
1341 sigset_t waitset;
1342 int r;
1343
1344 rcu_register_thread();
1345
1346 qemu_mutex_lock_iothread();
1347 qemu_thread_get_self(cpu->thread);
1348 cpu->thread_id = qemu_get_thread_id();
1349 cpu->can_do_io = 1;
1350 current_cpu = cpu;
1351
1352 sigemptyset(&waitset);
1353 sigaddset(&waitset, SIG_IPI);
1354
1355 /* signal CPU creation */
1356 cpu->created = true;
1357 qemu_cond_signal(&qemu_cpu_cond);
1358 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1359
1360 do {
1361 qemu_mutex_unlock_iothread();
1362 do {
1363 int sig;
1364 r = sigwait(&waitset, &sig);
1365 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1366 if (r == -1) {
1367 perror("sigwait");
1368 exit(1);
1369 }
1370 qemu_mutex_lock_iothread();
1371 qemu_wait_io_event(cpu);
1372 } while (!cpu->unplug);
1373
1374 qemu_mutex_unlock_iothread();
1375 rcu_unregister_thread();
1376 return NULL;
1377 #endif
1378 }
1379
1380 static int64_t tcg_get_icount_limit(void)
1381 {
1382 int64_t deadline;
1383
1384 if (replay_mode != REPLAY_MODE_PLAY) {
1385 /*
1386 * Include all the timers, because they may need an attention.
1387 * Too long CPU execution may create unnecessary delay in UI.
1388 */
1389 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1390 QEMU_TIMER_ATTR_ALL);
1391
1392 /* Maintain prior (possibly buggy) behaviour where if no deadline
1393 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1394 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1395 * nanoseconds.
1396 */
1397 if ((deadline < 0) || (deadline > INT32_MAX)) {
1398 deadline = INT32_MAX;
1399 }
1400
1401 return qemu_icount_round(deadline);
1402 } else {
1403 return replay_get_instructions();
1404 }
1405 }
1406
1407 static void handle_icount_deadline(void)
1408 {
1409 assert(qemu_in_vcpu_thread());
1410 if (use_icount) {
1411 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1412 QEMU_TIMER_ATTR_ALL);
1413
1414 if (deadline == 0) {
1415 /* Wake up other AioContexts. */
1416 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1417 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1418 }
1419 }
1420 }
1421
1422 static void prepare_icount_for_run(CPUState *cpu)
1423 {
1424 if (use_icount) {
1425 int insns_left;
1426
1427 /* These should always be cleared by process_icount_data after
1428 * each vCPU execution. However u16.high can be raised
1429 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1430 */
1431 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1432 g_assert(cpu->icount_extra == 0);
1433
1434 cpu->icount_budget = tcg_get_icount_limit();
1435 insns_left = MIN(0xffff, cpu->icount_budget);
1436 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1437 cpu->icount_extra = cpu->icount_budget - insns_left;
1438
1439 replay_mutex_lock();
1440 }
1441 }
1442
1443 static void process_icount_data(CPUState *cpu)
1444 {
1445 if (use_icount) {
1446 /* Account for executed instructions */
1447 cpu_update_icount(cpu);
1448
1449 /* Reset the counters */
1450 cpu_neg(cpu)->icount_decr.u16.low = 0;
1451 cpu->icount_extra = 0;
1452 cpu->icount_budget = 0;
1453
1454 replay_account_executed_instructions();
1455
1456 replay_mutex_unlock();
1457 }
1458 }
1459
1460
1461 static int tcg_cpu_exec(CPUState *cpu)
1462 {
1463 int ret;
1464 #ifdef CONFIG_PROFILER
1465 int64_t ti;
1466 #endif
1467
1468 assert(tcg_enabled());
1469 #ifdef CONFIG_PROFILER
1470 ti = profile_getclock();
1471 #endif
1472 cpu_exec_start(cpu);
1473 ret = cpu_exec(cpu);
1474 cpu_exec_end(cpu);
1475 #ifdef CONFIG_PROFILER
1476 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1477 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1478 #endif
1479 return ret;
1480 }
1481
1482 /* Destroy any remaining vCPUs which have been unplugged and have
1483 * finished running
1484 */
1485 static void deal_with_unplugged_cpus(void)
1486 {
1487 CPUState *cpu;
1488
1489 CPU_FOREACH(cpu) {
1490 if (cpu->unplug && !cpu_can_run(cpu)) {
1491 qemu_tcg_destroy_vcpu(cpu);
1492 cpu->created = false;
1493 qemu_cond_signal(&qemu_cpu_cond);
1494 break;
1495 }
1496 }
1497 }
1498
1499 /* Single-threaded TCG
1500 *
1501 * In the single-threaded case each vCPU is simulated in turn. If
1502 * there is more than a single vCPU we create a simple timer to kick
1503 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1504 * This is done explicitly rather than relying on side-effects
1505 * elsewhere.
1506 */
1507
1508 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1509 {
1510 CPUState *cpu = arg;
1511
1512 assert(tcg_enabled());
1513 rcu_register_thread();
1514 tcg_register_thread();
1515
1516 qemu_mutex_lock_iothread();
1517 qemu_thread_get_self(cpu->thread);
1518
1519 cpu->thread_id = qemu_get_thread_id();
1520 cpu->created = true;
1521 cpu->can_do_io = 1;
1522 qemu_cond_signal(&qemu_cpu_cond);
1523 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1524
1525 /* wait for initial kick-off after machine start */
1526 while (first_cpu->stopped) {
1527 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1528
1529 /* process any pending work */
1530 CPU_FOREACH(cpu) {
1531 current_cpu = cpu;
1532 qemu_wait_io_event_common(cpu);
1533 }
1534 }
1535
1536 start_tcg_kick_timer();
1537
1538 cpu = first_cpu;
1539
1540 /* process any pending work */
1541 cpu->exit_request = 1;
1542
1543 while (1) {
1544 qemu_mutex_unlock_iothread();
1545 replay_mutex_lock();
1546 qemu_mutex_lock_iothread();
1547 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1548 qemu_account_warp_timer();
1549
1550 /* Run the timers here. This is much more efficient than
1551 * waking up the I/O thread and waiting for completion.
1552 */
1553 handle_icount_deadline();
1554
1555 replay_mutex_unlock();
1556
1557 if (!cpu) {
1558 cpu = first_cpu;
1559 }
1560
1561 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1562
1563 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1564 current_cpu = cpu;
1565
1566 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1567 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1568
1569 if (cpu_can_run(cpu)) {
1570 int r;
1571
1572 qemu_mutex_unlock_iothread();
1573 prepare_icount_for_run(cpu);
1574
1575 r = tcg_cpu_exec(cpu);
1576
1577 process_icount_data(cpu);
1578 qemu_mutex_lock_iothread();
1579
1580 if (r == EXCP_DEBUG) {
1581 cpu_handle_guest_debug(cpu);
1582 break;
1583 } else if (r == EXCP_ATOMIC) {
1584 qemu_mutex_unlock_iothread();
1585 cpu_exec_step_atomic(cpu);
1586 qemu_mutex_lock_iothread();
1587 break;
1588 }
1589 } else if (cpu->stop) {
1590 if (cpu->unplug) {
1591 cpu = CPU_NEXT(cpu);
1592 }
1593 break;
1594 }
1595
1596 cpu = CPU_NEXT(cpu);
1597 } /* while (cpu && !cpu->exit_request).. */
1598
1599 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1600 atomic_set(&tcg_current_rr_cpu, NULL);
1601
1602 if (cpu && cpu->exit_request) {
1603 atomic_mb_set(&cpu->exit_request, 0);
1604 }
1605
1606 if (use_icount && all_cpu_threads_idle()) {
1607 /*
1608 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1609 * in the main_loop, wake it up in order to start the warp timer.
1610 */
1611 qemu_notify_event();
1612 }
1613
1614 qemu_tcg_rr_wait_io_event();
1615 deal_with_unplugged_cpus();
1616 }
1617
1618 rcu_unregister_thread();
1619 return NULL;
1620 }
1621
1622 static void *qemu_hax_cpu_thread_fn(void *arg)
1623 {
1624 CPUState *cpu = arg;
1625 int r;
1626
1627 rcu_register_thread();
1628 qemu_mutex_lock_iothread();
1629 qemu_thread_get_self(cpu->thread);
1630
1631 cpu->thread_id = qemu_get_thread_id();
1632 cpu->created = true;
1633 current_cpu = cpu;
1634
1635 hax_init_vcpu(cpu);
1636 qemu_cond_signal(&qemu_cpu_cond);
1637 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1638
1639 do {
1640 if (cpu_can_run(cpu)) {
1641 r = hax_smp_cpu_exec(cpu);
1642 if (r == EXCP_DEBUG) {
1643 cpu_handle_guest_debug(cpu);
1644 }
1645 }
1646
1647 qemu_wait_io_event(cpu);
1648 } while (!cpu->unplug || cpu_can_run(cpu));
1649 rcu_unregister_thread();
1650 return NULL;
1651 }
1652
1653 /* The HVF-specific vCPU thread function. This one should only run when the host
1654 * CPU supports the VMX "unrestricted guest" feature. */
1655 static void *qemu_hvf_cpu_thread_fn(void *arg)
1656 {
1657 CPUState *cpu = arg;
1658
1659 int r;
1660
1661 assert(hvf_enabled());
1662
1663 rcu_register_thread();
1664
1665 qemu_mutex_lock_iothread();
1666 qemu_thread_get_self(cpu->thread);
1667
1668 cpu->thread_id = qemu_get_thread_id();
1669 cpu->can_do_io = 1;
1670 current_cpu = cpu;
1671
1672 hvf_init_vcpu(cpu);
1673
1674 /* signal CPU creation */
1675 cpu->created = true;
1676 qemu_cond_signal(&qemu_cpu_cond);
1677 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1678
1679 do {
1680 if (cpu_can_run(cpu)) {
1681 r = hvf_vcpu_exec(cpu);
1682 if (r == EXCP_DEBUG) {
1683 cpu_handle_guest_debug(cpu);
1684 }
1685 }
1686 qemu_wait_io_event(cpu);
1687 } while (!cpu->unplug || cpu_can_run(cpu));
1688
1689 hvf_vcpu_destroy(cpu);
1690 cpu->created = false;
1691 qemu_cond_signal(&qemu_cpu_cond);
1692 qemu_mutex_unlock_iothread();
1693 rcu_unregister_thread();
1694 return NULL;
1695 }
1696
1697 static void *qemu_whpx_cpu_thread_fn(void *arg)
1698 {
1699 CPUState *cpu = arg;
1700 int r;
1701
1702 rcu_register_thread();
1703
1704 qemu_mutex_lock_iothread();
1705 qemu_thread_get_self(cpu->thread);
1706 cpu->thread_id = qemu_get_thread_id();
1707 current_cpu = cpu;
1708
1709 r = whpx_init_vcpu(cpu);
1710 if (r < 0) {
1711 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1712 exit(1);
1713 }
1714
1715 /* signal CPU creation */
1716 cpu->created = true;
1717 qemu_cond_signal(&qemu_cpu_cond);
1718 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1719
1720 do {
1721 if (cpu_can_run(cpu)) {
1722 r = whpx_vcpu_exec(cpu);
1723 if (r == EXCP_DEBUG) {
1724 cpu_handle_guest_debug(cpu);
1725 }
1726 }
1727 while (cpu_thread_is_idle(cpu)) {
1728 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1729 }
1730 qemu_wait_io_event_common(cpu);
1731 } while (!cpu->unplug || cpu_can_run(cpu));
1732
1733 whpx_destroy_vcpu(cpu);
1734 cpu->created = false;
1735 qemu_cond_signal(&qemu_cpu_cond);
1736 qemu_mutex_unlock_iothread();
1737 rcu_unregister_thread();
1738 return NULL;
1739 }
1740
1741 #ifdef _WIN32
1742 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1743 {
1744 }
1745 #endif
1746
1747 /* Multi-threaded TCG
1748 *
1749 * In the multi-threaded case each vCPU has its own thread. The TLS
1750 * variable current_cpu can be used deep in the code to find the
1751 * current CPUState for a given thread.
1752 */
1753
1754 static void *qemu_tcg_cpu_thread_fn(void *arg)
1755 {
1756 CPUState *cpu = arg;
1757
1758 assert(tcg_enabled());
1759 g_assert(!use_icount);
1760
1761 rcu_register_thread();
1762 tcg_register_thread();
1763
1764 qemu_mutex_lock_iothread();
1765 qemu_thread_get_self(cpu->thread);
1766
1767 cpu->thread_id = qemu_get_thread_id();
1768 cpu->created = true;
1769 cpu->can_do_io = 1;
1770 current_cpu = cpu;
1771 qemu_cond_signal(&qemu_cpu_cond);
1772 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1773
1774 /* process any pending work */
1775 cpu->exit_request = 1;
1776
1777 do {
1778 if (cpu_can_run(cpu)) {
1779 int r;
1780 qemu_mutex_unlock_iothread();
1781 r = tcg_cpu_exec(cpu);
1782 qemu_mutex_lock_iothread();
1783 switch (r) {
1784 case EXCP_DEBUG:
1785 cpu_handle_guest_debug(cpu);
1786 break;
1787 case EXCP_HALTED:
1788 /* during start-up the vCPU is reset and the thread is
1789 * kicked several times. If we don't ensure we go back
1790 * to sleep in the halted state we won't cleanly
1791 * start-up when the vCPU is enabled.
1792 *
1793 * cpu->halted should ensure we sleep in wait_io_event
1794 */
1795 g_assert(cpu->halted);
1796 break;
1797 case EXCP_ATOMIC:
1798 qemu_mutex_unlock_iothread();
1799 cpu_exec_step_atomic(cpu);
1800 qemu_mutex_lock_iothread();
1801 default:
1802 /* Ignore everything else? */
1803 break;
1804 }
1805 }
1806
1807 atomic_mb_set(&cpu->exit_request, 0);
1808 qemu_wait_io_event(cpu);
1809 } while (!cpu->unplug || cpu_can_run(cpu));
1810
1811 qemu_tcg_destroy_vcpu(cpu);
1812 cpu->created = false;
1813 qemu_cond_signal(&qemu_cpu_cond);
1814 qemu_mutex_unlock_iothread();
1815 rcu_unregister_thread();
1816 return NULL;
1817 }
1818
1819 static void qemu_cpu_kick_thread(CPUState *cpu)
1820 {
1821 #ifndef _WIN32
1822 int err;
1823
1824 if (cpu->thread_kicked) {
1825 return;
1826 }
1827 cpu->thread_kicked = true;
1828 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1829 if (err && err != ESRCH) {
1830 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1831 exit(1);
1832 }
1833 #else /* _WIN32 */
1834 if (!qemu_cpu_is_self(cpu)) {
1835 if (whpx_enabled()) {
1836 whpx_vcpu_kick(cpu);
1837 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1838 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1839 __func__, GetLastError());
1840 exit(1);
1841 }
1842 }
1843 #endif
1844 }
1845
1846 void qemu_cpu_kick(CPUState *cpu)
1847 {
1848 qemu_cond_broadcast(cpu->halt_cond);
1849 if (tcg_enabled()) {
1850 if (qemu_tcg_mttcg_enabled()) {
1851 cpu_exit(cpu);
1852 } else {
1853 qemu_cpu_kick_rr_cpus();
1854 }
1855 } else {
1856 if (hax_enabled()) {
1857 /*
1858 * FIXME: race condition with the exit_request check in
1859 * hax_vcpu_hax_exec
1860 */
1861 cpu->exit_request = 1;
1862 }
1863 qemu_cpu_kick_thread(cpu);
1864 }
1865 }
1866
1867 void qemu_cpu_kick_self(void)
1868 {
1869 assert(current_cpu);
1870 qemu_cpu_kick_thread(current_cpu);
1871 }
1872
1873 bool qemu_cpu_is_self(CPUState *cpu)
1874 {
1875 return qemu_thread_is_self(cpu->thread);
1876 }
1877
1878 bool qemu_in_vcpu_thread(void)
1879 {
1880 return current_cpu && qemu_cpu_is_self(current_cpu);
1881 }
1882
1883 static __thread bool iothread_locked = false;
1884
1885 bool qemu_mutex_iothread_locked(void)
1886 {
1887 return iothread_locked;
1888 }
1889
1890 /*
1891 * The BQL is taken from so many places that it is worth profiling the
1892 * callers directly, instead of funneling them all through a single function.
1893 */
1894 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1895 {
1896 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1897
1898 g_assert(!qemu_mutex_iothread_locked());
1899 bql_lock(&qemu_global_mutex, file, line);
1900 iothread_locked = true;
1901 }
1902
1903 void qemu_mutex_unlock_iothread(void)
1904 {
1905 g_assert(qemu_mutex_iothread_locked());
1906 iothread_locked = false;
1907 qemu_mutex_unlock(&qemu_global_mutex);
1908 }
1909
1910 static bool all_vcpus_paused(void)
1911 {
1912 CPUState *cpu;
1913
1914 CPU_FOREACH(cpu) {
1915 if (!cpu->stopped) {
1916 return false;
1917 }
1918 }
1919
1920 return true;
1921 }
1922
1923 void pause_all_vcpus(void)
1924 {
1925 CPUState *cpu;
1926
1927 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1928 CPU_FOREACH(cpu) {
1929 if (qemu_cpu_is_self(cpu)) {
1930 qemu_cpu_stop(cpu, true);
1931 } else {
1932 cpu->stop = true;
1933 qemu_cpu_kick(cpu);
1934 }
1935 }
1936
1937 /* We need to drop the replay_lock so any vCPU threads woken up
1938 * can finish their replay tasks
1939 */
1940 replay_mutex_unlock();
1941
1942 while (!all_vcpus_paused()) {
1943 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1944 CPU_FOREACH(cpu) {
1945 qemu_cpu_kick(cpu);
1946 }
1947 }
1948
1949 qemu_mutex_unlock_iothread();
1950 replay_mutex_lock();
1951 qemu_mutex_lock_iothread();
1952 }
1953
1954 void cpu_resume(CPUState *cpu)
1955 {
1956 cpu->stop = false;
1957 cpu->stopped = false;
1958 qemu_cpu_kick(cpu);
1959 }
1960
1961 void resume_all_vcpus(void)
1962 {
1963 CPUState *cpu;
1964
1965 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1966 CPU_FOREACH(cpu) {
1967 cpu_resume(cpu);
1968 }
1969 }
1970
1971 void cpu_remove_sync(CPUState *cpu)
1972 {
1973 cpu->stop = true;
1974 cpu->unplug = true;
1975 qemu_cpu_kick(cpu);
1976 qemu_mutex_unlock_iothread();
1977 qemu_thread_join(cpu->thread);
1978 qemu_mutex_lock_iothread();
1979 }
1980
1981 /* For temporary buffers for forming a name */
1982 #define VCPU_THREAD_NAME_SIZE 16
1983
1984 static void qemu_tcg_init_vcpu(CPUState *cpu)
1985 {
1986 char thread_name[VCPU_THREAD_NAME_SIZE];
1987 static QemuCond *single_tcg_halt_cond;
1988 static QemuThread *single_tcg_cpu_thread;
1989 static int tcg_region_inited;
1990
1991 assert(tcg_enabled());
1992 /*
1993 * Initialize TCG regions--once. Now is a good time, because:
1994 * (1) TCG's init context, prologue and target globals have been set up.
1995 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1996 * -accel flag is processed, so the check doesn't work then).
1997 */
1998 if (!tcg_region_inited) {
1999 tcg_region_inited = 1;
2000 tcg_region_init();
2001 }
2002
2003 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
2004 cpu->thread = g_malloc0(sizeof(QemuThread));
2005 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006 qemu_cond_init(cpu->halt_cond);
2007
2008 if (qemu_tcg_mttcg_enabled()) {
2009 /* create a thread per vCPU with TCG (MTTCG) */
2010 parallel_cpus = true;
2011 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2012 cpu->cpu_index);
2013
2014 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2015 cpu, QEMU_THREAD_JOINABLE);
2016
2017 } else {
2018 /* share a single thread for all cpus with TCG */
2019 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2020 qemu_thread_create(cpu->thread, thread_name,
2021 qemu_tcg_rr_cpu_thread_fn,
2022 cpu, QEMU_THREAD_JOINABLE);
2023
2024 single_tcg_halt_cond = cpu->halt_cond;
2025 single_tcg_cpu_thread = cpu->thread;
2026 }
2027 #ifdef _WIN32
2028 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2029 #endif
2030 } else {
2031 /* For non-MTTCG cases we share the thread */
2032 cpu->thread = single_tcg_cpu_thread;
2033 cpu->halt_cond = single_tcg_halt_cond;
2034 cpu->thread_id = first_cpu->thread_id;
2035 cpu->can_do_io = 1;
2036 cpu->created = true;
2037 }
2038 }
2039
2040 static void qemu_hax_start_vcpu(CPUState *cpu)
2041 {
2042 char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044 cpu->thread = g_malloc0(sizeof(QemuThread));
2045 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046 qemu_cond_init(cpu->halt_cond);
2047
2048 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2049 cpu->cpu_index);
2050 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2051 cpu, QEMU_THREAD_JOINABLE);
2052 #ifdef _WIN32
2053 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2054 #endif
2055 }
2056
2057 static void qemu_kvm_start_vcpu(CPUState *cpu)
2058 {
2059 char thread_name[VCPU_THREAD_NAME_SIZE];
2060
2061 cpu->thread = g_malloc0(sizeof(QemuThread));
2062 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2063 qemu_cond_init(cpu->halt_cond);
2064 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2065 cpu->cpu_index);
2066 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2067 cpu, QEMU_THREAD_JOINABLE);
2068 }
2069
2070 static void qemu_hvf_start_vcpu(CPUState *cpu)
2071 {
2072 char thread_name[VCPU_THREAD_NAME_SIZE];
2073
2074 /* HVF currently does not support TCG, and only runs in
2075 * unrestricted-guest mode. */
2076 assert(hvf_enabled());
2077
2078 cpu->thread = g_malloc0(sizeof(QemuThread));
2079 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2080 qemu_cond_init(cpu->halt_cond);
2081
2082 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2083 cpu->cpu_index);
2084 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2085 cpu, QEMU_THREAD_JOINABLE);
2086 }
2087
2088 static void qemu_whpx_start_vcpu(CPUState *cpu)
2089 {
2090 char thread_name[VCPU_THREAD_NAME_SIZE];
2091
2092 cpu->thread = g_malloc0(sizeof(QemuThread));
2093 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2094 qemu_cond_init(cpu->halt_cond);
2095 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2096 cpu->cpu_index);
2097 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2098 cpu, QEMU_THREAD_JOINABLE);
2099 #ifdef _WIN32
2100 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2101 #endif
2102 }
2103
2104 static void qemu_dummy_start_vcpu(CPUState *cpu)
2105 {
2106 char thread_name[VCPU_THREAD_NAME_SIZE];
2107
2108 cpu->thread = g_malloc0(sizeof(QemuThread));
2109 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2110 qemu_cond_init(cpu->halt_cond);
2111 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2112 cpu->cpu_index);
2113 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2114 QEMU_THREAD_JOINABLE);
2115 }
2116
2117 void qemu_init_vcpu(CPUState *cpu)
2118 {
2119 MachineState *ms = MACHINE(qdev_get_machine());
2120
2121 cpu->nr_cores = ms->smp.cores;
2122 cpu->nr_threads = ms->smp.threads;
2123 cpu->stopped = true;
2124 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2125
2126 if (!cpu->as) {
2127 /* If the target cpu hasn't set up any address spaces itself,
2128 * give it the default one.
2129 */
2130 cpu->num_ases = 1;
2131 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2132 }
2133
2134 if (kvm_enabled()) {
2135 qemu_kvm_start_vcpu(cpu);
2136 } else if (hax_enabled()) {
2137 qemu_hax_start_vcpu(cpu);
2138 } else if (hvf_enabled()) {
2139 qemu_hvf_start_vcpu(cpu);
2140 } else if (tcg_enabled()) {
2141 qemu_tcg_init_vcpu(cpu);
2142 } else if (whpx_enabled()) {
2143 qemu_whpx_start_vcpu(cpu);
2144 } else {
2145 qemu_dummy_start_vcpu(cpu);
2146 }
2147
2148 while (!cpu->created) {
2149 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2150 }
2151 }
2152
2153 void cpu_stop_current(void)
2154 {
2155 if (current_cpu) {
2156 current_cpu->stop = true;
2157 cpu_exit(current_cpu);
2158 }
2159 }
2160
2161 int vm_stop(RunState state)
2162 {
2163 if (qemu_in_vcpu_thread()) {
2164 qemu_system_vmstop_request_prepare();
2165 qemu_system_vmstop_request(state);
2166 /*
2167 * FIXME: should not return to device code in case
2168 * vm_stop() has been requested.
2169 */
2170 cpu_stop_current();
2171 return 0;
2172 }
2173
2174 return do_vm_stop(state, true);
2175 }
2176
2177 /**
2178 * Prepare for (re)starting the VM.
2179 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2180 * running or in case of an error condition), 0 otherwise.
2181 */
2182 int vm_prepare_start(void)
2183 {
2184 RunState requested;
2185
2186 qemu_vmstop_requested(&requested);
2187 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2188 return -1;
2189 }
2190
2191 /* Ensure that a STOP/RESUME pair of events is emitted if a
2192 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2193 * example, according to documentation is always followed by
2194 * the STOP event.
2195 */
2196 if (runstate_is_running()) {
2197 qapi_event_send_stop();
2198 qapi_event_send_resume();
2199 return -1;
2200 }
2201
2202 /* We are sending this now, but the CPUs will be resumed shortly later */
2203 qapi_event_send_resume();
2204
2205 cpu_enable_ticks();
2206 runstate_set(RUN_STATE_RUNNING);
2207 vm_state_notify(1, RUN_STATE_RUNNING);
2208 return 0;
2209 }
2210
2211 void vm_start(void)
2212 {
2213 if (!vm_prepare_start()) {
2214 resume_all_vcpus();
2215 }
2216 }
2217
2218 /* does a state transition even if the VM is already stopped,
2219 current state is forgotten forever */
2220 int vm_stop_force_state(RunState state)
2221 {
2222 if (runstate_is_running()) {
2223 return vm_stop(state);
2224 } else {
2225 runstate_set(state);
2226
2227 bdrv_drain_all();
2228 /* Make sure to return an error if the flush in a previous vm_stop()
2229 * failed. */
2230 return bdrv_flush_all();
2231 }
2232 }
2233
2234 void list_cpus(const char *optarg)
2235 {
2236 /* XXX: implement xxx_cpu_list for targets that still miss it */
2237 #if defined(cpu_list)
2238 cpu_list();
2239 #endif
2240 }
2241
2242 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2243 bool has_cpu, int64_t cpu_index, Error **errp)
2244 {
2245 FILE *f;
2246 uint32_t l;
2247 CPUState *cpu;
2248 uint8_t buf[1024];
2249 int64_t orig_addr = addr, orig_size = size;
2250
2251 if (!has_cpu) {
2252 cpu_index = 0;
2253 }
2254
2255 cpu = qemu_get_cpu(cpu_index);
2256 if (cpu == NULL) {
2257 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2258 "a CPU number");
2259 return;
2260 }
2261
2262 f = fopen(filename, "wb");
2263 if (!f) {
2264 error_setg_file_open(errp, errno, filename);
2265 return;
2266 }
2267
2268 while (size != 0) {
2269 l = sizeof(buf);
2270 if (l > size)
2271 l = size;
2272 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2273 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2274 " specified", orig_addr, orig_size);
2275 goto exit;
2276 }
2277 if (fwrite(buf, 1, l, f) != l) {
2278 error_setg(errp, QERR_IO_ERROR);
2279 goto exit;
2280 }
2281 addr += l;
2282 size -= l;
2283 }
2284
2285 exit:
2286 fclose(f);
2287 }
2288
2289 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2290 Error **errp)
2291 {
2292 FILE *f;
2293 uint32_t l;
2294 uint8_t buf[1024];
2295
2296 f = fopen(filename, "wb");
2297 if (!f) {
2298 error_setg_file_open(errp, errno, filename);
2299 return;
2300 }
2301
2302 while (size != 0) {
2303 l = sizeof(buf);
2304 if (l > size)
2305 l = size;
2306 cpu_physical_memory_read(addr, buf, l);
2307 if (fwrite(buf, 1, l, f) != l) {
2308 error_setg(errp, QERR_IO_ERROR);
2309 goto exit;
2310 }
2311 addr += l;
2312 size -= l;
2313 }
2314
2315 exit:
2316 fclose(f);
2317 }
2318
2319 void qmp_inject_nmi(Error **errp)
2320 {
2321 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2322 }
2323
2324 void dump_drift_info(void)
2325 {
2326 if (!use_icount) {
2327 return;
2328 }
2329
2330 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2331 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2332 if (icount_align_option) {
2333 qemu_printf("Max guest delay %"PRIi64" ms\n",
2334 -max_delay / SCALE_MS);
2335 qemu_printf("Max guest advance %"PRIi64" ms\n",
2336 max_advance / SCALE_MS);
2337 } else {
2338 qemu_printf("Max guest delay NA\n");
2339 qemu_printf("Max guest advance NA\n");
2340 }
2341 }