Update version for 2.11.2 release
[qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53 #include "hw/boards.h"
54
55 #ifdef CONFIG_LINUX
56
57 #include <sys/prctl.h>
58
59 #ifndef PR_MCE_KILL
60 #define PR_MCE_KILL 33
61 #endif
62
63 #ifndef PR_MCE_KILL_SET
64 #define PR_MCE_KILL_SET 1
65 #endif
66
67 #ifndef PR_MCE_KILL_EARLY
68 #define PR_MCE_KILL_EARLY 1
69 #endif
70
71 #endif /* CONFIG_LINUX */
72
73 int64_t max_delay;
74 int64_t max_advance;
75
76 /* vcpu throttling controls */
77 static QEMUTimer *throttle_timer;
78 static unsigned int throttle_percentage;
79
80 #define CPU_THROTTLE_PCT_MIN 1
81 #define CPU_THROTTLE_PCT_MAX 99
82 #define CPU_THROTTLE_TIMESLICE_NS 10000000
83
84 bool cpu_is_stopped(CPUState *cpu)
85 {
86 return cpu->stopped || !runstate_is_running();
87 }
88
89 static bool cpu_thread_is_idle(CPUState *cpu)
90 {
91 if (cpu->stop || cpu->queued_work_first) {
92 return false;
93 }
94 if (cpu_is_stopped(cpu)) {
95 return true;
96 }
97 if (!cpu->halted || cpu_has_work(cpu) ||
98 kvm_halt_in_kernel()) {
99 return false;
100 }
101 return true;
102 }
103
104 static bool all_cpu_threads_idle(void)
105 {
106 CPUState *cpu;
107
108 CPU_FOREACH(cpu) {
109 if (!cpu_thread_is_idle(cpu)) {
110 return false;
111 }
112 }
113 return true;
114 }
115
116 /***********************************************************/
117 /* guest cycle counter */
118
119 /* Protected by TimersState seqlock */
120
121 static bool icount_sleep = true;
122 static int64_t vm_clock_warp_start = -1;
123 /* Conversion factor from emulated instructions to virtual clock ticks. */
124 static int icount_time_shift;
125 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
126 #define MAX_ICOUNT_SHIFT 10
127
128 static QEMUTimer *icount_rt_timer;
129 static QEMUTimer *icount_vm_timer;
130 static QEMUTimer *icount_warp_timer;
131
132 typedef struct TimersState {
133 /* Protected by BQL. */
134 int64_t cpu_ticks_prev;
135 int64_t cpu_ticks_offset;
136
137 /* cpu_clock_offset can be read out of BQL, so protect it with
138 * this lock.
139 */
140 QemuSeqLock vm_clock_seqlock;
141 int64_t cpu_clock_offset;
142 int32_t cpu_ticks_enabled;
143 int64_t dummy;
144
145 /* Compensate for varying guest execution speed. */
146 int64_t qemu_icount_bias;
147 /* Only written by TCG thread */
148 int64_t qemu_icount;
149 } TimersState;
150
151 static TimersState timers_state;
152 bool mttcg_enabled;
153
154 /*
155 * We default to false if we know other options have been enabled
156 * which are currently incompatible with MTTCG. Otherwise when each
157 * guest (target) has been updated to support:
158 * - atomic instructions
159 * - memory ordering primitives (barriers)
160 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
161 *
162 * Once a guest architecture has been converted to the new primitives
163 * there are two remaining limitations to check.
164 *
165 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
166 * - The host must have a stronger memory order than the guest
167 *
168 * It may be possible in future to support strong guests on weak hosts
169 * but that will require tagging all load/stores in a guest with their
170 * implicit memory order requirements which would likely slow things
171 * down a lot.
172 */
173
174 static bool check_tcg_memory_orders_compatible(void)
175 {
176 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
177 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
178 #else
179 return false;
180 #endif
181 }
182
183 static bool default_mttcg_enabled(void)
184 {
185 if (use_icount || TCG_OVERSIZED_GUEST) {
186 return false;
187 } else {
188 #ifdef TARGET_SUPPORTS_MTTCG
189 return check_tcg_memory_orders_compatible();
190 #else
191 return false;
192 #endif
193 }
194 }
195
196 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
197 {
198 const char *t = qemu_opt_get(opts, "thread");
199 if (t) {
200 if (strcmp(t, "multi") == 0) {
201 if (TCG_OVERSIZED_GUEST) {
202 error_setg(errp, "No MTTCG when guest word size > hosts");
203 } else if (use_icount) {
204 error_setg(errp, "No MTTCG when icount is enabled");
205 } else {
206 #ifndef TARGET_SUPPORTS_MTTCG
207 error_report("Guest not yet converted to MTTCG - "
208 "you may get unexpected results");
209 #endif
210 if (!check_tcg_memory_orders_compatible()) {
211 error_report("Guest expects a stronger memory ordering "
212 "than the host provides");
213 error_printf("This may cause strange/hard to debug errors\n");
214 }
215 mttcg_enabled = true;
216 }
217 } else if (strcmp(t, "single") == 0) {
218 mttcg_enabled = false;
219 } else {
220 error_setg(errp, "Invalid 'thread' setting %s", t);
221 }
222 } else {
223 mttcg_enabled = default_mttcg_enabled();
224 }
225 }
226
227 /* The current number of executed instructions is based on what we
228 * originally budgeted minus the current state of the decrementing
229 * icount counters in extra/u16.low.
230 */
231 static int64_t cpu_get_icount_executed(CPUState *cpu)
232 {
233 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
234 }
235
236 /*
237 * Update the global shared timer_state.qemu_icount to take into
238 * account executed instructions. This is done by the TCG vCPU
239 * thread so the main-loop can see time has moved forward.
240 */
241 void cpu_update_icount(CPUState *cpu)
242 {
243 int64_t executed = cpu_get_icount_executed(cpu);
244 cpu->icount_budget -= executed;
245
246 #ifdef CONFIG_ATOMIC64
247 atomic_set__nocheck(&timers_state.qemu_icount,
248 atomic_read__nocheck(&timers_state.qemu_icount) +
249 executed);
250 #else /* FIXME: we need 64bit atomics to do this safely */
251 timers_state.qemu_icount += executed;
252 #endif
253 }
254
255 int64_t cpu_get_icount_raw(void)
256 {
257 CPUState *cpu = current_cpu;
258
259 if (cpu && cpu->running) {
260 if (!cpu->can_do_io) {
261 fprintf(stderr, "Bad icount read\n");
262 exit(1);
263 }
264 /* Take into account what has run */
265 cpu_update_icount(cpu);
266 }
267 #ifdef CONFIG_ATOMIC64
268 return atomic_read__nocheck(&timers_state.qemu_icount);
269 #else /* FIXME: we need 64bit atomics to do this safely */
270 return timers_state.qemu_icount;
271 #endif
272 }
273
274 /* Return the virtual CPU time, based on the instruction counter. */
275 static int64_t cpu_get_icount_locked(void)
276 {
277 int64_t icount = cpu_get_icount_raw();
278 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
279 }
280
281 int64_t cpu_get_icount(void)
282 {
283 int64_t icount;
284 unsigned start;
285
286 do {
287 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
288 icount = cpu_get_icount_locked();
289 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
290
291 return icount;
292 }
293
294 int64_t cpu_icount_to_ns(int64_t icount)
295 {
296 return icount << icount_time_shift;
297 }
298
299 /* return the time elapsed in VM between vm_start and vm_stop. Unless
300 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
301 * counter.
302 *
303 * Caller must hold the BQL
304 */
305 int64_t cpu_get_ticks(void)
306 {
307 int64_t ticks;
308
309 if (use_icount) {
310 return cpu_get_icount();
311 }
312
313 ticks = timers_state.cpu_ticks_offset;
314 if (timers_state.cpu_ticks_enabled) {
315 ticks += cpu_get_host_ticks();
316 }
317
318 if (timers_state.cpu_ticks_prev > ticks) {
319 /* Note: non increasing ticks may happen if the host uses
320 software suspend */
321 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
322 ticks = timers_state.cpu_ticks_prev;
323 }
324
325 timers_state.cpu_ticks_prev = ticks;
326 return ticks;
327 }
328
329 static int64_t cpu_get_clock_locked(void)
330 {
331 int64_t time;
332
333 time = timers_state.cpu_clock_offset;
334 if (timers_state.cpu_ticks_enabled) {
335 time += get_clock();
336 }
337
338 return time;
339 }
340
341 /* Return the monotonic time elapsed in VM, i.e.,
342 * the time between vm_start and vm_stop
343 */
344 int64_t cpu_get_clock(void)
345 {
346 int64_t ti;
347 unsigned start;
348
349 do {
350 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
351 ti = cpu_get_clock_locked();
352 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
353
354 return ti;
355 }
356
357 /* enable cpu_get_ticks()
358 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
359 */
360 void cpu_enable_ticks(void)
361 {
362 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
363 seqlock_write_begin(&timers_state.vm_clock_seqlock);
364 if (!timers_state.cpu_ticks_enabled) {
365 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
366 timers_state.cpu_clock_offset -= get_clock();
367 timers_state.cpu_ticks_enabled = 1;
368 }
369 seqlock_write_end(&timers_state.vm_clock_seqlock);
370 }
371
372 /* disable cpu_get_ticks() : the clock is stopped. You must not call
373 * cpu_get_ticks() after that.
374 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
375 */
376 void cpu_disable_ticks(void)
377 {
378 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
379 seqlock_write_begin(&timers_state.vm_clock_seqlock);
380 if (timers_state.cpu_ticks_enabled) {
381 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
382 timers_state.cpu_clock_offset = cpu_get_clock_locked();
383 timers_state.cpu_ticks_enabled = 0;
384 }
385 seqlock_write_end(&timers_state.vm_clock_seqlock);
386 }
387
388 /* Correlation between real and virtual time is always going to be
389 fairly approximate, so ignore small variation.
390 When the guest is idle real and virtual time will be aligned in
391 the IO wait loop. */
392 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
393
394 static void icount_adjust(void)
395 {
396 int64_t cur_time;
397 int64_t cur_icount;
398 int64_t delta;
399
400 /* Protected by TimersState mutex. */
401 static int64_t last_delta;
402
403 /* If the VM is not running, then do nothing. */
404 if (!runstate_is_running()) {
405 return;
406 }
407
408 seqlock_write_begin(&timers_state.vm_clock_seqlock);
409 cur_time = cpu_get_clock_locked();
410 cur_icount = cpu_get_icount_locked();
411
412 delta = cur_icount - cur_time;
413 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
414 if (delta > 0
415 && last_delta + ICOUNT_WOBBLE < delta * 2
416 && icount_time_shift > 0) {
417 /* The guest is getting too far ahead. Slow time down. */
418 icount_time_shift--;
419 }
420 if (delta < 0
421 && last_delta - ICOUNT_WOBBLE > delta * 2
422 && icount_time_shift < MAX_ICOUNT_SHIFT) {
423 /* The guest is getting too far behind. Speed time up. */
424 icount_time_shift++;
425 }
426 last_delta = delta;
427 timers_state.qemu_icount_bias = cur_icount
428 - (timers_state.qemu_icount << icount_time_shift);
429 seqlock_write_end(&timers_state.vm_clock_seqlock);
430 }
431
432 static void icount_adjust_rt(void *opaque)
433 {
434 timer_mod(icount_rt_timer,
435 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
436 icount_adjust();
437 }
438
439 static void icount_adjust_vm(void *opaque)
440 {
441 timer_mod(icount_vm_timer,
442 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
443 NANOSECONDS_PER_SECOND / 10);
444 icount_adjust();
445 }
446
447 static int64_t qemu_icount_round(int64_t count)
448 {
449 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
450 }
451
452 static void icount_warp_rt(void)
453 {
454 unsigned seq;
455 int64_t warp_start;
456
457 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
458 * changes from -1 to another value, so the race here is okay.
459 */
460 do {
461 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
462 warp_start = vm_clock_warp_start;
463 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
464
465 if (warp_start == -1) {
466 return;
467 }
468
469 seqlock_write_begin(&timers_state.vm_clock_seqlock);
470 if (runstate_is_running()) {
471 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
472 cpu_get_clock_locked());
473 int64_t warp_delta;
474
475 warp_delta = clock - vm_clock_warp_start;
476 if (use_icount == 2) {
477 /*
478 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
479 * far ahead of real time.
480 */
481 int64_t cur_icount = cpu_get_icount_locked();
482 int64_t delta = clock - cur_icount;
483 warp_delta = MIN(warp_delta, delta);
484 }
485 timers_state.qemu_icount_bias += warp_delta;
486 }
487 vm_clock_warp_start = -1;
488 seqlock_write_end(&timers_state.vm_clock_seqlock);
489
490 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
491 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
492 }
493 }
494
495 static void icount_timer_cb(void *opaque)
496 {
497 /* No need for a checkpoint because the timer already synchronizes
498 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
499 */
500 icount_warp_rt();
501 }
502
503 void qtest_clock_warp(int64_t dest)
504 {
505 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
506 AioContext *aio_context;
507 assert(qtest_enabled());
508 aio_context = qemu_get_aio_context();
509 while (clock < dest) {
510 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
511 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
512
513 seqlock_write_begin(&timers_state.vm_clock_seqlock);
514 timers_state.qemu_icount_bias += warp;
515 seqlock_write_end(&timers_state.vm_clock_seqlock);
516
517 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
518 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
519 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
520 }
521 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
522 }
523
524 void qemu_start_warp_timer(void)
525 {
526 int64_t clock;
527 int64_t deadline;
528
529 if (!use_icount) {
530 return;
531 }
532
533 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
534 * do not fire, so computing the deadline does not make sense.
535 */
536 if (!runstate_is_running()) {
537 return;
538 }
539
540 /* warp clock deterministically in record/replay mode */
541 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
542 return;
543 }
544
545 if (!all_cpu_threads_idle()) {
546 return;
547 }
548
549 if (qtest_enabled()) {
550 /* When testing, qtest commands advance icount. */
551 return;
552 }
553
554 /* We want to use the earliest deadline from ALL vm_clocks */
555 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
556 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
557 if (deadline < 0) {
558 static bool notified;
559 if (!icount_sleep && !notified) {
560 warn_report("icount sleep disabled and no active timers");
561 notified = true;
562 }
563 return;
564 }
565
566 if (deadline > 0) {
567 /*
568 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
569 * sleep. Otherwise, the CPU might be waiting for a future timer
570 * interrupt to wake it up, but the interrupt never comes because
571 * the vCPU isn't running any insns and thus doesn't advance the
572 * QEMU_CLOCK_VIRTUAL.
573 */
574 if (!icount_sleep) {
575 /*
576 * We never let VCPUs sleep in no sleep icount mode.
577 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
578 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
579 * It is useful when we want a deterministic execution time,
580 * isolated from host latencies.
581 */
582 seqlock_write_begin(&timers_state.vm_clock_seqlock);
583 timers_state.qemu_icount_bias += deadline;
584 seqlock_write_end(&timers_state.vm_clock_seqlock);
585 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
586 } else {
587 /*
588 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
589 * "real" time, (related to the time left until the next event) has
590 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
591 * This avoids that the warps are visible externally; for example,
592 * you will not be sending network packets continuously instead of
593 * every 100ms.
594 */
595 seqlock_write_begin(&timers_state.vm_clock_seqlock);
596 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
597 vm_clock_warp_start = clock;
598 }
599 seqlock_write_end(&timers_state.vm_clock_seqlock);
600 timer_mod_anticipate(icount_warp_timer, clock + deadline);
601 }
602 } else if (deadline == 0) {
603 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
604 }
605 }
606
607 static void qemu_account_warp_timer(void)
608 {
609 if (!use_icount || !icount_sleep) {
610 return;
611 }
612
613 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
614 * do not fire, so computing the deadline does not make sense.
615 */
616 if (!runstate_is_running()) {
617 return;
618 }
619
620 /* warp clock deterministically in record/replay mode */
621 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
622 return;
623 }
624
625 timer_del(icount_warp_timer);
626 icount_warp_rt();
627 }
628
629 static bool icount_state_needed(void *opaque)
630 {
631 return use_icount;
632 }
633
634 /*
635 * This is a subsection for icount migration.
636 */
637 static const VMStateDescription icount_vmstate_timers = {
638 .name = "timer/icount",
639 .version_id = 1,
640 .minimum_version_id = 1,
641 .needed = icount_state_needed,
642 .fields = (VMStateField[]) {
643 VMSTATE_INT64(qemu_icount_bias, TimersState),
644 VMSTATE_INT64(qemu_icount, TimersState),
645 VMSTATE_END_OF_LIST()
646 }
647 };
648
649 static const VMStateDescription vmstate_timers = {
650 .name = "timer",
651 .version_id = 2,
652 .minimum_version_id = 1,
653 .fields = (VMStateField[]) {
654 VMSTATE_INT64(cpu_ticks_offset, TimersState),
655 VMSTATE_INT64(dummy, TimersState),
656 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
657 VMSTATE_END_OF_LIST()
658 },
659 .subsections = (const VMStateDescription*[]) {
660 &icount_vmstate_timers,
661 NULL
662 }
663 };
664
665 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
666 {
667 double pct;
668 double throttle_ratio;
669 long sleeptime_ns;
670
671 if (!cpu_throttle_get_percentage()) {
672 return;
673 }
674
675 pct = (double)cpu_throttle_get_percentage()/100;
676 throttle_ratio = pct / (1 - pct);
677 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
678
679 qemu_mutex_unlock_iothread();
680 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
681 qemu_mutex_lock_iothread();
682 atomic_set(&cpu->throttle_thread_scheduled, 0);
683 }
684
685 static void cpu_throttle_timer_tick(void *opaque)
686 {
687 CPUState *cpu;
688 double pct;
689
690 /* Stop the timer if needed */
691 if (!cpu_throttle_get_percentage()) {
692 return;
693 }
694 CPU_FOREACH(cpu) {
695 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
696 async_run_on_cpu(cpu, cpu_throttle_thread,
697 RUN_ON_CPU_NULL);
698 }
699 }
700
701 pct = (double)cpu_throttle_get_percentage()/100;
702 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
703 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
704 }
705
706 void cpu_throttle_set(int new_throttle_pct)
707 {
708 /* Ensure throttle percentage is within valid range */
709 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
710 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
711
712 atomic_set(&throttle_percentage, new_throttle_pct);
713
714 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
715 CPU_THROTTLE_TIMESLICE_NS);
716 }
717
718 void cpu_throttle_stop(void)
719 {
720 atomic_set(&throttle_percentage, 0);
721 }
722
723 bool cpu_throttle_active(void)
724 {
725 return (cpu_throttle_get_percentage() != 0);
726 }
727
728 int cpu_throttle_get_percentage(void)
729 {
730 return atomic_read(&throttle_percentage);
731 }
732
733 void cpu_ticks_init(void)
734 {
735 seqlock_init(&timers_state.vm_clock_seqlock);
736 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
737 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
738 cpu_throttle_timer_tick, NULL);
739 }
740
741 void configure_icount(QemuOpts *opts, Error **errp)
742 {
743 const char *option;
744 char *rem_str = NULL;
745
746 option = qemu_opt_get(opts, "shift");
747 if (!option) {
748 if (qemu_opt_get(opts, "align") != NULL) {
749 error_setg(errp, "Please specify shift option when using align");
750 }
751 return;
752 }
753
754 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
755 if (icount_sleep) {
756 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
757 icount_timer_cb, NULL);
758 }
759
760 icount_align_option = qemu_opt_get_bool(opts, "align", false);
761
762 if (icount_align_option && !icount_sleep) {
763 error_setg(errp, "align=on and sleep=off are incompatible");
764 }
765 if (strcmp(option, "auto") != 0) {
766 errno = 0;
767 icount_time_shift = strtol(option, &rem_str, 0);
768 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
769 error_setg(errp, "icount: Invalid shift value");
770 }
771 use_icount = 1;
772 return;
773 } else if (icount_align_option) {
774 error_setg(errp, "shift=auto and align=on are incompatible");
775 } else if (!icount_sleep) {
776 error_setg(errp, "shift=auto and sleep=off are incompatible");
777 }
778
779 use_icount = 2;
780
781 /* 125MIPS seems a reasonable initial guess at the guest speed.
782 It will be corrected fairly quickly anyway. */
783 icount_time_shift = 3;
784
785 /* Have both realtime and virtual time triggers for speed adjustment.
786 The realtime trigger catches emulated time passing too slowly,
787 the virtual time trigger catches emulated time passing too fast.
788 Realtime triggers occur even when idle, so use them less frequently
789 than VM triggers. */
790 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
791 icount_adjust_rt, NULL);
792 timer_mod(icount_rt_timer,
793 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
794 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
795 icount_adjust_vm, NULL);
796 timer_mod(icount_vm_timer,
797 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
798 NANOSECONDS_PER_SECOND / 10);
799 }
800
801 /***********************************************************/
802 /* TCG vCPU kick timer
803 *
804 * The kick timer is responsible for moving single threaded vCPU
805 * emulation on to the next vCPU. If more than one vCPU is running a
806 * timer event with force a cpu->exit so the next vCPU can get
807 * scheduled.
808 *
809 * The timer is removed if all vCPUs are idle and restarted again once
810 * idleness is complete.
811 */
812
813 static QEMUTimer *tcg_kick_vcpu_timer;
814 static CPUState *tcg_current_rr_cpu;
815
816 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
817
818 static inline int64_t qemu_tcg_next_kick(void)
819 {
820 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
821 }
822
823 /* Kick the currently round-robin scheduled vCPU */
824 static void qemu_cpu_kick_rr_cpu(void)
825 {
826 CPUState *cpu;
827 do {
828 cpu = atomic_mb_read(&tcg_current_rr_cpu);
829 if (cpu) {
830 cpu_exit(cpu);
831 }
832 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
833 }
834
835 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
836 {
837 }
838
839 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
840 {
841 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
842 qemu_notify_event();
843 return;
844 }
845
846 if (qemu_in_vcpu_thread()) {
847 /* A CPU is currently running; kick it back out to the
848 * tcg_cpu_exec() loop so it will recalculate its
849 * icount deadline immediately.
850 */
851 qemu_cpu_kick(current_cpu);
852 } else if (first_cpu) {
853 /* qemu_cpu_kick is not enough to kick a halted CPU out of
854 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
855 * causes cpu_thread_is_idle to return false. This way,
856 * handle_icount_deadline can run.
857 * If we have no CPUs at all for some reason, we don't
858 * need to do anything.
859 */
860 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
861 }
862 }
863
864 static void kick_tcg_thread(void *opaque)
865 {
866 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
867 qemu_cpu_kick_rr_cpu();
868 }
869
870 static void start_tcg_kick_timer(void)
871 {
872 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
873 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
874 kick_tcg_thread, NULL);
875 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
876 }
877 }
878
879 static void stop_tcg_kick_timer(void)
880 {
881 if (tcg_kick_vcpu_timer) {
882 timer_del(tcg_kick_vcpu_timer);
883 tcg_kick_vcpu_timer = NULL;
884 }
885 }
886
887 /***********************************************************/
888 void hw_error(const char *fmt, ...)
889 {
890 va_list ap;
891 CPUState *cpu;
892
893 va_start(ap, fmt);
894 fprintf(stderr, "qemu: hardware error: ");
895 vfprintf(stderr, fmt, ap);
896 fprintf(stderr, "\n");
897 CPU_FOREACH(cpu) {
898 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
899 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
900 }
901 va_end(ap);
902 abort();
903 }
904
905 void cpu_synchronize_all_states(void)
906 {
907 CPUState *cpu;
908
909 CPU_FOREACH(cpu) {
910 cpu_synchronize_state(cpu);
911 }
912 }
913
914 void cpu_synchronize_all_post_reset(void)
915 {
916 CPUState *cpu;
917
918 CPU_FOREACH(cpu) {
919 cpu_synchronize_post_reset(cpu);
920 }
921 }
922
923 void cpu_synchronize_all_post_init(void)
924 {
925 CPUState *cpu;
926
927 CPU_FOREACH(cpu) {
928 cpu_synchronize_post_init(cpu);
929 }
930 }
931
932 void cpu_synchronize_all_pre_loadvm(void)
933 {
934 CPUState *cpu;
935
936 CPU_FOREACH(cpu) {
937 cpu_synchronize_pre_loadvm(cpu);
938 }
939 }
940
941 static int do_vm_stop(RunState state)
942 {
943 int ret = 0;
944
945 if (runstate_is_running()) {
946 cpu_disable_ticks();
947 pause_all_vcpus();
948 runstate_set(state);
949 vm_state_notify(0, state);
950 qapi_event_send_stop(&error_abort);
951 }
952
953 bdrv_drain_all();
954 replay_disable_events();
955 ret = bdrv_flush_all();
956
957 return ret;
958 }
959
960 static bool cpu_can_run(CPUState *cpu)
961 {
962 if (cpu->stop) {
963 return false;
964 }
965 if (cpu_is_stopped(cpu)) {
966 return false;
967 }
968 return true;
969 }
970
971 static void cpu_handle_guest_debug(CPUState *cpu)
972 {
973 gdb_set_stop_cpu(cpu);
974 qemu_system_debug_request();
975 cpu->stopped = true;
976 }
977
978 #ifdef CONFIG_LINUX
979 static void sigbus_reraise(void)
980 {
981 sigset_t set;
982 struct sigaction action;
983
984 memset(&action, 0, sizeof(action));
985 action.sa_handler = SIG_DFL;
986 if (!sigaction(SIGBUS, &action, NULL)) {
987 raise(SIGBUS);
988 sigemptyset(&set);
989 sigaddset(&set, SIGBUS);
990 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
991 }
992 perror("Failed to re-raise SIGBUS!\n");
993 abort();
994 }
995
996 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
997 {
998 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
999 sigbus_reraise();
1000 }
1001
1002 if (current_cpu) {
1003 /* Called asynchronously in VCPU thread. */
1004 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1005 sigbus_reraise();
1006 }
1007 } else {
1008 /* Called synchronously (via signalfd) in main thread. */
1009 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1010 sigbus_reraise();
1011 }
1012 }
1013 }
1014
1015 static void qemu_init_sigbus(void)
1016 {
1017 struct sigaction action;
1018
1019 memset(&action, 0, sizeof(action));
1020 action.sa_flags = SA_SIGINFO;
1021 action.sa_sigaction = sigbus_handler;
1022 sigaction(SIGBUS, &action, NULL);
1023
1024 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1025 }
1026 #else /* !CONFIG_LINUX */
1027 static void qemu_init_sigbus(void)
1028 {
1029 }
1030 #endif /* !CONFIG_LINUX */
1031
1032 static QemuMutex qemu_global_mutex;
1033
1034 static QemuThread io_thread;
1035
1036 /* cpu creation */
1037 static QemuCond qemu_cpu_cond;
1038 /* system init */
1039 static QemuCond qemu_pause_cond;
1040
1041 void qemu_init_cpu_loop(void)
1042 {
1043 qemu_init_sigbus();
1044 qemu_cond_init(&qemu_cpu_cond);
1045 qemu_cond_init(&qemu_pause_cond);
1046 qemu_mutex_init(&qemu_global_mutex);
1047
1048 qemu_thread_get_self(&io_thread);
1049 }
1050
1051 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1052 {
1053 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1054 }
1055
1056 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1057 {
1058 if (kvm_destroy_vcpu(cpu) < 0) {
1059 error_report("kvm_destroy_vcpu failed");
1060 exit(EXIT_FAILURE);
1061 }
1062 }
1063
1064 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1065 {
1066 }
1067
1068 static void qemu_wait_io_event_common(CPUState *cpu)
1069 {
1070 atomic_mb_set(&cpu->thread_kicked, false);
1071 if (cpu->stop) {
1072 cpu->stop = false;
1073 cpu->stopped = true;
1074 qemu_cond_broadcast(&qemu_pause_cond);
1075 }
1076 process_queued_cpu_work(cpu);
1077 }
1078
1079 static bool qemu_tcg_should_sleep(CPUState *cpu)
1080 {
1081 if (mttcg_enabled) {
1082 return cpu_thread_is_idle(cpu);
1083 } else {
1084 return all_cpu_threads_idle();
1085 }
1086 }
1087
1088 static void qemu_tcg_wait_io_event(CPUState *cpu)
1089 {
1090 while (qemu_tcg_should_sleep(cpu)) {
1091 stop_tcg_kick_timer();
1092 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1093 }
1094
1095 start_tcg_kick_timer();
1096
1097 qemu_wait_io_event_common(cpu);
1098 }
1099
1100 static void qemu_kvm_wait_io_event(CPUState *cpu)
1101 {
1102 while (cpu_thread_is_idle(cpu)) {
1103 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1104 }
1105
1106 qemu_wait_io_event_common(cpu);
1107 }
1108
1109 static void *qemu_kvm_cpu_thread_fn(void *arg)
1110 {
1111 CPUState *cpu = arg;
1112 int r;
1113
1114 rcu_register_thread();
1115
1116 qemu_mutex_lock_iothread();
1117 qemu_thread_get_self(cpu->thread);
1118 cpu->thread_id = qemu_get_thread_id();
1119 cpu->can_do_io = 1;
1120 current_cpu = cpu;
1121
1122 r = kvm_init_vcpu(cpu);
1123 if (r < 0) {
1124 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1125 exit(1);
1126 }
1127
1128 kvm_init_cpu_signals(cpu);
1129
1130 /* signal CPU creation */
1131 cpu->created = true;
1132 qemu_cond_signal(&qemu_cpu_cond);
1133
1134 do {
1135 if (cpu_can_run(cpu)) {
1136 r = kvm_cpu_exec(cpu);
1137 if (r == EXCP_DEBUG) {
1138 cpu_handle_guest_debug(cpu);
1139 }
1140 }
1141 qemu_kvm_wait_io_event(cpu);
1142 } while (!cpu->unplug || cpu_can_run(cpu));
1143
1144 qemu_kvm_destroy_vcpu(cpu);
1145 cpu->created = false;
1146 qemu_cond_signal(&qemu_cpu_cond);
1147 qemu_mutex_unlock_iothread();
1148 return NULL;
1149 }
1150
1151 static void *qemu_dummy_cpu_thread_fn(void *arg)
1152 {
1153 #ifdef _WIN32
1154 fprintf(stderr, "qtest is not supported under Windows\n");
1155 exit(1);
1156 #else
1157 CPUState *cpu = arg;
1158 sigset_t waitset;
1159 int r;
1160
1161 rcu_register_thread();
1162
1163 qemu_mutex_lock_iothread();
1164 qemu_thread_get_self(cpu->thread);
1165 cpu->thread_id = qemu_get_thread_id();
1166 cpu->can_do_io = 1;
1167 current_cpu = cpu;
1168
1169 sigemptyset(&waitset);
1170 sigaddset(&waitset, SIG_IPI);
1171
1172 /* signal CPU creation */
1173 cpu->created = true;
1174 qemu_cond_signal(&qemu_cpu_cond);
1175
1176 while (1) {
1177 qemu_mutex_unlock_iothread();
1178 do {
1179 int sig;
1180 r = sigwait(&waitset, &sig);
1181 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1182 if (r == -1) {
1183 perror("sigwait");
1184 exit(1);
1185 }
1186 qemu_mutex_lock_iothread();
1187 qemu_wait_io_event_common(cpu);
1188 }
1189
1190 return NULL;
1191 #endif
1192 }
1193
1194 static int64_t tcg_get_icount_limit(void)
1195 {
1196 int64_t deadline;
1197
1198 if (replay_mode != REPLAY_MODE_PLAY) {
1199 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1200
1201 /* Maintain prior (possibly buggy) behaviour where if no deadline
1202 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1203 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1204 * nanoseconds.
1205 */
1206 if ((deadline < 0) || (deadline > INT32_MAX)) {
1207 deadline = INT32_MAX;
1208 }
1209
1210 return qemu_icount_round(deadline);
1211 } else {
1212 return replay_get_instructions();
1213 }
1214 }
1215
1216 static void handle_icount_deadline(void)
1217 {
1218 assert(qemu_in_vcpu_thread());
1219 if (use_icount) {
1220 int64_t deadline =
1221 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1222
1223 if (deadline == 0) {
1224 /* Wake up other AioContexts. */
1225 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1226 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1227 }
1228 }
1229 }
1230
1231 static void prepare_icount_for_run(CPUState *cpu)
1232 {
1233 if (use_icount) {
1234 int insns_left;
1235
1236 /* These should always be cleared by process_icount_data after
1237 * each vCPU execution. However u16.high can be raised
1238 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1239 */
1240 g_assert(cpu->icount_decr.u16.low == 0);
1241 g_assert(cpu->icount_extra == 0);
1242
1243 cpu->icount_budget = tcg_get_icount_limit();
1244 insns_left = MIN(0xffff, cpu->icount_budget);
1245 cpu->icount_decr.u16.low = insns_left;
1246 cpu->icount_extra = cpu->icount_budget - insns_left;
1247 }
1248 }
1249
1250 static void process_icount_data(CPUState *cpu)
1251 {
1252 if (use_icount) {
1253 /* Account for executed instructions */
1254 cpu_update_icount(cpu);
1255
1256 /* Reset the counters */
1257 cpu->icount_decr.u16.low = 0;
1258 cpu->icount_extra = 0;
1259 cpu->icount_budget = 0;
1260
1261 replay_account_executed_instructions();
1262 }
1263 }
1264
1265
1266 static int tcg_cpu_exec(CPUState *cpu)
1267 {
1268 int ret;
1269 #ifdef CONFIG_PROFILER
1270 int64_t ti;
1271 #endif
1272
1273 #ifdef CONFIG_PROFILER
1274 ti = profile_getclock();
1275 #endif
1276 qemu_mutex_unlock_iothread();
1277 cpu_exec_start(cpu);
1278 ret = cpu_exec(cpu);
1279 cpu_exec_end(cpu);
1280 qemu_mutex_lock_iothread();
1281 #ifdef CONFIG_PROFILER
1282 tcg_time += profile_getclock() - ti;
1283 #endif
1284 return ret;
1285 }
1286
1287 /* Destroy any remaining vCPUs which have been unplugged and have
1288 * finished running
1289 */
1290 static void deal_with_unplugged_cpus(void)
1291 {
1292 CPUState *cpu;
1293
1294 CPU_FOREACH(cpu) {
1295 if (cpu->unplug && !cpu_can_run(cpu)) {
1296 qemu_tcg_destroy_vcpu(cpu);
1297 cpu->created = false;
1298 qemu_cond_signal(&qemu_cpu_cond);
1299 break;
1300 }
1301 }
1302 }
1303
1304 /* Single-threaded TCG
1305 *
1306 * In the single-threaded case each vCPU is simulated in turn. If
1307 * there is more than a single vCPU we create a simple timer to kick
1308 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1309 * This is done explicitly rather than relying on side-effects
1310 * elsewhere.
1311 */
1312
1313 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1314 {
1315 CPUState *cpu = arg;
1316
1317 rcu_register_thread();
1318 tcg_register_thread();
1319
1320 qemu_mutex_lock_iothread();
1321 qemu_thread_get_self(cpu->thread);
1322
1323 CPU_FOREACH(cpu) {
1324 cpu->thread_id = qemu_get_thread_id();
1325 cpu->created = true;
1326 cpu->can_do_io = 1;
1327 }
1328 qemu_cond_signal(&qemu_cpu_cond);
1329
1330 /* wait for initial kick-off after machine start */
1331 while (first_cpu->stopped) {
1332 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1333
1334 /* process any pending work */
1335 CPU_FOREACH(cpu) {
1336 current_cpu = cpu;
1337 qemu_wait_io_event_common(cpu);
1338 }
1339 }
1340
1341 start_tcg_kick_timer();
1342
1343 cpu = first_cpu;
1344
1345 /* process any pending work */
1346 cpu->exit_request = 1;
1347
1348 while (1) {
1349 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1350 qemu_account_warp_timer();
1351
1352 /* Run the timers here. This is much more efficient than
1353 * waking up the I/O thread and waiting for completion.
1354 */
1355 handle_icount_deadline();
1356
1357 if (!cpu) {
1358 cpu = first_cpu;
1359 }
1360
1361 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1362
1363 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1364 current_cpu = cpu;
1365
1366 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1367 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1368
1369 if (cpu_can_run(cpu)) {
1370 int r;
1371
1372 prepare_icount_for_run(cpu);
1373
1374 r = tcg_cpu_exec(cpu);
1375
1376 process_icount_data(cpu);
1377
1378 if (r == EXCP_DEBUG) {
1379 cpu_handle_guest_debug(cpu);
1380 break;
1381 } else if (r == EXCP_ATOMIC) {
1382 qemu_mutex_unlock_iothread();
1383 cpu_exec_step_atomic(cpu);
1384 qemu_mutex_lock_iothread();
1385 break;
1386 }
1387 } else if (cpu->stop) {
1388 if (cpu->unplug) {
1389 cpu = CPU_NEXT(cpu);
1390 }
1391 break;
1392 }
1393
1394 cpu = CPU_NEXT(cpu);
1395 } /* while (cpu && !cpu->exit_request).. */
1396
1397 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1398 atomic_set(&tcg_current_rr_cpu, NULL);
1399
1400 if (cpu && cpu->exit_request) {
1401 atomic_mb_set(&cpu->exit_request, 0);
1402 }
1403
1404 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1405 deal_with_unplugged_cpus();
1406 }
1407
1408 return NULL;
1409 }
1410
1411 static void *qemu_hax_cpu_thread_fn(void *arg)
1412 {
1413 CPUState *cpu = arg;
1414 int r;
1415
1416 qemu_mutex_lock_iothread();
1417 qemu_thread_get_self(cpu->thread);
1418
1419 cpu->thread_id = qemu_get_thread_id();
1420 cpu->created = true;
1421 cpu->halted = 0;
1422 current_cpu = cpu;
1423
1424 hax_init_vcpu(cpu);
1425 qemu_cond_signal(&qemu_cpu_cond);
1426
1427 while (1) {
1428 if (cpu_can_run(cpu)) {
1429 r = hax_smp_cpu_exec(cpu);
1430 if (r == EXCP_DEBUG) {
1431 cpu_handle_guest_debug(cpu);
1432 }
1433 }
1434
1435 while (cpu_thread_is_idle(cpu)) {
1436 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1437 }
1438 #ifdef _WIN32
1439 SleepEx(0, TRUE);
1440 #endif
1441 qemu_wait_io_event_common(cpu);
1442 }
1443 return NULL;
1444 }
1445
1446 #ifdef _WIN32
1447 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1448 {
1449 }
1450 #endif
1451
1452 /* Multi-threaded TCG
1453 *
1454 * In the multi-threaded case each vCPU has its own thread. The TLS
1455 * variable current_cpu can be used deep in the code to find the
1456 * current CPUState for a given thread.
1457 */
1458
1459 static void *qemu_tcg_cpu_thread_fn(void *arg)
1460 {
1461 CPUState *cpu = arg;
1462
1463 g_assert(!use_icount);
1464
1465 rcu_register_thread();
1466 tcg_register_thread();
1467
1468 qemu_mutex_lock_iothread();
1469 qemu_thread_get_self(cpu->thread);
1470
1471 cpu->thread_id = qemu_get_thread_id();
1472 cpu->created = true;
1473 cpu->can_do_io = 1;
1474 current_cpu = cpu;
1475 qemu_cond_signal(&qemu_cpu_cond);
1476
1477 /* process any pending work */
1478 cpu->exit_request = 1;
1479
1480 while (1) {
1481 if (cpu_can_run(cpu)) {
1482 int r;
1483 r = tcg_cpu_exec(cpu);
1484 switch (r) {
1485 case EXCP_DEBUG:
1486 cpu_handle_guest_debug(cpu);
1487 break;
1488 case EXCP_HALTED:
1489 /* during start-up the vCPU is reset and the thread is
1490 * kicked several times. If we don't ensure we go back
1491 * to sleep in the halted state we won't cleanly
1492 * start-up when the vCPU is enabled.
1493 *
1494 * cpu->halted should ensure we sleep in wait_io_event
1495 */
1496 g_assert(cpu->halted);
1497 break;
1498 case EXCP_ATOMIC:
1499 qemu_mutex_unlock_iothread();
1500 cpu_exec_step_atomic(cpu);
1501 qemu_mutex_lock_iothread();
1502 default:
1503 /* Ignore everything else? */
1504 break;
1505 }
1506 } else if (cpu->unplug) {
1507 qemu_tcg_destroy_vcpu(cpu);
1508 cpu->created = false;
1509 qemu_cond_signal(&qemu_cpu_cond);
1510 qemu_mutex_unlock_iothread();
1511 return NULL;
1512 }
1513
1514 atomic_mb_set(&cpu->exit_request, 0);
1515 qemu_tcg_wait_io_event(cpu);
1516 }
1517
1518 return NULL;
1519 }
1520
1521 static void qemu_cpu_kick_thread(CPUState *cpu)
1522 {
1523 #ifndef _WIN32
1524 int err;
1525
1526 if (cpu->thread_kicked) {
1527 return;
1528 }
1529 cpu->thread_kicked = true;
1530 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1531 if (err) {
1532 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1533 exit(1);
1534 }
1535 #else /* _WIN32 */
1536 if (!qemu_cpu_is_self(cpu)) {
1537 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1538 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1539 __func__, GetLastError());
1540 exit(1);
1541 }
1542 }
1543 #endif
1544 }
1545
1546 void qemu_cpu_kick(CPUState *cpu)
1547 {
1548 qemu_cond_broadcast(cpu->halt_cond);
1549 if (tcg_enabled()) {
1550 cpu_exit(cpu);
1551 /* NOP unless doing single-thread RR */
1552 qemu_cpu_kick_rr_cpu();
1553 } else {
1554 if (hax_enabled()) {
1555 /*
1556 * FIXME: race condition with the exit_request check in
1557 * hax_vcpu_hax_exec
1558 */
1559 cpu->exit_request = 1;
1560 }
1561 qemu_cpu_kick_thread(cpu);
1562 }
1563 }
1564
1565 void qemu_cpu_kick_self(void)
1566 {
1567 assert(current_cpu);
1568 qemu_cpu_kick_thread(current_cpu);
1569 }
1570
1571 bool qemu_cpu_is_self(CPUState *cpu)
1572 {
1573 return qemu_thread_is_self(cpu->thread);
1574 }
1575
1576 bool qemu_in_vcpu_thread(void)
1577 {
1578 return current_cpu && qemu_cpu_is_self(current_cpu);
1579 }
1580
1581 static __thread bool iothread_locked = false;
1582
1583 bool qemu_mutex_iothread_locked(void)
1584 {
1585 return iothread_locked;
1586 }
1587
1588 void qemu_mutex_lock_iothread(void)
1589 {
1590 g_assert(!qemu_mutex_iothread_locked());
1591 qemu_mutex_lock(&qemu_global_mutex);
1592 iothread_locked = true;
1593 }
1594
1595 void qemu_mutex_unlock_iothread(void)
1596 {
1597 g_assert(qemu_mutex_iothread_locked());
1598 iothread_locked = false;
1599 qemu_mutex_unlock(&qemu_global_mutex);
1600 }
1601
1602 static bool all_vcpus_paused(void)
1603 {
1604 CPUState *cpu;
1605
1606 CPU_FOREACH(cpu) {
1607 if (!cpu->stopped) {
1608 return false;
1609 }
1610 }
1611
1612 return true;
1613 }
1614
1615 void pause_all_vcpus(void)
1616 {
1617 CPUState *cpu;
1618
1619 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1620 CPU_FOREACH(cpu) {
1621 cpu->stop = true;
1622 qemu_cpu_kick(cpu);
1623 }
1624
1625 if (qemu_in_vcpu_thread()) {
1626 cpu_stop_current();
1627 }
1628
1629 while (!all_vcpus_paused()) {
1630 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1631 CPU_FOREACH(cpu) {
1632 qemu_cpu_kick(cpu);
1633 }
1634 }
1635 }
1636
1637 void cpu_resume(CPUState *cpu)
1638 {
1639 cpu->stop = false;
1640 cpu->stopped = false;
1641 qemu_cpu_kick(cpu);
1642 }
1643
1644 void resume_all_vcpus(void)
1645 {
1646 CPUState *cpu;
1647
1648 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1649 CPU_FOREACH(cpu) {
1650 cpu_resume(cpu);
1651 }
1652 }
1653
1654 void cpu_remove(CPUState *cpu)
1655 {
1656 cpu->stop = true;
1657 cpu->unplug = true;
1658 qemu_cpu_kick(cpu);
1659 }
1660
1661 void cpu_remove_sync(CPUState *cpu)
1662 {
1663 cpu_remove(cpu);
1664 while (cpu->created) {
1665 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1666 }
1667 }
1668
1669 /* For temporary buffers for forming a name */
1670 #define VCPU_THREAD_NAME_SIZE 16
1671
1672 static void qemu_tcg_init_vcpu(CPUState *cpu)
1673 {
1674 char thread_name[VCPU_THREAD_NAME_SIZE];
1675 static QemuCond *single_tcg_halt_cond;
1676 static QemuThread *single_tcg_cpu_thread;
1677 static int tcg_region_inited;
1678
1679 /*
1680 * Initialize TCG regions--once. Now is a good time, because:
1681 * (1) TCG's init context, prologue and target globals have been set up.
1682 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1683 * -accel flag is processed, so the check doesn't work then).
1684 */
1685 if (!tcg_region_inited) {
1686 tcg_region_inited = 1;
1687 tcg_region_init();
1688 }
1689
1690 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1691 cpu->thread = g_malloc0(sizeof(QemuThread));
1692 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1693 qemu_cond_init(cpu->halt_cond);
1694
1695 if (qemu_tcg_mttcg_enabled()) {
1696 /* create a thread per vCPU with TCG (MTTCG) */
1697 parallel_cpus = true;
1698 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1699 cpu->cpu_index);
1700
1701 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1702 cpu, QEMU_THREAD_JOINABLE);
1703
1704 } else {
1705 /* share a single thread for all cpus with TCG */
1706 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1707 qemu_thread_create(cpu->thread, thread_name,
1708 qemu_tcg_rr_cpu_thread_fn,
1709 cpu, QEMU_THREAD_JOINABLE);
1710
1711 single_tcg_halt_cond = cpu->halt_cond;
1712 single_tcg_cpu_thread = cpu->thread;
1713 }
1714 #ifdef _WIN32
1715 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1716 #endif
1717 while (!cpu->created) {
1718 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1719 }
1720 } else {
1721 /* For non-MTTCG cases we share the thread */
1722 cpu->thread = single_tcg_cpu_thread;
1723 cpu->halt_cond = single_tcg_halt_cond;
1724 }
1725 }
1726
1727 static void qemu_hax_start_vcpu(CPUState *cpu)
1728 {
1729 char thread_name[VCPU_THREAD_NAME_SIZE];
1730
1731 cpu->thread = g_malloc0(sizeof(QemuThread));
1732 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1733 qemu_cond_init(cpu->halt_cond);
1734
1735 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1736 cpu->cpu_index);
1737 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1738 cpu, QEMU_THREAD_JOINABLE);
1739 #ifdef _WIN32
1740 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1741 #endif
1742 while (!cpu->created) {
1743 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1744 }
1745 }
1746
1747 static void qemu_kvm_start_vcpu(CPUState *cpu)
1748 {
1749 char thread_name[VCPU_THREAD_NAME_SIZE];
1750
1751 cpu->thread = g_malloc0(sizeof(QemuThread));
1752 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1753 qemu_cond_init(cpu->halt_cond);
1754 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1755 cpu->cpu_index);
1756 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1757 cpu, QEMU_THREAD_JOINABLE);
1758 while (!cpu->created) {
1759 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1760 }
1761 }
1762
1763 static void qemu_dummy_start_vcpu(CPUState *cpu)
1764 {
1765 char thread_name[VCPU_THREAD_NAME_SIZE];
1766
1767 cpu->thread = g_malloc0(sizeof(QemuThread));
1768 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1769 qemu_cond_init(cpu->halt_cond);
1770 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1771 cpu->cpu_index);
1772 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1773 QEMU_THREAD_JOINABLE);
1774 while (!cpu->created) {
1775 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1776 }
1777 }
1778
1779 void qemu_init_vcpu(CPUState *cpu)
1780 {
1781 cpu->nr_cores = smp_cores;
1782 cpu->nr_threads = smp_threads;
1783 cpu->stopped = true;
1784
1785 if (!cpu->as) {
1786 /* If the target cpu hasn't set up any address spaces itself,
1787 * give it the default one.
1788 */
1789 AddressSpace *as = g_new0(AddressSpace, 1);
1790
1791 address_space_init(as, cpu->memory, "cpu-memory");
1792 cpu->num_ases = 1;
1793 cpu_address_space_init(cpu, as, 0);
1794 }
1795
1796 if (kvm_enabled()) {
1797 qemu_kvm_start_vcpu(cpu);
1798 } else if (hax_enabled()) {
1799 qemu_hax_start_vcpu(cpu);
1800 } else if (tcg_enabled()) {
1801 qemu_tcg_init_vcpu(cpu);
1802 } else {
1803 qemu_dummy_start_vcpu(cpu);
1804 }
1805 }
1806
1807 void cpu_stop_current(void)
1808 {
1809 if (current_cpu) {
1810 current_cpu->stop = false;
1811 current_cpu->stopped = true;
1812 cpu_exit(current_cpu);
1813 qemu_cond_broadcast(&qemu_pause_cond);
1814 }
1815 }
1816
1817 int vm_stop(RunState state)
1818 {
1819 if (qemu_in_vcpu_thread()) {
1820 qemu_system_vmstop_request_prepare();
1821 qemu_system_vmstop_request(state);
1822 /*
1823 * FIXME: should not return to device code in case
1824 * vm_stop() has been requested.
1825 */
1826 cpu_stop_current();
1827 return 0;
1828 }
1829
1830 return do_vm_stop(state);
1831 }
1832
1833 /**
1834 * Prepare for (re)starting the VM.
1835 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1836 * running or in case of an error condition), 0 otherwise.
1837 */
1838 int vm_prepare_start(void)
1839 {
1840 RunState requested;
1841 int res = 0;
1842
1843 qemu_vmstop_requested(&requested);
1844 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1845 return -1;
1846 }
1847
1848 /* Ensure that a STOP/RESUME pair of events is emitted if a
1849 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1850 * example, according to documentation is always followed by
1851 * the STOP event.
1852 */
1853 if (runstate_is_running()) {
1854 qapi_event_send_stop(&error_abort);
1855 res = -1;
1856 } else {
1857 replay_enable_events();
1858 cpu_enable_ticks();
1859 runstate_set(RUN_STATE_RUNNING);
1860 vm_state_notify(1, RUN_STATE_RUNNING);
1861 }
1862
1863 /* We are sending this now, but the CPUs will be resumed shortly later */
1864 qapi_event_send_resume(&error_abort);
1865 return res;
1866 }
1867
1868 void vm_start(void)
1869 {
1870 if (!vm_prepare_start()) {
1871 resume_all_vcpus();
1872 }
1873 }
1874
1875 /* does a state transition even if the VM is already stopped,
1876 current state is forgotten forever */
1877 int vm_stop_force_state(RunState state)
1878 {
1879 if (runstate_is_running()) {
1880 return vm_stop(state);
1881 } else {
1882 runstate_set(state);
1883
1884 bdrv_drain_all();
1885 /* Make sure to return an error if the flush in a previous vm_stop()
1886 * failed. */
1887 return bdrv_flush_all();
1888 }
1889 }
1890
1891 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1892 {
1893 /* XXX: implement xxx_cpu_list for targets that still miss it */
1894 #if defined(cpu_list)
1895 cpu_list(f, cpu_fprintf);
1896 #endif
1897 }
1898
1899 CpuInfoList *qmp_query_cpus(Error **errp)
1900 {
1901 MachineState *ms = MACHINE(qdev_get_machine());
1902 MachineClass *mc = MACHINE_GET_CLASS(ms);
1903 CpuInfoList *head = NULL, *cur_item = NULL;
1904 CPUState *cpu;
1905
1906 CPU_FOREACH(cpu) {
1907 CpuInfoList *info;
1908 #if defined(TARGET_I386)
1909 X86CPU *x86_cpu = X86_CPU(cpu);
1910 CPUX86State *env = &x86_cpu->env;
1911 #elif defined(TARGET_PPC)
1912 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1913 CPUPPCState *env = &ppc_cpu->env;
1914 #elif defined(TARGET_SPARC)
1915 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1916 CPUSPARCState *env = &sparc_cpu->env;
1917 #elif defined(TARGET_MIPS)
1918 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1919 CPUMIPSState *env = &mips_cpu->env;
1920 #elif defined(TARGET_TRICORE)
1921 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1922 CPUTriCoreState *env = &tricore_cpu->env;
1923 #endif
1924
1925 cpu_synchronize_state(cpu);
1926
1927 info = g_malloc0(sizeof(*info));
1928 info->value = g_malloc0(sizeof(*info->value));
1929 info->value->CPU = cpu->cpu_index;
1930 info->value->current = (cpu == first_cpu);
1931 info->value->halted = cpu->halted;
1932 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1933 info->value->thread_id = cpu->thread_id;
1934 #if defined(TARGET_I386)
1935 info->value->arch = CPU_INFO_ARCH_X86;
1936 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1937 #elif defined(TARGET_PPC)
1938 info->value->arch = CPU_INFO_ARCH_PPC;
1939 info->value->u.ppc.nip = env->nip;
1940 #elif defined(TARGET_SPARC)
1941 info->value->arch = CPU_INFO_ARCH_SPARC;
1942 info->value->u.q_sparc.pc = env->pc;
1943 info->value->u.q_sparc.npc = env->npc;
1944 #elif defined(TARGET_MIPS)
1945 info->value->arch = CPU_INFO_ARCH_MIPS;
1946 info->value->u.q_mips.PC = env->active_tc.PC;
1947 #elif defined(TARGET_TRICORE)
1948 info->value->arch = CPU_INFO_ARCH_TRICORE;
1949 info->value->u.tricore.PC = env->PC;
1950 #else
1951 info->value->arch = CPU_INFO_ARCH_OTHER;
1952 #endif
1953 info->value->has_props = !!mc->cpu_index_to_instance_props;
1954 if (info->value->has_props) {
1955 CpuInstanceProperties *props;
1956 props = g_malloc0(sizeof(*props));
1957 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
1958 info->value->props = props;
1959 }
1960
1961 /* XXX: waiting for the qapi to support GSList */
1962 if (!cur_item) {
1963 head = cur_item = info;
1964 } else {
1965 cur_item->next = info;
1966 cur_item = info;
1967 }
1968 }
1969
1970 return head;
1971 }
1972
1973 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1974 bool has_cpu, int64_t cpu_index, Error **errp)
1975 {
1976 FILE *f;
1977 uint32_t l;
1978 CPUState *cpu;
1979 uint8_t buf[1024];
1980 int64_t orig_addr = addr, orig_size = size;
1981
1982 if (!has_cpu) {
1983 cpu_index = 0;
1984 }
1985
1986 cpu = qemu_get_cpu(cpu_index);
1987 if (cpu == NULL) {
1988 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1989 "a CPU number");
1990 return;
1991 }
1992
1993 f = fopen(filename, "wb");
1994 if (!f) {
1995 error_setg_file_open(errp, errno, filename);
1996 return;
1997 }
1998
1999 while (size != 0) {
2000 l = sizeof(buf);
2001 if (l > size)
2002 l = size;
2003 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2004 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2005 " specified", orig_addr, orig_size);
2006 goto exit;
2007 }
2008 if (fwrite(buf, 1, l, f) != l) {
2009 error_setg(errp, QERR_IO_ERROR);
2010 goto exit;
2011 }
2012 addr += l;
2013 size -= l;
2014 }
2015
2016 exit:
2017 fclose(f);
2018 }
2019
2020 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2021 Error **errp)
2022 {
2023 FILE *f;
2024 uint32_t l;
2025 uint8_t buf[1024];
2026
2027 f = fopen(filename, "wb");
2028 if (!f) {
2029 error_setg_file_open(errp, errno, filename);
2030 return;
2031 }
2032
2033 while (size != 0) {
2034 l = sizeof(buf);
2035 if (l > size)
2036 l = size;
2037 cpu_physical_memory_read(addr, buf, l);
2038 if (fwrite(buf, 1, l, f) != l) {
2039 error_setg(errp, QERR_IO_ERROR);
2040 goto exit;
2041 }
2042 addr += l;
2043 size -= l;
2044 }
2045
2046 exit:
2047 fclose(f);
2048 }
2049
2050 void qmp_inject_nmi(Error **errp)
2051 {
2052 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2053 }
2054
2055 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2056 {
2057 if (!use_icount) {
2058 return;
2059 }
2060
2061 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2062 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2063 if (icount_align_option) {
2064 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2065 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2066 } else {
2067 cpu_fprintf(f, "Max guest delay NA\n");
2068 cpu_fprintf(f, "Max guest advance NA\n");
2069 }
2070 }