ppc/pnv: Add support for POWER8+ LPC Controller
[qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifdef CONFIG_LINUX
55
56 #include <sys/prctl.h>
57
58 #ifndef PR_MCE_KILL
59 #define PR_MCE_KILL 33
60 #endif
61
62 #ifndef PR_MCE_KILL_SET
63 #define PR_MCE_KILL_SET 1
64 #endif
65
66 #ifndef PR_MCE_KILL_EARLY
67 #define PR_MCE_KILL_EARLY 1
68 #endif
69
70 #endif /* CONFIG_LINUX */
71
72 int64_t max_delay;
73 int64_t max_advance;
74
75 /* vcpu throttling controls */
76 static QEMUTimer *throttle_timer;
77 static unsigned int throttle_percentage;
78
79 #define CPU_THROTTLE_PCT_MIN 1
80 #define CPU_THROTTLE_PCT_MAX 99
81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
82
83 bool cpu_is_stopped(CPUState *cpu)
84 {
85 return cpu->stopped || !runstate_is_running();
86 }
87
88 static bool cpu_thread_is_idle(CPUState *cpu)
89 {
90 if (cpu->stop || cpu->queued_work_first) {
91 return false;
92 }
93 if (cpu_is_stopped(cpu)) {
94 return true;
95 }
96 if (!cpu->halted || cpu_has_work(cpu) ||
97 kvm_halt_in_kernel()) {
98 return false;
99 }
100 return true;
101 }
102
103 static bool all_cpu_threads_idle(void)
104 {
105 CPUState *cpu;
106
107 CPU_FOREACH(cpu) {
108 if (!cpu_thread_is_idle(cpu)) {
109 return false;
110 }
111 }
112 return true;
113 }
114
115 /***********************************************************/
116 /* guest cycle counter */
117
118 /* Protected by TimersState seqlock */
119
120 static bool icount_sleep = true;
121 static int64_t vm_clock_warp_start = -1;
122 /* Conversion factor from emulated instructions to virtual clock ticks. */
123 static int icount_time_shift;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
126
127 static QEMUTimer *icount_rt_timer;
128 static QEMUTimer *icount_vm_timer;
129 static QEMUTimer *icount_warp_timer;
130
131 typedef struct TimersState {
132 /* Protected by BQL. */
133 int64_t cpu_ticks_prev;
134 int64_t cpu_ticks_offset;
135
136 /* cpu_clock_offset can be read out of BQL, so protect it with
137 * this lock.
138 */
139 QemuSeqLock vm_clock_seqlock;
140 int64_t cpu_clock_offset;
141 int32_t cpu_ticks_enabled;
142 int64_t dummy;
143
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146 /* Only written by TCG thread */
147 int64_t qemu_icount;
148 } TimersState;
149
150 static TimersState timers_state;
151 bool mttcg_enabled;
152
153 /*
154 * We default to false if we know other options have been enabled
155 * which are currently incompatible with MTTCG. Otherwise when each
156 * guest (target) has been updated to support:
157 * - atomic instructions
158 * - memory ordering primitives (barriers)
159 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
160 *
161 * Once a guest architecture has been converted to the new primitives
162 * there are two remaining limitations to check.
163 *
164 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
165 * - The host must have a stronger memory order than the guest
166 *
167 * It may be possible in future to support strong guests on weak hosts
168 * but that will require tagging all load/stores in a guest with their
169 * implicit memory order requirements which would likely slow things
170 * down a lot.
171 */
172
173 static bool check_tcg_memory_orders_compatible(void)
174 {
175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
176 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
177 #else
178 return false;
179 #endif
180 }
181
182 static bool default_mttcg_enabled(void)
183 {
184 if (use_icount || TCG_OVERSIZED_GUEST) {
185 return false;
186 } else {
187 #ifdef TARGET_SUPPORTS_MTTCG
188 return check_tcg_memory_orders_compatible();
189 #else
190 return false;
191 #endif
192 }
193 }
194
195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
196 {
197 const char *t = qemu_opt_get(opts, "thread");
198 if (t) {
199 if (strcmp(t, "multi") == 0) {
200 if (TCG_OVERSIZED_GUEST) {
201 error_setg(errp, "No MTTCG when guest word size > hosts");
202 } else if (use_icount) {
203 error_setg(errp, "No MTTCG when icount is enabled");
204 } else {
205 #ifndef TARGET_SUPPORTS_MTTCG
206 error_report("Guest not yet converted to MTTCG - "
207 "you may get unexpected results");
208 #endif
209 if (!check_tcg_memory_orders_compatible()) {
210 error_report("Guest expects a stronger memory ordering "
211 "than the host provides");
212 error_printf("This may cause strange/hard to debug errors\n");
213 }
214 mttcg_enabled = true;
215 }
216 } else if (strcmp(t, "single") == 0) {
217 mttcg_enabled = false;
218 } else {
219 error_setg(errp, "Invalid 'thread' setting %s", t);
220 }
221 } else {
222 mttcg_enabled = default_mttcg_enabled();
223 }
224 }
225
226 /* The current number of executed instructions is based on what we
227 * originally budgeted minus the current state of the decrementing
228 * icount counters in extra/u16.low.
229 */
230 static int64_t cpu_get_icount_executed(CPUState *cpu)
231 {
232 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
233 }
234
235 /*
236 * Update the global shared timer_state.qemu_icount to take into
237 * account executed instructions. This is done by the TCG vCPU
238 * thread so the main-loop can see time has moved forward.
239 */
240 void cpu_update_icount(CPUState *cpu)
241 {
242 int64_t executed = cpu_get_icount_executed(cpu);
243 cpu->icount_budget -= executed;
244
245 #ifdef CONFIG_ATOMIC64
246 atomic_set__nocheck(&timers_state.qemu_icount,
247 atomic_read__nocheck(&timers_state.qemu_icount) +
248 executed);
249 #else /* FIXME: we need 64bit atomics to do this safely */
250 timers_state.qemu_icount += executed;
251 #endif
252 }
253
254 int64_t cpu_get_icount_raw(void)
255 {
256 CPUState *cpu = current_cpu;
257
258 if (cpu && cpu->running) {
259 if (!cpu->can_do_io) {
260 fprintf(stderr, "Bad icount read\n");
261 exit(1);
262 }
263 /* Take into account what has run */
264 cpu_update_icount(cpu);
265 }
266 #ifdef CONFIG_ATOMIC64
267 return atomic_read__nocheck(&timers_state.qemu_icount);
268 #else /* FIXME: we need 64bit atomics to do this safely */
269 return timers_state.qemu_icount;
270 #endif
271 }
272
273 /* Return the virtual CPU time, based on the instruction counter. */
274 static int64_t cpu_get_icount_locked(void)
275 {
276 int64_t icount = cpu_get_icount_raw();
277 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
278 }
279
280 int64_t cpu_get_icount(void)
281 {
282 int64_t icount;
283 unsigned start;
284
285 do {
286 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
287 icount = cpu_get_icount_locked();
288 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
289
290 return icount;
291 }
292
293 int64_t cpu_icount_to_ns(int64_t icount)
294 {
295 return icount << icount_time_shift;
296 }
297
298 /* return the time elapsed in VM between vm_start and vm_stop. Unless
299 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
300 * counter.
301 *
302 * Caller must hold the BQL
303 */
304 int64_t cpu_get_ticks(void)
305 {
306 int64_t ticks;
307
308 if (use_icount) {
309 return cpu_get_icount();
310 }
311
312 ticks = timers_state.cpu_ticks_offset;
313 if (timers_state.cpu_ticks_enabled) {
314 ticks += cpu_get_host_ticks();
315 }
316
317 if (timers_state.cpu_ticks_prev > ticks) {
318 /* Note: non increasing ticks may happen if the host uses
319 software suspend */
320 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
321 ticks = timers_state.cpu_ticks_prev;
322 }
323
324 timers_state.cpu_ticks_prev = ticks;
325 return ticks;
326 }
327
328 static int64_t cpu_get_clock_locked(void)
329 {
330 int64_t time;
331
332 time = timers_state.cpu_clock_offset;
333 if (timers_state.cpu_ticks_enabled) {
334 time += get_clock();
335 }
336
337 return time;
338 }
339
340 /* Return the monotonic time elapsed in VM, i.e.,
341 * the time between vm_start and vm_stop
342 */
343 int64_t cpu_get_clock(void)
344 {
345 int64_t ti;
346 unsigned start;
347
348 do {
349 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
350 ti = cpu_get_clock_locked();
351 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
352
353 return ti;
354 }
355
356 /* enable cpu_get_ticks()
357 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
358 */
359 void cpu_enable_ticks(void)
360 {
361 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
362 seqlock_write_begin(&timers_state.vm_clock_seqlock);
363 if (!timers_state.cpu_ticks_enabled) {
364 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
365 timers_state.cpu_clock_offset -= get_clock();
366 timers_state.cpu_ticks_enabled = 1;
367 }
368 seqlock_write_end(&timers_state.vm_clock_seqlock);
369 }
370
371 /* disable cpu_get_ticks() : the clock is stopped. You must not call
372 * cpu_get_ticks() after that.
373 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
374 */
375 void cpu_disable_ticks(void)
376 {
377 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
378 seqlock_write_begin(&timers_state.vm_clock_seqlock);
379 if (timers_state.cpu_ticks_enabled) {
380 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
381 timers_state.cpu_clock_offset = cpu_get_clock_locked();
382 timers_state.cpu_ticks_enabled = 0;
383 }
384 seqlock_write_end(&timers_state.vm_clock_seqlock);
385 }
386
387 /* Correlation between real and virtual time is always going to be
388 fairly approximate, so ignore small variation.
389 When the guest is idle real and virtual time will be aligned in
390 the IO wait loop. */
391 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
392
393 static void icount_adjust(void)
394 {
395 int64_t cur_time;
396 int64_t cur_icount;
397 int64_t delta;
398
399 /* Protected by TimersState mutex. */
400 static int64_t last_delta;
401
402 /* If the VM is not running, then do nothing. */
403 if (!runstate_is_running()) {
404 return;
405 }
406
407 seqlock_write_begin(&timers_state.vm_clock_seqlock);
408 cur_time = cpu_get_clock_locked();
409 cur_icount = cpu_get_icount_locked();
410
411 delta = cur_icount - cur_time;
412 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
413 if (delta > 0
414 && last_delta + ICOUNT_WOBBLE < delta * 2
415 && icount_time_shift > 0) {
416 /* The guest is getting too far ahead. Slow time down. */
417 icount_time_shift--;
418 }
419 if (delta < 0
420 && last_delta - ICOUNT_WOBBLE > delta * 2
421 && icount_time_shift < MAX_ICOUNT_SHIFT) {
422 /* The guest is getting too far behind. Speed time up. */
423 icount_time_shift++;
424 }
425 last_delta = delta;
426 timers_state.qemu_icount_bias = cur_icount
427 - (timers_state.qemu_icount << icount_time_shift);
428 seqlock_write_end(&timers_state.vm_clock_seqlock);
429 }
430
431 static void icount_adjust_rt(void *opaque)
432 {
433 timer_mod(icount_rt_timer,
434 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
435 icount_adjust();
436 }
437
438 static void icount_adjust_vm(void *opaque)
439 {
440 timer_mod(icount_vm_timer,
441 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
442 NANOSECONDS_PER_SECOND / 10);
443 icount_adjust();
444 }
445
446 static int64_t qemu_icount_round(int64_t count)
447 {
448 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
449 }
450
451 static void icount_warp_rt(void)
452 {
453 unsigned seq;
454 int64_t warp_start;
455
456 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
457 * changes from -1 to another value, so the race here is okay.
458 */
459 do {
460 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
461 warp_start = vm_clock_warp_start;
462 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
463
464 if (warp_start == -1) {
465 return;
466 }
467
468 seqlock_write_begin(&timers_state.vm_clock_seqlock);
469 if (runstate_is_running()) {
470 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
471 cpu_get_clock_locked());
472 int64_t warp_delta;
473
474 warp_delta = clock - vm_clock_warp_start;
475 if (use_icount == 2) {
476 /*
477 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
478 * far ahead of real time.
479 */
480 int64_t cur_icount = cpu_get_icount_locked();
481 int64_t delta = clock - cur_icount;
482 warp_delta = MIN(warp_delta, delta);
483 }
484 timers_state.qemu_icount_bias += warp_delta;
485 }
486 vm_clock_warp_start = -1;
487 seqlock_write_end(&timers_state.vm_clock_seqlock);
488
489 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
490 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
491 }
492 }
493
494 static void icount_timer_cb(void *opaque)
495 {
496 /* No need for a checkpoint because the timer already synchronizes
497 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
498 */
499 icount_warp_rt();
500 }
501
502 void qtest_clock_warp(int64_t dest)
503 {
504 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
505 AioContext *aio_context;
506 assert(qtest_enabled());
507 aio_context = qemu_get_aio_context();
508 while (clock < dest) {
509 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
510 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
511
512 seqlock_write_begin(&timers_state.vm_clock_seqlock);
513 timers_state.qemu_icount_bias += warp;
514 seqlock_write_end(&timers_state.vm_clock_seqlock);
515
516 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
517 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
518 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
519 }
520 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
521 }
522
523 void qemu_start_warp_timer(void)
524 {
525 int64_t clock;
526 int64_t deadline;
527
528 if (!use_icount) {
529 return;
530 }
531
532 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
533 * do not fire, so computing the deadline does not make sense.
534 */
535 if (!runstate_is_running()) {
536 return;
537 }
538
539 /* warp clock deterministically in record/replay mode */
540 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
541 return;
542 }
543
544 if (!all_cpu_threads_idle()) {
545 return;
546 }
547
548 if (qtest_enabled()) {
549 /* When testing, qtest commands advance icount. */
550 return;
551 }
552
553 /* We want to use the earliest deadline from ALL vm_clocks */
554 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
555 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
556 if (deadline < 0) {
557 static bool notified;
558 if (!icount_sleep && !notified) {
559 error_report("WARNING: icount sleep disabled and no active timers");
560 notified = true;
561 }
562 return;
563 }
564
565 if (deadline > 0) {
566 /*
567 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
568 * sleep. Otherwise, the CPU might be waiting for a future timer
569 * interrupt to wake it up, but the interrupt never comes because
570 * the vCPU isn't running any insns and thus doesn't advance the
571 * QEMU_CLOCK_VIRTUAL.
572 */
573 if (!icount_sleep) {
574 /*
575 * We never let VCPUs sleep in no sleep icount mode.
576 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
577 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
578 * It is useful when we want a deterministic execution time,
579 * isolated from host latencies.
580 */
581 seqlock_write_begin(&timers_state.vm_clock_seqlock);
582 timers_state.qemu_icount_bias += deadline;
583 seqlock_write_end(&timers_state.vm_clock_seqlock);
584 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
585 } else {
586 /*
587 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
588 * "real" time, (related to the time left until the next event) has
589 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
590 * This avoids that the warps are visible externally; for example,
591 * you will not be sending network packets continuously instead of
592 * every 100ms.
593 */
594 seqlock_write_begin(&timers_state.vm_clock_seqlock);
595 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
596 vm_clock_warp_start = clock;
597 }
598 seqlock_write_end(&timers_state.vm_clock_seqlock);
599 timer_mod_anticipate(icount_warp_timer, clock + deadline);
600 }
601 } else if (deadline == 0) {
602 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
603 }
604 }
605
606 static void qemu_account_warp_timer(void)
607 {
608 if (!use_icount || !icount_sleep) {
609 return;
610 }
611
612 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
613 * do not fire, so computing the deadline does not make sense.
614 */
615 if (!runstate_is_running()) {
616 return;
617 }
618
619 /* warp clock deterministically in record/replay mode */
620 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
621 return;
622 }
623
624 timer_del(icount_warp_timer);
625 icount_warp_rt();
626 }
627
628 static bool icount_state_needed(void *opaque)
629 {
630 return use_icount;
631 }
632
633 /*
634 * This is a subsection for icount migration.
635 */
636 static const VMStateDescription icount_vmstate_timers = {
637 .name = "timer/icount",
638 .version_id = 1,
639 .minimum_version_id = 1,
640 .needed = icount_state_needed,
641 .fields = (VMStateField[]) {
642 VMSTATE_INT64(qemu_icount_bias, TimersState),
643 VMSTATE_INT64(qemu_icount, TimersState),
644 VMSTATE_END_OF_LIST()
645 }
646 };
647
648 static const VMStateDescription vmstate_timers = {
649 .name = "timer",
650 .version_id = 2,
651 .minimum_version_id = 1,
652 .fields = (VMStateField[]) {
653 VMSTATE_INT64(cpu_ticks_offset, TimersState),
654 VMSTATE_INT64(dummy, TimersState),
655 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
656 VMSTATE_END_OF_LIST()
657 },
658 .subsections = (const VMStateDescription*[]) {
659 &icount_vmstate_timers,
660 NULL
661 }
662 };
663
664 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
665 {
666 double pct;
667 double throttle_ratio;
668 long sleeptime_ns;
669
670 if (!cpu_throttle_get_percentage()) {
671 return;
672 }
673
674 pct = (double)cpu_throttle_get_percentage()/100;
675 throttle_ratio = pct / (1 - pct);
676 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
677
678 qemu_mutex_unlock_iothread();
679 atomic_set(&cpu->throttle_thread_scheduled, 0);
680 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
681 qemu_mutex_lock_iothread();
682 }
683
684 static void cpu_throttle_timer_tick(void *opaque)
685 {
686 CPUState *cpu;
687 double pct;
688
689 /* Stop the timer if needed */
690 if (!cpu_throttle_get_percentage()) {
691 return;
692 }
693 CPU_FOREACH(cpu) {
694 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
695 async_run_on_cpu(cpu, cpu_throttle_thread,
696 RUN_ON_CPU_NULL);
697 }
698 }
699
700 pct = (double)cpu_throttle_get_percentage()/100;
701 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
702 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
703 }
704
705 void cpu_throttle_set(int new_throttle_pct)
706 {
707 /* Ensure throttle percentage is within valid range */
708 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
709 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
710
711 atomic_set(&throttle_percentage, new_throttle_pct);
712
713 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
714 CPU_THROTTLE_TIMESLICE_NS);
715 }
716
717 void cpu_throttle_stop(void)
718 {
719 atomic_set(&throttle_percentage, 0);
720 }
721
722 bool cpu_throttle_active(void)
723 {
724 return (cpu_throttle_get_percentage() != 0);
725 }
726
727 int cpu_throttle_get_percentage(void)
728 {
729 return atomic_read(&throttle_percentage);
730 }
731
732 void cpu_ticks_init(void)
733 {
734 seqlock_init(&timers_state.vm_clock_seqlock);
735 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
736 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
737 cpu_throttle_timer_tick, NULL);
738 }
739
740 void configure_icount(QemuOpts *opts, Error **errp)
741 {
742 const char *option;
743 char *rem_str = NULL;
744
745 option = qemu_opt_get(opts, "shift");
746 if (!option) {
747 if (qemu_opt_get(opts, "align") != NULL) {
748 error_setg(errp, "Please specify shift option when using align");
749 }
750 return;
751 }
752
753 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
754 if (icount_sleep) {
755 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
756 icount_timer_cb, NULL);
757 }
758
759 icount_align_option = qemu_opt_get_bool(opts, "align", false);
760
761 if (icount_align_option && !icount_sleep) {
762 error_setg(errp, "align=on and sleep=off are incompatible");
763 }
764 if (strcmp(option, "auto") != 0) {
765 errno = 0;
766 icount_time_shift = strtol(option, &rem_str, 0);
767 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
768 error_setg(errp, "icount: Invalid shift value");
769 }
770 use_icount = 1;
771 return;
772 } else if (icount_align_option) {
773 error_setg(errp, "shift=auto and align=on are incompatible");
774 } else if (!icount_sleep) {
775 error_setg(errp, "shift=auto and sleep=off are incompatible");
776 }
777
778 use_icount = 2;
779
780 /* 125MIPS seems a reasonable initial guess at the guest speed.
781 It will be corrected fairly quickly anyway. */
782 icount_time_shift = 3;
783
784 /* Have both realtime and virtual time triggers for speed adjustment.
785 The realtime trigger catches emulated time passing too slowly,
786 the virtual time trigger catches emulated time passing too fast.
787 Realtime triggers occur even when idle, so use them less frequently
788 than VM triggers. */
789 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
790 icount_adjust_rt, NULL);
791 timer_mod(icount_rt_timer,
792 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
793 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
794 icount_adjust_vm, NULL);
795 timer_mod(icount_vm_timer,
796 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
797 NANOSECONDS_PER_SECOND / 10);
798 }
799
800 /***********************************************************/
801 /* TCG vCPU kick timer
802 *
803 * The kick timer is responsible for moving single threaded vCPU
804 * emulation on to the next vCPU. If more than one vCPU is running a
805 * timer event with force a cpu->exit so the next vCPU can get
806 * scheduled.
807 *
808 * The timer is removed if all vCPUs are idle and restarted again once
809 * idleness is complete.
810 */
811
812 static QEMUTimer *tcg_kick_vcpu_timer;
813 static CPUState *tcg_current_rr_cpu;
814
815 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
816
817 static inline int64_t qemu_tcg_next_kick(void)
818 {
819 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
820 }
821
822 /* Kick the currently round-robin scheduled vCPU */
823 static void qemu_cpu_kick_rr_cpu(void)
824 {
825 CPUState *cpu;
826 do {
827 cpu = atomic_mb_read(&tcg_current_rr_cpu);
828 if (cpu) {
829 cpu_exit(cpu);
830 }
831 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
832 }
833
834 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
835 {
836 }
837
838 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
839 {
840 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
841 qemu_notify_event();
842 return;
843 }
844
845 if (!qemu_in_vcpu_thread() && first_cpu) {
846 /* qemu_cpu_kick is not enough to kick a halted CPU out of
847 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
848 * causes cpu_thread_is_idle to return false. This way,
849 * handle_icount_deadline can run.
850 */
851 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
852 }
853 }
854
855 static void kick_tcg_thread(void *opaque)
856 {
857 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
858 qemu_cpu_kick_rr_cpu();
859 }
860
861 static void start_tcg_kick_timer(void)
862 {
863 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
864 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
865 kick_tcg_thread, NULL);
866 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
867 }
868 }
869
870 static void stop_tcg_kick_timer(void)
871 {
872 if (tcg_kick_vcpu_timer) {
873 timer_del(tcg_kick_vcpu_timer);
874 tcg_kick_vcpu_timer = NULL;
875 }
876 }
877
878 /***********************************************************/
879 void hw_error(const char *fmt, ...)
880 {
881 va_list ap;
882 CPUState *cpu;
883
884 va_start(ap, fmt);
885 fprintf(stderr, "qemu: hardware error: ");
886 vfprintf(stderr, fmt, ap);
887 fprintf(stderr, "\n");
888 CPU_FOREACH(cpu) {
889 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
890 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
891 }
892 va_end(ap);
893 abort();
894 }
895
896 void cpu_synchronize_all_states(void)
897 {
898 CPUState *cpu;
899
900 CPU_FOREACH(cpu) {
901 cpu_synchronize_state(cpu);
902 }
903 }
904
905 void cpu_synchronize_all_post_reset(void)
906 {
907 CPUState *cpu;
908
909 CPU_FOREACH(cpu) {
910 cpu_synchronize_post_reset(cpu);
911 }
912 }
913
914 void cpu_synchronize_all_post_init(void)
915 {
916 CPUState *cpu;
917
918 CPU_FOREACH(cpu) {
919 cpu_synchronize_post_init(cpu);
920 }
921 }
922
923 static int do_vm_stop(RunState state)
924 {
925 int ret = 0;
926
927 if (runstate_is_running()) {
928 cpu_disable_ticks();
929 pause_all_vcpus();
930 runstate_set(state);
931 vm_state_notify(0, state);
932 qapi_event_send_stop(&error_abort);
933 }
934
935 bdrv_drain_all();
936 replay_disable_events();
937 ret = bdrv_flush_all();
938
939 return ret;
940 }
941
942 static bool cpu_can_run(CPUState *cpu)
943 {
944 if (cpu->stop) {
945 return false;
946 }
947 if (cpu_is_stopped(cpu)) {
948 return false;
949 }
950 return true;
951 }
952
953 static void cpu_handle_guest_debug(CPUState *cpu)
954 {
955 gdb_set_stop_cpu(cpu);
956 qemu_system_debug_request();
957 cpu->stopped = true;
958 }
959
960 #ifdef CONFIG_LINUX
961 static void sigbus_reraise(void)
962 {
963 sigset_t set;
964 struct sigaction action;
965
966 memset(&action, 0, sizeof(action));
967 action.sa_handler = SIG_DFL;
968 if (!sigaction(SIGBUS, &action, NULL)) {
969 raise(SIGBUS);
970 sigemptyset(&set);
971 sigaddset(&set, SIGBUS);
972 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
973 }
974 perror("Failed to re-raise SIGBUS!\n");
975 abort();
976 }
977
978 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
979 {
980 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
981 sigbus_reraise();
982 }
983
984 if (current_cpu) {
985 /* Called asynchronously in VCPU thread. */
986 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
987 sigbus_reraise();
988 }
989 } else {
990 /* Called synchronously (via signalfd) in main thread. */
991 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
992 sigbus_reraise();
993 }
994 }
995 }
996
997 static void qemu_init_sigbus(void)
998 {
999 struct sigaction action;
1000
1001 memset(&action, 0, sizeof(action));
1002 action.sa_flags = SA_SIGINFO;
1003 action.sa_sigaction = sigbus_handler;
1004 sigaction(SIGBUS, &action, NULL);
1005
1006 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1007 }
1008 #else /* !CONFIG_LINUX */
1009 static void qemu_init_sigbus(void)
1010 {
1011 }
1012 #endif /* !CONFIG_LINUX */
1013
1014 static QemuMutex qemu_global_mutex;
1015
1016 static QemuThread io_thread;
1017
1018 /* cpu creation */
1019 static QemuCond qemu_cpu_cond;
1020 /* system init */
1021 static QemuCond qemu_pause_cond;
1022
1023 void qemu_init_cpu_loop(void)
1024 {
1025 qemu_init_sigbus();
1026 qemu_cond_init(&qemu_cpu_cond);
1027 qemu_cond_init(&qemu_pause_cond);
1028 qemu_mutex_init(&qemu_global_mutex);
1029
1030 qemu_thread_get_self(&io_thread);
1031 }
1032
1033 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1034 {
1035 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1036 }
1037
1038 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1039 {
1040 if (kvm_destroy_vcpu(cpu) < 0) {
1041 error_report("kvm_destroy_vcpu failed");
1042 exit(EXIT_FAILURE);
1043 }
1044 }
1045
1046 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1047 {
1048 }
1049
1050 static void qemu_wait_io_event_common(CPUState *cpu)
1051 {
1052 atomic_mb_set(&cpu->thread_kicked, false);
1053 if (cpu->stop) {
1054 cpu->stop = false;
1055 cpu->stopped = true;
1056 qemu_cond_broadcast(&qemu_pause_cond);
1057 }
1058 process_queued_cpu_work(cpu);
1059 }
1060
1061 static bool qemu_tcg_should_sleep(CPUState *cpu)
1062 {
1063 if (mttcg_enabled) {
1064 return cpu_thread_is_idle(cpu);
1065 } else {
1066 return all_cpu_threads_idle();
1067 }
1068 }
1069
1070 static void qemu_tcg_wait_io_event(CPUState *cpu)
1071 {
1072 while (qemu_tcg_should_sleep(cpu)) {
1073 stop_tcg_kick_timer();
1074 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1075 }
1076
1077 start_tcg_kick_timer();
1078
1079 qemu_wait_io_event_common(cpu);
1080 }
1081
1082 static void qemu_kvm_wait_io_event(CPUState *cpu)
1083 {
1084 while (cpu_thread_is_idle(cpu)) {
1085 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1086 }
1087
1088 qemu_wait_io_event_common(cpu);
1089 }
1090
1091 static void *qemu_kvm_cpu_thread_fn(void *arg)
1092 {
1093 CPUState *cpu = arg;
1094 int r;
1095
1096 rcu_register_thread();
1097
1098 qemu_mutex_lock_iothread();
1099 qemu_thread_get_self(cpu->thread);
1100 cpu->thread_id = qemu_get_thread_id();
1101 cpu->can_do_io = 1;
1102 current_cpu = cpu;
1103
1104 r = kvm_init_vcpu(cpu);
1105 if (r < 0) {
1106 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1107 exit(1);
1108 }
1109
1110 kvm_init_cpu_signals(cpu);
1111
1112 /* signal CPU creation */
1113 cpu->created = true;
1114 qemu_cond_signal(&qemu_cpu_cond);
1115
1116 do {
1117 if (cpu_can_run(cpu)) {
1118 r = kvm_cpu_exec(cpu);
1119 if (r == EXCP_DEBUG) {
1120 cpu_handle_guest_debug(cpu);
1121 }
1122 }
1123 qemu_kvm_wait_io_event(cpu);
1124 } while (!cpu->unplug || cpu_can_run(cpu));
1125
1126 qemu_kvm_destroy_vcpu(cpu);
1127 cpu->created = false;
1128 qemu_cond_signal(&qemu_cpu_cond);
1129 qemu_mutex_unlock_iothread();
1130 return NULL;
1131 }
1132
1133 static void *qemu_dummy_cpu_thread_fn(void *arg)
1134 {
1135 #ifdef _WIN32
1136 fprintf(stderr, "qtest is not supported under Windows\n");
1137 exit(1);
1138 #else
1139 CPUState *cpu = arg;
1140 sigset_t waitset;
1141 int r;
1142
1143 rcu_register_thread();
1144
1145 qemu_mutex_lock_iothread();
1146 qemu_thread_get_self(cpu->thread);
1147 cpu->thread_id = qemu_get_thread_id();
1148 cpu->can_do_io = 1;
1149 current_cpu = cpu;
1150
1151 sigemptyset(&waitset);
1152 sigaddset(&waitset, SIG_IPI);
1153
1154 /* signal CPU creation */
1155 cpu->created = true;
1156 qemu_cond_signal(&qemu_cpu_cond);
1157
1158 while (1) {
1159 qemu_mutex_unlock_iothread();
1160 do {
1161 int sig;
1162 r = sigwait(&waitset, &sig);
1163 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1164 if (r == -1) {
1165 perror("sigwait");
1166 exit(1);
1167 }
1168 qemu_mutex_lock_iothread();
1169 qemu_wait_io_event_common(cpu);
1170 }
1171
1172 return NULL;
1173 #endif
1174 }
1175
1176 static int64_t tcg_get_icount_limit(void)
1177 {
1178 int64_t deadline;
1179
1180 if (replay_mode != REPLAY_MODE_PLAY) {
1181 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1182
1183 /* Maintain prior (possibly buggy) behaviour where if no deadline
1184 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1185 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1186 * nanoseconds.
1187 */
1188 if ((deadline < 0) || (deadline > INT32_MAX)) {
1189 deadline = INT32_MAX;
1190 }
1191
1192 return qemu_icount_round(deadline);
1193 } else {
1194 return replay_get_instructions();
1195 }
1196 }
1197
1198 static void handle_icount_deadline(void)
1199 {
1200 assert(qemu_in_vcpu_thread());
1201 if (use_icount) {
1202 int64_t deadline =
1203 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1204
1205 if (deadline == 0) {
1206 /* Wake up other AioContexts. */
1207 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1208 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1209 }
1210 }
1211 }
1212
1213 static void prepare_icount_for_run(CPUState *cpu)
1214 {
1215 if (use_icount) {
1216 int insns_left;
1217
1218 /* These should always be cleared by process_icount_data after
1219 * each vCPU execution. However u16.high can be raised
1220 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1221 */
1222 g_assert(cpu->icount_decr.u16.low == 0);
1223 g_assert(cpu->icount_extra == 0);
1224
1225 cpu->icount_budget = tcg_get_icount_limit();
1226 insns_left = MIN(0xffff, cpu->icount_budget);
1227 cpu->icount_decr.u16.low = insns_left;
1228 cpu->icount_extra = cpu->icount_budget - insns_left;
1229 }
1230 }
1231
1232 static void process_icount_data(CPUState *cpu)
1233 {
1234 if (use_icount) {
1235 /* Account for executed instructions */
1236 cpu_update_icount(cpu);
1237
1238 /* Reset the counters */
1239 cpu->icount_decr.u16.low = 0;
1240 cpu->icount_extra = 0;
1241 cpu->icount_budget = 0;
1242
1243 replay_account_executed_instructions();
1244 }
1245 }
1246
1247
1248 static int tcg_cpu_exec(CPUState *cpu)
1249 {
1250 int ret;
1251 #ifdef CONFIG_PROFILER
1252 int64_t ti;
1253 #endif
1254
1255 #ifdef CONFIG_PROFILER
1256 ti = profile_getclock();
1257 #endif
1258 qemu_mutex_unlock_iothread();
1259 cpu_exec_start(cpu);
1260 ret = cpu_exec(cpu);
1261 cpu_exec_end(cpu);
1262 qemu_mutex_lock_iothread();
1263 #ifdef CONFIG_PROFILER
1264 tcg_time += profile_getclock() - ti;
1265 #endif
1266 return ret;
1267 }
1268
1269 /* Destroy any remaining vCPUs which have been unplugged and have
1270 * finished running
1271 */
1272 static void deal_with_unplugged_cpus(void)
1273 {
1274 CPUState *cpu;
1275
1276 CPU_FOREACH(cpu) {
1277 if (cpu->unplug && !cpu_can_run(cpu)) {
1278 qemu_tcg_destroy_vcpu(cpu);
1279 cpu->created = false;
1280 qemu_cond_signal(&qemu_cpu_cond);
1281 break;
1282 }
1283 }
1284 }
1285
1286 /* Single-threaded TCG
1287 *
1288 * In the single-threaded case each vCPU is simulated in turn. If
1289 * there is more than a single vCPU we create a simple timer to kick
1290 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1291 * This is done explicitly rather than relying on side-effects
1292 * elsewhere.
1293 */
1294
1295 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1296 {
1297 CPUState *cpu = arg;
1298
1299 rcu_register_thread();
1300
1301 qemu_mutex_lock_iothread();
1302 qemu_thread_get_self(cpu->thread);
1303
1304 CPU_FOREACH(cpu) {
1305 cpu->thread_id = qemu_get_thread_id();
1306 cpu->created = true;
1307 cpu->can_do_io = 1;
1308 }
1309 qemu_cond_signal(&qemu_cpu_cond);
1310
1311 /* wait for initial kick-off after machine start */
1312 while (first_cpu->stopped) {
1313 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1314
1315 /* process any pending work */
1316 CPU_FOREACH(cpu) {
1317 current_cpu = cpu;
1318 qemu_wait_io_event_common(cpu);
1319 }
1320 }
1321
1322 start_tcg_kick_timer();
1323
1324 cpu = first_cpu;
1325
1326 /* process any pending work */
1327 cpu->exit_request = 1;
1328
1329 while (1) {
1330 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1331 qemu_account_warp_timer();
1332
1333 /* Run the timers here. This is much more efficient than
1334 * waking up the I/O thread and waiting for completion.
1335 */
1336 handle_icount_deadline();
1337
1338 if (!cpu) {
1339 cpu = first_cpu;
1340 }
1341
1342 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1343
1344 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1345 current_cpu = cpu;
1346
1347 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1348 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1349
1350 if (cpu_can_run(cpu)) {
1351 int r;
1352
1353 prepare_icount_for_run(cpu);
1354
1355 r = tcg_cpu_exec(cpu);
1356
1357 process_icount_data(cpu);
1358
1359 if (r == EXCP_DEBUG) {
1360 cpu_handle_guest_debug(cpu);
1361 break;
1362 } else if (r == EXCP_ATOMIC) {
1363 qemu_mutex_unlock_iothread();
1364 cpu_exec_step_atomic(cpu);
1365 qemu_mutex_lock_iothread();
1366 break;
1367 }
1368 } else if (cpu->stop) {
1369 if (cpu->unplug) {
1370 cpu = CPU_NEXT(cpu);
1371 }
1372 break;
1373 }
1374
1375 cpu = CPU_NEXT(cpu);
1376 } /* while (cpu && !cpu->exit_request).. */
1377
1378 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1379 atomic_set(&tcg_current_rr_cpu, NULL);
1380
1381 if (cpu && cpu->exit_request) {
1382 atomic_mb_set(&cpu->exit_request, 0);
1383 }
1384
1385 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1386 deal_with_unplugged_cpus();
1387 }
1388
1389 return NULL;
1390 }
1391
1392 static void *qemu_hax_cpu_thread_fn(void *arg)
1393 {
1394 CPUState *cpu = arg;
1395 int r;
1396
1397 qemu_mutex_lock_iothread();
1398 qemu_thread_get_self(cpu->thread);
1399
1400 cpu->thread_id = qemu_get_thread_id();
1401 cpu->created = true;
1402 cpu->halted = 0;
1403 current_cpu = cpu;
1404
1405 hax_init_vcpu(cpu);
1406 qemu_cond_signal(&qemu_cpu_cond);
1407
1408 while (1) {
1409 if (cpu_can_run(cpu)) {
1410 r = hax_smp_cpu_exec(cpu);
1411 if (r == EXCP_DEBUG) {
1412 cpu_handle_guest_debug(cpu);
1413 }
1414 }
1415
1416 while (cpu_thread_is_idle(cpu)) {
1417 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1418 }
1419 #ifdef _WIN32
1420 SleepEx(0, TRUE);
1421 #endif
1422 qemu_wait_io_event_common(cpu);
1423 }
1424 return NULL;
1425 }
1426
1427 #ifdef _WIN32
1428 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1429 {
1430 }
1431 #endif
1432
1433 /* Multi-threaded TCG
1434 *
1435 * In the multi-threaded case each vCPU has its own thread. The TLS
1436 * variable current_cpu can be used deep in the code to find the
1437 * current CPUState for a given thread.
1438 */
1439
1440 static void *qemu_tcg_cpu_thread_fn(void *arg)
1441 {
1442 CPUState *cpu = arg;
1443
1444 g_assert(!use_icount);
1445
1446 rcu_register_thread();
1447
1448 qemu_mutex_lock_iothread();
1449 qemu_thread_get_self(cpu->thread);
1450
1451 cpu->thread_id = qemu_get_thread_id();
1452 cpu->created = true;
1453 cpu->can_do_io = 1;
1454 current_cpu = cpu;
1455 qemu_cond_signal(&qemu_cpu_cond);
1456
1457 /* process any pending work */
1458 cpu->exit_request = 1;
1459
1460 while (1) {
1461 if (cpu_can_run(cpu)) {
1462 int r;
1463 r = tcg_cpu_exec(cpu);
1464 switch (r) {
1465 case EXCP_DEBUG:
1466 cpu_handle_guest_debug(cpu);
1467 break;
1468 case EXCP_HALTED:
1469 /* during start-up the vCPU is reset and the thread is
1470 * kicked several times. If we don't ensure we go back
1471 * to sleep in the halted state we won't cleanly
1472 * start-up when the vCPU is enabled.
1473 *
1474 * cpu->halted should ensure we sleep in wait_io_event
1475 */
1476 g_assert(cpu->halted);
1477 break;
1478 case EXCP_ATOMIC:
1479 qemu_mutex_unlock_iothread();
1480 cpu_exec_step_atomic(cpu);
1481 qemu_mutex_lock_iothread();
1482 default:
1483 /* Ignore everything else? */
1484 break;
1485 }
1486 }
1487
1488 atomic_mb_set(&cpu->exit_request, 0);
1489 qemu_tcg_wait_io_event(cpu);
1490 }
1491
1492 return NULL;
1493 }
1494
1495 static void qemu_cpu_kick_thread(CPUState *cpu)
1496 {
1497 #ifndef _WIN32
1498 int err;
1499
1500 if (cpu->thread_kicked) {
1501 return;
1502 }
1503 cpu->thread_kicked = true;
1504 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1505 if (err) {
1506 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1507 exit(1);
1508 }
1509 #else /* _WIN32 */
1510 if (!qemu_cpu_is_self(cpu)) {
1511 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1512 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1513 __func__, GetLastError());
1514 exit(1);
1515 }
1516 }
1517 #endif
1518 }
1519
1520 void qemu_cpu_kick(CPUState *cpu)
1521 {
1522 qemu_cond_broadcast(cpu->halt_cond);
1523 if (tcg_enabled()) {
1524 cpu_exit(cpu);
1525 /* NOP unless doing single-thread RR */
1526 qemu_cpu_kick_rr_cpu();
1527 } else {
1528 if (hax_enabled()) {
1529 /*
1530 * FIXME: race condition with the exit_request check in
1531 * hax_vcpu_hax_exec
1532 */
1533 cpu->exit_request = 1;
1534 }
1535 qemu_cpu_kick_thread(cpu);
1536 }
1537 }
1538
1539 void qemu_cpu_kick_self(void)
1540 {
1541 assert(current_cpu);
1542 qemu_cpu_kick_thread(current_cpu);
1543 }
1544
1545 bool qemu_cpu_is_self(CPUState *cpu)
1546 {
1547 return qemu_thread_is_self(cpu->thread);
1548 }
1549
1550 bool qemu_in_vcpu_thread(void)
1551 {
1552 return current_cpu && qemu_cpu_is_self(current_cpu);
1553 }
1554
1555 static __thread bool iothread_locked = false;
1556
1557 bool qemu_mutex_iothread_locked(void)
1558 {
1559 return iothread_locked;
1560 }
1561
1562 void qemu_mutex_lock_iothread(void)
1563 {
1564 g_assert(!qemu_mutex_iothread_locked());
1565 qemu_mutex_lock(&qemu_global_mutex);
1566 iothread_locked = true;
1567 }
1568
1569 void qemu_mutex_unlock_iothread(void)
1570 {
1571 g_assert(qemu_mutex_iothread_locked());
1572 iothread_locked = false;
1573 qemu_mutex_unlock(&qemu_global_mutex);
1574 }
1575
1576 static bool all_vcpus_paused(void)
1577 {
1578 CPUState *cpu;
1579
1580 CPU_FOREACH(cpu) {
1581 if (!cpu->stopped) {
1582 return false;
1583 }
1584 }
1585
1586 return true;
1587 }
1588
1589 void pause_all_vcpus(void)
1590 {
1591 CPUState *cpu;
1592
1593 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1594 CPU_FOREACH(cpu) {
1595 cpu->stop = true;
1596 qemu_cpu_kick(cpu);
1597 }
1598
1599 if (qemu_in_vcpu_thread()) {
1600 cpu_stop_current();
1601 }
1602
1603 while (!all_vcpus_paused()) {
1604 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1605 CPU_FOREACH(cpu) {
1606 qemu_cpu_kick(cpu);
1607 }
1608 }
1609 }
1610
1611 void cpu_resume(CPUState *cpu)
1612 {
1613 cpu->stop = false;
1614 cpu->stopped = false;
1615 qemu_cpu_kick(cpu);
1616 }
1617
1618 void resume_all_vcpus(void)
1619 {
1620 CPUState *cpu;
1621
1622 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1623 CPU_FOREACH(cpu) {
1624 cpu_resume(cpu);
1625 }
1626 }
1627
1628 void cpu_remove(CPUState *cpu)
1629 {
1630 cpu->stop = true;
1631 cpu->unplug = true;
1632 qemu_cpu_kick(cpu);
1633 }
1634
1635 void cpu_remove_sync(CPUState *cpu)
1636 {
1637 cpu_remove(cpu);
1638 while (cpu->created) {
1639 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1640 }
1641 }
1642
1643 /* For temporary buffers for forming a name */
1644 #define VCPU_THREAD_NAME_SIZE 16
1645
1646 static void qemu_tcg_init_vcpu(CPUState *cpu)
1647 {
1648 char thread_name[VCPU_THREAD_NAME_SIZE];
1649 static QemuCond *single_tcg_halt_cond;
1650 static QemuThread *single_tcg_cpu_thread;
1651
1652 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1653 cpu->thread = g_malloc0(sizeof(QemuThread));
1654 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1655 qemu_cond_init(cpu->halt_cond);
1656
1657 if (qemu_tcg_mttcg_enabled()) {
1658 /* create a thread per vCPU with TCG (MTTCG) */
1659 parallel_cpus = true;
1660 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1661 cpu->cpu_index);
1662
1663 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1664 cpu, QEMU_THREAD_JOINABLE);
1665
1666 } else {
1667 /* share a single thread for all cpus with TCG */
1668 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1669 qemu_thread_create(cpu->thread, thread_name,
1670 qemu_tcg_rr_cpu_thread_fn,
1671 cpu, QEMU_THREAD_JOINABLE);
1672
1673 single_tcg_halt_cond = cpu->halt_cond;
1674 single_tcg_cpu_thread = cpu->thread;
1675 }
1676 #ifdef _WIN32
1677 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1678 #endif
1679 while (!cpu->created) {
1680 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1681 }
1682 } else {
1683 /* For non-MTTCG cases we share the thread */
1684 cpu->thread = single_tcg_cpu_thread;
1685 cpu->halt_cond = single_tcg_halt_cond;
1686 }
1687 }
1688
1689 static void qemu_hax_start_vcpu(CPUState *cpu)
1690 {
1691 char thread_name[VCPU_THREAD_NAME_SIZE];
1692
1693 cpu->thread = g_malloc0(sizeof(QemuThread));
1694 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1695 qemu_cond_init(cpu->halt_cond);
1696
1697 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1698 cpu->cpu_index);
1699 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1700 cpu, QEMU_THREAD_JOINABLE);
1701 #ifdef _WIN32
1702 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1703 #endif
1704 while (!cpu->created) {
1705 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1706 }
1707 }
1708
1709 static void qemu_kvm_start_vcpu(CPUState *cpu)
1710 {
1711 char thread_name[VCPU_THREAD_NAME_SIZE];
1712
1713 cpu->thread = g_malloc0(sizeof(QemuThread));
1714 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1715 qemu_cond_init(cpu->halt_cond);
1716 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1717 cpu->cpu_index);
1718 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1719 cpu, QEMU_THREAD_JOINABLE);
1720 while (!cpu->created) {
1721 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1722 }
1723 }
1724
1725 static void qemu_dummy_start_vcpu(CPUState *cpu)
1726 {
1727 char thread_name[VCPU_THREAD_NAME_SIZE];
1728
1729 cpu->thread = g_malloc0(sizeof(QemuThread));
1730 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1731 qemu_cond_init(cpu->halt_cond);
1732 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1733 cpu->cpu_index);
1734 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1735 QEMU_THREAD_JOINABLE);
1736 while (!cpu->created) {
1737 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1738 }
1739 }
1740
1741 void qemu_init_vcpu(CPUState *cpu)
1742 {
1743 cpu->nr_cores = smp_cores;
1744 cpu->nr_threads = smp_threads;
1745 cpu->stopped = true;
1746
1747 if (!cpu->as) {
1748 /* If the target cpu hasn't set up any address spaces itself,
1749 * give it the default one.
1750 */
1751 AddressSpace *as = address_space_init_shareable(cpu->memory,
1752 "cpu-memory");
1753 cpu->num_ases = 1;
1754 cpu_address_space_init(cpu, as, 0);
1755 }
1756
1757 if (kvm_enabled()) {
1758 qemu_kvm_start_vcpu(cpu);
1759 } else if (hax_enabled()) {
1760 qemu_hax_start_vcpu(cpu);
1761 } else if (tcg_enabled()) {
1762 qemu_tcg_init_vcpu(cpu);
1763 } else {
1764 qemu_dummy_start_vcpu(cpu);
1765 }
1766 }
1767
1768 void cpu_stop_current(void)
1769 {
1770 if (current_cpu) {
1771 current_cpu->stop = false;
1772 current_cpu->stopped = true;
1773 cpu_exit(current_cpu);
1774 qemu_cond_broadcast(&qemu_pause_cond);
1775 }
1776 }
1777
1778 int vm_stop(RunState state)
1779 {
1780 if (qemu_in_vcpu_thread()) {
1781 qemu_system_vmstop_request_prepare();
1782 qemu_system_vmstop_request(state);
1783 /*
1784 * FIXME: should not return to device code in case
1785 * vm_stop() has been requested.
1786 */
1787 cpu_stop_current();
1788 return 0;
1789 }
1790
1791 return do_vm_stop(state);
1792 }
1793
1794 /**
1795 * Prepare for (re)starting the VM.
1796 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1797 * running or in case of an error condition), 0 otherwise.
1798 */
1799 int vm_prepare_start(void)
1800 {
1801 RunState requested;
1802 int res = 0;
1803
1804 qemu_vmstop_requested(&requested);
1805 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1806 return -1;
1807 }
1808
1809 /* Ensure that a STOP/RESUME pair of events is emitted if a
1810 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1811 * example, according to documentation is always followed by
1812 * the STOP event.
1813 */
1814 if (runstate_is_running()) {
1815 qapi_event_send_stop(&error_abort);
1816 res = -1;
1817 } else {
1818 replay_enable_events();
1819 cpu_enable_ticks();
1820 runstate_set(RUN_STATE_RUNNING);
1821 vm_state_notify(1, RUN_STATE_RUNNING);
1822 }
1823
1824 /* We are sending this now, but the CPUs will be resumed shortly later */
1825 qapi_event_send_resume(&error_abort);
1826 return res;
1827 }
1828
1829 void vm_start(void)
1830 {
1831 if (!vm_prepare_start()) {
1832 resume_all_vcpus();
1833 }
1834 }
1835
1836 /* does a state transition even if the VM is already stopped,
1837 current state is forgotten forever */
1838 int vm_stop_force_state(RunState state)
1839 {
1840 if (runstate_is_running()) {
1841 return vm_stop(state);
1842 } else {
1843 runstate_set(state);
1844
1845 bdrv_drain_all();
1846 /* Make sure to return an error if the flush in a previous vm_stop()
1847 * failed. */
1848 return bdrv_flush_all();
1849 }
1850 }
1851
1852 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1853 {
1854 /* XXX: implement xxx_cpu_list for targets that still miss it */
1855 #if defined(cpu_list)
1856 cpu_list(f, cpu_fprintf);
1857 #endif
1858 }
1859
1860 CpuInfoList *qmp_query_cpus(Error **errp)
1861 {
1862 CpuInfoList *head = NULL, *cur_item = NULL;
1863 CPUState *cpu;
1864
1865 CPU_FOREACH(cpu) {
1866 CpuInfoList *info;
1867 #if defined(TARGET_I386)
1868 X86CPU *x86_cpu = X86_CPU(cpu);
1869 CPUX86State *env = &x86_cpu->env;
1870 #elif defined(TARGET_PPC)
1871 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1872 CPUPPCState *env = &ppc_cpu->env;
1873 #elif defined(TARGET_SPARC)
1874 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1875 CPUSPARCState *env = &sparc_cpu->env;
1876 #elif defined(TARGET_MIPS)
1877 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1878 CPUMIPSState *env = &mips_cpu->env;
1879 #elif defined(TARGET_TRICORE)
1880 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1881 CPUTriCoreState *env = &tricore_cpu->env;
1882 #endif
1883
1884 cpu_synchronize_state(cpu);
1885
1886 info = g_malloc0(sizeof(*info));
1887 info->value = g_malloc0(sizeof(*info->value));
1888 info->value->CPU = cpu->cpu_index;
1889 info->value->current = (cpu == first_cpu);
1890 info->value->halted = cpu->halted;
1891 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1892 info->value->thread_id = cpu->thread_id;
1893 #if defined(TARGET_I386)
1894 info->value->arch = CPU_INFO_ARCH_X86;
1895 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1896 #elif defined(TARGET_PPC)
1897 info->value->arch = CPU_INFO_ARCH_PPC;
1898 info->value->u.ppc.nip = env->nip;
1899 #elif defined(TARGET_SPARC)
1900 info->value->arch = CPU_INFO_ARCH_SPARC;
1901 info->value->u.q_sparc.pc = env->pc;
1902 info->value->u.q_sparc.npc = env->npc;
1903 #elif defined(TARGET_MIPS)
1904 info->value->arch = CPU_INFO_ARCH_MIPS;
1905 info->value->u.q_mips.PC = env->active_tc.PC;
1906 #elif defined(TARGET_TRICORE)
1907 info->value->arch = CPU_INFO_ARCH_TRICORE;
1908 info->value->u.tricore.PC = env->PC;
1909 #else
1910 info->value->arch = CPU_INFO_ARCH_OTHER;
1911 #endif
1912
1913 /* XXX: waiting for the qapi to support GSList */
1914 if (!cur_item) {
1915 head = cur_item = info;
1916 } else {
1917 cur_item->next = info;
1918 cur_item = info;
1919 }
1920 }
1921
1922 return head;
1923 }
1924
1925 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1926 bool has_cpu, int64_t cpu_index, Error **errp)
1927 {
1928 FILE *f;
1929 uint32_t l;
1930 CPUState *cpu;
1931 uint8_t buf[1024];
1932 int64_t orig_addr = addr, orig_size = size;
1933
1934 if (!has_cpu) {
1935 cpu_index = 0;
1936 }
1937
1938 cpu = qemu_get_cpu(cpu_index);
1939 if (cpu == NULL) {
1940 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1941 "a CPU number");
1942 return;
1943 }
1944
1945 f = fopen(filename, "wb");
1946 if (!f) {
1947 error_setg_file_open(errp, errno, filename);
1948 return;
1949 }
1950
1951 while (size != 0) {
1952 l = sizeof(buf);
1953 if (l > size)
1954 l = size;
1955 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1956 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1957 " specified", orig_addr, orig_size);
1958 goto exit;
1959 }
1960 if (fwrite(buf, 1, l, f) != l) {
1961 error_setg(errp, QERR_IO_ERROR);
1962 goto exit;
1963 }
1964 addr += l;
1965 size -= l;
1966 }
1967
1968 exit:
1969 fclose(f);
1970 }
1971
1972 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1973 Error **errp)
1974 {
1975 FILE *f;
1976 uint32_t l;
1977 uint8_t buf[1024];
1978
1979 f = fopen(filename, "wb");
1980 if (!f) {
1981 error_setg_file_open(errp, errno, filename);
1982 return;
1983 }
1984
1985 while (size != 0) {
1986 l = sizeof(buf);
1987 if (l > size)
1988 l = size;
1989 cpu_physical_memory_read(addr, buf, l);
1990 if (fwrite(buf, 1, l, f) != l) {
1991 error_setg(errp, QERR_IO_ERROR);
1992 goto exit;
1993 }
1994 addr += l;
1995 size -= l;
1996 }
1997
1998 exit:
1999 fclose(f);
2000 }
2001
2002 void qmp_inject_nmi(Error **errp)
2003 {
2004 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2005 }
2006
2007 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2008 {
2009 if (!use_icount) {
2010 return;
2011 }
2012
2013 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2014 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2015 if (icount_align_option) {
2016 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2017 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2018 } else {
2019 cpu_fprintf(f, "Max guest delay NA\n");
2020 cpu_fprintf(f, "Max guest advance NA\n");
2021 }
2022 }