target/i386: kvm: Add support for KVM_CAP_EXCEPTION_PAYLOAD
[qemu.git] / target / i386 / kvm.c
1 /*
2 * QEMU KVM support
3 *
4 * Copyright (C) 2006-2008 Qumranet Technologies
5 * Copyright IBM, Corp. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
11 * See the COPYING file in the top-level directory.
12 *
13 */
14
15 #include "qemu/osdep.h"
16 #include "qapi/error.h"
17 #include <sys/ioctl.h>
18 #include <sys/utsname.h>
19
20 #include <linux/kvm.h>
21 #include "standard-headers/asm-x86/kvm_para.h"
22
23 #include "cpu.h"
24 #include "sysemu/sysemu.h"
25 #include "sysemu/hw_accel.h"
26 #include "sysemu/kvm_int.h"
27 #include "kvm_i386.h"
28 #include "hyperv.h"
29 #include "hyperv-proto.h"
30
31 #include "exec/gdbstub.h"
32 #include "qemu/host-utils.h"
33 #include "qemu/config-file.h"
34 #include "qemu/error-report.h"
35 #include "hw/i386/pc.h"
36 #include "hw/i386/apic.h"
37 #include "hw/i386/apic_internal.h"
38 #include "hw/i386/apic-msidef.h"
39 #include "hw/i386/intel_iommu.h"
40 #include "hw/i386/x86-iommu.h"
41
42 #include "hw/pci/pci.h"
43 #include "hw/pci/msi.h"
44 #include "hw/pci/msix.h"
45 #include "migration/blocker.h"
46 #include "exec/memattrs.h"
47 #include "trace.h"
48
49 //#define DEBUG_KVM
50
51 #ifdef DEBUG_KVM
52 #define DPRINTF(fmt, ...) \
53 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
54 #else
55 #define DPRINTF(fmt, ...) \
56 do { } while (0)
57 #endif
58
59 #define MSR_KVM_WALL_CLOCK 0x11
60 #define MSR_KVM_SYSTEM_TIME 0x12
61
62 /* A 4096-byte buffer can hold the 8-byte kvm_msrs header, plus
63 * 255 kvm_msr_entry structs */
64 #define MSR_BUF_SIZE 4096
65
66 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
67 KVM_CAP_INFO(SET_TSS_ADDR),
68 KVM_CAP_INFO(EXT_CPUID),
69 KVM_CAP_INFO(MP_STATE),
70 KVM_CAP_LAST_INFO
71 };
72
73 static bool has_msr_star;
74 static bool has_msr_hsave_pa;
75 static bool has_msr_tsc_aux;
76 static bool has_msr_tsc_adjust;
77 static bool has_msr_tsc_deadline;
78 static bool has_msr_feature_control;
79 static bool has_msr_misc_enable;
80 static bool has_msr_smbase;
81 static bool has_msr_bndcfgs;
82 static int lm_capable_kernel;
83 static bool has_msr_hv_hypercall;
84 static bool has_msr_hv_crash;
85 static bool has_msr_hv_reset;
86 static bool has_msr_hv_vpindex;
87 static bool hv_vpindex_settable;
88 static bool has_msr_hv_runtime;
89 static bool has_msr_hv_synic;
90 static bool has_msr_hv_stimer;
91 static bool has_msr_hv_frequencies;
92 static bool has_msr_hv_reenlightenment;
93 static bool has_msr_xss;
94 static bool has_msr_spec_ctrl;
95 static bool has_msr_virt_ssbd;
96 static bool has_msr_smi_count;
97 static bool has_msr_arch_capabs;
98 static bool has_msr_core_capabs;
99
100 static uint32_t has_architectural_pmu_version;
101 static uint32_t num_architectural_pmu_gp_counters;
102 static uint32_t num_architectural_pmu_fixed_counters;
103
104 static int has_xsave;
105 static int has_xcrs;
106 static int has_pit_state2;
107 static int has_exception_payload;
108
109 static bool has_msr_mcg_ext_ctl;
110
111 static struct kvm_cpuid2 *cpuid_cache;
112 static struct kvm_msr_list *kvm_feature_msrs;
113
114 int kvm_has_pit_state2(void)
115 {
116 return has_pit_state2;
117 }
118
119 bool kvm_has_smm(void)
120 {
121 return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
122 }
123
124 bool kvm_has_adjust_clock_stable(void)
125 {
126 int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
127
128 return (ret == KVM_CLOCK_TSC_STABLE);
129 }
130
131 bool kvm_allows_irq0_override(void)
132 {
133 return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
134 }
135
136 static bool kvm_x2apic_api_set_flags(uint64_t flags)
137 {
138 KVMState *s = KVM_STATE(current_machine->accelerator);
139
140 return !kvm_vm_enable_cap(s, KVM_CAP_X2APIC_API, 0, flags);
141 }
142
143 #define MEMORIZE(fn, _result) \
144 ({ \
145 static bool _memorized; \
146 \
147 if (_memorized) { \
148 return _result; \
149 } \
150 _memorized = true; \
151 _result = fn; \
152 })
153
154 static bool has_x2apic_api;
155
156 bool kvm_has_x2apic_api(void)
157 {
158 return has_x2apic_api;
159 }
160
161 bool kvm_enable_x2apic(void)
162 {
163 return MEMORIZE(
164 kvm_x2apic_api_set_flags(KVM_X2APIC_API_USE_32BIT_IDS |
165 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK),
166 has_x2apic_api);
167 }
168
169 bool kvm_hv_vpindex_settable(void)
170 {
171 return hv_vpindex_settable;
172 }
173
174 static int kvm_get_tsc(CPUState *cs)
175 {
176 X86CPU *cpu = X86_CPU(cs);
177 CPUX86State *env = &cpu->env;
178 struct {
179 struct kvm_msrs info;
180 struct kvm_msr_entry entries[1];
181 } msr_data;
182 int ret;
183
184 if (env->tsc_valid) {
185 return 0;
186 }
187
188 msr_data.info.nmsrs = 1;
189 msr_data.entries[0].index = MSR_IA32_TSC;
190 env->tsc_valid = !runstate_is_running();
191
192 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
193 if (ret < 0) {
194 return ret;
195 }
196
197 assert(ret == 1);
198 env->tsc = msr_data.entries[0].data;
199 return 0;
200 }
201
202 static inline void do_kvm_synchronize_tsc(CPUState *cpu, run_on_cpu_data arg)
203 {
204 kvm_get_tsc(cpu);
205 }
206
207 void kvm_synchronize_all_tsc(void)
208 {
209 CPUState *cpu;
210
211 if (kvm_enabled()) {
212 CPU_FOREACH(cpu) {
213 run_on_cpu(cpu, do_kvm_synchronize_tsc, RUN_ON_CPU_NULL);
214 }
215 }
216 }
217
218 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
219 {
220 struct kvm_cpuid2 *cpuid;
221 int r, size;
222
223 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
224 cpuid = g_malloc0(size);
225 cpuid->nent = max;
226 r = kvm_ioctl(s, KVM_GET_SUPPORTED_CPUID, cpuid);
227 if (r == 0 && cpuid->nent >= max) {
228 r = -E2BIG;
229 }
230 if (r < 0) {
231 if (r == -E2BIG) {
232 g_free(cpuid);
233 return NULL;
234 } else {
235 fprintf(stderr, "KVM_GET_SUPPORTED_CPUID failed: %s\n",
236 strerror(-r));
237 exit(1);
238 }
239 }
240 return cpuid;
241 }
242
243 /* Run KVM_GET_SUPPORTED_CPUID ioctl(), allocating a buffer large enough
244 * for all entries.
245 */
246 static struct kvm_cpuid2 *get_supported_cpuid(KVMState *s)
247 {
248 struct kvm_cpuid2 *cpuid;
249 int max = 1;
250
251 if (cpuid_cache != NULL) {
252 return cpuid_cache;
253 }
254 while ((cpuid = try_get_cpuid(s, max)) == NULL) {
255 max *= 2;
256 }
257 cpuid_cache = cpuid;
258 return cpuid;
259 }
260
261 static const struct kvm_para_features {
262 int cap;
263 int feature;
264 } para_features[] = {
265 { KVM_CAP_CLOCKSOURCE, KVM_FEATURE_CLOCKSOURCE },
266 { KVM_CAP_NOP_IO_DELAY, KVM_FEATURE_NOP_IO_DELAY },
267 { KVM_CAP_PV_MMU, KVM_FEATURE_MMU_OP },
268 { KVM_CAP_ASYNC_PF, KVM_FEATURE_ASYNC_PF },
269 };
270
271 static int get_para_features(KVMState *s)
272 {
273 int i, features = 0;
274
275 for (i = 0; i < ARRAY_SIZE(para_features); i++) {
276 if (kvm_check_extension(s, para_features[i].cap)) {
277 features |= (1 << para_features[i].feature);
278 }
279 }
280
281 return features;
282 }
283
284 static bool host_tsx_blacklisted(void)
285 {
286 int family, model, stepping;\
287 char vendor[CPUID_VENDOR_SZ + 1];
288
289 host_vendor_fms(vendor, &family, &model, &stepping);
290
291 /* Check if we are running on a Haswell host known to have broken TSX */
292 return !strcmp(vendor, CPUID_VENDOR_INTEL) &&
293 (family == 6) &&
294 ((model == 63 && stepping < 4) ||
295 model == 60 || model == 69 || model == 70);
296 }
297
298 /* Returns the value for a specific register on the cpuid entry
299 */
300 static uint32_t cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, int reg)
301 {
302 uint32_t ret = 0;
303 switch (reg) {
304 case R_EAX:
305 ret = entry->eax;
306 break;
307 case R_EBX:
308 ret = entry->ebx;
309 break;
310 case R_ECX:
311 ret = entry->ecx;
312 break;
313 case R_EDX:
314 ret = entry->edx;
315 break;
316 }
317 return ret;
318 }
319
320 /* Find matching entry for function/index on kvm_cpuid2 struct
321 */
322 static struct kvm_cpuid_entry2 *cpuid_find_entry(struct kvm_cpuid2 *cpuid,
323 uint32_t function,
324 uint32_t index)
325 {
326 int i;
327 for (i = 0; i < cpuid->nent; ++i) {
328 if (cpuid->entries[i].function == function &&
329 cpuid->entries[i].index == index) {
330 return &cpuid->entries[i];
331 }
332 }
333 /* not found: */
334 return NULL;
335 }
336
337 uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
338 uint32_t index, int reg)
339 {
340 struct kvm_cpuid2 *cpuid;
341 uint32_t ret = 0;
342 uint32_t cpuid_1_edx;
343 bool found = false;
344
345 cpuid = get_supported_cpuid(s);
346
347 struct kvm_cpuid_entry2 *entry = cpuid_find_entry(cpuid, function, index);
348 if (entry) {
349 found = true;
350 ret = cpuid_entry_get_reg(entry, reg);
351 }
352
353 /* Fixups for the data returned by KVM, below */
354
355 if (function == 1 && reg == R_EDX) {
356 /* KVM before 2.6.30 misreports the following features */
357 ret |= CPUID_MTRR | CPUID_PAT | CPUID_MCE | CPUID_MCA;
358 } else if (function == 1 && reg == R_ECX) {
359 /* We can set the hypervisor flag, even if KVM does not return it on
360 * GET_SUPPORTED_CPUID
361 */
362 ret |= CPUID_EXT_HYPERVISOR;
363 /* tsc-deadline flag is not returned by GET_SUPPORTED_CPUID, but it
364 * can be enabled if the kernel has KVM_CAP_TSC_DEADLINE_TIMER,
365 * and the irqchip is in the kernel.
366 */
367 if (kvm_irqchip_in_kernel() &&
368 kvm_check_extension(s, KVM_CAP_TSC_DEADLINE_TIMER)) {
369 ret |= CPUID_EXT_TSC_DEADLINE_TIMER;
370 }
371
372 /* x2apic is reported by GET_SUPPORTED_CPUID, but it can't be enabled
373 * without the in-kernel irqchip
374 */
375 if (!kvm_irqchip_in_kernel()) {
376 ret &= ~CPUID_EXT_X2APIC;
377 }
378
379 if (enable_cpu_pm) {
380 int disable_exits = kvm_check_extension(s,
381 KVM_CAP_X86_DISABLE_EXITS);
382
383 if (disable_exits & KVM_X86_DISABLE_EXITS_MWAIT) {
384 ret |= CPUID_EXT_MONITOR;
385 }
386 }
387 } else if (function == 6 && reg == R_EAX) {
388 ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */
389 } else if (function == 7 && index == 0 && reg == R_EBX) {
390 if (host_tsx_blacklisted()) {
391 ret &= ~(CPUID_7_0_EBX_RTM | CPUID_7_0_EBX_HLE);
392 }
393 } else if (function == 7 && index == 0 && reg == R_EDX) {
394 /*
395 * Linux v4.17-v4.20 incorrectly return ARCH_CAPABILITIES on SVM hosts.
396 * We can detect the bug by checking if MSR_IA32_ARCH_CAPABILITIES is
397 * returned by KVM_GET_MSR_INDEX_LIST.
398 */
399 if (!has_msr_arch_capabs) {
400 ret &= ~CPUID_7_0_EDX_ARCH_CAPABILITIES;
401 }
402 } else if (function == 0x80000001 && reg == R_ECX) {
403 /*
404 * It's safe to enable TOPOEXT even if it's not returned by
405 * GET_SUPPORTED_CPUID. Unconditionally enabling TOPOEXT here allows
406 * us to keep CPU models including TOPOEXT runnable on older kernels.
407 */
408 ret |= CPUID_EXT3_TOPOEXT;
409 } else if (function == 0x80000001 && reg == R_EDX) {
410 /* On Intel, kvm returns cpuid according to the Intel spec,
411 * so add missing bits according to the AMD spec:
412 */
413 cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
414 ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
415 } else if (function == KVM_CPUID_FEATURES && reg == R_EAX) {
416 /* kvm_pv_unhalt is reported by GET_SUPPORTED_CPUID, but it can't
417 * be enabled without the in-kernel irqchip
418 */
419 if (!kvm_irqchip_in_kernel()) {
420 ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
421 }
422 } else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
423 ret |= 1U << KVM_HINTS_REALTIME;
424 found = 1;
425 }
426
427 /* fallback for older kernels */
428 if ((function == KVM_CPUID_FEATURES) && !found) {
429 ret = get_para_features(s);
430 }
431
432 return ret;
433 }
434
435 uint32_t kvm_arch_get_supported_msr_feature(KVMState *s, uint32_t index)
436 {
437 struct {
438 struct kvm_msrs info;
439 struct kvm_msr_entry entries[1];
440 } msr_data;
441 uint32_t ret;
442
443 if (kvm_feature_msrs == NULL) { /* Host doesn't support feature MSRs */
444 return 0;
445 }
446
447 /* Check if requested MSR is supported feature MSR */
448 int i;
449 for (i = 0; i < kvm_feature_msrs->nmsrs; i++)
450 if (kvm_feature_msrs->indices[i] == index) {
451 break;
452 }
453 if (i == kvm_feature_msrs->nmsrs) {
454 return 0; /* if the feature MSR is not supported, simply return 0 */
455 }
456
457 msr_data.info.nmsrs = 1;
458 msr_data.entries[0].index = index;
459
460 ret = kvm_ioctl(s, KVM_GET_MSRS, &msr_data);
461 if (ret != 1) {
462 error_report("KVM get MSR (index=0x%x) feature failed, %s",
463 index, strerror(-ret));
464 exit(1);
465 }
466
467 return msr_data.entries[0].data;
468 }
469
470
471 typedef struct HWPoisonPage {
472 ram_addr_t ram_addr;
473 QLIST_ENTRY(HWPoisonPage) list;
474 } HWPoisonPage;
475
476 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
477 QLIST_HEAD_INITIALIZER(hwpoison_page_list);
478
479 static void kvm_unpoison_all(void *param)
480 {
481 HWPoisonPage *page, *next_page;
482
483 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
484 QLIST_REMOVE(page, list);
485 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
486 g_free(page);
487 }
488 }
489
490 static void kvm_hwpoison_page_add(ram_addr_t ram_addr)
491 {
492 HWPoisonPage *page;
493
494 QLIST_FOREACH(page, &hwpoison_page_list, list) {
495 if (page->ram_addr == ram_addr) {
496 return;
497 }
498 }
499 page = g_new(HWPoisonPage, 1);
500 page->ram_addr = ram_addr;
501 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
502 }
503
504 static int kvm_get_mce_cap_supported(KVMState *s, uint64_t *mce_cap,
505 int *max_banks)
506 {
507 int r;
508
509 r = kvm_check_extension(s, KVM_CAP_MCE);
510 if (r > 0) {
511 *max_banks = r;
512 return kvm_ioctl(s, KVM_X86_GET_MCE_CAP_SUPPORTED, mce_cap);
513 }
514 return -ENOSYS;
515 }
516
517 static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code)
518 {
519 CPUState *cs = CPU(cpu);
520 CPUX86State *env = &cpu->env;
521 uint64_t status = MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
522 MCI_STATUS_MISCV | MCI_STATUS_ADDRV | MCI_STATUS_S;
523 uint64_t mcg_status = MCG_STATUS_MCIP;
524 int flags = 0;
525
526 if (code == BUS_MCEERR_AR) {
527 status |= MCI_STATUS_AR | 0x134;
528 mcg_status |= MCG_STATUS_EIPV;
529 } else {
530 status |= 0xc0;
531 mcg_status |= MCG_STATUS_RIPV;
532 }
533
534 flags = cpu_x86_support_mca_broadcast(env) ? MCE_INJECT_BROADCAST : 0;
535 /* We need to read back the value of MSR_EXT_MCG_CTL that was set by the
536 * guest kernel back into env->mcg_ext_ctl.
537 */
538 cpu_synchronize_state(cs);
539 if (env->mcg_ext_ctl & MCG_EXT_CTL_LMCE_EN) {
540 mcg_status |= MCG_STATUS_LMCE;
541 flags = 0;
542 }
543
544 cpu_x86_inject_mce(NULL, cpu, 9, status, mcg_status, paddr,
545 (MCM_ADDR_PHYS << 6) | 0xc, flags);
546 }
547
548 static void hardware_memory_error(void)
549 {
550 fprintf(stderr, "Hardware memory error!\n");
551 exit(1);
552 }
553
554 void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
555 {
556 X86CPU *cpu = X86_CPU(c);
557 CPUX86State *env = &cpu->env;
558 ram_addr_t ram_addr;
559 hwaddr paddr;
560
561 /* If we get an action required MCE, it has been injected by KVM
562 * while the VM was running. An action optional MCE instead should
563 * be coming from the main thread, which qemu_init_sigbus identifies
564 * as the "early kill" thread.
565 */
566 assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
567
568 if ((env->mcg_cap & MCG_SER_P) && addr) {
569 ram_addr = qemu_ram_addr_from_host(addr);
570 if (ram_addr != RAM_ADDR_INVALID &&
571 kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
572 kvm_hwpoison_page_add(ram_addr);
573 kvm_mce_inject(cpu, paddr, code);
574 return;
575 }
576
577 fprintf(stderr, "Hardware memory error for memory used by "
578 "QEMU itself instead of guest system!\n");
579 }
580
581 if (code == BUS_MCEERR_AR) {
582 hardware_memory_error();
583 }
584
585 /* Hope we are lucky for AO MCE */
586 }
587
588 static void kvm_reset_exception(CPUX86State *env)
589 {
590 env->exception_nr = -1;
591 env->exception_pending = 0;
592 env->exception_injected = 0;
593 env->exception_has_payload = false;
594 env->exception_payload = 0;
595 }
596
597 static void kvm_queue_exception(CPUX86State *env,
598 int32_t exception_nr,
599 uint8_t exception_has_payload,
600 uint64_t exception_payload)
601 {
602 assert(env->exception_nr == -1);
603 assert(!env->exception_pending);
604 assert(!env->exception_injected);
605 assert(!env->exception_has_payload);
606
607 env->exception_nr = exception_nr;
608
609 if (has_exception_payload) {
610 env->exception_pending = 1;
611
612 env->exception_has_payload = exception_has_payload;
613 env->exception_payload = exception_payload;
614 } else {
615 env->exception_injected = 1;
616
617 if (exception_nr == EXCP01_DB) {
618 assert(exception_has_payload);
619 env->dr[6] = exception_payload;
620 } else if (exception_nr == EXCP0E_PAGE) {
621 assert(exception_has_payload);
622 env->cr[2] = exception_payload;
623 } else {
624 assert(!exception_has_payload);
625 }
626 }
627 }
628
629 static int kvm_inject_mce_oldstyle(X86CPU *cpu)
630 {
631 CPUX86State *env = &cpu->env;
632
633 if (!kvm_has_vcpu_events() && env->exception_nr == EXCP12_MCHK) {
634 unsigned int bank, bank_num = env->mcg_cap & 0xff;
635 struct kvm_x86_mce mce;
636
637 kvm_reset_exception(env);
638
639 /*
640 * There must be at least one bank in use if an MCE is pending.
641 * Find it and use its values for the event injection.
642 */
643 for (bank = 0; bank < bank_num; bank++) {
644 if (env->mce_banks[bank * 4 + 1] & MCI_STATUS_VAL) {
645 break;
646 }
647 }
648 assert(bank < bank_num);
649
650 mce.bank = bank;
651 mce.status = env->mce_banks[bank * 4 + 1];
652 mce.mcg_status = env->mcg_status;
653 mce.addr = env->mce_banks[bank * 4 + 2];
654 mce.misc = env->mce_banks[bank * 4 + 3];
655
656 return kvm_vcpu_ioctl(CPU(cpu), KVM_X86_SET_MCE, &mce);
657 }
658 return 0;
659 }
660
661 static void cpu_update_state(void *opaque, int running, RunState state)
662 {
663 CPUX86State *env = opaque;
664
665 if (running) {
666 env->tsc_valid = false;
667 }
668 }
669
670 unsigned long kvm_arch_vcpu_id(CPUState *cs)
671 {
672 X86CPU *cpu = X86_CPU(cs);
673 return cpu->apic_id;
674 }
675
676 #ifndef KVM_CPUID_SIGNATURE_NEXT
677 #define KVM_CPUID_SIGNATURE_NEXT 0x40000100
678 #endif
679
680 static bool hyperv_enabled(X86CPU *cpu)
681 {
682 CPUState *cs = CPU(cpu);
683 return kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0 &&
684 ((cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_RETRY) ||
685 cpu->hyperv_features || cpu->hyperv_passthrough);
686 }
687
688 static int kvm_arch_set_tsc_khz(CPUState *cs)
689 {
690 X86CPU *cpu = X86_CPU(cs);
691 CPUX86State *env = &cpu->env;
692 int r;
693
694 if (!env->tsc_khz) {
695 return 0;
696 }
697
698 r = kvm_check_extension(cs->kvm_state, KVM_CAP_TSC_CONTROL) ?
699 kvm_vcpu_ioctl(cs, KVM_SET_TSC_KHZ, env->tsc_khz) :
700 -ENOTSUP;
701 if (r < 0) {
702 /* When KVM_SET_TSC_KHZ fails, it's an error only if the current
703 * TSC frequency doesn't match the one we want.
704 */
705 int cur_freq = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
706 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
707 -ENOTSUP;
708 if (cur_freq <= 0 || cur_freq != env->tsc_khz) {
709 warn_report("TSC frequency mismatch between "
710 "VM (%" PRId64 " kHz) and host (%d kHz), "
711 "and TSC scaling unavailable",
712 env->tsc_khz, cur_freq);
713 return r;
714 }
715 }
716
717 return 0;
718 }
719
720 static bool tsc_is_stable_and_known(CPUX86State *env)
721 {
722 if (!env->tsc_khz) {
723 return false;
724 }
725 return (env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC)
726 || env->user_tsc_khz;
727 }
728
729 static struct {
730 const char *desc;
731 struct {
732 uint32_t fw;
733 uint32_t bits;
734 } flags[2];
735 uint64_t dependencies;
736 } kvm_hyperv_properties[] = {
737 [HYPERV_FEAT_RELAXED] = {
738 .desc = "relaxed timing (hv-relaxed)",
739 .flags = {
740 {.fw = FEAT_HYPERV_EAX,
741 .bits = HV_HYPERCALL_AVAILABLE},
742 {.fw = FEAT_HV_RECOMM_EAX,
743 .bits = HV_RELAXED_TIMING_RECOMMENDED}
744 }
745 },
746 [HYPERV_FEAT_VAPIC] = {
747 .desc = "virtual APIC (hv-vapic)",
748 .flags = {
749 {.fw = FEAT_HYPERV_EAX,
750 .bits = HV_HYPERCALL_AVAILABLE | HV_APIC_ACCESS_AVAILABLE},
751 {.fw = FEAT_HV_RECOMM_EAX,
752 .bits = HV_APIC_ACCESS_RECOMMENDED}
753 }
754 },
755 [HYPERV_FEAT_TIME] = {
756 .desc = "clocksources (hv-time)",
757 .flags = {
758 {.fw = FEAT_HYPERV_EAX,
759 .bits = HV_HYPERCALL_AVAILABLE | HV_TIME_REF_COUNT_AVAILABLE |
760 HV_REFERENCE_TSC_AVAILABLE}
761 }
762 },
763 [HYPERV_FEAT_CRASH] = {
764 .desc = "crash MSRs (hv-crash)",
765 .flags = {
766 {.fw = FEAT_HYPERV_EDX,
767 .bits = HV_GUEST_CRASH_MSR_AVAILABLE}
768 }
769 },
770 [HYPERV_FEAT_RESET] = {
771 .desc = "reset MSR (hv-reset)",
772 .flags = {
773 {.fw = FEAT_HYPERV_EAX,
774 .bits = HV_RESET_AVAILABLE}
775 }
776 },
777 [HYPERV_FEAT_VPINDEX] = {
778 .desc = "VP_INDEX MSR (hv-vpindex)",
779 .flags = {
780 {.fw = FEAT_HYPERV_EAX,
781 .bits = HV_VP_INDEX_AVAILABLE}
782 }
783 },
784 [HYPERV_FEAT_RUNTIME] = {
785 .desc = "VP_RUNTIME MSR (hv-runtime)",
786 .flags = {
787 {.fw = FEAT_HYPERV_EAX,
788 .bits = HV_VP_RUNTIME_AVAILABLE}
789 }
790 },
791 [HYPERV_FEAT_SYNIC] = {
792 .desc = "synthetic interrupt controller (hv-synic)",
793 .flags = {
794 {.fw = FEAT_HYPERV_EAX,
795 .bits = HV_SYNIC_AVAILABLE}
796 }
797 },
798 [HYPERV_FEAT_STIMER] = {
799 .desc = "synthetic timers (hv-stimer)",
800 .flags = {
801 {.fw = FEAT_HYPERV_EAX,
802 .bits = HV_SYNTIMERS_AVAILABLE}
803 },
804 .dependencies = BIT(HYPERV_FEAT_SYNIC) | BIT(HYPERV_FEAT_TIME)
805 },
806 [HYPERV_FEAT_FREQUENCIES] = {
807 .desc = "frequency MSRs (hv-frequencies)",
808 .flags = {
809 {.fw = FEAT_HYPERV_EAX,
810 .bits = HV_ACCESS_FREQUENCY_MSRS},
811 {.fw = FEAT_HYPERV_EDX,
812 .bits = HV_FREQUENCY_MSRS_AVAILABLE}
813 }
814 },
815 [HYPERV_FEAT_REENLIGHTENMENT] = {
816 .desc = "reenlightenment MSRs (hv-reenlightenment)",
817 .flags = {
818 {.fw = FEAT_HYPERV_EAX,
819 .bits = HV_ACCESS_REENLIGHTENMENTS_CONTROL}
820 }
821 },
822 [HYPERV_FEAT_TLBFLUSH] = {
823 .desc = "paravirtualized TLB flush (hv-tlbflush)",
824 .flags = {
825 {.fw = FEAT_HV_RECOMM_EAX,
826 .bits = HV_REMOTE_TLB_FLUSH_RECOMMENDED |
827 HV_EX_PROCESSOR_MASKS_RECOMMENDED}
828 },
829 .dependencies = BIT(HYPERV_FEAT_VPINDEX)
830 },
831 [HYPERV_FEAT_EVMCS] = {
832 .desc = "enlightened VMCS (hv-evmcs)",
833 .flags = {
834 {.fw = FEAT_HV_RECOMM_EAX,
835 .bits = HV_ENLIGHTENED_VMCS_RECOMMENDED}
836 },
837 .dependencies = BIT(HYPERV_FEAT_VAPIC)
838 },
839 [HYPERV_FEAT_IPI] = {
840 .desc = "paravirtualized IPI (hv-ipi)",
841 .flags = {
842 {.fw = FEAT_HV_RECOMM_EAX,
843 .bits = HV_CLUSTER_IPI_RECOMMENDED |
844 HV_EX_PROCESSOR_MASKS_RECOMMENDED}
845 },
846 .dependencies = BIT(HYPERV_FEAT_VPINDEX)
847 },
848 [HYPERV_FEAT_STIMER_DIRECT] = {
849 .desc = "direct mode synthetic timers (hv-stimer-direct)",
850 .flags = {
851 {.fw = FEAT_HYPERV_EDX,
852 .bits = HV_STIMER_DIRECT_MODE_AVAILABLE}
853 },
854 .dependencies = BIT(HYPERV_FEAT_STIMER)
855 },
856 };
857
858 static struct kvm_cpuid2 *try_get_hv_cpuid(CPUState *cs, int max)
859 {
860 struct kvm_cpuid2 *cpuid;
861 int r, size;
862
863 size = sizeof(*cpuid) + max * sizeof(*cpuid->entries);
864 cpuid = g_malloc0(size);
865 cpuid->nent = max;
866
867 r = kvm_vcpu_ioctl(cs, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
868 if (r == 0 && cpuid->nent >= max) {
869 r = -E2BIG;
870 }
871 if (r < 0) {
872 if (r == -E2BIG) {
873 g_free(cpuid);
874 return NULL;
875 } else {
876 fprintf(stderr, "KVM_GET_SUPPORTED_HV_CPUID failed: %s\n",
877 strerror(-r));
878 exit(1);
879 }
880 }
881 return cpuid;
882 }
883
884 /*
885 * Run KVM_GET_SUPPORTED_HV_CPUID ioctl(), allocating a buffer large enough
886 * for all entries.
887 */
888 static struct kvm_cpuid2 *get_supported_hv_cpuid(CPUState *cs)
889 {
890 struct kvm_cpuid2 *cpuid;
891 int max = 7; /* 0x40000000..0x40000005, 0x4000000A */
892
893 /*
894 * When the buffer is too small, KVM_GET_SUPPORTED_HV_CPUID fails with
895 * -E2BIG, however, it doesn't report back the right size. Keep increasing
896 * it and re-trying until we succeed.
897 */
898 while ((cpuid = try_get_hv_cpuid(cs, max)) == NULL) {
899 max++;
900 }
901 return cpuid;
902 }
903
904 /*
905 * When KVM_GET_SUPPORTED_HV_CPUID is not supported we fill CPUID feature
906 * leaves from KVM_CAP_HYPERV* and present MSRs data.
907 */
908 static struct kvm_cpuid2 *get_supported_hv_cpuid_legacy(CPUState *cs)
909 {
910 X86CPU *cpu = X86_CPU(cs);
911 struct kvm_cpuid2 *cpuid;
912 struct kvm_cpuid_entry2 *entry_feat, *entry_recomm;
913
914 /* HV_CPUID_FEATURES, HV_CPUID_ENLIGHTMENT_INFO */
915 cpuid = g_malloc0(sizeof(*cpuid) + 2 * sizeof(*cpuid->entries));
916 cpuid->nent = 2;
917
918 /* HV_CPUID_VENDOR_AND_MAX_FUNCTIONS */
919 entry_feat = &cpuid->entries[0];
920 entry_feat->function = HV_CPUID_FEATURES;
921
922 entry_recomm = &cpuid->entries[1];
923 entry_recomm->function = HV_CPUID_ENLIGHTMENT_INFO;
924 entry_recomm->ebx = cpu->hyperv_spinlock_attempts;
925
926 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV) > 0) {
927 entry_feat->eax |= HV_HYPERCALL_AVAILABLE;
928 entry_feat->eax |= HV_APIC_ACCESS_AVAILABLE;
929 entry_feat->edx |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
930 entry_recomm->eax |= HV_RELAXED_TIMING_RECOMMENDED;
931 entry_recomm->eax |= HV_APIC_ACCESS_RECOMMENDED;
932 }
933
934 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_TIME) > 0) {
935 entry_feat->eax |= HV_TIME_REF_COUNT_AVAILABLE;
936 entry_feat->eax |= HV_REFERENCE_TSC_AVAILABLE;
937 }
938
939 if (has_msr_hv_frequencies) {
940 entry_feat->eax |= HV_ACCESS_FREQUENCY_MSRS;
941 entry_feat->edx |= HV_FREQUENCY_MSRS_AVAILABLE;
942 }
943
944 if (has_msr_hv_crash) {
945 entry_feat->edx |= HV_GUEST_CRASH_MSR_AVAILABLE;
946 }
947
948 if (has_msr_hv_reenlightenment) {
949 entry_feat->eax |= HV_ACCESS_REENLIGHTENMENTS_CONTROL;
950 }
951
952 if (has_msr_hv_reset) {
953 entry_feat->eax |= HV_RESET_AVAILABLE;
954 }
955
956 if (has_msr_hv_vpindex) {
957 entry_feat->eax |= HV_VP_INDEX_AVAILABLE;
958 }
959
960 if (has_msr_hv_runtime) {
961 entry_feat->eax |= HV_VP_RUNTIME_AVAILABLE;
962 }
963
964 if (has_msr_hv_synic) {
965 unsigned int cap = cpu->hyperv_synic_kvm_only ?
966 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
967
968 if (kvm_check_extension(cs->kvm_state, cap) > 0) {
969 entry_feat->eax |= HV_SYNIC_AVAILABLE;
970 }
971 }
972
973 if (has_msr_hv_stimer) {
974 entry_feat->eax |= HV_SYNTIMERS_AVAILABLE;
975 }
976
977 if (kvm_check_extension(cs->kvm_state,
978 KVM_CAP_HYPERV_TLBFLUSH) > 0) {
979 entry_recomm->eax |= HV_REMOTE_TLB_FLUSH_RECOMMENDED;
980 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
981 }
982
983 if (kvm_check_extension(cs->kvm_state,
984 KVM_CAP_HYPERV_ENLIGHTENED_VMCS) > 0) {
985 entry_recomm->eax |= HV_ENLIGHTENED_VMCS_RECOMMENDED;
986 }
987
988 if (kvm_check_extension(cs->kvm_state,
989 KVM_CAP_HYPERV_SEND_IPI) > 0) {
990 entry_recomm->eax |= HV_CLUSTER_IPI_RECOMMENDED;
991 entry_recomm->eax |= HV_EX_PROCESSOR_MASKS_RECOMMENDED;
992 }
993
994 return cpuid;
995 }
996
997 static int hv_cpuid_get_fw(struct kvm_cpuid2 *cpuid, int fw, uint32_t *r)
998 {
999 struct kvm_cpuid_entry2 *entry;
1000 uint32_t func;
1001 int reg;
1002
1003 switch (fw) {
1004 case FEAT_HYPERV_EAX:
1005 reg = R_EAX;
1006 func = HV_CPUID_FEATURES;
1007 break;
1008 case FEAT_HYPERV_EDX:
1009 reg = R_EDX;
1010 func = HV_CPUID_FEATURES;
1011 break;
1012 case FEAT_HV_RECOMM_EAX:
1013 reg = R_EAX;
1014 func = HV_CPUID_ENLIGHTMENT_INFO;
1015 break;
1016 default:
1017 return -EINVAL;
1018 }
1019
1020 entry = cpuid_find_entry(cpuid, func, 0);
1021 if (!entry) {
1022 return -ENOENT;
1023 }
1024
1025 switch (reg) {
1026 case R_EAX:
1027 *r = entry->eax;
1028 break;
1029 case R_EDX:
1030 *r = entry->edx;
1031 break;
1032 default:
1033 return -EINVAL;
1034 }
1035
1036 return 0;
1037 }
1038
1039 static int hv_cpuid_check_and_set(CPUState *cs, struct kvm_cpuid2 *cpuid,
1040 int feature)
1041 {
1042 X86CPU *cpu = X86_CPU(cs);
1043 CPUX86State *env = &cpu->env;
1044 uint32_t r, fw, bits;
1045 uint64_t deps;
1046 int i, dep_feat = 0;
1047
1048 if (!hyperv_feat_enabled(cpu, feature) && !cpu->hyperv_passthrough) {
1049 return 0;
1050 }
1051
1052 deps = kvm_hyperv_properties[feature].dependencies;
1053 while ((dep_feat = find_next_bit(&deps, 64, dep_feat)) < 64) {
1054 if (!(hyperv_feat_enabled(cpu, dep_feat))) {
1055 fprintf(stderr,
1056 "Hyper-V %s requires Hyper-V %s\n",
1057 kvm_hyperv_properties[feature].desc,
1058 kvm_hyperv_properties[dep_feat].desc);
1059 return 1;
1060 }
1061 dep_feat++;
1062 }
1063
1064 for (i = 0; i < ARRAY_SIZE(kvm_hyperv_properties[feature].flags); i++) {
1065 fw = kvm_hyperv_properties[feature].flags[i].fw;
1066 bits = kvm_hyperv_properties[feature].flags[i].bits;
1067
1068 if (!fw) {
1069 continue;
1070 }
1071
1072 if (hv_cpuid_get_fw(cpuid, fw, &r) || (r & bits) != bits) {
1073 if (hyperv_feat_enabled(cpu, feature)) {
1074 fprintf(stderr,
1075 "Hyper-V %s is not supported by kernel\n",
1076 kvm_hyperv_properties[feature].desc);
1077 return 1;
1078 } else {
1079 return 0;
1080 }
1081 }
1082
1083 env->features[fw] |= bits;
1084 }
1085
1086 if (cpu->hyperv_passthrough) {
1087 cpu->hyperv_features |= BIT(feature);
1088 }
1089
1090 return 0;
1091 }
1092
1093 /*
1094 * Fill in Hyper-V CPUIDs. Returns the number of entries filled in cpuid_ent in
1095 * case of success, errno < 0 in case of failure and 0 when no Hyper-V
1096 * extentions are enabled.
1097 */
1098 static int hyperv_handle_properties(CPUState *cs,
1099 struct kvm_cpuid_entry2 *cpuid_ent)
1100 {
1101 X86CPU *cpu = X86_CPU(cs);
1102 CPUX86State *env = &cpu->env;
1103 struct kvm_cpuid2 *cpuid;
1104 struct kvm_cpuid_entry2 *c;
1105 uint32_t signature[3];
1106 uint32_t cpuid_i = 0;
1107 int r;
1108
1109 if (!hyperv_enabled(cpu))
1110 return 0;
1111
1112 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ||
1113 cpu->hyperv_passthrough) {
1114 uint16_t evmcs_version;
1115
1116 r = kvm_vcpu_enable_cap(cs, KVM_CAP_HYPERV_ENLIGHTENED_VMCS, 0,
1117 (uintptr_t)&evmcs_version);
1118
1119 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) && r) {
1120 fprintf(stderr, "Hyper-V %s is not supported by kernel\n",
1121 kvm_hyperv_properties[HYPERV_FEAT_EVMCS].desc);
1122 return -ENOSYS;
1123 }
1124
1125 if (!r) {
1126 env->features[FEAT_HV_RECOMM_EAX] |=
1127 HV_ENLIGHTENED_VMCS_RECOMMENDED;
1128 env->features[FEAT_HV_NESTED_EAX] = evmcs_version;
1129 }
1130 }
1131
1132 if (kvm_check_extension(cs->kvm_state, KVM_CAP_HYPERV_CPUID) > 0) {
1133 cpuid = get_supported_hv_cpuid(cs);
1134 } else {
1135 cpuid = get_supported_hv_cpuid_legacy(cs);
1136 }
1137
1138 if (cpu->hyperv_passthrough) {
1139 memcpy(cpuid_ent, &cpuid->entries[0],
1140 cpuid->nent * sizeof(cpuid->entries[0]));
1141
1142 c = cpuid_find_entry(cpuid, HV_CPUID_FEATURES, 0);
1143 if (c) {
1144 env->features[FEAT_HYPERV_EAX] = c->eax;
1145 env->features[FEAT_HYPERV_EBX] = c->ebx;
1146 env->features[FEAT_HYPERV_EDX] = c->eax;
1147 }
1148 c = cpuid_find_entry(cpuid, HV_CPUID_ENLIGHTMENT_INFO, 0);
1149 if (c) {
1150 env->features[FEAT_HV_RECOMM_EAX] = c->eax;
1151
1152 /* hv-spinlocks may have been overriden */
1153 if (cpu->hyperv_spinlock_attempts != HYPERV_SPINLOCK_NEVER_RETRY) {
1154 c->ebx = cpu->hyperv_spinlock_attempts;
1155 }
1156 }
1157 c = cpuid_find_entry(cpuid, HV_CPUID_NESTED_FEATURES, 0);
1158 if (c) {
1159 env->features[FEAT_HV_NESTED_EAX] = c->eax;
1160 }
1161 }
1162
1163 /* Features */
1164 r = hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RELAXED);
1165 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_VAPIC);
1166 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TIME);
1167 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_CRASH);
1168 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RESET);
1169 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_VPINDEX);
1170 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_RUNTIME);
1171 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_SYNIC);
1172 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER);
1173 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_FREQUENCIES);
1174 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_REENLIGHTENMENT);
1175 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_TLBFLUSH);
1176 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_EVMCS);
1177 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_IPI);
1178 r |= hv_cpuid_check_and_set(cs, cpuid, HYPERV_FEAT_STIMER_DIRECT);
1179
1180 /* Additional dependencies not covered by kvm_hyperv_properties[] */
1181 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC) &&
1182 !cpu->hyperv_synic_kvm_only &&
1183 !hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)) {
1184 fprintf(stderr, "Hyper-V %s requires Hyper-V %s\n",
1185 kvm_hyperv_properties[HYPERV_FEAT_SYNIC].desc,
1186 kvm_hyperv_properties[HYPERV_FEAT_VPINDEX].desc);
1187 r |= 1;
1188 }
1189
1190 /* Not exposed by KVM but needed to make CPU hotplug in Windows work */
1191 env->features[FEAT_HYPERV_EDX] |= HV_CPU_DYNAMIC_PARTITIONING_AVAILABLE;
1192
1193 if (r) {
1194 r = -ENOSYS;
1195 goto free;
1196 }
1197
1198 if (cpu->hyperv_passthrough) {
1199 /* We already copied all feature words from KVM as is */
1200 r = cpuid->nent;
1201 goto free;
1202 }
1203
1204 c = &cpuid_ent[cpuid_i++];
1205 c->function = HV_CPUID_VENDOR_AND_MAX_FUNCTIONS;
1206 if (!cpu->hyperv_vendor_id) {
1207 memcpy(signature, "Microsoft Hv", 12);
1208 } else {
1209 size_t len = strlen(cpu->hyperv_vendor_id);
1210
1211 if (len > 12) {
1212 error_report("hv-vendor-id truncated to 12 characters");
1213 len = 12;
1214 }
1215 memset(signature, 0, 12);
1216 memcpy(signature, cpu->hyperv_vendor_id, len);
1217 }
1218 c->eax = hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS) ?
1219 HV_CPUID_NESTED_FEATURES : HV_CPUID_IMPLEMENT_LIMITS;
1220 c->ebx = signature[0];
1221 c->ecx = signature[1];
1222 c->edx = signature[2];
1223
1224 c = &cpuid_ent[cpuid_i++];
1225 c->function = HV_CPUID_INTERFACE;
1226 memcpy(signature, "Hv#1\0\0\0\0\0\0\0\0", 12);
1227 c->eax = signature[0];
1228 c->ebx = 0;
1229 c->ecx = 0;
1230 c->edx = 0;
1231
1232 c = &cpuid_ent[cpuid_i++];
1233 c->function = HV_CPUID_VERSION;
1234 c->eax = 0x00001bbc;
1235 c->ebx = 0x00060001;
1236
1237 c = &cpuid_ent[cpuid_i++];
1238 c->function = HV_CPUID_FEATURES;
1239 c->eax = env->features[FEAT_HYPERV_EAX];
1240 c->ebx = env->features[FEAT_HYPERV_EBX];
1241 c->edx = env->features[FEAT_HYPERV_EDX];
1242
1243 c = &cpuid_ent[cpuid_i++];
1244 c->function = HV_CPUID_ENLIGHTMENT_INFO;
1245 c->eax = env->features[FEAT_HV_RECOMM_EAX];
1246 c->ebx = cpu->hyperv_spinlock_attempts;
1247
1248 c = &cpuid_ent[cpuid_i++];
1249 c->function = HV_CPUID_IMPLEMENT_LIMITS;
1250 c->eax = cpu->hv_max_vps;
1251 c->ebx = 0x40;
1252
1253 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_EVMCS)) {
1254 __u32 function;
1255
1256 /* Create zeroed 0x40000006..0x40000009 leaves */
1257 for (function = HV_CPUID_IMPLEMENT_LIMITS + 1;
1258 function < HV_CPUID_NESTED_FEATURES; function++) {
1259 c = &cpuid_ent[cpuid_i++];
1260 c->function = function;
1261 }
1262
1263 c = &cpuid_ent[cpuid_i++];
1264 c->function = HV_CPUID_NESTED_FEATURES;
1265 c->eax = env->features[FEAT_HV_NESTED_EAX];
1266 }
1267 r = cpuid_i;
1268
1269 free:
1270 g_free(cpuid);
1271
1272 return r;
1273 }
1274
1275 static Error *hv_passthrough_mig_blocker;
1276
1277 static int hyperv_init_vcpu(X86CPU *cpu)
1278 {
1279 CPUState *cs = CPU(cpu);
1280 Error *local_err = NULL;
1281 int ret;
1282
1283 if (cpu->hyperv_passthrough && hv_passthrough_mig_blocker == NULL) {
1284 error_setg(&hv_passthrough_mig_blocker,
1285 "'hv-passthrough' CPU flag prevents migration, use explicit"
1286 " set of hv-* flags instead");
1287 ret = migrate_add_blocker(hv_passthrough_mig_blocker, &local_err);
1288 if (local_err) {
1289 error_report_err(local_err);
1290 error_free(hv_passthrough_mig_blocker);
1291 return ret;
1292 }
1293 }
1294
1295 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX) && !hv_vpindex_settable) {
1296 /*
1297 * the kernel doesn't support setting vp_index; assert that its value
1298 * is in sync
1299 */
1300 struct {
1301 struct kvm_msrs info;
1302 struct kvm_msr_entry entries[1];
1303 } msr_data = {
1304 .info.nmsrs = 1,
1305 .entries[0].index = HV_X64_MSR_VP_INDEX,
1306 };
1307
1308 ret = kvm_vcpu_ioctl(cs, KVM_GET_MSRS, &msr_data);
1309 if (ret < 0) {
1310 return ret;
1311 }
1312 assert(ret == 1);
1313
1314 if (msr_data.entries[0].data != hyperv_vp_index(CPU(cpu))) {
1315 error_report("kernel's vp_index != QEMU's vp_index");
1316 return -ENXIO;
1317 }
1318 }
1319
1320 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1321 uint32_t synic_cap = cpu->hyperv_synic_kvm_only ?
1322 KVM_CAP_HYPERV_SYNIC : KVM_CAP_HYPERV_SYNIC2;
1323 ret = kvm_vcpu_enable_cap(cs, synic_cap, 0);
1324 if (ret < 0) {
1325 error_report("failed to turn on HyperV SynIC in KVM: %s",
1326 strerror(-ret));
1327 return ret;
1328 }
1329
1330 if (!cpu->hyperv_synic_kvm_only) {
1331 ret = hyperv_x86_synic_add(cpu);
1332 if (ret < 0) {
1333 error_report("failed to create HyperV SynIC: %s",
1334 strerror(-ret));
1335 return ret;
1336 }
1337 }
1338 }
1339
1340 return 0;
1341 }
1342
1343 static Error *invtsc_mig_blocker;
1344 static Error *nested_virt_mig_blocker;
1345
1346 #define KVM_MAX_CPUID_ENTRIES 100
1347
1348 int kvm_arch_init_vcpu(CPUState *cs)
1349 {
1350 struct {
1351 struct kvm_cpuid2 cpuid;
1352 struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES];
1353 } cpuid_data;
1354 /*
1355 * The kernel defines these structs with padding fields so there
1356 * should be no extra padding in our cpuid_data struct.
1357 */
1358 QEMU_BUILD_BUG_ON(sizeof(cpuid_data) !=
1359 sizeof(struct kvm_cpuid2) +
1360 sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES);
1361
1362 X86CPU *cpu = X86_CPU(cs);
1363 CPUX86State *env = &cpu->env;
1364 uint32_t limit, i, j, cpuid_i;
1365 uint32_t unused;
1366 struct kvm_cpuid_entry2 *c;
1367 uint32_t signature[3];
1368 int kvm_base = KVM_CPUID_SIGNATURE;
1369 int max_nested_state_len;
1370 int r;
1371 Error *local_err = NULL;
1372
1373 memset(&cpuid_data, 0, sizeof(cpuid_data));
1374
1375 cpuid_i = 0;
1376
1377 r = kvm_arch_set_tsc_khz(cs);
1378 if (r < 0) {
1379 return r;
1380 }
1381
1382 /* vcpu's TSC frequency is either specified by user, or following
1383 * the value used by KVM if the former is not present. In the
1384 * latter case, we query it from KVM and record in env->tsc_khz,
1385 * so that vcpu's TSC frequency can be migrated later via this field.
1386 */
1387 if (!env->tsc_khz) {
1388 r = kvm_check_extension(cs->kvm_state, KVM_CAP_GET_TSC_KHZ) ?
1389 kvm_vcpu_ioctl(cs, KVM_GET_TSC_KHZ) :
1390 -ENOTSUP;
1391 if (r > 0) {
1392 env->tsc_khz = r;
1393 }
1394 }
1395
1396 /* Paravirtualization CPUIDs */
1397 r = hyperv_handle_properties(cs, cpuid_data.entries);
1398 if (r < 0) {
1399 return r;
1400 } else if (r > 0) {
1401 cpuid_i = r;
1402 kvm_base = KVM_CPUID_SIGNATURE_NEXT;
1403 has_msr_hv_hypercall = true;
1404 }
1405
1406 if (cpu->expose_kvm) {
1407 memcpy(signature, "KVMKVMKVM\0\0\0", 12);
1408 c = &cpuid_data.entries[cpuid_i++];
1409 c->function = KVM_CPUID_SIGNATURE | kvm_base;
1410 c->eax = KVM_CPUID_FEATURES | kvm_base;
1411 c->ebx = signature[0];
1412 c->ecx = signature[1];
1413 c->edx = signature[2];
1414
1415 c = &cpuid_data.entries[cpuid_i++];
1416 c->function = KVM_CPUID_FEATURES | kvm_base;
1417 c->eax = env->features[FEAT_KVM];
1418 c->edx = env->features[FEAT_KVM_HINTS];
1419 }
1420
1421 cpu_x86_cpuid(env, 0, 0, &limit, &unused, &unused, &unused);
1422
1423 for (i = 0; i <= limit; i++) {
1424 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1425 fprintf(stderr, "unsupported level value: 0x%x\n", limit);
1426 abort();
1427 }
1428 c = &cpuid_data.entries[cpuid_i++];
1429
1430 switch (i) {
1431 case 2: {
1432 /* Keep reading function 2 till all the input is received */
1433 int times;
1434
1435 c->function = i;
1436 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC |
1437 KVM_CPUID_FLAG_STATE_READ_NEXT;
1438 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1439 times = c->eax & 0xff;
1440
1441 for (j = 1; j < times; ++j) {
1442 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1443 fprintf(stderr, "cpuid_data is full, no space for "
1444 "cpuid(eax:2):eax & 0xf = 0x%x\n", times);
1445 abort();
1446 }
1447 c = &cpuid_data.entries[cpuid_i++];
1448 c->function = i;
1449 c->flags = KVM_CPUID_FLAG_STATEFUL_FUNC;
1450 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1451 }
1452 break;
1453 }
1454 case 4:
1455 case 0xb:
1456 case 0xd:
1457 for (j = 0; ; j++) {
1458 if (i == 0xd && j == 64) {
1459 break;
1460 }
1461 c->function = i;
1462 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1463 c->index = j;
1464 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1465
1466 if (i == 4 && c->eax == 0) {
1467 break;
1468 }
1469 if (i == 0xb && !(c->ecx & 0xff00)) {
1470 break;
1471 }
1472 if (i == 0xd && c->eax == 0) {
1473 continue;
1474 }
1475 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1476 fprintf(stderr, "cpuid_data is full, no space for "
1477 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1478 abort();
1479 }
1480 c = &cpuid_data.entries[cpuid_i++];
1481 }
1482 break;
1483 case 0x14: {
1484 uint32_t times;
1485
1486 c->function = i;
1487 c->index = 0;
1488 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1489 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1490 times = c->eax;
1491
1492 for (j = 1; j <= times; ++j) {
1493 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1494 fprintf(stderr, "cpuid_data is full, no space for "
1495 "cpuid(eax:0x14,ecx:0x%x)\n", j);
1496 abort();
1497 }
1498 c = &cpuid_data.entries[cpuid_i++];
1499 c->function = i;
1500 c->index = j;
1501 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1502 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1503 }
1504 break;
1505 }
1506 default:
1507 c->function = i;
1508 c->flags = 0;
1509 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1510 break;
1511 }
1512 }
1513
1514 if (limit >= 0x0a) {
1515 uint32_t eax, edx;
1516
1517 cpu_x86_cpuid(env, 0x0a, 0, &eax, &unused, &unused, &edx);
1518
1519 has_architectural_pmu_version = eax & 0xff;
1520 if (has_architectural_pmu_version > 0) {
1521 num_architectural_pmu_gp_counters = (eax & 0xff00) >> 8;
1522
1523 /* Shouldn't be more than 32, since that's the number of bits
1524 * available in EBX to tell us _which_ counters are available.
1525 * Play it safe.
1526 */
1527 if (num_architectural_pmu_gp_counters > MAX_GP_COUNTERS) {
1528 num_architectural_pmu_gp_counters = MAX_GP_COUNTERS;
1529 }
1530
1531 if (has_architectural_pmu_version > 1) {
1532 num_architectural_pmu_fixed_counters = edx & 0x1f;
1533
1534 if (num_architectural_pmu_fixed_counters > MAX_FIXED_COUNTERS) {
1535 num_architectural_pmu_fixed_counters = MAX_FIXED_COUNTERS;
1536 }
1537 }
1538 }
1539 }
1540
1541 cpu_x86_cpuid(env, 0x80000000, 0, &limit, &unused, &unused, &unused);
1542
1543 for (i = 0x80000000; i <= limit; i++) {
1544 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1545 fprintf(stderr, "unsupported xlevel value: 0x%x\n", limit);
1546 abort();
1547 }
1548 c = &cpuid_data.entries[cpuid_i++];
1549
1550 switch (i) {
1551 case 0x8000001d:
1552 /* Query for all AMD cache information leaves */
1553 for (j = 0; ; j++) {
1554 c->function = i;
1555 c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1556 c->index = j;
1557 cpu_x86_cpuid(env, i, j, &c->eax, &c->ebx, &c->ecx, &c->edx);
1558
1559 if (c->eax == 0) {
1560 break;
1561 }
1562 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1563 fprintf(stderr, "cpuid_data is full, no space for "
1564 "cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
1565 abort();
1566 }
1567 c = &cpuid_data.entries[cpuid_i++];
1568 }
1569 break;
1570 default:
1571 c->function = i;
1572 c->flags = 0;
1573 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1574 break;
1575 }
1576 }
1577
1578 /* Call Centaur's CPUID instructions they are supported. */
1579 if (env->cpuid_xlevel2 > 0) {
1580 cpu_x86_cpuid(env, 0xC0000000, 0, &limit, &unused, &unused, &unused);
1581
1582 for (i = 0xC0000000; i <= limit; i++) {
1583 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
1584 fprintf(stderr, "unsupported xlevel2 value: 0x%x\n", limit);
1585 abort();
1586 }
1587 c = &cpuid_data.entries[cpuid_i++];
1588
1589 c->function = i;
1590 c->flags = 0;
1591 cpu_x86_cpuid(env, i, 0, &c->eax, &c->ebx, &c->ecx, &c->edx);
1592 }
1593 }
1594
1595 cpuid_data.cpuid.nent = cpuid_i;
1596
1597 if (((env->cpuid_version >> 8)&0xF) >= 6
1598 && (env->features[FEAT_1_EDX] & (CPUID_MCE | CPUID_MCA)) ==
1599 (CPUID_MCE | CPUID_MCA)
1600 && kvm_check_extension(cs->kvm_state, KVM_CAP_MCE) > 0) {
1601 uint64_t mcg_cap, unsupported_caps;
1602 int banks;
1603 int ret;
1604
1605 ret = kvm_get_mce_cap_supported(cs->kvm_state, &mcg_cap, &banks);
1606 if (ret < 0) {
1607 fprintf(stderr, "kvm_get_mce_cap_supported: %s", strerror(-ret));
1608 return ret;
1609 }
1610
1611 if (banks < (env->mcg_cap & MCG_CAP_BANKS_MASK)) {
1612 error_report("kvm: Unsupported MCE bank count (QEMU = %d, KVM = %d)",
1613 (int)(env->mcg_cap & MCG_CAP_BANKS_MASK), banks);
1614 return -ENOTSUP;
1615 }
1616
1617 unsupported_caps = env->mcg_cap & ~(mcg_cap | MCG_CAP_BANKS_MASK);
1618 if (unsupported_caps) {
1619 if (unsupported_caps & MCG_LMCE_P) {
1620 error_report("kvm: LMCE not supported");
1621 return -ENOTSUP;
1622 }
1623 warn_report("Unsupported MCG_CAP bits: 0x%" PRIx64,
1624 unsupported_caps);
1625 }
1626
1627 env->mcg_cap &= mcg_cap | MCG_CAP_BANKS_MASK;
1628 ret = kvm_vcpu_ioctl(cs, KVM_X86_SETUP_MCE, &env->mcg_cap);
1629 if (ret < 0) {
1630 fprintf(stderr, "KVM_X86_SETUP_MCE: %s", strerror(-ret));
1631 return ret;
1632 }
1633 }
1634
1635 qemu_add_vm_change_state_handler(cpu_update_state, env);
1636
1637 c = cpuid_find_entry(&cpuid_data.cpuid, 1, 0);
1638 if (c) {
1639 has_msr_feature_control = !!(c->ecx & CPUID_EXT_VMX) ||
1640 !!(c->ecx & CPUID_EXT_SMX);
1641 }
1642
1643 if (cpu_has_nested_virt(env) && !nested_virt_mig_blocker) {
1644 error_setg(&nested_virt_mig_blocker,
1645 "Nested virtualization does not support live migration yet");
1646 r = migrate_add_blocker(nested_virt_mig_blocker, &local_err);
1647 if (local_err) {
1648 error_report_err(local_err);
1649 error_free(nested_virt_mig_blocker);
1650 return r;
1651 }
1652 }
1653
1654 if (env->mcg_cap & MCG_LMCE_P) {
1655 has_msr_mcg_ext_ctl = has_msr_feature_control = true;
1656 }
1657
1658 if (!env->user_tsc_khz) {
1659 if ((env->features[FEAT_8000_0007_EDX] & CPUID_APM_INVTSC) &&
1660 invtsc_mig_blocker == NULL) {
1661 error_setg(&invtsc_mig_blocker,
1662 "State blocked by non-migratable CPU device"
1663 " (invtsc flag)");
1664 r = migrate_add_blocker(invtsc_mig_blocker, &local_err);
1665 if (local_err) {
1666 error_report_err(local_err);
1667 error_free(invtsc_mig_blocker);
1668 goto fail2;
1669 }
1670 }
1671 }
1672
1673 if (cpu->vmware_cpuid_freq
1674 /* Guests depend on 0x40000000 to detect this feature, so only expose
1675 * it if KVM exposes leaf 0x40000000. (Conflicts with Hyper-V) */
1676 && cpu->expose_kvm
1677 && kvm_base == KVM_CPUID_SIGNATURE
1678 /* TSC clock must be stable and known for this feature. */
1679 && tsc_is_stable_and_known(env)) {
1680
1681 c = &cpuid_data.entries[cpuid_i++];
1682 c->function = KVM_CPUID_SIGNATURE | 0x10;
1683 c->eax = env->tsc_khz;
1684 /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's
1685 * APIC_BUS_CYCLE_NS */
1686 c->ebx = 1000000;
1687 c->ecx = c->edx = 0;
1688
1689 c = cpuid_find_entry(&cpuid_data.cpuid, kvm_base, 0);
1690 c->eax = MAX(c->eax, KVM_CPUID_SIGNATURE | 0x10);
1691 }
1692
1693 cpuid_data.cpuid.nent = cpuid_i;
1694
1695 cpuid_data.cpuid.padding = 0;
1696 r = kvm_vcpu_ioctl(cs, KVM_SET_CPUID2, &cpuid_data);
1697 if (r) {
1698 goto fail;
1699 }
1700
1701 if (has_xsave) {
1702 env->xsave_buf = qemu_memalign(4096, sizeof(struct kvm_xsave));
1703 }
1704
1705 max_nested_state_len = kvm_max_nested_state_length();
1706 if (max_nested_state_len > 0) {
1707 assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
1708 env->nested_state = g_malloc0(max_nested_state_len);
1709
1710 env->nested_state->size = max_nested_state_len;
1711
1712 if (IS_INTEL_CPU(env)) {
1713 struct kvm_vmx_nested_state_hdr *vmx_hdr =
1714 &env->nested_state->hdr.vmx;
1715
1716 env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
1717 vmx_hdr->vmxon_pa = -1ull;
1718 vmx_hdr->vmcs12_pa = -1ull;
1719 }
1720 }
1721
1722 cpu->kvm_msr_buf = g_malloc0(MSR_BUF_SIZE);
1723
1724 if (!(env->features[FEAT_8000_0001_EDX] & CPUID_EXT2_RDTSCP)) {
1725 has_msr_tsc_aux = false;
1726 }
1727
1728 r = hyperv_init_vcpu(cpu);
1729 if (r) {
1730 goto fail;
1731 }
1732
1733 return 0;
1734
1735 fail:
1736 migrate_del_blocker(invtsc_mig_blocker);
1737 fail2:
1738 migrate_del_blocker(nested_virt_mig_blocker);
1739
1740 return r;
1741 }
1742
1743 int kvm_arch_destroy_vcpu(CPUState *cs)
1744 {
1745 X86CPU *cpu = X86_CPU(cs);
1746 CPUX86State *env = &cpu->env;
1747
1748 if (cpu->kvm_msr_buf) {
1749 g_free(cpu->kvm_msr_buf);
1750 cpu->kvm_msr_buf = NULL;
1751 }
1752
1753 if (env->nested_state) {
1754 g_free(env->nested_state);
1755 env->nested_state = NULL;
1756 }
1757
1758 return 0;
1759 }
1760
1761 void kvm_arch_reset_vcpu(X86CPU *cpu)
1762 {
1763 CPUX86State *env = &cpu->env;
1764
1765 env->xcr0 = 1;
1766 if (kvm_irqchip_in_kernel()) {
1767 env->mp_state = cpu_is_bsp(cpu) ? KVM_MP_STATE_RUNNABLE :
1768 KVM_MP_STATE_UNINITIALIZED;
1769 } else {
1770 env->mp_state = KVM_MP_STATE_RUNNABLE;
1771 }
1772
1773 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
1774 int i;
1775 for (i = 0; i < ARRAY_SIZE(env->msr_hv_synic_sint); i++) {
1776 env->msr_hv_synic_sint[i] = HV_SINT_MASKED;
1777 }
1778
1779 hyperv_x86_synic_reset(cpu);
1780 }
1781 }
1782
1783 void kvm_arch_do_init_vcpu(X86CPU *cpu)
1784 {
1785 CPUX86State *env = &cpu->env;
1786
1787 /* APs get directly into wait-for-SIPI state. */
1788 if (env->mp_state == KVM_MP_STATE_UNINITIALIZED) {
1789 env->mp_state = KVM_MP_STATE_INIT_RECEIVED;
1790 }
1791 }
1792
1793 static int kvm_get_supported_feature_msrs(KVMState *s)
1794 {
1795 int ret = 0;
1796
1797 if (kvm_feature_msrs != NULL) {
1798 return 0;
1799 }
1800
1801 if (!kvm_check_extension(s, KVM_CAP_GET_MSR_FEATURES)) {
1802 return 0;
1803 }
1804
1805 struct kvm_msr_list msr_list;
1806
1807 msr_list.nmsrs = 0;
1808 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, &msr_list);
1809 if (ret < 0 && ret != -E2BIG) {
1810 error_report("Fetch KVM feature MSR list failed: %s",
1811 strerror(-ret));
1812 return ret;
1813 }
1814
1815 assert(msr_list.nmsrs > 0);
1816 kvm_feature_msrs = (struct kvm_msr_list *) \
1817 g_malloc0(sizeof(msr_list) +
1818 msr_list.nmsrs * sizeof(msr_list.indices[0]));
1819
1820 kvm_feature_msrs->nmsrs = msr_list.nmsrs;
1821 ret = kvm_ioctl(s, KVM_GET_MSR_FEATURE_INDEX_LIST, kvm_feature_msrs);
1822
1823 if (ret < 0) {
1824 error_report("Fetch KVM feature MSR list failed: %s",
1825 strerror(-ret));
1826 g_free(kvm_feature_msrs);
1827 kvm_feature_msrs = NULL;
1828 return ret;
1829 }
1830
1831 return 0;
1832 }
1833
1834 static int kvm_get_supported_msrs(KVMState *s)
1835 {
1836 static int kvm_supported_msrs;
1837 int ret = 0;
1838
1839 /* first time */
1840 if (kvm_supported_msrs == 0) {
1841 struct kvm_msr_list msr_list, *kvm_msr_list;
1842
1843 kvm_supported_msrs = -1;
1844
1845 /* Obtain MSR list from KVM. These are the MSRs that we must
1846 * save/restore */
1847 msr_list.nmsrs = 0;
1848 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, &msr_list);
1849 if (ret < 0 && ret != -E2BIG) {
1850 return ret;
1851 }
1852 /* Old kernel modules had a bug and could write beyond the provided
1853 memory. Allocate at least a safe amount of 1K. */
1854 kvm_msr_list = g_malloc0(MAX(1024, sizeof(msr_list) +
1855 msr_list.nmsrs *
1856 sizeof(msr_list.indices[0])));
1857
1858 kvm_msr_list->nmsrs = msr_list.nmsrs;
1859 ret = kvm_ioctl(s, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
1860 if (ret >= 0) {
1861 int i;
1862
1863 for (i = 0; i < kvm_msr_list->nmsrs; i++) {
1864 switch (kvm_msr_list->indices[i]) {
1865 case MSR_STAR:
1866 has_msr_star = true;
1867 break;
1868 case MSR_VM_HSAVE_PA:
1869 has_msr_hsave_pa = true;
1870 break;
1871 case MSR_TSC_AUX:
1872 has_msr_tsc_aux = true;
1873 break;
1874 case MSR_TSC_ADJUST:
1875 has_msr_tsc_adjust = true;
1876 break;
1877 case MSR_IA32_TSCDEADLINE:
1878 has_msr_tsc_deadline = true;
1879 break;
1880 case MSR_IA32_SMBASE:
1881 has_msr_smbase = true;
1882 break;
1883 case MSR_SMI_COUNT:
1884 has_msr_smi_count = true;
1885 break;
1886 case MSR_IA32_MISC_ENABLE:
1887 has_msr_misc_enable = true;
1888 break;
1889 case MSR_IA32_BNDCFGS:
1890 has_msr_bndcfgs = true;
1891 break;
1892 case MSR_IA32_XSS:
1893 has_msr_xss = true;
1894 break;
1895 case HV_X64_MSR_CRASH_CTL:
1896 has_msr_hv_crash = true;
1897 break;
1898 case HV_X64_MSR_RESET:
1899 has_msr_hv_reset = true;
1900 break;
1901 case HV_X64_MSR_VP_INDEX:
1902 has_msr_hv_vpindex = true;
1903 break;
1904 case HV_X64_MSR_VP_RUNTIME:
1905 has_msr_hv_runtime = true;
1906 break;
1907 case HV_X64_MSR_SCONTROL:
1908 has_msr_hv_synic = true;
1909 break;
1910 case HV_X64_MSR_STIMER0_CONFIG:
1911 has_msr_hv_stimer = true;
1912 break;
1913 case HV_X64_MSR_TSC_FREQUENCY:
1914 has_msr_hv_frequencies = true;
1915 break;
1916 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
1917 has_msr_hv_reenlightenment = true;
1918 break;
1919 case MSR_IA32_SPEC_CTRL:
1920 has_msr_spec_ctrl = true;
1921 break;
1922 case MSR_VIRT_SSBD:
1923 has_msr_virt_ssbd = true;
1924 break;
1925 case MSR_IA32_ARCH_CAPABILITIES:
1926 has_msr_arch_capabs = true;
1927 break;
1928 case MSR_IA32_CORE_CAPABILITY:
1929 has_msr_core_capabs = true;
1930 break;
1931 }
1932 }
1933 }
1934
1935 g_free(kvm_msr_list);
1936 }
1937
1938 return ret;
1939 }
1940
1941 static Notifier smram_machine_done;
1942 static KVMMemoryListener smram_listener;
1943 static AddressSpace smram_address_space;
1944 static MemoryRegion smram_as_root;
1945 static MemoryRegion smram_as_mem;
1946
1947 static void register_smram_listener(Notifier *n, void *unused)
1948 {
1949 MemoryRegion *smram =
1950 (MemoryRegion *) object_resolve_path("/machine/smram", NULL);
1951
1952 /* Outer container... */
1953 memory_region_init(&smram_as_root, OBJECT(kvm_state), "mem-container-smram", ~0ull);
1954 memory_region_set_enabled(&smram_as_root, true);
1955
1956 /* ... with two regions inside: normal system memory with low
1957 * priority, and...
1958 */
1959 memory_region_init_alias(&smram_as_mem, OBJECT(kvm_state), "mem-smram",
1960 get_system_memory(), 0, ~0ull);
1961 memory_region_add_subregion_overlap(&smram_as_root, 0, &smram_as_mem, 0);
1962 memory_region_set_enabled(&smram_as_mem, true);
1963
1964 if (smram) {
1965 /* ... SMRAM with higher priority */
1966 memory_region_add_subregion_overlap(&smram_as_root, 0, smram, 10);
1967 memory_region_set_enabled(smram, true);
1968 }
1969
1970 address_space_init(&smram_address_space, &smram_as_root, "KVM-SMRAM");
1971 kvm_memory_listener_register(kvm_state, &smram_listener,
1972 &smram_address_space, 1);
1973 }
1974
1975 int kvm_arch_init(MachineState *ms, KVMState *s)
1976 {
1977 uint64_t identity_base = 0xfffbc000;
1978 uint64_t shadow_mem;
1979 int ret;
1980 struct utsname utsname;
1981
1982 has_xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
1983 has_xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
1984 has_pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
1985
1986 hv_vpindex_settable = kvm_check_extension(s, KVM_CAP_HYPERV_VP_INDEX);
1987
1988 has_exception_payload = kvm_check_extension(s, KVM_CAP_EXCEPTION_PAYLOAD);
1989 if (has_exception_payload) {
1990 ret = kvm_vm_enable_cap(s, KVM_CAP_EXCEPTION_PAYLOAD, 0, true);
1991 if (ret < 0) {
1992 error_report("kvm: Failed to enable exception payload cap: %s",
1993 strerror(-ret));
1994 return ret;
1995 }
1996 }
1997
1998 ret = kvm_get_supported_msrs(s);
1999 if (ret < 0) {
2000 return ret;
2001 }
2002
2003 kvm_get_supported_feature_msrs(s);
2004
2005 uname(&utsname);
2006 lm_capable_kernel = strcmp(utsname.machine, "x86_64") == 0;
2007
2008 /*
2009 * On older Intel CPUs, KVM uses vm86 mode to emulate 16-bit code directly.
2010 * In order to use vm86 mode, an EPT identity map and a TSS are needed.
2011 * Since these must be part of guest physical memory, we need to allocate
2012 * them, both by setting their start addresses in the kernel and by
2013 * creating a corresponding e820 entry. We need 4 pages before the BIOS.
2014 *
2015 * Older KVM versions may not support setting the identity map base. In
2016 * that case we need to stick with the default, i.e. a 256K maximum BIOS
2017 * size.
2018 */
2019 if (kvm_check_extension(s, KVM_CAP_SET_IDENTITY_MAP_ADDR)) {
2020 /* Allows up to 16M BIOSes. */
2021 identity_base = 0xfeffc000;
2022
2023 ret = kvm_vm_ioctl(s, KVM_SET_IDENTITY_MAP_ADDR, &identity_base);
2024 if (ret < 0) {
2025 return ret;
2026 }
2027 }
2028
2029 /* Set TSS base one page after EPT identity map. */
2030 ret = kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, identity_base + 0x1000);
2031 if (ret < 0) {
2032 return ret;
2033 }
2034
2035 /* Tell fw_cfg to notify the BIOS to reserve the range. */
2036 ret = e820_add_entry(identity_base, 0x4000, E820_RESERVED);
2037 if (ret < 0) {
2038 fprintf(stderr, "e820_add_entry() table is full\n");
2039 return ret;
2040 }
2041 qemu_register_reset(kvm_unpoison_all, NULL);
2042
2043 shadow_mem = machine_kvm_shadow_mem(ms);
2044 if (shadow_mem != -1) {
2045 shadow_mem /= 4096;
2046 ret = kvm_vm_ioctl(s, KVM_SET_NR_MMU_PAGES, shadow_mem);
2047 if (ret < 0) {
2048 return ret;
2049 }
2050 }
2051
2052 if (kvm_check_extension(s, KVM_CAP_X86_SMM) &&
2053 object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE) &&
2054 pc_machine_is_smm_enabled(PC_MACHINE(ms))) {
2055 smram_machine_done.notify = register_smram_listener;
2056 qemu_add_machine_init_done_notifier(&smram_machine_done);
2057 }
2058
2059 if (enable_cpu_pm) {
2060 int disable_exits = kvm_check_extension(s, KVM_CAP_X86_DISABLE_EXITS);
2061 int ret;
2062
2063 /* Work around for kernel header with a typo. TODO: fix header and drop. */
2064 #if defined(KVM_X86_DISABLE_EXITS_HTL) && !defined(KVM_X86_DISABLE_EXITS_HLT)
2065 #define KVM_X86_DISABLE_EXITS_HLT KVM_X86_DISABLE_EXITS_HTL
2066 #endif
2067 if (disable_exits) {
2068 disable_exits &= (KVM_X86_DISABLE_EXITS_MWAIT |
2069 KVM_X86_DISABLE_EXITS_HLT |
2070 KVM_X86_DISABLE_EXITS_PAUSE);
2071 }
2072
2073 ret = kvm_vm_enable_cap(s, KVM_CAP_X86_DISABLE_EXITS, 0,
2074 disable_exits);
2075 if (ret < 0) {
2076 error_report("kvm: guest stopping CPU not supported: %s",
2077 strerror(-ret));
2078 }
2079 }
2080
2081 return 0;
2082 }
2083
2084 static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2085 {
2086 lhs->selector = rhs->selector;
2087 lhs->base = rhs->base;
2088 lhs->limit = rhs->limit;
2089 lhs->type = 3;
2090 lhs->present = 1;
2091 lhs->dpl = 3;
2092 lhs->db = 0;
2093 lhs->s = 1;
2094 lhs->l = 0;
2095 lhs->g = 0;
2096 lhs->avl = 0;
2097 lhs->unusable = 0;
2098 }
2099
2100 static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
2101 {
2102 unsigned flags = rhs->flags;
2103 lhs->selector = rhs->selector;
2104 lhs->base = rhs->base;
2105 lhs->limit = rhs->limit;
2106 lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
2107 lhs->present = (flags & DESC_P_MASK) != 0;
2108 lhs->dpl = (flags >> DESC_DPL_SHIFT) & 3;
2109 lhs->db = (flags >> DESC_B_SHIFT) & 1;
2110 lhs->s = (flags & DESC_S_MASK) != 0;
2111 lhs->l = (flags >> DESC_L_SHIFT) & 1;
2112 lhs->g = (flags & DESC_G_MASK) != 0;
2113 lhs->avl = (flags & DESC_AVL_MASK) != 0;
2114 lhs->unusable = !lhs->present;
2115 lhs->padding = 0;
2116 }
2117
2118 static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
2119 {
2120 lhs->selector = rhs->selector;
2121 lhs->base = rhs->base;
2122 lhs->limit = rhs->limit;
2123 lhs->flags = (rhs->type << DESC_TYPE_SHIFT) |
2124 ((rhs->present && !rhs->unusable) * DESC_P_MASK) |
2125 (rhs->dpl << DESC_DPL_SHIFT) |
2126 (rhs->db << DESC_B_SHIFT) |
2127 (rhs->s * DESC_S_MASK) |
2128 (rhs->l << DESC_L_SHIFT) |
2129 (rhs->g * DESC_G_MASK) |
2130 (rhs->avl * DESC_AVL_MASK);
2131 }
2132
2133 static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
2134 {
2135 if (set) {
2136 *kvm_reg = *qemu_reg;
2137 } else {
2138 *qemu_reg = *kvm_reg;
2139 }
2140 }
2141
2142 static int kvm_getput_regs(X86CPU *cpu, int set)
2143 {
2144 CPUX86State *env = &cpu->env;
2145 struct kvm_regs regs;
2146 int ret = 0;
2147
2148 if (!set) {
2149 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_REGS, &regs);
2150 if (ret < 0) {
2151 return ret;
2152 }
2153 }
2154
2155 kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
2156 kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
2157 kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
2158 kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
2159 kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
2160 kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
2161 kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
2162 kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
2163 #ifdef TARGET_X86_64
2164 kvm_getput_reg(&regs.r8, &env->regs[8], set);
2165 kvm_getput_reg(&regs.r9, &env->regs[9], set);
2166 kvm_getput_reg(&regs.r10, &env->regs[10], set);
2167 kvm_getput_reg(&regs.r11, &env->regs[11], set);
2168 kvm_getput_reg(&regs.r12, &env->regs[12], set);
2169 kvm_getput_reg(&regs.r13, &env->regs[13], set);
2170 kvm_getput_reg(&regs.r14, &env->regs[14], set);
2171 kvm_getput_reg(&regs.r15, &env->regs[15], set);
2172 #endif
2173
2174 kvm_getput_reg(&regs.rflags, &env->eflags, set);
2175 kvm_getput_reg(&regs.rip, &env->eip, set);
2176
2177 if (set) {
2178 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_REGS, &regs);
2179 }
2180
2181 return ret;
2182 }
2183
2184 static int kvm_put_fpu(X86CPU *cpu)
2185 {
2186 CPUX86State *env = &cpu->env;
2187 struct kvm_fpu fpu;
2188 int i;
2189
2190 memset(&fpu, 0, sizeof fpu);
2191 fpu.fsw = env->fpus & ~(7 << 11);
2192 fpu.fsw |= (env->fpstt & 7) << 11;
2193 fpu.fcw = env->fpuc;
2194 fpu.last_opcode = env->fpop;
2195 fpu.last_ip = env->fpip;
2196 fpu.last_dp = env->fpdp;
2197 for (i = 0; i < 8; ++i) {
2198 fpu.ftwx |= (!env->fptags[i]) << i;
2199 }
2200 memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
2201 for (i = 0; i < CPU_NB_REGS; i++) {
2202 stq_p(&fpu.xmm[i][0], env->xmm_regs[i].ZMM_Q(0));
2203 stq_p(&fpu.xmm[i][8], env->xmm_regs[i].ZMM_Q(1));
2204 }
2205 fpu.mxcsr = env->mxcsr;
2206
2207 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_FPU, &fpu);
2208 }
2209
2210 #define XSAVE_FCW_FSW 0
2211 #define XSAVE_FTW_FOP 1
2212 #define XSAVE_CWD_RIP 2
2213 #define XSAVE_CWD_RDP 4
2214 #define XSAVE_MXCSR 6
2215 #define XSAVE_ST_SPACE 8
2216 #define XSAVE_XMM_SPACE 40
2217 #define XSAVE_XSTATE_BV 128
2218 #define XSAVE_YMMH_SPACE 144
2219 #define XSAVE_BNDREGS 240
2220 #define XSAVE_BNDCSR 256
2221 #define XSAVE_OPMASK 272
2222 #define XSAVE_ZMM_Hi256 288
2223 #define XSAVE_Hi16_ZMM 416
2224 #define XSAVE_PKRU 672
2225
2226 #define XSAVE_BYTE_OFFSET(word_offset) \
2227 ((word_offset) * sizeof_field(struct kvm_xsave, region[0]))
2228
2229 #define ASSERT_OFFSET(word_offset, field) \
2230 QEMU_BUILD_BUG_ON(XSAVE_BYTE_OFFSET(word_offset) != \
2231 offsetof(X86XSaveArea, field))
2232
2233 ASSERT_OFFSET(XSAVE_FCW_FSW, legacy.fcw);
2234 ASSERT_OFFSET(XSAVE_FTW_FOP, legacy.ftw);
2235 ASSERT_OFFSET(XSAVE_CWD_RIP, legacy.fpip);
2236 ASSERT_OFFSET(XSAVE_CWD_RDP, legacy.fpdp);
2237 ASSERT_OFFSET(XSAVE_MXCSR, legacy.mxcsr);
2238 ASSERT_OFFSET(XSAVE_ST_SPACE, legacy.fpregs);
2239 ASSERT_OFFSET(XSAVE_XMM_SPACE, legacy.xmm_regs);
2240 ASSERT_OFFSET(XSAVE_XSTATE_BV, header.xstate_bv);
2241 ASSERT_OFFSET(XSAVE_YMMH_SPACE, avx_state);
2242 ASSERT_OFFSET(XSAVE_BNDREGS, bndreg_state);
2243 ASSERT_OFFSET(XSAVE_BNDCSR, bndcsr_state);
2244 ASSERT_OFFSET(XSAVE_OPMASK, opmask_state);
2245 ASSERT_OFFSET(XSAVE_ZMM_Hi256, zmm_hi256_state);
2246 ASSERT_OFFSET(XSAVE_Hi16_ZMM, hi16_zmm_state);
2247 ASSERT_OFFSET(XSAVE_PKRU, pkru_state);
2248
2249 static int kvm_put_xsave(X86CPU *cpu)
2250 {
2251 CPUX86State *env = &cpu->env;
2252 X86XSaveArea *xsave = env->xsave_buf;
2253
2254 if (!has_xsave) {
2255 return kvm_put_fpu(cpu);
2256 }
2257 x86_cpu_xsave_all_areas(cpu, xsave);
2258
2259 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XSAVE, xsave);
2260 }
2261
2262 static int kvm_put_xcrs(X86CPU *cpu)
2263 {
2264 CPUX86State *env = &cpu->env;
2265 struct kvm_xcrs xcrs = {};
2266
2267 if (!has_xcrs) {
2268 return 0;
2269 }
2270
2271 xcrs.nr_xcrs = 1;
2272 xcrs.flags = 0;
2273 xcrs.xcrs[0].xcr = 0;
2274 xcrs.xcrs[0].value = env->xcr0;
2275 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_XCRS, &xcrs);
2276 }
2277
2278 static int kvm_put_sregs(X86CPU *cpu)
2279 {
2280 CPUX86State *env = &cpu->env;
2281 struct kvm_sregs sregs;
2282
2283 memset(sregs.interrupt_bitmap, 0, sizeof(sregs.interrupt_bitmap));
2284 if (env->interrupt_injected >= 0) {
2285 sregs.interrupt_bitmap[env->interrupt_injected / 64] |=
2286 (uint64_t)1 << (env->interrupt_injected % 64);
2287 }
2288
2289 if ((env->eflags & VM_MASK)) {
2290 set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
2291 set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
2292 set_v8086_seg(&sregs.es, &env->segs[R_ES]);
2293 set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
2294 set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
2295 set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
2296 } else {
2297 set_seg(&sregs.cs, &env->segs[R_CS]);
2298 set_seg(&sregs.ds, &env->segs[R_DS]);
2299 set_seg(&sregs.es, &env->segs[R_ES]);
2300 set_seg(&sregs.fs, &env->segs[R_FS]);
2301 set_seg(&sregs.gs, &env->segs[R_GS]);
2302 set_seg(&sregs.ss, &env->segs[R_SS]);
2303 }
2304
2305 set_seg(&sregs.tr, &env->tr);
2306 set_seg(&sregs.ldt, &env->ldt);
2307
2308 sregs.idt.limit = env->idt.limit;
2309 sregs.idt.base = env->idt.base;
2310 memset(sregs.idt.padding, 0, sizeof sregs.idt.padding);
2311 sregs.gdt.limit = env->gdt.limit;
2312 sregs.gdt.base = env->gdt.base;
2313 memset(sregs.gdt.padding, 0, sizeof sregs.gdt.padding);
2314
2315 sregs.cr0 = env->cr[0];
2316 sregs.cr2 = env->cr[2];
2317 sregs.cr3 = env->cr[3];
2318 sregs.cr4 = env->cr[4];
2319
2320 sregs.cr8 = cpu_get_apic_tpr(cpu->apic_state);
2321 sregs.apic_base = cpu_get_apic_base(cpu->apic_state);
2322
2323 sregs.efer = env->efer;
2324
2325 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
2326 }
2327
2328 static void kvm_msr_buf_reset(X86CPU *cpu)
2329 {
2330 memset(cpu->kvm_msr_buf, 0, MSR_BUF_SIZE);
2331 }
2332
2333 static void kvm_msr_entry_add(X86CPU *cpu, uint32_t index, uint64_t value)
2334 {
2335 struct kvm_msrs *msrs = cpu->kvm_msr_buf;
2336 void *limit = ((void *)msrs) + MSR_BUF_SIZE;
2337 struct kvm_msr_entry *entry = &msrs->entries[msrs->nmsrs];
2338
2339 assert((void *)(entry + 1) <= limit);
2340
2341 entry->index = index;
2342 entry->reserved = 0;
2343 entry->data = value;
2344 msrs->nmsrs++;
2345 }
2346
2347 static int kvm_put_one_msr(X86CPU *cpu, int index, uint64_t value)
2348 {
2349 kvm_msr_buf_reset(cpu);
2350 kvm_msr_entry_add(cpu, index, value);
2351
2352 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2353 }
2354
2355 void kvm_put_apicbase(X86CPU *cpu, uint64_t value)
2356 {
2357 int ret;
2358
2359 ret = kvm_put_one_msr(cpu, MSR_IA32_APICBASE, value);
2360 assert(ret == 1);
2361 }
2362
2363 static int kvm_put_tscdeadline_msr(X86CPU *cpu)
2364 {
2365 CPUX86State *env = &cpu->env;
2366 int ret;
2367
2368 if (!has_msr_tsc_deadline) {
2369 return 0;
2370 }
2371
2372 ret = kvm_put_one_msr(cpu, MSR_IA32_TSCDEADLINE, env->tsc_deadline);
2373 if (ret < 0) {
2374 return ret;
2375 }
2376
2377 assert(ret == 1);
2378 return 0;
2379 }
2380
2381 /*
2382 * Provide a separate write service for the feature control MSR in order to
2383 * kick the VCPU out of VMXON or even guest mode on reset. This has to be done
2384 * before writing any other state because forcibly leaving nested mode
2385 * invalidates the VCPU state.
2386 */
2387 static int kvm_put_msr_feature_control(X86CPU *cpu)
2388 {
2389 int ret;
2390
2391 if (!has_msr_feature_control) {
2392 return 0;
2393 }
2394
2395 ret = kvm_put_one_msr(cpu, MSR_IA32_FEATURE_CONTROL,
2396 cpu->env.msr_ia32_feature_control);
2397 if (ret < 0) {
2398 return ret;
2399 }
2400
2401 assert(ret == 1);
2402 return 0;
2403 }
2404
2405 static int kvm_put_msrs(X86CPU *cpu, int level)
2406 {
2407 CPUX86State *env = &cpu->env;
2408 int i;
2409 int ret;
2410
2411 kvm_msr_buf_reset(cpu);
2412
2413 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, env->sysenter_cs);
2414 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
2415 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
2416 kvm_msr_entry_add(cpu, MSR_PAT, env->pat);
2417 if (has_msr_star) {
2418 kvm_msr_entry_add(cpu, MSR_STAR, env->star);
2419 }
2420 if (has_msr_hsave_pa) {
2421 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, env->vm_hsave);
2422 }
2423 if (has_msr_tsc_aux) {
2424 kvm_msr_entry_add(cpu, MSR_TSC_AUX, env->tsc_aux);
2425 }
2426 if (has_msr_tsc_adjust) {
2427 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, env->tsc_adjust);
2428 }
2429 if (has_msr_misc_enable) {
2430 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE,
2431 env->msr_ia32_misc_enable);
2432 }
2433 if (has_msr_smbase) {
2434 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, env->smbase);
2435 }
2436 if (has_msr_smi_count) {
2437 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, env->msr_smi_count);
2438 }
2439 if (has_msr_bndcfgs) {
2440 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, env->msr_bndcfgs);
2441 }
2442 if (has_msr_xss) {
2443 kvm_msr_entry_add(cpu, MSR_IA32_XSS, env->xss);
2444 }
2445 if (has_msr_spec_ctrl) {
2446 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, env->spec_ctrl);
2447 }
2448 if (has_msr_virt_ssbd) {
2449 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, env->virt_ssbd);
2450 }
2451
2452 #ifdef TARGET_X86_64
2453 if (lm_capable_kernel) {
2454 kvm_msr_entry_add(cpu, MSR_CSTAR, env->cstar);
2455 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, env->kernelgsbase);
2456 kvm_msr_entry_add(cpu, MSR_FMASK, env->fmask);
2457 kvm_msr_entry_add(cpu, MSR_LSTAR, env->lstar);
2458 }
2459 #endif
2460
2461 /* If host supports feature MSR, write down. */
2462 if (has_msr_arch_capabs) {
2463 kvm_msr_entry_add(cpu, MSR_IA32_ARCH_CAPABILITIES,
2464 env->features[FEAT_ARCH_CAPABILITIES]);
2465 }
2466
2467 if (has_msr_core_capabs) {
2468 kvm_msr_entry_add(cpu, MSR_IA32_CORE_CAPABILITY,
2469 env->features[FEAT_CORE_CAPABILITY]);
2470 }
2471
2472 /*
2473 * The following MSRs have side effects on the guest or are too heavy
2474 * for normal writeback. Limit them to reset or full state updates.
2475 */
2476 if (level >= KVM_PUT_RESET_STATE) {
2477 kvm_msr_entry_add(cpu, MSR_IA32_TSC, env->tsc);
2478 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, env->system_time_msr);
2479 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, env->wall_clock_msr);
2480 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
2481 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, env->async_pf_en_msr);
2482 }
2483 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
2484 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, env->pv_eoi_en_msr);
2485 }
2486 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
2487 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, env->steal_time_msr);
2488 }
2489 if (has_architectural_pmu_version > 0) {
2490 if (has_architectural_pmu_version > 1) {
2491 /* Stop the counter. */
2492 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
2493 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
2494 }
2495
2496 /* Set the counter values. */
2497 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
2498 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i,
2499 env->msr_fixed_counters[i]);
2500 }
2501 for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
2502 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i,
2503 env->msr_gp_counters[i]);
2504 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i,
2505 env->msr_gp_evtsel[i]);
2506 }
2507 if (has_architectural_pmu_version > 1) {
2508 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS,
2509 env->msr_global_status);
2510 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
2511 env->msr_global_ovf_ctrl);
2512
2513 /* Now start the PMU. */
2514 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL,
2515 env->msr_fixed_ctr_ctrl);
2516 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL,
2517 env->msr_global_ctrl);
2518 }
2519 }
2520 /*
2521 * Hyper-V partition-wide MSRs: to avoid clearing them on cpu hot-add,
2522 * only sync them to KVM on the first cpu
2523 */
2524 if (current_cpu == first_cpu) {
2525 if (has_msr_hv_hypercall) {
2526 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID,
2527 env->msr_hv_guest_os_id);
2528 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL,
2529 env->msr_hv_hypercall);
2530 }
2531 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
2532 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC,
2533 env->msr_hv_tsc);
2534 }
2535 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
2536 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL,
2537 env->msr_hv_reenlightenment_control);
2538 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL,
2539 env->msr_hv_tsc_emulation_control);
2540 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS,
2541 env->msr_hv_tsc_emulation_status);
2542 }
2543 }
2544 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
2545 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE,
2546 env->msr_hv_vapic);
2547 }
2548 if (has_msr_hv_crash) {
2549 int j;
2550
2551 for (j = 0; j < HV_CRASH_PARAMS; j++)
2552 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j,
2553 env->msr_hv_crash_params[j]);
2554
2555 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_CTL, HV_CRASH_CTL_NOTIFY);
2556 }
2557 if (has_msr_hv_runtime) {
2558 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, env->msr_hv_runtime);
2559 }
2560 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VPINDEX)
2561 && hv_vpindex_settable) {
2562 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_INDEX,
2563 hyperv_vp_index(CPU(cpu)));
2564 }
2565 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2566 int j;
2567
2568 kvm_msr_entry_add(cpu, HV_X64_MSR_SVERSION, HV_SYNIC_VERSION);
2569
2570 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL,
2571 env->msr_hv_synic_control);
2572 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP,
2573 env->msr_hv_synic_evt_page);
2574 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP,
2575 env->msr_hv_synic_msg_page);
2576
2577 for (j = 0; j < ARRAY_SIZE(env->msr_hv_synic_sint); j++) {
2578 kvm_msr_entry_add(cpu, HV_X64_MSR_SINT0 + j,
2579 env->msr_hv_synic_sint[j]);
2580 }
2581 }
2582 if (has_msr_hv_stimer) {
2583 int j;
2584
2585 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_config); j++) {
2586 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_CONFIG + j * 2,
2587 env->msr_hv_stimer_config[j]);
2588 }
2589
2590 for (j = 0; j < ARRAY_SIZE(env->msr_hv_stimer_count); j++) {
2591 kvm_msr_entry_add(cpu, HV_X64_MSR_STIMER0_COUNT + j * 2,
2592 env->msr_hv_stimer_count[j]);
2593 }
2594 }
2595 if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
2596 uint64_t phys_mask = MAKE_64BIT_MASK(0, cpu->phys_bits);
2597
2598 kvm_msr_entry_add(cpu, MSR_MTRRdefType, env->mtrr_deftype);
2599 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, env->mtrr_fixed[0]);
2600 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, env->mtrr_fixed[1]);
2601 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, env->mtrr_fixed[2]);
2602 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, env->mtrr_fixed[3]);
2603 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, env->mtrr_fixed[4]);
2604 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, env->mtrr_fixed[5]);
2605 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, env->mtrr_fixed[6]);
2606 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, env->mtrr_fixed[7]);
2607 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, env->mtrr_fixed[8]);
2608 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, env->mtrr_fixed[9]);
2609 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, env->mtrr_fixed[10]);
2610 for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
2611 /* The CPU GPs if we write to a bit above the physical limit of
2612 * the host CPU (and KVM emulates that)
2613 */
2614 uint64_t mask = env->mtrr_var[i].mask;
2615 mask &= phys_mask;
2616
2617 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i),
2618 env->mtrr_var[i].base);
2619 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), mask);
2620 }
2621 }
2622 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
2623 int addr_num = kvm_arch_get_supported_cpuid(kvm_state,
2624 0x14, 1, R_EAX) & 0x7;
2625
2626 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL,
2627 env->msr_rtit_ctrl);
2628 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS,
2629 env->msr_rtit_status);
2630 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE,
2631 env->msr_rtit_output_base);
2632 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK,
2633 env->msr_rtit_output_mask);
2634 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH,
2635 env->msr_rtit_cr3_match);
2636 for (i = 0; i < addr_num; i++) {
2637 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i,
2638 env->msr_rtit_addrs[i]);
2639 }
2640 }
2641
2642 /* Note: MSR_IA32_FEATURE_CONTROL is written separately, see
2643 * kvm_put_msr_feature_control. */
2644 }
2645 if (env->mcg_cap) {
2646 int i;
2647
2648 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, env->mcg_status);
2649 kvm_msr_entry_add(cpu, MSR_MCG_CTL, env->mcg_ctl);
2650 if (has_msr_mcg_ext_ctl) {
2651 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, env->mcg_ext_ctl);
2652 }
2653 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
2654 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, env->mce_banks[i]);
2655 }
2656 }
2657
2658 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MSRS, cpu->kvm_msr_buf);
2659 if (ret < 0) {
2660 return ret;
2661 }
2662
2663 if (ret < cpu->kvm_msr_buf->nmsrs) {
2664 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2665 error_report("error: failed to set MSR 0x%" PRIx32 " to 0x%" PRIx64,
2666 (uint32_t)e->index, (uint64_t)e->data);
2667 }
2668
2669 assert(ret == cpu->kvm_msr_buf->nmsrs);
2670 return 0;
2671 }
2672
2673
2674 static int kvm_get_fpu(X86CPU *cpu)
2675 {
2676 CPUX86State *env = &cpu->env;
2677 struct kvm_fpu fpu;
2678 int i, ret;
2679
2680 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_FPU, &fpu);
2681 if (ret < 0) {
2682 return ret;
2683 }
2684
2685 env->fpstt = (fpu.fsw >> 11) & 7;
2686 env->fpus = fpu.fsw;
2687 env->fpuc = fpu.fcw;
2688 env->fpop = fpu.last_opcode;
2689 env->fpip = fpu.last_ip;
2690 env->fpdp = fpu.last_dp;
2691 for (i = 0; i < 8; ++i) {
2692 env->fptags[i] = !((fpu.ftwx >> i) & 1);
2693 }
2694 memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
2695 for (i = 0; i < CPU_NB_REGS; i++) {
2696 env->xmm_regs[i].ZMM_Q(0) = ldq_p(&fpu.xmm[i][0]);
2697 env->xmm_regs[i].ZMM_Q(1) = ldq_p(&fpu.xmm[i][8]);
2698 }
2699 env->mxcsr = fpu.mxcsr;
2700
2701 return 0;
2702 }
2703
2704 static int kvm_get_xsave(X86CPU *cpu)
2705 {
2706 CPUX86State *env = &cpu->env;
2707 X86XSaveArea *xsave = env->xsave_buf;
2708 int ret;
2709
2710 if (!has_xsave) {
2711 return kvm_get_fpu(cpu);
2712 }
2713
2714 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XSAVE, xsave);
2715 if (ret < 0) {
2716 return ret;
2717 }
2718 x86_cpu_xrstor_all_areas(cpu, xsave);
2719
2720 return 0;
2721 }
2722
2723 static int kvm_get_xcrs(X86CPU *cpu)
2724 {
2725 CPUX86State *env = &cpu->env;
2726 int i, ret;
2727 struct kvm_xcrs xcrs;
2728
2729 if (!has_xcrs) {
2730 return 0;
2731 }
2732
2733 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_XCRS, &xcrs);
2734 if (ret < 0) {
2735 return ret;
2736 }
2737
2738 for (i = 0; i < xcrs.nr_xcrs; i++) {
2739 /* Only support xcr0 now */
2740 if (xcrs.xcrs[i].xcr == 0) {
2741 env->xcr0 = xcrs.xcrs[i].value;
2742 break;
2743 }
2744 }
2745 return 0;
2746 }
2747
2748 static int kvm_get_sregs(X86CPU *cpu)
2749 {
2750 CPUX86State *env = &cpu->env;
2751 struct kvm_sregs sregs;
2752 int bit, i, ret;
2753
2754 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
2755 if (ret < 0) {
2756 return ret;
2757 }
2758
2759 /* There can only be one pending IRQ set in the bitmap at a time, so try
2760 to find it and save its number instead (-1 for none). */
2761 env->interrupt_injected = -1;
2762 for (i = 0; i < ARRAY_SIZE(sregs.interrupt_bitmap); i++) {
2763 if (sregs.interrupt_bitmap[i]) {
2764 bit = ctz64(sregs.interrupt_bitmap[i]);
2765 env->interrupt_injected = i * 64 + bit;
2766 break;
2767 }
2768 }
2769
2770 get_seg(&env->segs[R_CS], &sregs.cs);
2771 get_seg(&env->segs[R_DS], &sregs.ds);
2772 get_seg(&env->segs[R_ES], &sregs.es);
2773 get_seg(&env->segs[R_FS], &sregs.fs);
2774 get_seg(&env->segs[R_GS], &sregs.gs);
2775 get_seg(&env->segs[R_SS], &sregs.ss);
2776
2777 get_seg(&env->tr, &sregs.tr);
2778 get_seg(&env->ldt, &sregs.ldt);
2779
2780 env->idt.limit = sregs.idt.limit;
2781 env->idt.base = sregs.idt.base;
2782 env->gdt.limit = sregs.gdt.limit;
2783 env->gdt.base = sregs.gdt.base;
2784
2785 env->cr[0] = sregs.cr0;
2786 env->cr[2] = sregs.cr2;
2787 env->cr[3] = sregs.cr3;
2788 env->cr[4] = sregs.cr4;
2789
2790 env->efer = sregs.efer;
2791
2792 /* changes to apic base and cr8/tpr are read back via kvm_arch_post_run */
2793 x86_update_hflags(env);
2794
2795 return 0;
2796 }
2797
2798 static int kvm_get_msrs(X86CPU *cpu)
2799 {
2800 CPUX86State *env = &cpu->env;
2801 struct kvm_msr_entry *msrs = cpu->kvm_msr_buf->entries;
2802 int ret, i;
2803 uint64_t mtrr_top_bits;
2804
2805 kvm_msr_buf_reset(cpu);
2806
2807 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_CS, 0);
2808 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_ESP, 0);
2809 kvm_msr_entry_add(cpu, MSR_IA32_SYSENTER_EIP, 0);
2810 kvm_msr_entry_add(cpu, MSR_PAT, 0);
2811 if (has_msr_star) {
2812 kvm_msr_entry_add(cpu, MSR_STAR, 0);
2813 }
2814 if (has_msr_hsave_pa) {
2815 kvm_msr_entry_add(cpu, MSR_VM_HSAVE_PA, 0);
2816 }
2817 if (has_msr_tsc_aux) {
2818 kvm_msr_entry_add(cpu, MSR_TSC_AUX, 0);
2819 }
2820 if (has_msr_tsc_adjust) {
2821 kvm_msr_entry_add(cpu, MSR_TSC_ADJUST, 0);
2822 }
2823 if (has_msr_tsc_deadline) {
2824 kvm_msr_entry_add(cpu, MSR_IA32_TSCDEADLINE, 0);
2825 }
2826 if (has_msr_misc_enable) {
2827 kvm_msr_entry_add(cpu, MSR_IA32_MISC_ENABLE, 0);
2828 }
2829 if (has_msr_smbase) {
2830 kvm_msr_entry_add(cpu, MSR_IA32_SMBASE, 0);
2831 }
2832 if (has_msr_smi_count) {
2833 kvm_msr_entry_add(cpu, MSR_SMI_COUNT, 0);
2834 }
2835 if (has_msr_feature_control) {
2836 kvm_msr_entry_add(cpu, MSR_IA32_FEATURE_CONTROL, 0);
2837 }
2838 if (has_msr_bndcfgs) {
2839 kvm_msr_entry_add(cpu, MSR_IA32_BNDCFGS, 0);
2840 }
2841 if (has_msr_xss) {
2842 kvm_msr_entry_add(cpu, MSR_IA32_XSS, 0);
2843 }
2844 if (has_msr_spec_ctrl) {
2845 kvm_msr_entry_add(cpu, MSR_IA32_SPEC_CTRL, 0);
2846 }
2847 if (has_msr_virt_ssbd) {
2848 kvm_msr_entry_add(cpu, MSR_VIRT_SSBD, 0);
2849 }
2850 if (!env->tsc_valid) {
2851 kvm_msr_entry_add(cpu, MSR_IA32_TSC, 0);
2852 env->tsc_valid = !runstate_is_running();
2853 }
2854
2855 #ifdef TARGET_X86_64
2856 if (lm_capable_kernel) {
2857 kvm_msr_entry_add(cpu, MSR_CSTAR, 0);
2858 kvm_msr_entry_add(cpu, MSR_KERNELGSBASE, 0);
2859 kvm_msr_entry_add(cpu, MSR_FMASK, 0);
2860 kvm_msr_entry_add(cpu, MSR_LSTAR, 0);
2861 }
2862 #endif
2863 kvm_msr_entry_add(cpu, MSR_KVM_SYSTEM_TIME, 0);
2864 kvm_msr_entry_add(cpu, MSR_KVM_WALL_CLOCK, 0);
2865 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_ASYNC_PF)) {
2866 kvm_msr_entry_add(cpu, MSR_KVM_ASYNC_PF_EN, 0);
2867 }
2868 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_PV_EOI)) {
2869 kvm_msr_entry_add(cpu, MSR_KVM_PV_EOI_EN, 0);
2870 }
2871 if (env->features[FEAT_KVM] & (1 << KVM_FEATURE_STEAL_TIME)) {
2872 kvm_msr_entry_add(cpu, MSR_KVM_STEAL_TIME, 0);
2873 }
2874 if (has_architectural_pmu_version > 0) {
2875 if (has_architectural_pmu_version > 1) {
2876 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR_CTRL, 0);
2877 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_CTRL, 0);
2878 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_STATUS, 0);
2879 kvm_msr_entry_add(cpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 0);
2880 }
2881 for (i = 0; i < num_architectural_pmu_fixed_counters; i++) {
2882 kvm_msr_entry_add(cpu, MSR_CORE_PERF_FIXED_CTR0 + i, 0);
2883 }
2884 for (i = 0; i < num_architectural_pmu_gp_counters; i++) {
2885 kvm_msr_entry_add(cpu, MSR_P6_PERFCTR0 + i, 0);
2886 kvm_msr_entry_add(cpu, MSR_P6_EVNTSEL0 + i, 0);
2887 }
2888 }
2889
2890 if (env->mcg_cap) {
2891 kvm_msr_entry_add(cpu, MSR_MCG_STATUS, 0);
2892 kvm_msr_entry_add(cpu, MSR_MCG_CTL, 0);
2893 if (has_msr_mcg_ext_ctl) {
2894 kvm_msr_entry_add(cpu, MSR_MCG_EXT_CTL, 0);
2895 }
2896 for (i = 0; i < (env->mcg_cap & 0xff) * 4; i++) {
2897 kvm_msr_entry_add(cpu, MSR_MC0_CTL + i, 0);
2898 }
2899 }
2900
2901 if (has_msr_hv_hypercall) {
2902 kvm_msr_entry_add(cpu, HV_X64_MSR_HYPERCALL, 0);
2903 kvm_msr_entry_add(cpu, HV_X64_MSR_GUEST_OS_ID, 0);
2904 }
2905 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_VAPIC)) {
2906 kvm_msr_entry_add(cpu, HV_X64_MSR_APIC_ASSIST_PAGE, 0);
2907 }
2908 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_TIME)) {
2909 kvm_msr_entry_add(cpu, HV_X64_MSR_REFERENCE_TSC, 0);
2910 }
2911 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_REENLIGHTENMENT)) {
2912 kvm_msr_entry_add(cpu, HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0);
2913 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_CONTROL, 0);
2914 kvm_msr_entry_add(cpu, HV_X64_MSR_TSC_EMULATION_STATUS, 0);
2915 }
2916 if (has_msr_hv_crash) {
2917 int j;
2918
2919 for (j = 0; j < HV_CRASH_PARAMS; j++) {
2920 kvm_msr_entry_add(cpu, HV_X64_MSR_CRASH_P0 + j, 0);
2921 }
2922 }
2923 if (has_msr_hv_runtime) {
2924 kvm_msr_entry_add(cpu, HV_X64_MSR_VP_RUNTIME, 0);
2925 }
2926 if (hyperv_feat_enabled(cpu, HYPERV_FEAT_SYNIC)) {
2927 uint32_t msr;
2928
2929 kvm_msr_entry_add(cpu, HV_X64_MSR_SCONTROL, 0);
2930 kvm_msr_entry_add(cpu, HV_X64_MSR_SIEFP, 0);
2931 kvm_msr_entry_add(cpu, HV_X64_MSR_SIMP, 0);
2932 for (msr = HV_X64_MSR_SINT0; msr <= HV_X64_MSR_SINT15; msr++) {
2933 kvm_msr_entry_add(cpu, msr, 0);
2934 }
2935 }
2936 if (has_msr_hv_stimer) {
2937 uint32_t msr;
2938
2939 for (msr = HV_X64_MSR_STIMER0_CONFIG; msr <= HV_X64_MSR_STIMER3_COUNT;
2940 msr++) {
2941 kvm_msr_entry_add(cpu, msr, 0);
2942 }
2943 }
2944 if (env->features[FEAT_1_EDX] & CPUID_MTRR) {
2945 kvm_msr_entry_add(cpu, MSR_MTRRdefType, 0);
2946 kvm_msr_entry_add(cpu, MSR_MTRRfix64K_00000, 0);
2947 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_80000, 0);
2948 kvm_msr_entry_add(cpu, MSR_MTRRfix16K_A0000, 0);
2949 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C0000, 0);
2950 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_C8000, 0);
2951 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D0000, 0);
2952 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_D8000, 0);
2953 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E0000, 0);
2954 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_E8000, 0);
2955 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F0000, 0);
2956 kvm_msr_entry_add(cpu, MSR_MTRRfix4K_F8000, 0);
2957 for (i = 0; i < MSR_MTRRcap_VCNT; i++) {
2958 kvm_msr_entry_add(cpu, MSR_MTRRphysBase(i), 0);
2959 kvm_msr_entry_add(cpu, MSR_MTRRphysMask(i), 0);
2960 }
2961 }
2962
2963 if (env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) {
2964 int addr_num =
2965 kvm_arch_get_supported_cpuid(kvm_state, 0x14, 1, R_EAX) & 0x7;
2966
2967 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CTL, 0);
2968 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_STATUS, 0);
2969 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_BASE, 0);
2970 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_OUTPUT_MASK, 0);
2971 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_CR3_MATCH, 0);
2972 for (i = 0; i < addr_num; i++) {
2973 kvm_msr_entry_add(cpu, MSR_IA32_RTIT_ADDR0_A + i, 0);
2974 }
2975 }
2976
2977 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, cpu->kvm_msr_buf);
2978 if (ret < 0) {
2979 return ret;
2980 }
2981
2982 if (ret < cpu->kvm_msr_buf->nmsrs) {
2983 struct kvm_msr_entry *e = &cpu->kvm_msr_buf->entries[ret];
2984 error_report("error: failed to get MSR 0x%" PRIx32,
2985 (uint32_t)e->index);
2986 }
2987
2988 assert(ret == cpu->kvm_msr_buf->nmsrs);
2989 /*
2990 * MTRR masks: Each mask consists of 5 parts
2991 * a 10..0: must be zero
2992 * b 11 : valid bit
2993 * c n-1.12: actual mask bits
2994 * d 51..n: reserved must be zero
2995 * e 63.52: reserved must be zero
2996 *
2997 * 'n' is the number of physical bits supported by the CPU and is
2998 * apparently always <= 52. We know our 'n' but don't know what
2999 * the destinations 'n' is; it might be smaller, in which case
3000 * it masks (c) on loading. It might be larger, in which case
3001 * we fill 'd' so that d..c is consistent irrespetive of the 'n'
3002 * we're migrating to.
3003 */
3004
3005 if (cpu->fill_mtrr_mask) {
3006 QEMU_BUILD_BUG_ON(TARGET_PHYS_ADDR_SPACE_BITS > 52);
3007 assert(cpu->phys_bits <= TARGET_PHYS_ADDR_SPACE_BITS);
3008 mtrr_top_bits = MAKE_64BIT_MASK(cpu->phys_bits, 52 - cpu->phys_bits);
3009 } else {
3010 mtrr_top_bits = 0;
3011 }
3012
3013 for (i = 0; i < ret; i++) {
3014 uint32_t index = msrs[i].index;
3015 switch (index) {
3016 case MSR_IA32_SYSENTER_CS:
3017 env->sysenter_cs = msrs[i].data;
3018 break;
3019 case MSR_IA32_SYSENTER_ESP:
3020 env->sysenter_esp = msrs[i].data;
3021 break;
3022 case MSR_IA32_SYSENTER_EIP:
3023 env->sysenter_eip = msrs[i].data;
3024 break;
3025 case MSR_PAT:
3026 env->pat = msrs[i].data;
3027 break;
3028 case MSR_STAR:
3029 env->star = msrs[i].data;
3030 break;
3031 #ifdef TARGET_X86_64
3032 case MSR_CSTAR:
3033 env->cstar = msrs[i].data;
3034 break;
3035 case MSR_KERNELGSBASE:
3036 env->kernelgsbase = msrs[i].data;
3037 break;
3038 case MSR_FMASK:
3039 env->fmask = msrs[i].data;
3040 break;
3041 case MSR_LSTAR:
3042 env->lstar = msrs[i].data;
3043 break;
3044 #endif
3045 case MSR_IA32_TSC:
3046 env->tsc = msrs[i].data;
3047 break;
3048 case MSR_TSC_AUX:
3049 env->tsc_aux = msrs[i].data;
3050 break;
3051 case MSR_TSC_ADJUST:
3052 env->tsc_adjust = msrs[i].data;
3053 break;
3054 case MSR_IA32_TSCDEADLINE:
3055 env->tsc_deadline = msrs[i].data;
3056 break;
3057 case MSR_VM_HSAVE_PA:
3058 env->vm_hsave = msrs[i].data;
3059 break;
3060 case MSR_KVM_SYSTEM_TIME:
3061 env->system_time_msr = msrs[i].data;
3062 break;
3063 case MSR_KVM_WALL_CLOCK:
3064 env->wall_clock_msr = msrs[i].data;
3065 break;
3066 case MSR_MCG_STATUS:
3067 env->mcg_status = msrs[i].data;
3068 break;
3069 case MSR_MCG_CTL:
3070 env->mcg_ctl = msrs[i].data;
3071 break;
3072 case MSR_MCG_EXT_CTL:
3073 env->mcg_ext_ctl = msrs[i].data;
3074 break;
3075 case MSR_IA32_MISC_ENABLE:
3076 env->msr_ia32_misc_enable = msrs[i].data;
3077 break;
3078 case MSR_IA32_SMBASE:
3079 env->smbase = msrs[i].data;
3080 break;
3081 case MSR_SMI_COUNT:
3082 env->msr_smi_count = msrs[i].data;
3083 break;
3084 case MSR_IA32_FEATURE_CONTROL:
3085 env->msr_ia32_feature_control = msrs[i].data;
3086 break;
3087 case MSR_IA32_BNDCFGS:
3088 env->msr_bndcfgs = msrs[i].data;
3089 break;
3090 case MSR_IA32_XSS:
3091 env->xss = msrs[i].data;
3092 break;
3093 default:
3094 if (msrs[i].index >= MSR_MC0_CTL &&
3095 msrs[i].index < MSR_MC0_CTL + (env->mcg_cap & 0xff) * 4) {
3096 env->mce_banks[msrs[i].index - MSR_MC0_CTL] = msrs[i].data;
3097 }
3098 break;
3099 case MSR_KVM_ASYNC_PF_EN:
3100 env->async_pf_en_msr = msrs[i].data;
3101 break;
3102 case MSR_KVM_PV_EOI_EN:
3103 env->pv_eoi_en_msr = msrs[i].data;
3104 break;
3105 case MSR_KVM_STEAL_TIME:
3106 env->steal_time_msr = msrs[i].data;
3107 break;
3108 case MSR_CORE_PERF_FIXED_CTR_CTRL:
3109 env->msr_fixed_ctr_ctrl = msrs[i].data;
3110 break;
3111 case MSR_CORE_PERF_GLOBAL_CTRL:
3112 env->msr_global_ctrl = msrs[i].data;
3113 break;
3114 case MSR_CORE_PERF_GLOBAL_STATUS:
3115 env->msr_global_status = msrs[i].data;
3116 break;
3117 case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
3118 env->msr_global_ovf_ctrl = msrs[i].data;
3119 break;
3120 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR0 + MAX_FIXED_COUNTERS - 1:
3121 env->msr_fixed_counters[index - MSR_CORE_PERF_FIXED_CTR0] = msrs[i].data;
3122 break;
3123 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR0 + MAX_GP_COUNTERS - 1:
3124 env->msr_gp_counters[index - MSR_P6_PERFCTR0] = msrs[i].data;
3125 break;
3126 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL0 + MAX_GP_COUNTERS - 1:
3127 env->msr_gp_evtsel[index - MSR_P6_EVNTSEL0] = msrs[i].data;
3128 break;
3129 case HV_X64_MSR_HYPERCALL:
3130 env->msr_hv_hypercall = msrs[i].data;
3131 break;
3132 case HV_X64_MSR_GUEST_OS_ID:
3133 env->msr_hv_guest_os_id = msrs[i].data;
3134 break;
3135 case HV_X64_MSR_APIC_ASSIST_PAGE:
3136 env->msr_hv_vapic = msrs[i].data;
3137 break;
3138 case HV_X64_MSR_REFERENCE_TSC:
3139 env->msr_hv_tsc = msrs[i].data;
3140 break;
3141 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3142 env->msr_hv_crash_params[index - HV_X64_MSR_CRASH_P0] = msrs[i].data;
3143 break;
3144 case HV_X64_MSR_VP_RUNTIME:
3145 env->msr_hv_runtime = msrs[i].data;
3146 break;
3147 case HV_X64_MSR_SCONTROL:
3148 env->msr_hv_synic_control = msrs[i].data;
3149 break;
3150 case HV_X64_MSR_SIEFP:
3151 env->msr_hv_synic_evt_page = msrs[i].data;
3152 break;
3153 case HV_X64_MSR_SIMP:
3154 env->msr_hv_synic_msg_page = msrs[i].data;
3155 break;
3156 case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
3157 env->msr_hv_synic_sint[index - HV_X64_MSR_SINT0] = msrs[i].data;
3158 break;
3159 case HV_X64_MSR_STIMER0_CONFIG:
3160 case HV_X64_MSR_STIMER1_CONFIG:
3161 case HV_X64_MSR_STIMER2_CONFIG:
3162 case HV_X64_MSR_STIMER3_CONFIG:
3163 env->msr_hv_stimer_config[(index - HV_X64_MSR_STIMER0_CONFIG)/2] =
3164 msrs[i].data;
3165 break;
3166 case HV_X64_MSR_STIMER0_COUNT:
3167 case HV_X64_MSR_STIMER1_COUNT:
3168 case HV_X64_MSR_STIMER2_COUNT:
3169 case HV_X64_MSR_STIMER3_COUNT:
3170 env->msr_hv_stimer_count[(index - HV_X64_MSR_STIMER0_COUNT)/2] =
3171 msrs[i].data;
3172 break;
3173 case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3174 env->msr_hv_reenlightenment_control = msrs[i].data;
3175 break;
3176 case HV_X64_MSR_TSC_EMULATION_CONTROL:
3177 env->msr_hv_tsc_emulation_control = msrs[i].data;
3178 break;
3179 case HV_X64_MSR_TSC_EMULATION_STATUS:
3180 env->msr_hv_tsc_emulation_status = msrs[i].data;
3181 break;
3182 case MSR_MTRRdefType:
3183 env->mtrr_deftype = msrs[i].data;
3184 break;
3185 case MSR_MTRRfix64K_00000:
3186 env->mtrr_fixed[0] = msrs[i].data;
3187 break;
3188 case MSR_MTRRfix16K_80000:
3189 env->mtrr_fixed[1] = msrs[i].data;
3190 break;
3191 case MSR_MTRRfix16K_A0000:
3192 env->mtrr_fixed[2] = msrs[i].data;
3193 break;
3194 case MSR_MTRRfix4K_C0000:
3195 env->mtrr_fixed[3] = msrs[i].data;
3196 break;
3197 case MSR_MTRRfix4K_C8000:
3198 env->mtrr_fixed[4] = msrs[i].data;
3199 break;
3200 case MSR_MTRRfix4K_D0000:
3201 env->mtrr_fixed[5] = msrs[i].data;
3202 break;
3203 case MSR_MTRRfix4K_D8000:
3204 env->mtrr_fixed[6] = msrs[i].data;
3205 break;
3206 case MSR_MTRRfix4K_E0000:
3207 env->mtrr_fixed[7] = msrs[i].data;
3208 break;
3209 case MSR_MTRRfix4K_E8000:
3210 env->mtrr_fixed[8] = msrs[i].data;
3211 break;
3212 case MSR_MTRRfix4K_F0000:
3213 env->mtrr_fixed[9] = msrs[i].data;
3214 break;
3215 case MSR_MTRRfix4K_F8000:
3216 env->mtrr_fixed[10] = msrs[i].data;
3217 break;
3218 case MSR_MTRRphysBase(0) ... MSR_MTRRphysMask(MSR_MTRRcap_VCNT - 1):
3219 if (index & 1) {
3220 env->mtrr_var[MSR_MTRRphysIndex(index)].mask = msrs[i].data |
3221 mtrr_top_bits;
3222 } else {
3223 env->mtrr_var[MSR_MTRRphysIndex(index)].base = msrs[i].data;
3224 }
3225 break;
3226 case MSR_IA32_SPEC_CTRL:
3227 env->spec_ctrl = msrs[i].data;
3228 break;
3229 case MSR_VIRT_SSBD:
3230 env->virt_ssbd = msrs[i].data;
3231 break;
3232 case MSR_IA32_RTIT_CTL:
3233 env->msr_rtit_ctrl = msrs[i].data;
3234 break;
3235 case MSR_IA32_RTIT_STATUS:
3236 env->msr_rtit_status = msrs[i].data;
3237 break;
3238 case MSR_IA32_RTIT_OUTPUT_BASE:
3239 env->msr_rtit_output_base = msrs[i].data;
3240 break;
3241 case MSR_IA32_RTIT_OUTPUT_MASK:
3242 env->msr_rtit_output_mask = msrs[i].data;
3243 break;
3244 case MSR_IA32_RTIT_CR3_MATCH:
3245 env->msr_rtit_cr3_match = msrs[i].data;
3246 break;
3247 case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
3248 env->msr_rtit_addrs[index - MSR_IA32_RTIT_ADDR0_A] = msrs[i].data;
3249 break;
3250 }
3251 }
3252
3253 return 0;
3254 }
3255
3256 static int kvm_put_mp_state(X86CPU *cpu)
3257 {
3258 struct kvm_mp_state mp_state = { .mp_state = cpu->env.mp_state };
3259
3260 return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_MP_STATE, &mp_state);
3261 }
3262
3263 static int kvm_get_mp_state(X86CPU *cpu)
3264 {
3265 CPUState *cs = CPU(cpu);
3266 CPUX86State *env = &cpu->env;
3267 struct kvm_mp_state mp_state;
3268 int ret;
3269
3270 ret = kvm_vcpu_ioctl(cs, KVM_GET_MP_STATE, &mp_state);
3271 if (ret < 0) {
3272 return ret;
3273 }
3274 env->mp_state = mp_state.mp_state;
3275 if (kvm_irqchip_in_kernel()) {
3276 cs->halted = (mp_state.mp_state == KVM_MP_STATE_HALTED);
3277 }
3278 return 0;
3279 }
3280
3281 static int kvm_get_apic(X86CPU *cpu)
3282 {
3283 DeviceState *apic = cpu->apic_state;
3284 struct kvm_lapic_state kapic;
3285 int ret;
3286
3287 if (apic && kvm_irqchip_in_kernel()) {
3288 ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_LAPIC, &kapic);
3289 if (ret < 0) {