本文展示的代码中,qemu版本为1.6.0, Linux内核版本为3.12.73
本篇文章主要描述IO从guest vm跳转到kvm和qemu后的处理过程。

首先回顾一下kvm的启动过程。qemu通过调用kvm提供的一系列接口来启动kvm。qemu的入口为vl.c中的main函数,main函数通过调用kvm_init 和 machine->init来初始化kvm。 其中, machine->init会创建vcpu, 用一个线程去模拟vcpu, 该线程执行的函数为qemu_kvm_cpu_thread_fn, 并且该线程调用kvm_cpu_exec,该函数调用kvm_vcpu_ioctl切换到kvm中,下次从kvm中返回时,会接着执行kvm_vcpu_ioctl之后的代码,判断exit_reason,然后进行相应处理。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
int kvm_cpu_exec(CPUState *cpu)  
{
struct kvm_run *run = cpu->kvm_run;
int ret, run_ret;

DPRINTF("kvm_cpu_exec()\n");

if (kvm_arch_process_async_events(cpu)) {
cpu->exit_request = 0;
return EXCP_HLT;
}

do {
if (cpu->kvm_vcpu_dirty) {
kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
cpu->kvm_vcpu_dirty = false;
}

kvm_arch_pre_run(cpu, run);
if (cpu->exit_request) {
DPRINTF("interrupt exit requested\n");
/*
* KVM requires us to reenter the kernel after IO exits to complete
* instruction emulation. This self-signal will ensure that we
* leave ASAP again.
*/
qemu_cpu_kick_self();
}
qemu_mutex_unlock_iothread();

run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);

qemu_mutex_lock_iothread();
kvm_arch_post_run(cpu, run);

if (run_ret < 0) {
if (run_ret == -EINTR || run_ret == -EAGAIN) {
DPRINTF("io window exit\n");
ret = EXCP_INTERRUPT;
break;
}
fprintf(stderr, "error: kvm run failed %s\n",
strerror(-run_ret));
abort();
}

trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
switch (run->exit_reason) {
case KVM_EXIT_IO:
DPRINTF("handle_io\n");
kvm_handle_io(run->io.port,
(uint8_t *)run + run->io.data_offset,
run->io.direction,
run->io.size,
run->io.count);
ret = 0;
break;
case KVM_EXIT_MMIO:
DPRINTF("handle_mmio\n");
cpu_physical_memory_rw(run->mmio.phys_addr,
run->mmio.data,
run->mmio.len,
run->mmio.is_write);
ret = 0;
break;
case KVM_EXIT_IRQ_WINDOW_OPEN:
DPRINTF("irq_window_open\n");
ret = EXCP_INTERRUPT;
break;
case KVM_EXIT_SHUTDOWN:
DPRINTF("shutdown\n");
qemu_system_reset_request();
ret = EXCP_INTERRUPT;
break;
case KVM_EXIT_UNKNOWN:
fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
(uint64_t)run->hw.hardware_exit_reason);
ret = -1;
break;
case KVM_EXIT_INTERNAL_ERROR:
ret = kvm_handle_internal_error(cpu, run);
break;
default:
DPRINTF("kvm_arch_handle_exit\n");
ret = kvm_arch_handle_exit(cpu, run);
break;
}
} while (ret == 0);

if (ret < 0) {
cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}

cpu->exit_request = 0;
return ret;
}

kvm_vcpu_ioctl执行时,调用的kvm函数是virt/kvm/kvm-main.c中的kvm_vcpu_ioctl函数。当传入参数为KVM_RUN时,最终会调用vcpu_enter_guest函数, 在vcpu_enter_guest函数中调用了kvm_x86_ops->run(vcpu),在intel处理器架构中该函数对应的实现为vmx_vcpu_run, vmx_vcpu_run设置好寄存器状态之后调用VM_LAUNCH或者VM_RESUME进入guest vm, 一旦发生vm exit则从此处继续执行下去。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr;

/*...此处省略n行代码...*/
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP ";"
"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
"push %%" _ASM_CX " \n\t"
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
"1: \n\t"
/* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
"je 2f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
"2: \n\t"
/* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
/* Load guest registers. Don't clobber flags. */
"mov %c[rax](%0), %%" _ASM_AX " \n\t"
"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
#ifdef CONFIG_X86_64
"mov %c[r8](%0), %%r8 \n\t"
"mov %c[r9](%0), %%r9 \n\t"
"mov %c[r10](%0), %%r10 \n\t"
"mov %c[r11](%0), %%r11 \n\t"
"mov %c[r12](%0), %%r12 \n\t"
"mov %c[r13](%0), %%r13 \n\t"
"mov %c[r14](%0), %%r14 \n\t"
"mov %c[r15](%0), %%r15 \n\t"
#endif
"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */

/* Enter guest mode */
"jne 1f \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
"jmp 2f \n\t"
"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
"2: "
/* Save guest registers, load host registers, keep flags */
"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
"pop %0 \n\t"
"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
#ifdef CONFIG_X86_64
"mov %%r8, %c[r8](%0) \n\t"
"mov %%r9, %c[r9](%0) \n\t"
"mov %%r10, %c[r10](%0) \n\t"
"mov %%r11, %c[r11](%0) \n\t"
"mov %%r12, %c[r12](%0) \n\t"
"mov %%r13, %c[r13](%0) \n\t"
"mov %%r14, %c[r14](%0) \n\t"
"mov %%r15, %c[r15](%0) \n\t"
#endif
"mov %%cr2, %%" _ASM_AX " \n\t"
"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"

"pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
"setbe %c[fail](%0) \n\t"
".pushsection .rodata \n\t"
".global vmx_return \n\t"
"vmx_return: " _ASM_PTR " 2b \n\t"
".popsection"
: : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
[fail]"i"(offsetof(struct vcpu_vmx, fail)),
[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
#ifdef CONFIG_X86_64
[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory"
#ifdef CONFIG_X86_64
, "rax", "rbx", "rdi", "rsi"
, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
#else
, "eax", "ebx", "edi", "esi"
#endif
);

/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
if (debugctlmsr)
update_debugctlmsr(debugctlmsr);

#ifndef CONFIG_X86_64
/*
* The sysexit path does not restore ds/es, so we must set them to
* a reasonable value ourselves.
*
* We can't defer this to vmx_load_host_state() since that function
* may be executed in interrupt context, which saves and restore segments
* around it, nullifying its effect.
*/
loadsegment(ds, __USER_DS);
loadsegment(es, __USER_DS);
#endif

vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
| (1 << VCPU_EXREG_RFLAGS)
| (1 << VCPU_EXREG_CPL)
| (1 << VCPU_EXREG_PDPTR)
| (1 << VCPU_EXREG_SEGMENTS)
| (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;

vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

if (is_guest_mode(vcpu)) {
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
vmcs12->idt_vectoring_error_code =
vmcs_read32(IDT_VECTORING_ERROR_CODE);
vmcs12->vm_exit_instruction_len =
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
}
}

vmx->loaded_vmcs->launched = 1;

vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);

vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}

介绍完初始化的流程,可以介绍IO在kvm和qemu中的处理流程了。当Guest Vm进行IO操作需要访问设备时,就会触发vm exit 返回到vmx_vcpu_run, vmx保存好vmcs并且记录下VM_EXIT_REASON后返回到调用该函数的vcpu_enter_guest, 在vcpu_enter_guest函数末尾调用了r= kvm_x86_ops->handle_exit(vcpu), 该函数对应于vmx_handle_exit函数(intel cpu架构对应关系可以查看vmx.c文件中static struct kvm_x86_ops vmx_x86_ops), vmx_handle_exit 调用kvm_vmx_exit_handlers[exit_reason](vcpu),该语句根据exit_reason调用不同的函数,该数据结构定义如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {  
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
[EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
[EXIT_REASON_CPUID] = handle_cpuid,
[EXIT_REASON_MSR_READ] = handle_rdmsr,
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = handle_rdpmc,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
[EXIT_REASON_VMPTRLD] = handle_vmptrld,
[EXIT_REASON_VMPTRST] = handle_vmptrst,
[EXIT_REASON_VMREAD] = handle_vmread,
[EXIT_REASON_VMRESUME] = handle_vmresume,
[EXIT_REASON_VMWRITE] = handle_vmwrite,
[EXIT_REASON_VMOFF] = handle_vmoff,
[EXIT_REASON_VMON] = handle_vmon,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
[EXIT_REASON_XSETBV] = handle_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
};

如果是因为IO原因导致的vm exit,则调用的处理函数为handle_io,handle_io的处理可以查看io的虚拟化 , 该过程结束之后需要qemu去处理IO,这时候会返回到qemu, 在kvm_cpu_exec中继续执行下去,看上面kvm_cpu_exec的代码,如果是因为IO原因返回到qemu,会调用kvm_handle_io函数.

1
2
3
4
5
6
7
8
9
10
switch (run->exit_reason) {
case KVM_EXIT_IO:
DPRINTF("handle_io\n");
kvm_handle_io(run->io.port,
(uint8_t *)run + run->io.data_offset,
run->io.direction,
run->io.size,
run->io.count);
ret = 0;
break;

kvm_handle_io调用cpu_outb, cpu_outw等指令处理IO操作。

当qemu完成IO操作后,会在kvm_cpu_exec函数的循环中,调用kvm_vcpu_ioctl重新进入kvm。

以上阐述了IO操作在kvm和qemu中处理的整个过程。


参考资料:

  1. csdn dashulu
  2. io的虚拟化