本文将以v5.16-rc1源码为例,介绍下KVM中FPU virtualization的实现。

1. Prerequisite

2. xsave state

1
2
3
4
vmx_vcpu_run
kvm_load_guest_xsave_state
vmx_vcpu_enter_exit(vcpu, vmx); /* The actual VMENTER/EXIT is in the .noinstr.text section. */
kvm_load_host_xsave_state
1
2
3
4
5
6
7
8
9
10
11
12
13
14
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
...
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {

if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);

if (vcpu->arch.xsaves_enabled &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
...
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {

if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);

if (vcpu->arch.xsaves_enabled &&
vcpu->arch.ia32_xss != host_xss)
wrmsrl(MSR_IA32_XSS, host_xss);
}
}

3. kvm_load_guest_fpu and kvm_put_guest_fpu

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
struct kvm_vcpu_arch {
...
/*
* QEMU userspace and the guest each have their own FPU state.
* In vcpu_run, we switch between the user and guest FPU contexts.
* While running a VCPU, the VCPU thread will have the guest FPU
* context.
*
* Note that while the PKRU state lives inside the fpu registers,
* it is switched out separately at VMENTER and VMEXIT time. The
* "guest_fpstate" state here contains the guest FPU context, with the
* host PRKU bits.
*/
struct fpu_guest guest_fpu;
...
}
1
2
3
4
5
6
7
8
9
10
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
/*
* Exclude PKRU from restore as restored separately in
* kvm_x86_ops.run().
*/
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
...
}
1
2
3
4
5
6
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
struct fpu {
...
/*
* @fpstate:
*
* Pointer to the active struct fpstate. Initialized to
* point at @__fpstate below.
*/
struct fpstate *fpstate;

/*
* @__task_fpstate:
*
* Pointer to an inactive struct fpstate. Initialized to NULL. Is
* used only for KVM support to swap out the regular task fpstate.
*/
struct fpstate *__task_fpstate;
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest)
{
struct fpstate *guest_fps = guest_fpu->fpstate;
struct fpu *fpu = &current->thread.fpu;
struct fpstate *cur_fps = fpu->fpstate;

fpregs_lock();
if (!cur_fps->is_confidential && !test_thread_flag(TIF_NEED_FPU_LOAD))
save_fpregs_to_fpstate(fpu);

/* Swap fpstate */
if (enter_guest) {
fpu->__task_fpstate = cur_fps;
fpu->fpstate = guest_fps;
guest_fps->in_use = true;
} else {
guest_fps->in_use = false;
fpu->fpstate = fpu->__task_fpstate;
fpu->__task_fpstate = NULL;
}

cur_fps = fpu->fpstate;

if (!cur_fps->is_confidential) {
/* Includes XFD update */
restore_fpregs_from_fpstate(cur_fps, XFEATURE_MASK_FPSTATE);
} else {
/*
* XSTATE is restored by firmware from encrypted
* memory. Make sure XFD state is correct while
* running with guest fpstate
*/
xfd_update_state(cur_fps);
}

fpregs_mark_activate();
fpregs_unlock();
return 0;
}

4. KVM_GET_XSAVE and KVM_SET_XSAVE ioctl

x86/kvm: Convert FPU handling to a single swap buffer

1
2
3
4
kvm_arch_vcpu_ioctl
kvm_vcpu_ioctl_x86_get_xsave[KVM_GET_XSAVE]
fpu_copy_guest_fpstate_to_uabi
__copy_xstate_to_uabi_buf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/**
* __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer
* @to: membuf descriptor
* @fpstate: The fpstate buffer from which to copy
* @pkru_val: The PKRU value to store in the PKRU component
* @copy_mode: The requested copy mode
*
* Converts from kernel XSAVE or XSAVES compacted format to UABI conforming
* format, i.e. from the kernel internal hardware dependent storage format
* to the requested @mode. UABI XSTATE is always uncompacted!
*
* It supports partial copy but @to.pos always starts from zero.
*/
void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate,
u32 pkru_val, enum xstate_copy_mode copy_mode)
{
...
}

__copy_xstate_to_uabi_buf

1
2
3
4
5
kvm_arch_vcpu_ioctl
kvm_vcpu_ioctl_x86_set_xsave[KVM_SET_XSAVE]
fpu_copy_uabi_to_guest_fpstate
copy_uabi_from_kernel_to_xstate
copy_uabi_to_xstate