本文参考内核版本为v5.0。主要内容转载自:kvm performance optimization technologies, part one

Overview

PV EOI is an old pv optimization. The idea behind pv eoi is to avoid the EOI write in APIC. This exit is expensive. PV EOI uses a shared memory. The VMM set a flag in this shared memory before injecting the interrupt, when the guest process the interrupt and write an EOI, if it finds this flag it will clear it and just return.

初始化

1
2
3
4
#define KVM_PV_EOI_BIT 0
#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
#define KVM_PV_EOI_DISABLED 0x0

Linux guest:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;

static void kvm_guest_cpu_init(void)
{
...
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
unsigned long pa;
__this_cpu_write(kvm_apic_eoi, 0);
pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
| KVM_MSR_ENABLED;
wrmsrl(MSR_KVM_PV_EOI_EN, pa);
}
...
}

kvm:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len)
{
u64 addr = data & ~KVM_MSR_ENABLED;
struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data;
unsigned long new_len;

if (!IS_ALIGNED(addr, 4))
return 1;

vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
return 0;

if (addr == ghc->gpa && len <= ghc->len)
new_len = ghc->len;
else
new_len = len;

return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len);
}

Linux guest:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
static void __init kvm_guest_init(void)
{
...
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
...
}

static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
/**
* This relies on __test_and_clear_bit to modify the memory
* in a way that is atomic with respect to the local CPU.
* The hypervisor only accesses this memory from the local CPU so
* there's no need for lock or memory barriers.
* An optimization barrier is implied in apic write.
*/
if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
return;
apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
}

interrupt injection before vmentry

The apic_sync_pv_eoi_to_guest will be called when vmentry.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*
* apic_sync_pv_eoi_to_guest - called before vmentry
*
* Detect whether it's safe to enable PV EOI and
* if yes do so.
*/
static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
struct kvm_lapic *apic)
{
if (!pv_eoi_enabled(vcpu) ||
/* IRR set or many bits in ISR: could be nested. */
apic->irr_pending ||
/* Cache not set: could be safe but we don't bother. */
apic->highest_isr_cache == -1 ||
/* Need EOI to update ioapic. */
kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
/*
* PV EOI was disabled by apic_sync_pv_eoi_from_guest
* so we need not do anything here.
*/
return;
}

pv_eoi_set_pending(apic->vcpu);
}

pv_eoi_set_pending will set the KVM_PV_EOI_ENABLED flag in shared memory.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
static void pv_eoi_set_pending(struct kvm_vcpu *vcpu)
{
if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) {
apic_debug("Can't set EOI MSR value: 0x%llx\n",
(unsigned long long)vcpu->arch.pv_eoi.msr_val);
return;
}
__set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}

static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{

return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
sizeof(val));
}

apic_sync_pv_eoi_from_guest after vmexit

The apic_sync_pv_eoi_from_guest will be called when vmexit or cancel interrupt.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/*
* apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt
*
* Detect whether guest triggered PV EOI since the
* last entry. If yes, set EOI on guests's behalf.
* Clear PV EOI in guest memory in any case.
*/
static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu,
struct kvm_lapic *apic)
{
bool pending;
int vector;
/*
* PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host
* and KVM_PV_EOI_ENABLED in guest memory as follows:
*
* KVM_APIC_PV_EOI_PENDING is unset:
* -> host disabled PV EOI.
* KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set:
* -> host enabled PV EOI, guest did not execute EOI yet.
* KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset:
* -> host enabled PV EOI, guest executed EOI.
*/
BUG_ON(!pv_eoi_enabled(vcpu));
pending = pv_eoi_get_pending(vcpu);
/*
* Clear pending bit in any case: it will be set again on vmentry.
* While this might not be ideal from performance point of view,
* this makes sure pv eoi is only enabled when we know it's safe.
*/
pv_eoi_clr_pending(vcpu);
if (pending)
return;
vector = apic_set_eoi(apic);
trace_kvm_pv_eoi(apic, vector);
}

pv_eoi_get_pending will get the status of the shared flag. If it is still pending, it means the no guest trigger the EOI write, nothing to do. If the guest trigger the EOI here will call apic_set_eoi set the EOI of APIC on guests’s behalf.

Note the apic->irr_pending will always be true with virtual interrupt delivery enabled. So pv eoi today I think is little used as the APICv is very common.

总结


参考资料:

  1. http://terenceli.github.io/%E6%8A%80%E6%9C%AF/2020/09/10/kvm-performance-1
  2. https://lwn.net/Articles/502176/
  3. https://lore.kernel.org/kvm/cover.1337695416.git.mst@redhat.com/
  4. https://www.kernel.org/doc/Documentation/virtual/kvm/msr.txt