Introduction to VT-x Posted-interrupt介绍了原理,本文将结合KVM代码,介绍VT-x Posted Interrupt的实现细节。

source code:
https://elixir.bootlin.com/linux/v5.14/source

vmx_deliver_posted_interrupt

假设guest运行在x2APIC mode下,当source vCPU写ICR寄存器时,会发生VM Exit。

KVM的函数调用链如下:

1
2
3
4
5
6
7
8
9
10
kvm_emulate_wrmsr
kvm_set_msr
kvm_set_msr_ignored_check
__kvm_set_msr
static_call(kvm_x86_set_msr)[vmx_set_msr]
kvm_set_msr_common
kvm_x2apic_msr_write
kvm_lapic_reg_write
kvm_apic_send_ipi
kvm_irq_delivery_to_apic

1
2
3
4
5
kvm_irq_delivery_to_apic
kvm_irq_delivery_to_apic_fast
kvm_apic_set_irq
__apic_accept_irq
kvm_x86_deliver_posted_interrupt[vmx_deliver_posted_interrupt]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/*
* Send interrupt to vcpu via posted interrupt way.
* 1. If target vcpu is running(non-root mode), send posted interrupt
* notification to vcpu and hardware will sync PIR to vIRR atomically.
* 2. If target vcpu isn't running(root mode), kick it to pick up the
* interrupt from PIR in next vmentry.
*/
static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;

...

if (!vcpu->arch.apicv_active)
return -1;

if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return 0;

/* If a previous notification has sent the IPI, nothing to do. */
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;

if (vcpu != kvm_get_running_vcpu() &&
!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
// Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
kvm_vcpu_kick(vcpu);

return 0;
}

static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
#ifdef CONFIG_SMP
int pi_vec = POSTED_INTR_VECTOR;

if (vcpu->mode == IN_GUEST_MODE) {
// 给vCPU所在的physical CPU发送一个Notification Event
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
return true;
}
#endif
return false;
}

If target vcpu isn’t running(root mode), kick it(kvm_vcpu_kick(vcpu)) to pick up the interrupt from PIR in next vmentry.

vmx_sync_pir_to_irr

pick up the interrupt from PIR in the next vmentry这一步的代码解析如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
...

/*
* This handles the case where a posted interrupt was
* notified with kvm_vcpu_kick.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
// vmx_sync_pir_to_irr(vcpu) is called
static_call(kvm_x86_sync_pir_to_irr)(vcpu);
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;

if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
}
vmx_set_rvi(max_irr);
return max_irr;
}

kvm_apic_update_irr
__kvm_apic_update_irr

bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
{
u32 i, vec;
u32 pir_val, irr_val, prev_irr_val;
int max_updated_irr;

max_updated_irr = -1;
*max_irr = -1;

for (i = vec = 0; i <= 7; i++, vec += 32) {
pir_val = READ_ONCE(pir[i]);
irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
if (pir_val) {
prev_irr_val = irr_val;
irr_val |= xchg(&pir[i], 0);
*((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
if (prev_irr_val != irr_val) {
max_updated_irr =
__fls(irr_val ^ prev_irr_val) + vec;
}
}
if (irr_val)
*max_irr = __fls(irr_val) + vec;
}

return ((max_updated_irr != -1) &&
(max_updated_irr == *max_irr));
}

一言以蔽之: vmx_sync_pir_to_irr所做的工作就是下图绿框中标注的步骤。区别在于:如果target vcpu is running(non-root mode),上述步骤由硬件来完成;否则,需要调用vmx_sync_pir_to_irr由软件来完成。


推荐材料: