本文将介绍PV IPI技术。部分内容转载自:kvm performance optimization technologies, part one

1. Idea

Instead of sending the IPI to vcpu one by one, the pv ipi send uses a bitmap to to record the IPI vcpu and then make a hypercall thus reduce the VM-exit. The patchset is here.

2. Usage

Doc:
6. KVM_HC_SEND_IPI
————————
Architecture: x86
Status: active
Purpose: Send IPIs to multiple vCPUs.

a0: lower part of the bitmap of destination APIC IDs
a1: higher part of the bitmap of destination APIC IDs
a2: the lowest APIC ID in bitmap
a3: APIC ICR

The hypercall lets a guest send multicast IPIs, with at most 128
128 destinations per hypercall in 64-bit mode and 64 vCPUs per
hypercall in 32-bit mode.  The destinations are represented by a
bitmap contained in the first two arguments (a0 and a1). Bit 0 of
a0 corresponds to the APIC ID in the third argument (a2), bit 1
corresponds to the APIC ID a2+1, and so on.

Returns the number of CPUs to which the IPIs were delivered successfully.

The test code in KVM unit test:

1
2
3
4
5
6
7
8
static void test_pv_ipi(void)
{
int ret;
unsigned long a0 = 0xFFFFFFFF, a1 = 0, a2 = 0xFFFFFFFF, a3 = 0x0;

asm volatile("vmcall" : "=a"(ret) :"a"(KVM_HC_SEND_IPI), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
report(!ret, "PV IPIs testing");
}

a3就是kvm_pv_send_ipi函数中的icr参数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit)
{
...
if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
return -KVM_EINVAL;

irq.vector = icr & APIC_VECTOR_MASK;
irq.delivery_mode = icr & APIC_MODE_MASK;
irq.level = (icr & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr & APIC_INT_LEVELTRIG;
...
}

3. Implementation

源码基于Kernel v5.17.0-rc1。

3.1 kvm side

  • Expose PV_SEND_IPI CPUID feature bit to guest
    KVM_FEATURE_PV_SEND_IPI

  • Implement PV IPIs send hypercall
    KVM_HC_SEND_IPI

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
unsigned long icr, int op_64_bit)
{
struct kvm_apic_map *map;
struct kvm_lapic_irq irq = {0};
int cluster_size = op_64_bit ? 64 : 32;
int count;

if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK))
return -KVM_EINVAL;

irq.vector = icr & APIC_VECTOR_MASK;
irq.delivery_mode = icr & APIC_MODE_MASK;
irq.level = (icr & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr & APIC_INT_LEVELTRIG;

rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);

count = -EOPNOTSUPP;
if (likely(map)) {
count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min);
min += cluster_size;
count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min);
}

rcu_read_unlock();
return count;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map,
struct kvm_lapic_irq *irq, u32 min)
{
int i, count = 0;
struct kvm_vcpu *vcpu;

if (min > map->max_apic_id)
return 0;

for_each_set_bit(i, ipi_bitmap,
min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) {
if (map->phys_map[min + i]) {
vcpu = map->phys_map[min + i]->vcpu;
count += kvm_apic_set_irq(vcpu, irq, NULL);
}
}

return count;
}

3.2 guest side

  • Set the IPI entry points
1
2
3
4
5
6
static void kvm_setup_pv_ipi(void)
{
apic->send_IPI_mask = kvm_send_ipi_mask;
apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
pr_info("setup PV IPIs\n");
}
  • Guest trigger IPI
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
unsigned long flags;
int cpu, apic_id, icr;
int min = 0, max = 0;
#ifdef CONFIG_X86_64
__uint128_t ipi_bitmap = 0;
#else
u64 ipi_bitmap = 0;
#endif
long ret;

if (cpumask_empty(mask))
return;

local_irq_save(flags);

switch (vector) {
default:
icr = APIC_DM_FIXED | vector;
break;
case NMI_VECTOR:
icr = APIC_DM_NMI;
break;
}

for_each_cpu(cpu, mask) {
apic_id = per_cpu(x86_cpu_to_apicid, cpu);
if (!ipi_bitmap) {
min = max = apic_id;
} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
ipi_bitmap <<= min - apic_id;
min = apic_id;
} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
ret);
min = max = apic_id;
ipi_bitmap = 0;
}
__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
}

if (ipi_bitmap) {
ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
ret);
}

local_irq_restore(flags);
}

It will set the bitmap accross the IPI target vcpu and finally call the kvm_hypercall4(KVM_HC_SEND_IPI).


参考资料:

  1. KVM: X86: Implement Exit-less IPIs support
  2. kvm performance optimization technologies, part one
  3. Boosting Dedicated InstanceviaKVMTaxCut
  4. IOMMU(六)-post interrupt