Overview

IRQ bypass仅仅是一套软件框架而已!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
When a physical I/O device is assigned to a virtual machine through
facilities like VFIO and KVM, the interrupt for the device generally
bounces through the host system before being injected into the VM.
However, hardware technologies exist that often allow the host to be
bypassed for some of these scenarios. Intel Posted Interrupts allow
the specified physical edge interrupts to be directly injected into a
guest when delivered to a physical processor while the vCPU is
running. ARM IRQ Forwarding allows forwarded physical interrupts to
be directly deactivated by the guest.

The IRQ bypass manager here is meant to provide the shim to connect
interrupt producers, generally the host physical device driver, with
interrupt consumers, generally the hypervisor, in order to configure
these bypass mechanism. To do this, we base the connection on a
shared, opaque token. For KVM-VFIO this is expected to be an
eventfd_ctx since this is the connection we already use to connect an
eventfd to an irqfd on the in-kernel path. When a producer and
consumer with matching tokens is found, callbacks via both registered
participants allow the bypass facilities to be automatically enabled.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/*
* Theory of operation
*
* The IRQ bypass manager is a simple set of lists and callbacks that allows
* IRQ producers (ex. physical interrupt sources) to be matched to IRQ
* consumers (ex. virtualization hardware that allows IRQ bypass or offload)
* via a shared token (ex. eventfd_ctx). Producers and consumers register
* independently. When a token match is found, the optional @stop callback
* will be called for each participant. The pair will then be connected via
* the @add_* callbacks, and finally the optional @start callback will allow
* any final coordination. When either participant is unregistered, the
* process is repeated using the @del_* callbacks in place of the @add_*
* callbacks. Match tokens must be unique per producer/consumer, 1:N pairings
* are not supported.
*/
1
2
3
4
int irq_bypass_register_producer(struct irq_bypass_producer *);
void irq_bypass_unregister_producer(struct irq_bypass_producer *);
int irq_bypass_register_consumer(struct irq_bypass_consumer *);
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);

kvm_arch_irq_bypass_add_producer

irq_bypass_register_producer

1
2
3
4
5
vfio_pci_set_irqs_ioctl
└── vfio_pci_set_msi_trigger
└── vfio_msi_set_block
└── vfio_msi_set_vector_signal
└── irq_bypass_register_producer
1
2
3
4
5
6
7
8
9
10
11
12
13
static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
int vector, int fd, bool msix)
{
...
struct eventfd_ctx *trigger;
...
trigger = eventfd_ctx_fdget(fd); // 正常情况下,这里的fd就是irqfd
...
vdev->ctx[vector].producer.token = trigger;
vdev->ctx[vector].producer.irq = irq;
ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/**
* irq_bypass_register_producer - register IRQ bypass producer
* @producer: pointer to producer structure
*
* Add the provided IRQ producer to the list of producers and connect
* with any matching token found on the IRQ consumers list.
*/
int irq_bypass_register_producer(struct irq_bypass_producer *producer)
{
struct irq_bypass_producer *tmp;
struct irq_bypass_consumer *consumer;

if (!producer->token)
return -EINVAL;

might_sleep();

if (!try_module_get(THIS_MODULE))
return -ENODEV;

mutex_lock(&lock);

list_for_each_entry(tmp, &producers, node) {
if (tmp->token == producer->token) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return -EBUSY;
}
}

list_for_each_entry(consumer, &consumers, node) {
if (consumer->token == producer->token) {
int ret = __connect(producer, consumer);
if (ret) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
break;
}
}

list_add(&producer->node, &producers);

mutex_unlock(&lock);

return 0;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);

irq_bypass_register_consumer

1
2
3
4
kvm_vm_ioctl
└── kvm_irqfd
└── kvm_irqfd_assign
└── irq_bypass_register_consumer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
...
#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
if (kvm_arch_has_irq_bypass()) {
irqfd->consumer.token = (void *)irqfd->eventfd;
irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
irqfd->consumer.start = kvm_arch_irq_bypass_start;
ret = irq_bypass_register_consumer(&irqfd->consumer);
if (ret)
pr_info("irq bypass consumer (token %p) registration fails: %d\n",
irqfd->consumer.token, ret);
}
#endif
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
/**
* irq_bypass_register_consumer - register IRQ bypass consumer
* @consumer: pointer to consumer structure
*
* Add the provided IRQ consumer to the list of consumers and connect
* with any matching token found on the IRQ producer list.
*/
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
{
struct irq_bypass_consumer *tmp;
struct irq_bypass_producer *producer;

if (!consumer->token ||
!consumer->add_producer || !consumer->del_producer)
return -EINVAL;

might_sleep();

if (!try_module_get(THIS_MODULE))
return -ENODEV;

mutex_lock(&lock);

list_for_each_entry(tmp, &consumers, node) {
if (tmp->token == consumer->token || tmp == consumer) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return -EBUSY;
}
}

list_for_each_entry(producer, &producers, node) {
if (producer->token == consumer->token) {
int ret = __connect(producer, consumer);
if (ret) {
mutex_unlock(&lock);
module_put(THIS_MODULE);
return ret;
}
break;
}
}

list_add(&consumer->node, &consumers);

mutex_unlock(&lock);

return 0;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);

vmx_update_pi_irte

当irq bypass的producer和consumer token(eventfd_ctx)匹配成功时,才会调用kvm_arch_irq_bypass_add_producer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
/* @lock must be held when calling connect */
static int __connect(struct irq_bypass_producer *prod,
struct irq_bypass_consumer *cons)
{
int ret = 0;

if (prod->stop)
prod->stop(prod);
if (cons->stop)
cons->stop(cons);

if (prod->add_consumer)
ret = prod->add_consumer(prod, cons);

if (!ret) {
ret = cons->add_producer(cons, prod);
if (ret && prod->del_consumer)
prod->del_consumer(prod, cons);
}

if (cons->start)
cons->start(cons);
if (prod->start)
prod->start(prod);

return ret;
}

1
2
3
4
5
6
7
8
9
10
11
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);

irqfd->producer = prod;

return kvm_x86_ops->update_pi_irte(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
}

add_producer其实就是设置irte为Posted-Interrupts而已!

1
2
3
vmx_update_pi_irte
└── irq_set_vcpu_affinity
└── intel_ir_set_vcpu_affinity

1
2
3
4
5
6
7
8
9
10
11
/*
* vmx_update_pi_irte - set IRTE for Posted-Interrupts
*
* @kvm: kvm
* @host_irq: host irq of the interrupt
* @guest_irq: gsi of the interrupt
* @set: set or unset PI
* returns 0 on success, < 0 on failure
*/
static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)

kvm_arch_update_irqfd_routing

1
2
3
4
kvm_vm_ioctl
└── kvm_set_irq_routing
└── kvm_irq_routing_update
└── kvm_arch_update_irqfd_routing
1
2
3
4
5
6
7
8
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
if (!kvm_x86_ops->update_pi_irte)
return -EINVAL;

return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
}

既然irq bypass的producer和consumer token匹配成功时就调用了update_pi_irte,为什么还要在kvm_irq_routing_update中也调用update_pi_irte呢?其实是在guest内部做irq balance时才会触发。

以guest内部对msi-x table中断做irq balance为例, qemu的函数调用链如下:

1
2
3
4
5
6
7
8
9
msix_table_mmio_write
└── msix_handle_mask_update
└── msix_fire_vector_notifier
└── vfio_msix_vector_use
└── vfio_msix_vector_do_use
└── vfio_update_kvm_msi_virq
├── kvm_irqchip_update_msi_route
└── kvm_irqchip_commit_routes
└── kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes)

guest内部做irq balance时,可能会更改vCPU与vector号,需要将guest更改后的vector更新到IRTE中的vv字段。所以在guest内部做irq balance时,需要调用update_pi_irte来更新IRTE。


参考资料:

  1. virt: IRQ bypass manager
  2. include/linux/irqbypass.h
  3. virt: IRQ bypass manager