本文将结合qemu与linux源码,解析vhost中ioeventfd与irqfd相关内容。

prerequisite

overview

ioeventfd与kick绑定,irqfd与中断绑定

ioeventfd:

  • qemu利用KVM_IOEVENTFD ioctl,将ioeventfd与guest kick寄存器的地址(pio/mmio地址)和vq index的值绑定,传给kvm
    • 当kvm检测到guest往kick寄存器写入vq index后,写eventfd通知vhost
  • qemu利用VHOST_SET_VRING_KICK ioctl,将ioeventfd传给vhost,vhost就会poll ioeventfd的写
    • 当vhost poll到ioeventfd的写后,就会开始从avai ring中拉取请求,处理完io请求后,更新used ring,最后给guest注入中断(需要借助于irqfd)

irqfd:

  • qemu利用VHOST_SET_VRING_CALL ioctl,将irqfd传给vhost
    • vhost在更新完used ring后,写eventfd通知kvm注入中断
  • qemu利用KVM_IRQFD ioctl,将irqfd与vq的中断绑定,传给kvm,kvm就会poll irqfd的写
    • 当kvm poll到irqfd的写后,就会根据中断路由信息,给guest注入中断

ioeventfd

qemu侧ioeventfd的关联

1
2
3
4
5
6
7
8
9
10
11
12
virtio_bus_start_ioeventfd
└── virtio_device_start_ioeventfd_impl[start_ioeventfd]
├── event_notifier_set(&vq->host_notifier)
└── memory_region_transaction_commit
└── address_space_update_ioeventfds
└── address_space_add_del_ioeventfds
├── kvm_io_ioeventfd_add[eventfd_add] //pio
│ └── kvm_set_ioeventfd_pio
│ └── kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick)
└── kvm_mem_ioeventfd_add[eventfd_add] //mmio
└── kvm_set_ioeventfd_mmio
└── kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd)
1
2
3
4
vhost_virtqueue_start
├── event_notifier_get_fd(virtio_queue_get_host_notifier(vvq))
└── vhost_kernel_set_vring_kick[vhost_set_vring_kick]
└── vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file)
1
2
3
4
5
6
struct VirtQueue
{
...
EventNotifier host_notifier;
...
}

由上述代码可知,qemu侧通过host_notifier的ioeventfd,将vhost与kvm关联了起来;

  • vhost负责poll ioeventfd
  • kvm负责写ioeventfd来通知vhost guest的kick操作

kvm侧ioeventfd处理

参考Dive into ioeventfd(KVM side) mechanism即可。

vhost侧ioeventfd处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
...
switch (ioctl) {
...
case VHOST_SET_VRING_KICK:
if (copy_from_user(&f, argp, sizeof f)) {
r = -EFAULT;
break;
}
eventfp = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_fget(f.fd);
if (IS_ERR(eventfp)) {
r = PTR_ERR(eventfp);
break;
}
if (eventfp != vq->kick) {
pollstop = (filep = vq->kick) != NULL;
pollstart = (vq->kick = eventfp) != NULL;
} else
filep = eventfp;
break;
...
}
...
if (pollstart && vq->handle_kick)
r = vhost_poll_start(&vq->poll, vq->kick);
...
}

qemu利用VHOST_SET_VRING_KICK ioctl,将ioeventfd传给vhost,然后vhost就开始poll ioeventfd(vhost_poll_start)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/* Start polling a file. We add ourselves to file's wait queue. The caller must
* keep a reference to a file until after vhost_poll_stop is called. */
int vhost_poll_start(struct vhost_poll *poll, struct file *file)
{
__poll_t mask;

if (poll->wqh)
return 0;

mask = vfs_poll(file, &poll->table); //调用callback vhost_poll_func
if (mask)
vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
if (mask & EPOLLERR) {
vhost_poll_stop(poll);
return -EINVAL;
}

return 0;
}
EXPORT_SYMBOL_GPL(vhost_poll_start);

当vhost poll到ioeventfd写后,就会触发vhost_poll_wakeup回调。

1
2
3
4
5
6
vhost_poll_wakeup
└── vhost_poll_queue
└── vhost_vq_work_queue
└── vhost_worker_queue
├── llist_add(&work->node, &worker->work_list)
└── vhost_task_wake(worker->vtsk)

worker->vtsk又会如何操作呢?且看worker->vtsk的初始化情况以及worker->vtsk的执行函数吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
{
struct vhost_worker *worker;
struct vhost_task *vtsk;
char name[TASK_COMM_LEN];
int ret;
u32 id;

worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
if (!worker)
return NULL;

snprintf(name, sizeof(name), "vhost-%d", current->pid);

vtsk = vhost_task_create(vhost_worker, worker, name);
if (!vtsk)
goto free_worker;

mutex_init(&worker->mutex);
init_llist_head(&worker->work_list);
worker->kcov_handle = kcov_common_handle();
worker->vtsk = vtsk;
...
}

static bool vhost_worker(void *data)
{
struct vhost_worker *worker = data;
struct vhost_work *work, *work_next;
struct llist_node *node;

node = llist_del_all(&worker->work_list);
if (node) {
__set_current_state(TASK_RUNNING);

node = llist_reverse_order(node);
/* make sure flag is seen after deletion */
smp_wmb();
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, &work->flags);
kcov_remote_start_common(worker->kcov_handle);
work->fn(work); //vq->handle_kick
kcov_remote_stop();
cond_resched();
}
}

return !!node;
}

在初始化过程中,vhost会创建一个名为vhost-$pid的内核线程,其中$pid是QEMU进程的pid。该线程被称为“vhost工作线程”。
vhost工作线程的运行函数为vhost_worker,而vhost_worker就会触发vq->handle_kick的回调。

那么vhost_pollvq->handle_kick等又是如何初始化的呢?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
int iov_limit, int weight, int byte_weight,
bool use_worker,
int (*msg_handler)(struct vhost_dev *dev, u32 asid,
struct vhost_iotlb_msg *msg))
{
struct vhost_virtqueue *vq;
int i;

dev->vqs = vqs;
dev->nvqs = nvqs;
mutex_init(&dev->mutex);
dev->log_ctx = NULL;
dev->umem = NULL;
dev->iotlb = NULL;
dev->mm = NULL;
dev->iov_limit = iov_limit;
dev->weight = weight;
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
dev->msg_handler = msg_handler;
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
spin_lock_init(&dev->iotlb_lock);
xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);

for (i = 0; i < dev->nvqs; ++i) {
vq = dev->vqs[i];
vq->log = NULL;
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
mutex_init(&vq->mutex);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
EPOLLIN, dev, vq);
}
}

vhost设备初始化时,会为每个vq调用vhost_poll_init来初始化vhost_pollvq->handle_kick等内容。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
__poll_t mask, struct vhost_dev *dev,
struct vhost_virtqueue *vq)
{
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); //设置wait的回调,在vhost poll到ioeventfd的写后,会触发回调
init_poll_funcptr(&poll->table, vhost_poll_func); //vhost_poll_func在vfs_poll时会触发回调,加入到file's wait queue中
poll->mask = mask;
poll->dev = dev;
poll->wqh = NULL;
poll->vq = vq;

vhost_work_init(&poll->work, fn); //fn为vq->handle_kick,初始化为work->fn,在vhost_worker中会回调
}

static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, //在vfs_poll时会触发回调,加入到file's wait queue中
poll_table *pt)
{
struct vhost_poll *poll;

poll = container_of(pt, struct vhost_poll, table);
poll->wqh = wqh;
add_wait_queue(wqh, &poll->wait);
}
1
2
3
4
5
6
7
8
9
10
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
struct vhost_worker __rcu *worker;
...
struct vhost_poll poll;
/* The routine to call when the Guest pings us, or timeout. */
vhost_work_fn_t handle_kick;
...
}

总结下vhost侧poll ioeventfd的流程:

  1. vhost_poll_func会让vhost poll ioeventfd,加入到file’s wait queue中
  2. kvm写ioeventfd通知vhost
  3. vhost回调vhost_poll_wakeup,将work加入到workqueue中,唤醒vhost工作线程
  4. vhost工作线程回调vq->handle_kick,处理vq中的io请求

irqfd

qemu侧irqfd的关联

1
2
3
4
5
6
7
virtio_pci_set_guest_notifiers
└── kvm_virtio_pci_vector_use
└── kvm_virtio_pci_irqfd_use
├── virtio_queue_get_guest_notifier
└── kvm_irqchip_add_irqfd_notifier_gsi
└── kvm_irqchip_assign_irqfd
└── kvm_vm_ioctl(s, KVM_IRQFD, &irqfd)
1
2
3
4
5
vhost_virtqueue_start
└── vhost_virtqueue_mask
├── event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq))
└── vhost_kernel_set_vring_call[vhost_set_vring_call]
└── vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file)
1
2
3
4
5
6
struct VirtQueue
{
...
EventNotifier guest_notifier
...
}

由上述代码可知,qemu侧通过guest_notifier的irqfd,将vhost与kvm关联了起来;

  • kvm负责poll irqfd,然后给vm注入中断
  • vhost在更新完used ring后,写irqfd来通知kvm注入中断

kvm侧irqfd处理

参考Dive into irqfd(KVM side) mechanism即可。

vhost侧irqfd处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
{
...
switch (ioctl) {
...
case VHOST_SET_VRING_CALL:
if (copy_from_user(&f, argp, sizeof f)) {
r = -EFAULT;
break;
}
ctx = f.fd == VHOST_FILE_UNBIND ? NULL : eventfd_ctx_fdget(f.fd);
if (IS_ERR(ctx)) {
r = PTR_ERR(ctx);
break;
}

swap(ctx, vq->call_ctx.ctx);
break;
...
}
...
}

qemu调用VHOST_SET_VRING_CALL,将irqfd传递给vhost

1
2
3
4
5
6
7
8
/* This actually signals the guest, using eventfd. */
void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq)
{
/* Signal the Guest tell them we used something up. */
if (vq->call_ctx.ctx && vhost_notify(dev, vq))
eventfd_signal(vq->call_ctx.ctx, 1);
}
EXPORT_SYMBOL_GPL(vhost_signal);

vhost在处理完io请求,并更新used ring后,调用vhost_signal,触发irqfd的写;kvm poll到后,就会给vm注入中断。