本文将结合virtio spec、qemu与Linux kernel源码深入解析virtio-blk resize的原理。

本文参考的virtio spec是0.9.5,qemu版本为v2.6.0,Linux kernel版本为v4.19

1. overview

virtio-blk后端设备resize后,通过msi-x的configuration vector给guest发送中断,guest收到中断后,handler会读取virtio header中的capacity field来完成resize操作。

2. 基础知识

virtio header中有configuration vector这个field。guest配置MSI-x table时,会配置好configuration vector。

当后端设备配置发生变化时,会触发configuration vector对应的中断。

对于virtio-blk设备,virtio header中的capacity存放了size信息。当resize时,capacity会发生变化。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
struct virtio_blk_config {
/* The capacity (in 512-byte sectors). */
__u64 capacity;
/* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */
__u32 size_max;
/* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
__u32 seg_max;
/* geometry of the device (if VIRTIO_BLK_F_GEOMETRY) */
struct virtio_blk_geometry {
__u16 cylinders;
__u8 heads;
__u8 sectors;
} geometry;

/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
__u32 blk_size;

/* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY */
/* exponent for physical block per logical block. */
__u8 physical_block_exp;
/* alignment offset in logical blocks. */
__u8 alignment_offset;
/* minimum I/O size without performance penalty in logical blocks. */
__u16 min_io_size;
/* optimal sustained I/O size in logical blocks. */
__u32 opt_io_size;

/* writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
__u8 wce;
__u8 unused;

/* number of vqs, only available when VIRTIO_BLK_F_MQ is set */
__u16 num_queues;
} __attribute__((packed));

3. virtio-blk后端设备resize

首先需要完成virtio-blk后端设备的resize,比如virtio-blk后端设备是一个文件,那么需要将这个文件resize!virtio-blk后端设备的形式较多,不在本文描述范围之内。

4. QEMU发送configuration vector中断

4.1 hmp block_resize命令

1
2
3
4
5
hmp_block_resize
└── qmp_block_resize
└── bdrv_truncate
└── blk_dev_resize_cb
└── virtio_blk_resize[resize_cb]

hmp block_resize命令的函数调用链如上所示,最终会调用到virtio_blk_resize函数。

4.2 virtio_blk_resize发送configuration vector中断

1
2
3
4
5
6
7
virtio_blk_resize
└── virtio_notify_config
└── virtio_notify_vector(vdev, vdev->config_vector)
└── virtio_pci_notify
└── msix_notify
├── msix_get_message
└── msi_send_message

最终qemu会调用msi_send_message往guest注入configuration vector中断(本质上是模拟memory write TLP)。

5. guest处理configuration vector中断

5.1 guest注册中断

1
2
3
4
5
6
7
8
9
static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
bool per_vq_vectors, struct irq_affinity *desc)
{
...
err = request_irq(pci_irq_vector(vp_dev->pci_dev, v),
vp_config_changed, 0, vp_dev->msix_names[v],
vp_dev);
...
}
1
2
3
4
5
6
7
8
/* Handle a configuration change: Tell driver if it wants to know. */
static irqreturn_t vp_config_changed(int irq, void *opaque)
{
struct virtio_pci_device *vp_dev = opaque;

virtio_config_changed(&vp_dev->vdev);
return IRQ_HANDLED;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
void virtio_config_changed(struct virtio_device *dev)
{
unsigned long flags;

spin_lock_irqsave(&dev->config_lock, flags);
__virtio_config_changed(dev);
spin_unlock_irqrestore(&dev->config_lock, flags);
}
EXPORT_SYMBOL_GPL(virtio_config_changed);

static void __virtio_config_changed(struct virtio_device *dev)
{
struct virtio_driver *drv = drv_to_virtio(dev->dev.driver);

if (!dev->config_enabled)
dev->config_change_pending = true;
else if (drv && drv->config_changed)
drv->config_changed(dev); //for virtio-blk, it's virtblk_config_changed
}
1
2
3
4
5
static struct virtio_driver virtio_blk = {
...
.config_changed = virtblk_config_changed,
...
};
1
2
3
4
5
6
7
8
9
10
11
12
13
static void virtblk_config_changed(struct virtio_device *vdev)
{
struct virtio_blk *vblk = vdev->priv;

queue_work(virtblk_wq, &vblk->config_work);
}

static int virtblk_probe(struct virtio_device *vdev)
{
...
INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
...
}

5.2 中断handler处理resize

1
2
3
virtblk_config_changed_work
└── virtblk_update_capacity
└── virtio_cread
1
2
3
4
5
6
7
8
9
10
11
12
13
/* The queue's logical block size must be set before calling this */
static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
{
struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
unsigned long long nblocks;
u64 capacity;

/* Host must always specify the capacity. */
virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
...
}

virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity)其实就是读virtio header中的capacity field,此时会发生VM Exit trap到qemu中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* Config space accessors. */
#define virtio_cread(vdev, structname, member, ptr) \
do { \
/* Must match the member's type, and be integer */ \
if (!typecheck(typeof((((structname*)0)->member)), *(ptr))) \
(*ptr) = 1; \
\
switch (sizeof(*ptr)) { \
case 1: \
*(ptr) = virtio_cread8(vdev, \
offsetof(structname, member)); \
break; \
case 2: \
*(ptr) = virtio_cread16(vdev, \
offsetof(structname, member)); \
break; \
case 4: \
*(ptr) = virtio_cread32(vdev, \
offsetof(structname, member)); \
break; \
case 8: \
*(ptr) = virtio_cread64(vdev, \
offsetof(structname, member)); \
break; \
default: \
BUG(); \
} \
} while(0)


static inline u64 virtio_cread64(struct virtio_device *vdev,
unsigned int offset)
{
u64 ret;
__virtio_cread_many(vdev, offset, &ret, 1, sizeof(ret));
return virtio64_to_cpu(vdev, (__force __virtio64)ret);
}

/* Read @count fields, @bytes each. */
static inline void __virtio_cread_many(struct virtio_device *vdev,
unsigned int offset,
void *buf, size_t count, size_t bytes)
{
u32 old, gen = vdev->config->generation ?
vdev->config->generation(vdev) : 0;
int i;

do {
old = gen;

for (i = 0; i < count; i++)
vdev->config->get(vdev, offset + bytes * i,
buf + i * bytes, bytes);

gen = vdev->config->generation ?
vdev->config->generation(vdev) : 0;
} while (gen != old);
}

6. qemu完成virtio header中capacity field的模拟

1
2
3
4
5
6
7
8
9
10
11
12
13
14
uint32_t virtio_config_readb(VirtIODevice *vdev, uint32_t addr)
{
VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
uint8_t val;

if (addr + sizeof(val) > vdev->config_len) {
return (uint32_t)-1;
}

k->get_config(vdev, vdev->config);//对应到virtio_blk_update_config

val = ldub_p(vdev->config + addr);
return val;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* coalesce internal state, copy to pci i/o region 0
*/
static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
{
VirtIOBlock *s = VIRTIO_BLK(vdev);
BlockConf *conf = &s->conf.conf;
struct virtio_blk_config blkcfg;
uint64_t capacity;
int blk_size = conf->logical_block_size;

blk_get_geometry(s->blk, &capacity);
memset(&blkcfg, 0, sizeof(blkcfg));
virtio_stq_p(vdev, &blkcfg.capacity, capacity);
virtio_stl_p(vdev, &blkcfg.seg_max, 128 - 2);
virtio_stw_p(vdev, &blkcfg.geometry.cylinders, conf->cyls);
virtio_stl_p(vdev, &blkcfg.blk_size, blk_size);
virtio_stw_p(vdev, &blkcfg.min_io_size, conf->min_io_size / blk_size);
virtio_stw_p(vdev, &blkcfg.opt_io_size, conf->opt_io_size / blk_size);
blkcfg.geometry.heads = conf->heads;
/*
* We must ensure that the block device capacity is a multiple of
* the logical block size. If that is not the case, let's use
* sector_mask to adopt the geometry to have a correct picture.
* For those devices where the capacity is ok for the given geometry
* we don't touch the sector value of the geometry, since some devices
* (like s390 dasd) need a specific value. Here the capacity is already
* cyls*heads*secs*blk_size and the sector value is not block size
* divided by 512 - instead it is the amount of blk_size blocks
* per track (cylinder).
*/
if (blk_getlength(s->blk) / conf->heads / conf->secs % blk_size) {
blkcfg.geometry.sectors = conf->secs & ~s->sector_mask;
} else {
blkcfg.geometry.sectors = conf->secs;
}
blkcfg.size_max = 0;
blkcfg.physical_block_exp = get_physical_block_exp(conf);
blkcfg.alignment_offset = 0;
blkcfg.wce = blk_enable_write_cache(s->blk);
memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
}

参考资料:

  1. qemu block_resize(动态修改磁盘大小)实现简记
  2. Online resizing block devices and file systems
  3. Is online disk resize possible with KVM?