当使用KVM pass-thru设备时,通常会设置intel_iommu=on iommu=pt内核参数,其中intel_iommu=on就是使能intel iommu,本文将介绍iommu=pt

本文参考的内核版本是v5.0

identity mapping指的是iova与hpa 1:1映射。

1. Motivation

The pt option only enables IOMMU translation for devices used in pass-thru ,doesn’t enable IOMMU translation for host used devices ,and this will improve performance for host PCIe devices (which are not pass-thru to a VM).

内核的注释:

1
2
3
4
5
6
7
8
/*
* This variable becomes 1 if iommu=pt is passed on the kernel command line.
* If this variable is 1, IOMMU implementations do no DMA translation for
* devices and allow every device to access to whole physical memory. This is
* useful if a user wants to use an IOMMU only for KVM device assignment to
* guests and not for driver dma translation.
*/
int iommu_pass_through __read_mostly = 1;

2. 源码解析

2.1 pt option解析

1
2
3
4
5
6
7
static __init int iommu_setup(char *p)
{
...
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
...
}

2.2 init_dmars

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#define IDENTMAP_ALL        1


/*
* This domain is a statically identity mapping domain.
* 1. This domain creats a static 1:1 mapping to all usable memory.
* 2. It maps to each iommu if successful.
* 3. Each iommu maps to this domain if successful.
*/
static struct dmar_domain *si_domain;
static int hw_pass_through = 1;

static int __init init_dmars(void)
{
...
if (!ecap_pass_through(iommu->ecap))
hw_pass_through = 0;
...
if (iommu_pass_through)
iommu_identity_mapping |= IDENTMAP_ALL;
...
if (iommu_identity_mapping) {
ret = si_domain_init(hw_pass_through);
if (ret)
goto free_iommu;
}
...
if (iommu_identity_mapping) {
ret = iommu_prepare_static_identity_mapping(hw_pass_through);
if (ret) {
pr_crit("Failed to setup IOMMU pass-through\n");
goto free_iommu;
}
}
...
}

#define ecap_pass_through(e) ((e >> 6) & 0x1)

ecap_pass_through(iommu->ecap)的含义是检查Extended Capability Register的PT field。

如果Hardware supports pass-through translation type,那么hw_pass_through为1;否则hw_pass_through为0。

iommu_pass_through被设置时,iommu_identity_mapping也会被设置。接着会依次调用si_domain_initiommu_prepare_static_identity_mapping

2.3 si_domain_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
static int __init si_domain_init(int hw)
{
int nid, ret = 0;

si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
if (!si_domain)
return -EFAULT;

if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
domain_exit(si_domain);
return -EFAULT;
}

pr_debug("Identity mapping domain allocated\n");

if (hw)
return 0;

for_each_online_node(nid) { //迭代所有的活动结点(针对NUMA)
unsigned long start_pfn, end_pfn;
int i;

for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
ret = iommu_domain_identity_map(si_domain, //iova与hpa 1:1映射
PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
if (ret)
return ret;
}
}

return 0;
}

从上述代码可知,当hw_pass_through为1时,无需建立iova与hpa 1:1映射的iommu页表;否则需要对all usable memory建立iova与hpa 1:1映射的iommu页表。

2.4 iommu_prepare_static_identity_mapping

1
2
3
4
5
6
iommu_prepare_static_identity_mapping
└── dev_prepare_static_identity_mapping
└── domain_add_dev_info
└── dmar_insert_one_dev_info
└── domain_context_mapping
└── domain_context_mapping_one
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static int domain_context_mapping_one(struct dmar_domain *domain,
struct intel_iommu *iommu,
u8 bus, u8 devfn)
{
...
// 设置translation type 为 pass through
if (hw_pass_through && domain_type_is_si(domain))
translation = CONTEXT_TT_PASS_THROUGH;
...
// 获取这个设备在contex table表里面的地址
context = iommu_context_addr(iommu, bus, devfn, 1);
...
struct dma_pte *pgd = domain->pgd; // iova页表基址
int agaw;

context_set_domain_id(context, did);
// 设置转换类型
context_set_translation_type(context, translation);

// 下面代码可以看出pass through模式不会设置iova页表地址
if (translation != CONTEXT_TT_PASS_THROUGH) {
...
// 非pass through模式下需要设置iova页表的基地址
context_set_address_root(context, virt_to_phys(pgd));
context_set_address_width(context, agaw);
} else {
/*
* In pass through mode, AW must be programmed to
* indicate the largest AGAW value supported by
* hardware. And ASR is ignored by hardware.
*/
context_set_address_width(context, iommu->msagaw);
}
...
}

#define CONTEXT_TT_PASS_THROUGH 2

因此CONTEXT_TT_PASS_THROUGH为10b,即是2。

3. 总结

配置了iommu=pt就会实现identity mapping:

  • 如果Hardware supports pass-through translation type,则配置pass-through translation type即可实现identity mapping,此时无需配置iommu页表;
  • 如果Hardware doesn’t support pass-through translation type,则需要配置iommu页表,使得iova与hpa 1:1映射。

hw_pass_through=0时,依然要走iommu页表,因此性能是不如hw_pass_through=1的。


参考资料:

  1. iommu passthrough分析
  2. Intel VT-d spec
  3. IOMMU(二)-从配置说起
  4. PCI_Passthrough PT Mode
  5. Configuring a Host for PCI Passthrough
  6. 深入了解iommu系列一:iommu硬件架构和驱动初始化
  7. 深入了解iommu系列二:iommu 工作原理解析之dma remapping