本文内容主要转载自Linux Kernel Userfaultfd 内部机制探究,看代码的时候结合man userfaultfdman ioctl_userfaultfd,方便查询。

1. Introduction

Userfaults allow the implementation of on-demand paging from userland and more generally they allow userland to take control of various memory page faults, something otherwise only the kernel code could do.

userfaultfd是kernel中提供的一种特殊的处理page fault的机制,能够让用户态程序自行处理自己的page fault。

它的调用方式是通过一个userfaultfd的syscall新建一个 fd,然后用ioctl等syscall来调用相关的API。该机制的初衷是为了方便虚拟机的post-copy live migration。

2. 工作流程


左侧的 Faulting thread,mm core,userfaultfd 是属于同一个(内核)线程,右边的 uffd monitor 是属于另一(内核)线程,它们在用户态应该表现为共享地址空间的2个线程。

在开始时,faulting 线程读取了一块未分配物理页的内存,触发了page fault,此时进到内核中进行处理,内核调用了 handle_userfault 交给 userfaultfd 相关的代码进行处理,此时该线程将被挂起进入阻塞状态。同时一个待处理的消息 uffd_msg 结构通过该 fd 发送到了另一个 monitor 线程,该线程可以调用相关 API 进行处理 ( UFFDIO_COPYUFFDIO_ZEROPAGE)并告知内核唤醒 faulting 线程。

从这个例子中我们能看出这里面涉及到2个线程之间的交互,我们也不能免俗地要介绍一下具体用法,阅读 userfaultfd man page 里给出的例子,里面大概分为5步。

3. 用法

3.1 分配一个 userfault fd 并检查 API

由于 glibc 没有对应的 syscall wrapper,直接使用 syscall 函数分配。

1
2
3
4
5
6
7
8
9
10
 /* Create and enable userfaultfd object */

uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");

uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");

3.2 注册需要进行 userfault 的内存区域

1
2
3
4
5
6
7
8
9
/* Register the memory range of the mapping we just created for
handling by the userfaultfd object. In mode, we request to track
missing pages (i.e., pages that have not yet been faulted in). */

uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl-UFFDIO_REGISTER");

3.3 创建 monitor 线程,(子线程)监听 fd 的事件

在一个 for 循环中,不断使用 pool 来等待这个 fd ,然后读取一个 msg,这里读取的 msg 就是 uffd_msg 结构。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
for (;;) {

/* See what poll() tells us about the userfaultfd */

struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
errExit("poll");

printf("\nfault_handler_thread():\n");
printf(" poll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n", nready,
(pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);

/* Read an event from the userfaultfd */

nread = read(uffd, &msg, sizeof(msg));
if (nread == 0) {
printf("EOF on userfaultfd!\n");
exit(EXIT_FAILURE);
}

if (nread == -1)
errExit("read");
...

3.4 主线程触发指定区域的 page fault

读一下该区域的内存即可

3.5(子线程)处理 fault

调用 UFFDIO_COPY为新映射的页提供数据,并唤醒主线程,子线程自身会进入到下一轮循环中继续 poll 等待输入。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/* Copy the page pointed to by 'page' into the faulting
region. Vary the contents that are copied in, so that it
is more obvious that each fault is handled separately. */

memset(page, 'A' + fault_cnt % 20, page_size);
fault_cnt++;

uffdio_copy.src = (unsigned long) page;

/* We need to handle page faults in units of pages(!).
So, round faulting address down to page boundary */

uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
~(page_size - 1);
uffdio_copy.len = page_size;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
errExit("ioctl-UFFDIO_COPY");

4. Example

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
/* userfaultfd_demo.c

Licensed under the GNU General Public License version 2 or later.
*/
#define _GNU_SOURCE
#include <inttypes.h>
#include <sys/types.h>
#include <stdio.h>
#include <linux/userfaultfd.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <poll.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <poll.h>

#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)

static int page_size;

static void *
fault_handler_thread(void *arg)
{
static struct uffd_msg msg; /* Data read from userfaultfd */
static int fault_cnt = 0; /* Number of faults so far handled */
long uffd; /* userfaultfd file descriptor */
static char *page = NULL;
struct uffdio_copy uffdio_copy;
ssize_t nread;

uffd = (long) arg;

/* Create a page that will be copied into the faulting region. */

if (page == NULL) {
page = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (page == MAP_FAILED)
errExit("mmap");
}

/* Loop, handling incoming events on the userfaultfd
file descriptor. */

for (;;) {

/* See what poll() tells us about the userfaultfd. */

struct pollfd pollfd;
int nready;
pollfd.fd = uffd;
pollfd.events = POLLIN;
nready = poll(&pollfd, 1, -1);
if (nready == -1)
errExit("poll");

printf("\nfault_handler_thread():\n");
printf(" poll() returns: nready = %d; "
"POLLIN = %d; POLLERR = %d\n", nready,
(pollfd.revents & POLLIN) != 0,
(pollfd.revents & POLLERR) != 0);

/* Read an event from the userfaultfd. */

nread = read(uffd, &msg, sizeof(msg));
if (nread == 0) {
printf("EOF on userfaultfd!\n");
exit(EXIT_FAILURE);
}

if (nread == -1)
errExit("read");

/* We expect only one kind of event; verify that assumption. */

if (msg.event != UFFD_EVENT_PAGEFAULT) {
fprintf(stderr, "Unexpected event on userfaultfd\n");
exit(EXIT_FAILURE);
}

/* Display info about the page-fault event. */

printf(" UFFD_EVENT_PAGEFAULT event: ");
printf("flags = %"PRIx64"; ", msg.arg.pagefault.flags);
printf("address = %"PRIx64"\n", msg.arg.pagefault.address);

/* Copy the page pointed to by 'page' into the faulting
region. Vary the contents that are copied in, so that it
is more obvious that each fault is handled separately. */

memset(page, 'A' + fault_cnt % 20, page_size);
fault_cnt++;

uffdio_copy.src = (unsigned long) page;

/* We need to handle page faults in units of pages(!).
So, round faulting address down to page boundary. */

uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address &
~(page_size - 1);
uffdio_copy.len = page_size;
uffdio_copy.mode = 0;
uffdio_copy.copy = 0;
if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1)
errExit("ioctl-UFFDIO_COPY");

printf(" (uffdio_copy.copy returned %"PRId64")\n",
uffdio_copy.copy);
}
}

int
main(int argc, char *argv[])
{
long uffd; /* userfaultfd file descriptor */
char *addr; /* Start of region handled by userfaultfd */
uint64_t len; /* Length of region handled by userfaultfd */
pthread_t thr; /* ID of thread that handles page faults */
struct uffdio_api uffdio_api;
struct uffdio_register uffdio_register;
int s;

if (argc != 2) {
fprintf(stderr, "Usage: %s num-pages\n", argv[0]);
exit(EXIT_FAILURE);
}

page_size = sysconf(_SC_PAGE_SIZE);
len = strtoull(argv[1], NULL, 0) * page_size;

/* Create and enable userfaultfd object. */

uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
if (uffd == -1)
errExit("userfaultfd");

uffdio_api.api = UFFD_API;
uffdio_api.features = 0;
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1)
errExit("ioctl-UFFDIO_API");

/* Create a private anonymous mapping. The memory will be
demand-zero paged--that is, not yet allocated. When we
actually touch the memory, it will be allocated via
the userfaultfd. */

addr = mmap(NULL, len, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED)
errExit("mmap");

printf("Address returned by mmap() = %p\n", addr);

/* Register the memory range of the mapping we just created for
handling by the userfaultfd object. In mode, we request to track
missing pages (i.e., pages that have not yet been faulted in). */

uffdio_register.range.start = (unsigned long) addr;
uffdio_register.range.len = len;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1)
errExit("ioctl-UFFDIO_REGISTER");

/* Create a thread that will process the userfaultfd events. */

s = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
if (s != 0) {
errno = s;
errExit("pthread_create");
}

/* Main thread now touches memory in the mapping, touching
locations 1024 bytes apart. This will trigger userfaultfd
events for all pages in the region. */

int l;
l = 0xf; /* Ensure that faulting address is not on a page
boundary, in order to test that we correctly
handle that case in fault_handling_thread(). */
while (l < len) {
char c = addr[l];
printf("Read address %p in main(): ", addr + l);
printf("%c\n", c);
l += 1024;
usleep(100000); /* Slow things down a little */
}

exit(EXIT_SUCCESS);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
$ ./userfaultfd_demo 3
Address returned by mmap() = 0x7fd30106c000

fault_handler_thread():
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106c00f
(uffdio_copy.copy returned 4096)
Read address 0x7fd30106c00f in main(): A
Read address 0x7fd30106c40f in main(): A
Read address 0x7fd30106c80f in main(): A
Read address 0x7fd30106cc0f in main(): A

fault_handler_thread():
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106d00f
(uffdio_copy.copy returned 4096)
Read address 0x7fd30106d00f in main(): B
Read address 0x7fd30106d40f in main(): B
Read address 0x7fd30106d80f in main(): B
Read address 0x7fd30106dc0f in main(): B

fault_handler_thread():
poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fd30106e00f
(uffdio_copy.copy returned 4096)
Read address 0x7fd30106e00f in main(): C
Read address 0x7fd30106e40f in main(): C
Read address 0x7fd30106e80f in main(): C
Read address 0x7fd30106ec0f in main(): C

参考资料:

  1. Linux Kernel Userfaultfd 内部机制探究
  2. man userfaultfd
  3. man ioctl_userfaultfd
  4. Userfaultfd: Current Features, Limitations and Future Development
  5. kernel userfaultfd.txt
  6. shanyizhou:Linux Userfaultfd
  7. userfaultfd_ post-copy VM migration and beyond.pdf
  8. userfaultfd hello world