值得注意的是,本文中的poll并非system call,而是内核中file_operations的poll函数。

强烈建议先阅读select usage and implementation in kernel

注:代码中的细节可参见Advanced Char Driver Operations

内容源于:select()/poll() 的内核实现

For every file descriptor, it calls that fd’s poll() method, which will add the caller to that fd’s wait queue, and return which events (readable, writeable, exception) currently apply to that fd.

1. How to add poll function to the kernel module code?

  1. Include needed headers:

    1
    2
    #include <linux/wait.h>
    #include <linux/poll.h>
  2. Declare waitqueue variable:

    1
    static DECLARE_WAIT_QUEUE_HEAD(fortune_wait);
  3. Add fortune_poll() function and add it (as .poll callback) to your file operations structure:

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    static unsigned int fortune_poll(struct file *file, poll_table *wait)
    {
    poll_wait(file, &fortune_wait, wait);
    if (new-data-is-ready)
    return POLLIN | POLLRDNORM;
    return 0;
    }

    static const struct file_operations proc_test_fops = {
    ....
    .poll = fortune_poll,
    };

    Note that you should return POLLIN | POLLRDNORM if you have some new data to read, and 0 in case there is no new data to read.

  4. Notify your waitqueue once you have new data:

    1
    wake_up_interruptible(&fortune_wait);

2. scull驱动实例

由于Linux设备驱动的耦合设计,对设备的操作函数都是驱动程序自定义的,我们必须要结合一个具体的实例来看看,才能知道f_op->poll里面弄得是什么鬼。

在这里我们以Linux Device Drivers, Third Edition一书中的例子——scull设备的驱动程序为例。

scull (Simple Character Utility for Loading Localities). scull is a char driver that acts on a memory area as though it were a device.

scull设备不同于硬件设备,它是模拟出来的一块内存,因此对它的读写更快速更自由,内存支持你顺着读倒着读点着读怎么读都可以。 我们以书中“管道”(pipe)式,即FIFO的读写驱动程序为例。

首先是scull_pipe的结构体,注意wait_queue_head_t这个队列类型,它就是用来记录等待设备I/O事件的进程的。

1
2
3
4
5
6
7
8
9
10
struct scull_pipe {
wait_queue_head_t inq, outq; /* read and write queues */
char *buffer, *end; /* begin of buf, end of buf*/
int buffersize; /* used in pointer arithmetic*/
char *rp, *wp; /* where to read, where to write */
int nreaders, nwriters; /* number of openings for r/w */
struct fasync_struct *async_queue; /* asynchronous readers */
struct mutex mutex; /* mutual exclusion semaphore */
struct cdev cdev; /* Char device structure */
};

scull设备的轮询操作函数scull_p_poll,驱动模块加载后,这个函数就被挂到(*poll)函数指针上去了。

我们可以看到它的确是返回了当前设备的I/O状态,并且调用了内核的poll_wait()函数,这里注意,它把自己的wait_queue_head_t队列也当作参数传进去了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static unsigned int scull_p_poll(struct file *filp, poll_table *wait)
{
struct scull_pipe *dev = filp->private_data;
unsigned int mask = 0;

mutex_lock(&dev->mutex);
poll_wait(filp, &dev->inq, wait);
poll_wait(filp, &dev->outq, wait);
if (dev->rp != dev->wp)
mask |= POLLIN | POLLRDNORM; /* readable */
if (spacefree(dev))
mask |= POLLOUT | POLLWRNORM; /* writable */
mutex_unlock(&dev->mutex);
return mask;
}

scull有数据写入时,它会把wait_queue_head_t队列里等待的进程给唤醒。

1
2
3
4
5
6
7
8
9
static ssize_t scull_p_write(struct file *filp, const char __user *buf, size_t count,
loff_t *f_pos)
{


/* finally, awake any reader */
wake_up_interruptible(&dev->inq); /* blocked in read() and select() */

}

可是wait_queue_head_t队列里的进程是什么时候装进去的? 肯定是poll_wait搞的鬼!

3. poll_wait与设备的等待队列

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}

/*
* Do not touch the structure directly, use the access functions
* poll_does_not_wait() and poll_requested_events() instead.
*/
typedef struct poll_table_struct {
poll_queue_proc _qproc;
unsigned long _key;
} poll_table;

/*
* structures and helpers for f_op->poll implementations
*/
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

可以看到,poll_wait()其实就是只是直接调用了struct poll_table_struct结构里绑定的函数指针。 我们需要找到struct poll_table_struct初始化的地方。

The poll_table structure is just a wrapper around a function that builds the actual data structure. That structure, for poll and select, is a linked list of memory pages containing poll_table_entry structures.

struct poll_table_struct里的函数指针,是在do_select()初始化的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
{
struct poll_wqueues table;
poll_table *wait;
poll_initwait(&table);
wait = &table.pt;
// …
}

void poll_initwait(struct poll_wqueues *pwq)
{
// 初始化poll_table里的函数指针
init_poll_funcptr(&pwq->pt, __pollwait);
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
EXPORT_SYMBOL(poll_initwait);

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->_qproc = qproc;
pt->_key = ~0UL; /* all events enabled */
}

我们现在终于知道,__pollwait()函数,就是poll_wait()幕后的真凶。

add_wait_queue()把当前进程添加到设备的等待队列wait_queue_head_t中去。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
entry->filp = get_file(filp);
entry->wait_address = wait_address;
entry->key = p->_key;
init_waitqueue_func_entry(&entry->wait, pollwake);
entry->wait.private = pwq;
// 把当前进程装到设备的等待队列
add_wait_queue(wait_address, &entry->wait);
}

void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;

wait->flags &= ~WQ_FLAG_EXCLUSIVE;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue(q, wait);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(add_wait_queue);

static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new)
{
list_add(&new->task_list, &head->task_list);
}

/**
* Insert a new element after the given list head. The new element does not
* need to be initialised as empty list.
* The list changes from:
* head → some element → ...
* to
* head → new element → older element → ...
*
* Example:
* struct foo *newfoo = malloc(...);
* list_add(&newfoo->entry, &bar->list_of_foos);
*
* @param entry The new element to prepend to the list.
* @param head The existing list.
*/
static inline void
list_add(struct list_head *entry, struct list_head *head)
{
__list_add(entry, head, head->next);
}


参考资料:

  1. select()/poll() 的内核实现
  2. How to add poll function to the kernel module code?
  3. POLLHUP vs POLLNVAL, or what is POLLHUP?
  4. Advanced Char Driver Operations
  5. 底层数据结构