epoll source code analysis

epoll function

int epoll_create(int size);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event * events, int maxevents, int timeout);

#define EPOLL_PACKED __attribute__((packed)) / / compact memory allocation
struct epoll_event {
	__poll_t events;
	__u64 data;


static int __init eventpoll_init(void)
	struct sysinfo si;

	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
	BUG_ON(max_user_watches < 0);

	 * Initialize the structure used to perform epoll file descriptor
	 * inclusion loops checks.

	/* Initialize the structure used to perform safe poll wait head wake ups */

	/* Initialize the structure used to perform file's f_op->poll() calls */

	// Cache of epitem
	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),

	// epoll_ Cache of entry
	pwq_cache = kmem_cache_create("eventpoll_pwq",
			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);

	return 0;


SYSCALL_DEFINE1(epoll_create, int, size)
	if (size <= 0)
		return -EINVAL;

	return sys_epoll_create1(0);

size is only used to check whether it is greater than 0, and is not really used. Then call sys_. epoll_ The create1 process checks the parameters, and then calls epoll_. create1.


epoll_ The create process is mainly to create and initialize the data structure eventpoll, and create a file instance and put it into file - > private_ data

SYSCALL_DEFINE1(epoll_create1, int, flags)
	int error;
	struct eventpoll *ep = NULL;

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;

	// Allocate memory for eventpoll.
	error = ep_alloc(&ep); 
	if (error < 0)
		return error;
	// Establish the relationship between EP and file file - > private_ data = ep.  Then bind the corresponding file and fd
	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (error < 0)

	return error;
static int ep_alloc(struct eventpoll **pep)
	int error;
	struct user_struct *user;
	struct eventpoll *ep;

	user = get_current_user();
	error = -ENOMEM;
	// Allocate memory
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
	if (unlikely(!ep))
		goto free_uid;

	// Member initialization
	ep->rbr = RB_ROOT;
	ep->ovflist = EP_UNACTIVE_PTR;
	ep->user = user;

	*pep = ep;

	return 0;

	return error;

int anon_inode_getfd(const char *name, const struct file_operations *fops,
		     void *priv, int flags)
	int error, fd;
	struct file *file;

	// Get unused fd
	error = get_unused_fd_flags(flags); 
	if (error < 0)
		return error;
	fd = error;
	// Create file instances, anonymous inode nodes, dentry and other data structures, and then let file - > private = private (epollevent)
	file = anon_inode_getfile(name, fops, priv, flags); 
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto err_put_unused_fd;
	fd_install(fd, file);

	return fd;

	return error;

Look at the important structures


struct eventpoll {
	// Spin lock
	spinlock_t lock; 

	// Mutex of access
	struct mutex mtx;

	// sys_ epoll_ Wait queue used by wait()
	wait_queue_head_t wq;

	// The waiting queue used by file - > poll() will be hung on the waiting queue with callback function of each monitored fd
	wait_queue_head_t poll_wait; 

	// ready file descriptor, and finally add the prepared file descriptor to this through the callback function
	struct list_head rdllist; 

 	// The root node of the red black tree used to place the epitem
	struct rb_root rbr;  

	// When an event is being delivered to user space, the ready event will be temporarily placed in the queue, otherwise it will be directly placed in rdllist
	struct epitem *ovflist; 

	// The user who created the eventpoll descriptor
	struct user_struct *user; 


// Each monitored file descriptor is finally rendered through epitem
struct epitem {
	// It is used to link this structure to the red black tree of eventpoll
	struct rb_node rbn;

	// Linked list node, all ready epitem s will be linked to rdllist in eventpoll
	struct list_head rdllink; 

	// Use with ovflist to keep entries in one-way chains
	struct epitem *next;

	// Describe the fd and file corresponding to this epitem
	struct epoll_filefd ffd;

	/* Number of active wait queue attached to poll operations */
	// The number of active waiting queues in the poll operation
	int nwait;

	// Two way linked list, which holds the waiting queue of monitored files, list_ add_ tail(&pwq->llink, &epi->pwqlist);
	struct list_head pwqlist;

	// Container for epitem
	struct eventpoll *ep;

	// Each file has an f_ep_links, the epitem corresponding to the link
	struct list_head fllink;

	// The epitem relates to which events, and the data is to call epoll_ctl is passed from the user state
	struct epoll_event event;


/* Wait structure used by the poll hooks */
struct eppoll_entry {
	/* List header used to link this structure to the "struct epitem" */
	// Link this structure to the epitem
	struct list_head llink;

	/* The "base" pointer is set to the container "struct epitem" */
	struct epitem *base; // Pointer to epitem

	// Callback function on this wait
	wait_queue_t wait;  

	// wait queue pointing to eventpoll
	wait_queue_head_t *whead; 


#ifdef __x86_64__
#define EPOLL_PACKED __attribute__((packed))
typedef unsigned __bitwise __poll_t;

struct epoll_event {
	__poll_t events;
	__u64 data;


SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
	int error;
	int did_lock_epmutex = 0;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

	error = -EFAULT;
	// User space epoll_ Copy event to kernel
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	// These two things have been bound during the previous create. You can obtain the file instance through epfd. The file here is bound to eventpoll.
	file = fget(epfd);
	if (!file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	// The file to io, that is, the file instance of target
	tfile = fget(fd);
	if (!tfile)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	// target should support poll
	if (!tfile->f_op || !tfile->f_op->poll)
		goto error_tgt_fput;

	error = -EINVAL;
	// The file bound to epoll must be a poll function that supports epoll
	if (file == tfile || !is_file_epoll(file))
		goto error_tgt_fput;

	// Get eventpoll
	ep = file->private_data;

	 * When we insert an epoll file descriptor, inside another epoll file
	 * descriptor, there is the change of creating closed loops, which are
	 * better be handled here, than in more critical paths.
	 * We hold epmutex across the loop check and the insert in this case, in
	 * order to prevent two separate inserts from racing and each doing the
	 * insert "at the same time" such that ep_loop_check passes on both
	 * before either one does the insert, thereby creating a cycle.
	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
		did_lock_epmutex = 1;
		error = -ELOOP;
		if (ep_loop_check(ep, tfile) != 0)
			goto error_tgt_fput;


	// Find the fd in the ep red black tree and check whether there is an epitem instance.
	epi = ep_find(ep, tfile, fd); 

	error = -EINVAL;
	switch (op) {
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		if (epi)
			error = ep_remove(ep, epi);
			error = -ENOENT;
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;

	if (unlikely(did_lock_epmutex))


	return error;


Mainly talk about ep_insert

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
	int error, revents, pwake = 0;
	unsigned long flags;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	// Maximum value that epoll can monitor
	// static long max_user_watches __read_mostly;
	user_watches = atomic_long_read(&ep->user->epoll_watches);
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;

	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	epi->ep = ep;
	// epi adds the fd to be monitored to the epitem
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	// Initialize callback function
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	revents = tfile->f_op->poll(tfile, &epq.pt);

	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	error = -ENOMEM;
	if (epi->nwait < 0)
		goto error_unregister;

	/* Add the current item to the list of active epoll hook for this file */
	// Each * * file * * will link all epitem s listening to itself
	list_add_tail(&epi->fllink, &tfile->f_ep_links);

	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	ep_rbtree_insert(ep, epi);

	/* We have to drop the new item inside our item list to keep track of it */
	spin_lock_irqsave(&ep->lock, flags);

	// If the file is ready for operation, but the callback function is not called to link the epitem to eventepoll. Direct wake up
	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);

		// Monitored events are available to wake up waiting tasks
		if (waitqueue_active(&ep->wq))
		if (waitqueue_active(&ep->poll_wait))

	spin_unlock_irqrestore(&ep->lock, flags);


	/* We have to call this outside the lock */
	if (pwake)

	return 0;

	ep_unregister_pollwait(ep, epi);

	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	spin_lock_irqsave(&ep->lock, flags);
	if (ep_is_linked(&epi->rdllink))
	spin_unlock_irqrestore(&ep->lock, flags);

	kmem_cache_free(epi_cache, epi);

	return error;
static inline int waitqueue_active(wait_queue_head_t *q)
	return !list_empty(&q->task_list);
static inline int list_empty(const struct list_head *head)
	return head->next == head;

Sort out the call chain: tfile - > F_ op->poll(tfile, &epq.pt) -> ep_ eventpoll_ poll(struct file *file, poll_table *wait) -> poll_ wait(file, &ep->poll_wait, wait) -> ep_ ptable_ queue_ proc(struct file *file, wait_queue_head_t *whead, poll_table *pt)

That is, in the end, tfile - > F_ op_ Poll (tfile, & EPQ. PT) calls ep_ptable_queue_proc callback function:

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;
	// Create eppoll_entry, set the wake-up function to ep_poll_callback, adding to the device waiting queue
	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead; 
		pwq->base = epi;
		// void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
		// Link the callback function of the device to the linked list poll in epollevent_ wait
		add_wait_queue(whead, &pwq->wait);
		list_add_tail(&pwq->llink, &epi->pwqlist);
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;

The code above is EP_ The most important thing to do in insert: create struct epoll_entry, set the wake-up callback function to ep_poll_callback, and then join the device waiting queue (this whead is the poll_wait linked list in epollevent). Link this callback function.

Only in this way, when the device is ready, wake up the waiting process on the queue, ep_poll_callback will be called


Each time the poll system call is called, the operating system must hang the corresponding fd to the current process. When there are many fd, it is very troublesome to hang; And every time epoll is called_ Wait is not so wordy. Epoll is only in epoll_ During CTL, hang the fd device on the current process. If the device has an event, the callback function will put fd into rdllist and call epoll every time_ Wait is just fd in rdllist Epoll makes clever use of callback function to realize a more efficient event driven model.

So it should be possible to guess EP_ poll_ What will callback do? It must insert the epitem (representing fd) of the received event on the red black tree into EP - > rdllist. In this way, when epoll_ When the wait returns, the rdllist is full of ready fd.


// Initialize custom wake-up function
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
    q->flags = 0;
    q->private = p;
    q->func = default_wake_function;

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
	int pwake = 0;
	unsigned long flags;
	// Get epitem from waiting queue
	struct epitem *epi = ep_item_from_wait(wait);
	struct eventpoll *ep = epi->ep;

	// Prohibit interrupt
	spin_lock_irqsave(&ep->lock, flags);

	 * If the event mask does not contain any poll(2) event, we consider the
	 * descriptor to be disabled. This condition is likely the effect of the
	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
	 * until the next EPOLL_CTL_MOD will be issued.
	// The event does not contain any events for poll(2)
	if (!(epi->event.events & ~EP_PRIVATE_BITS))
		goto out_unlock;

	 * Check the events coming with the callback. At this stage, not
	 * every device reports the events in the "key" parameter of the
	 * callback. We need to be able to handle both cases here, hence the
	 * test for "key" != NULL before the event match test.
	if (key && !((unsigned long) key & epi->event.events))
		goto out_unlock;

	 * If we are transferring events to userspace, we can hold no locks
	 * (because we're accessing user memory, and because of linux f_op->poll()
	 * semantics). All the events that happen during that period of time are
	 * chained in ep->ovflist and requeued later on.
	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
		if (epi->next == EP_UNACTIVE_PTR) {
			epi->next = ep->ovflist;
			ep->ovflist = epi;
		goto out_unlock;

	/* If this file is already in the ready list we exit soon */
	if (!ep_is_linked(&epi->rdllink))
		list_add_tail(&epi->rdllink, &ep->rdllist);

	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	// awaken
	if (waitqueue_active(&ep->wq))
	if (waitqueue_active(&ep->poll_wait))

	spin_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)

	return 1;

After consulting a lot of data, I realized that epoll is also a file type, and its underlying driver also implements file_ The poll function in operations, so fd of an epoll type can be monitored by other epoll instances. The fd of epoll type will only have "read ready" events. When the non epoll type file monitored by epoll has a "read ready" event, the current epoll will also enter the "read ready" state.

Therefore, recursion occurs if one epoll instance monitors another epoll. For example, as shown in the figure:

epollfd1 monitors 2 fd of "non epoll" type

epollfd2 monitors epollfd1 and 2 fd of "non epoll" type

If a readable event is triggered in the two FDS monitored by epollfd1, the EP of fd_ poll_ The callback function triggers fd to be placed in rdllist of epollfd1. At this time, the readable event of epollfd1 itself will also be triggered, so you need to start from the poll of epollfd1_ Find epollfd2 in the wait queue and call EP of epollfd1_ poll_ Callback (put epollfd1 into rdllist of epollfd2). Therefore, EP - > poll_wait is used to handle nested monitoring between epolls.

Tags: Linux Back-end

Posted on Thu, 28 Oct 2021 09:21:13 -0400 by merebel