Multiplexing IO epol poll select pool zero copy read / write context switching mmap

Posted by mattpointblank on Mon, 20 Sep 2021 15:30:51 +0200

High performance server, network theoretical knowledge and operating system

High performance server

Improve server performance

1. I/O model
Blocking IO (not appropriate)

Program blocking and read / write function when there is no data to read, the program blocks until the data is read successfully

Blocking process: the time period when data comes from nothing

non-blocking IO

Returns immediately when the file descriptor is unreadable or writable

Generally, polling is used for reading and writing after a period of time

I/O multiplexing

Program blocking and I/O multiplexing system calls can listen to multiple I/O events at the same time

Read and write operations to the I/O itself are non blocking


//  select  

/* According to POSIX.1-2001 */
       #include <sys/select.h>

       /* According to earlier standards */
       #include <sys/time.h>
       #include <sys/types.h>
       #include <unistd.h>

       int select(int nfds, //It is usually set to select all file descriptors listened on
                  		   //The maximum value in + 1 specifies the total number of file descriptors being listened on. File descriptors start at 0
                  fd_set *readfds, 
                  fd_set *writefds,
                  fd_set *exceptfds, 
                  struct timeval *timeout);
	   int pselect();
//	File descriptor collection 
//fd_ The set type is an array of integers in which each element is marked with a file descriptor per bit
//fd_ The number of file descriptors that set can hold is determined by FD_SETSIZE specifies
//Limits the number of file descriptors processed by select
FD_ZERO(fd_set *fdset);//Clear all bits
FD_SET(int fd,fd_set *fdest); //Set fd
FD_CLR(int fd,fd_set *fdest); //Clear fd
int FD_ISSET(int fd,fd_set *fdest);//test
//Readfds, writefds and exceptfds are respectively used to record whether to listen for readable and writable exception event file descriptors
//When the select function call returns, the kernel will modify the readfds, writefds, exceptions and file descriptor set, and retain the data readable and writable exception file descriptor

//Set select function timeout
//Returns the time remaining after the select call returns. If it fails, the timeout is uncertain
//If all members of the timeout variable are 0, select is non blocking
//If the timeout value is null, select will block until a file descriptor is ready
struct timeval{
	long tv_sec;
    long tv_usec;

//Return value:
//    When select is successful, it returns the synthesis of ready (read-write exception) file descriptors
//		Returns 0 if any file descriptors are ready within the timeout period
//		select failed, return - 1 and set errno
//		During the select wait period, the program receives the signal and select immediately returns - 1
//		errno is EINTR

//When the number of file descriptors increases, the efficiency actually decreases sharply


poll does not need to add all file descriptors to the collection (array) at a time

poll notifies users differently

select deletes file descriptors that are not ready from the collection

The poll kernel directly sets the events attribute of each data in the collection

The same thing is that after calling the poll/select function, you need to traverse all file descriptors

You need to traverse all file descriptors to determine whether they are ready

//  poll  
#include <poll.h>
int poll(struct pollfd *fds, nfds_t nfds, int timeout);
	fds:It's a struct pollfd Array of structure types
      struct pollfd{
      	  int fd;	//File descriptor
          short events;		//Registered events are readable and writable
          short revents;	//The events that actually occur are populated by the kernel
	POLLIN	Data readability
	POLLOUT Data readability
	Array length
	Number of milliseconds to wait for timeout
	-1	block
     0   Return now
 Return value:
     and select Number of file descriptors in the same ready state
int ppoll()??


Linux specific I/O multiplexing function

epoll has a set of functions

epoll registers the time on the file descriptor concerned by the user into the practice table of the kernel

You don't need to call to retransmit the file descriptor set every time like select/poll

epoll also does not need to traverse all the file descriptor sets after the call is completed

epol needs to use an additional file descriptor to uniquely represent the time representation in the kernel

#include <sys/epoll.h>
int epoll_create(int size);
	size parameter,It doesn't work at the moment
	Give the kernel a hint of how large the kernel event table needs to be
 Returns a file descriptor that identifies the event table in the kernel
 int epoll_ctl(int epfd,
               int op,
               int fd,
               struct epoll_event *event);
    epfd:epoll_create Return value of function
		EPOLL_CTL_ADD;	Register with timesheet fd Events
		EPOLL_CTL_MOD;	Modify event table fd Registration event for
		EPOLL_CTL_DEL;	Delete event table fd Events
    fd:File descriptor
    event:Registration event
  	struct epoll_event{
	  	_unit32_t event;	//epoll event
	    epoll_data_t data;	//user data
typedef union epoll_data{
    void *ptr;
    int fd;
    uint32_t u32;
    uint64_t u64;

The return value is 0 successfully or 0 failed-1 set up errno
int epoll_wait(int epfd,struct epoll_event *events,
              int maxevents,int timeout);
Return and ready events through this function events Array return
 The number of ready file descriptors successfully returned, events There are several valid data in the array
events:Array is used to receive ready file descriptor events
maxevents:Maximum length of array
timeout:Timeout wait
	0:Return now
epoll_wait If an event is detected, the function copies all ready events from the kernel event table to the events In the array pointed to.
Copy to kernel event table events In the array pointed to

LT and LE pattern
	LT(Level Trigger,Level trigger) default
        If the event is not processed each time epoll_wait Will be notified
    ET(Edge Trigger,Edge trigger) Efficient working mode
        If a file descriptor has data readable, the kernel will detect and notify the application
        If the application does not process the event immediately
        next time epoll_wait This event will not be advertised to the application again=
SIGIO signal

The signal triggers the read-write ready event. The user program performs read-write slave operation, and the program has no blocking phase

Asynchronous I/O

The kernel performs read-write operations and triggers read-write events. The program is not blocked

* if there is time to read in the kernel:

* * * synchronous read * * * needs to copy the data in the kernel to the user's memory before returning (waiting for the data copy from the user state to the kernel state)

ssize_t read(fd,buf,BUF_LEN);//Synchronous read

Return immediately after asynchronously reading and calling the function (do not wait)

#include <aio.h>
int aio_read(struct aiocb *aiocbp);
int aio_write(struct aiocb *aiocbp);

struct aiocb
    //File descriptor to be operated asynchronously
    int aio_fildes;
    //Select which asynchronous I/O type to operate when using for lio operations
    int aio_lio_opcode;
    //Asynchronous read or write buffer
    volatile void *aio_buf;
    //Number of bytes read or written asynchronously
    size_t aio_nbytes;
    //Structure of asynchronous notification
    struct sigevent aio_sigevent;


#define BUFFER_SIZE 1024
int MAX_LIST = 2;
int main(int argc,char **argv)
    //Structure required for aio operation
    struct aiocb rd;
    int fd,ret,couter;
    fd = open("test.txt",O_RDONLY);
    if(fd < 0)

    //Empty rd structure
    //For rd.aio_buf allocate space
    rd.aio_buf = malloc(BUFFER_SIZE + 1);
    //Fill rd structure
    rd.aio_fildes = fd;
    rd.aio_nbytes =  BUFFER_SIZE;
    rd.aio_offset = 0;
    //Perform asynchronous read operations
    ret = aio_read(&rd);
    if(ret < 0)
    couter = 0;
//  The loop waits for the asynchronous read operation to end
    while(aio_error(&rd) == EINPROGRESS)//After reading, it's ECANCELLED
        printf("The first%d second\n",++couter);
    //Get asynchronous read return value
    ret = aio_return(&rd);
    printf("\n\n The return value is:%d",ret);
    return 0;
//When all the data in the kernel is copied to the kernel space, a signal is sent			
2. Pool process pool thread pool

Suppose that the client sends a request when it connects (short connection, such as HTTP server)

Create and destroy threads or threads frequently (the proportion of CPU is basically creating and destroying threads)

This requires a process pool and a thread pool (not so important for long connections)

First create several processes or threads

When a client connects, let the thread in the thread pool connect with the client

After the client disconnects, the thread will not be destroyed and will be put back into the thread pool

3. Zero copy read / write

Read and write operations frequently copy data in user state and kernel state

Read: copy from kernel state to user state

Write: copy from user mode to kernel mode

#include <sys/sendfile.h>
       ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count);//Zero copy IO
//Put the file socket directly into the kernel and output it to the client
//Advanced I/O functions
//pipe  		 The Conduit
//dup/dup2 copy file descriptor
//readv/writev distributed read / write
//mmap munmap mapping physical memory
//splice 	 Zero-copy 
//tee 	 Zero-copy 
4. Up and down switching and locking

Up / down file switching:

Intensive I/O server threads will switch frequently and occupy a large proportion of CPU time

You need to * * * reduce the number of threads * * * or semi synchronous / semi asynchronous mode


Another problem that needs to be considered in the program big is the synchronization of shared resources (lock protection)

Locks are generally considered to be a very important factor leading to server inefficiency


1. Replace other modes without locking (multiplexing IO)

2. Reduce lock granularity

3. If the read-write frequency is greater than the write frequency, the read-write lock replaces the mutex lock

5. mmap / munmap function

The mmap function applies for a piece of memory. We use this memory as the shared memory for inter process communication, or map files into it, and munmap releases the applied memory

void* mmap(void *start,siez_t length,int port,int flags, int fd,off_t offset);
int munmap(void *start,size_t length);

It is used for data exchange between two file descriptors. It is also a zero copy

ssize_t splice(int fd_in,loff_t* off_in,int fd_out,loff_t* off_out,size_t len,unsigned int flags);

The tee function is used for data exchange between two pipelines, which is also a zero copy operation

ssize_t tee(int fd_in,int fd_out,size_t len,unsigned int flags);

Event collection:

Select: the user passes in the set of readable, writable and abnormal event file descriptors through three parameters. The kernel feeds back the ready events through online modification of these parameters, so that the user needs to reset these three parameters every time he calls select

poll: all event types are processed uniformly, so only one time set parameter is required. The user can pass in the event kernel through the events structure, and feed back the ready events by modifying the events structure

Epoll: the kernel directly manages all events registered by users through a schedule, so epoll is called every time_ When waiting, there is no need to repeatedly pass in the event registered by the user, epoll_ The wait parameter events is only used to feed back ready events and does not need to traverse all the file descriptor sets

The time complexity of the application index file descriptor

​ slect:O(n); poll:O(n) epoll(1)

Maximum supported file descriptors

select: restricted by multiple parties poll & epoll: 65535

Working mode

select: LT poLL: LT epoll: the default LT supports ET;

Kernel Implementation and efficiency

select: polling is used to detect ready events. The time complexity of the algorithm is O(n)

poll: polling is used to detect ready events. The complexity of algorithm events is O(n)

epoll: the ready event is detected by callback. The time complexity of the algorithm is O(1)

Reuse I/O + thread / thread pool

#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <errno.h>
#include <assert.h>
#include <unistd.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <sys/epoll.h>

#define NAME_LEN 48
#define MAX_CLIENTS 100
#define MSG_LEN 1024
typedef struct Client{
	int fd;
	struct sockaddr_in addr;
	char name[NAME_LEN];

Client gcls[MAX_CLIENTS+1] = {};
int size = 0;

int init_server(const char *ip,unsigned short int port){
	int fd = socket(AF_INET,SOCK_STREAM,0);
	assert(fd != -1);
	struct sockaddr_in addr = {};
	addr.sin_family = AF_INET;
	addr.sin_port = htons(port);
	addr.sin_addr.s_addr = inet_addr(ip);
	socklen_t len = sizeof(addr);
	int ret = bind(fd,(const struct sockaddr*)&addr,len);
	assert(ret != -1);
	ret = listen(fd,MAX_CLIENTS);
	assert(ret != -1);
	return fd;

void broadcast(int fd,const char *msg){
	int i;
		if(fd != gcls[i].fd){

void accept_client(int fd,int epfd){
	struct sockaddr_in addr = {};
	socklen_t len = sizeof(addr);
	int cfd = accept(fd,(struct sockaddr*)&addr,&len);
	if(cfd != -1){
		Client cls = {};
		cls.fd = cfd;
		cls.addr = addr;
		gcls[size] = cls;
		struct epoll_event event = {}; = EPOLLIN; = cfd;
		int ret = epoll_ctl(epfd,EPOLL_CTL_ADD,cfd,&event);
		if(ret == -1){

int recv_data(int fd){
	int index = 0;
		if(gcls[index].fd == fd){
	char msg[MSG_LEN] = {};
	int ret = 0;
		ret = recv(fd,msg,MSG_LEN,0);
		if(ret <= 0){
			return 0;
		strcat(msg," Enter the chat room,Welcome!");
		int len = strlen(msg);
		ret = recv(fd,msg+len,MSG_LEN-len,0);
		if(ret <= 0){
			msg[--len] = '\0';
			strcat(msg," Exit chat,Welcome everyone!");	
	if(ret <= 0)
		return 0;
	return 1;
void select_fd(int fd){
	int epfd = epoll_create(MAX_CLIENTS);
	if(epfd == -1){
	struct epoll_event event = {}; = EPOLLIN;  //Read event = fd;     //user data
	int ret = epoll_ctl(epfd,EPOLL_CTL_ADD,fd,&event);
	if(ret == -1){
		return ;
	struct epoll_event events[MAX_CLIENTS+1] = {};
	int i;
		ret = epoll_wait(epfd,events,MAX_CLIENTS+1,-1);
		if(ret == -1){
			if(events[i].data.fd == fd){//There is a client connection
			}else{//There is data to receive
				if(events[i].events & EPOLLIN){
					ret = recv_data(events[i].data.fd);
					if(ret == 0){
						struct epoll_event ev = {}; = EPOLLIN; = events[i].data.fd;
						ret = epoll_ctl(epfd,EPOLL_CTL_DEL,events[i].data.fd,&ev);
						if(ret == -1){

int main(int argc,char *argv[]){
	if(argc < 3){
		printf("%s ip port\n",argv[0]);
		return -1;
	int fd = init_server(argv[1],atoi(argv[2]));
	return 0;	

Reentrant function

readv /writev function

ssize_t readv(int fd,const struct iovec*  vector,int count);
ssize_t writev(int fd,const struct iovec*  vector,int count);

sendfile function

int fd = open("a.txt",O_RDONLY);
	read()Copy file disk to kernel ---- Copy from kernel to user
	send() Copy from user memory to kernel
	Together sendfile Load kernel from disk

Topics: Database SQL