基于x86-64 Linux-5.0.1的Socket与系统调用深度分析
一、Socket API编程接口
Libc库中定义的一些应用编程接口(Application Program Interface, API)引用了封装例程(Wrapper Routine),一般一个封装例程对应一个系统调用,大部分封装例程返回一个整数,其值含义依赖于相应的系统调用,-1在多数情况下表示内核不能满足进程的请求,Libc中定义的errno变量包含特定的出错码。C语言中的Socket API就是一种涉及系统调用的API,常用的函数如下:
int socket(int domain, int type, int protocol) //创建一个新的套接字,返回套接字描述符 int connect(int sockfd, struct sockaddr *server_addr, int sockaddr_len) //同远程服务器主动连接,成功时返回0,失败时返回1 int bind(int sockfd, struct sockaddr* my_addr, int addrlen) //为套接字指明一个本地端点地址,TCP/IP协议使用sockaddr_in结构,包含IP地址和端口号,服务器使用它来指明熟悉的端口号,然后等待连接 int listen(int sockfd, int input_queue_size) //面向连接的服务器指明某个套接字,将其置为被动模式,并准备接收传入连接 int accept(int sockfd, void* addr, int* addrlen) //获取传入连接请求,返回新的连接套接字描述符,为每个新连接请求创建一个新的套接字,服务器只对新的连接使用该套接字,原来的监听套接字接受其他的连接请求。新的连接上传输数据使用新的套接字 int sendto(int sockfd, const void* data, int data_len, unsigned int flags, struct sockaddr* remaddr,int remaddr_len) //基于UDP发送数据报,返回实际发送的数据长度,出错时返回1 int send(int sockfd, const void* data, int data_len, unsigned int flags) //在TCP连接上发送数据,返回成功传送数据的长度,出错时返回-1,将外发数据复制到OS内核中 int recvfrom(int sockfd, void *buf, int buf_len,unsigned int flags,struct sockaddr *from,int *fromlen); //从UDP接收数据,返回实际接收的字节数,失败时返回-1 int recv(int sockfd, void* buf, int buf_len,unsigned int flags) //从TCP接收数据,返回实际接收的数据长度,出错时返回-1。服务器使用其接收客户请求,客户使用它接受服务器的应答。如果没有数据,将阻塞,如果收到的数据大于缓存的大小,多余的数据将丢弃 close(int sockfd) //撤销套接字,如果只有一个进程使用,立即终止连接并撤销该套接字,如果多个进程共享该套接字,将引用数减一,如果引用数降到零,则撤销它
图1 UDP连接涉及的Socket API
图2 TCP连接涉及的Socket API
二、系统调用机制及内核中相关源代码
图3 应?程序、封装例程、系统调?处理程序及系统调?服务例程之间的关系
x86-64Linux系统启动时依次调用以下过程:start_kernel --> trap_init --> cpu_init --> syscall_init,而syscall_init函数实现了系统调用的初始化将中断向量与服务例程进行绑定。除此之外,还要进行系统调用表(对应于sys_call_table 数组)的初始化。在linux-5.0.1/arch/x86/kernel/cpu/common.c中定义了sysycall_init函数:
/* May not be marked __init: used by software suspend */ void syscall_init(void) { wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn‘t allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif /* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); }
在一个终端打开qemu启动MenuOS,在另一个终端用gdb读入linux-5.0.1的vmlinux,通过端口1234与qemu建立连接,在start_kernel,trap_init,cpu_init,syscall_init处设置断点,然后不断continue,跟踪验证内核启动及系统调用初始化过程,如下图所示:
当用户态程序进行系统调用时,CPU会切换到内核态并开始执行一个内核函数,内核实现了很多不同的系统调用,进程必须传递一个叫作系统调用号的参数来指明需要哪个系统调用。对于x86-64系统来说,用户态程序发起系统调用时,进程会跳转到entry_SYSCALL_64,在linux-5.0.1/arch/x86/entry/entry_64.S和//以下代码来自linux-5.0.1/arch/x86/entry/common.c中定义了x86-64的系统调用服务例程:
//以下代码来自linux-5.0.1/arch/x86/entry/entry_64.SGLOBAL(entry_SYSCALL_64_after_hwframe) ... /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ ...//以下代码来自linux-5.0.1/arch/x86/entry/common.c #ifdef CONFIG_X86_64 __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { ... if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs); ... } #endif
三、socket相关系统调用的内核处理函数
在linux-5.0.1/arch/x86/entry/syscalls/syscall_64.tbl中可查看x86-64系统调用号,及其对应API和入口(此处只摘取Socket相关的部分):
# # 64-bit system call numbers and entry vectors # # The format is: # <number> <abi> <name> <entry point> # # The __x64_sys_*() stubs are created on-the-fly for sys_*() system calls # # The abi is "common", "64" or "x32" for this file. # ... 41 common socket __x64_sys_socket 42 common connect __x64_sys_connect 43 common accept __x64_sys_accept 44 common sendto __x64_sys_sendto 45 64 recvfrom __x64_sys_recvfrom 46 64 sendmsg __x64_sys_sendmsg 47 64 recvmsg __x64_sys_recvmsg 48 common shutdown __x64_sys_shutdown 49 common bind __x64_sys_bind 50 common listen __x64_sys_listen 51 common getsockname __x64_sys_getsockname 52 common getpeername __x64_sys_getpeername 53 common socketpair __x64_sys_socketpair 54 64 setsockopt __x64_sys_setsockopt 55 64 getsockopt __x64_sys_getsockopt
在linux-5.0.1/net/socket.c中可以查看Socket接口对应的Linux内核系统调用处理函数:
/* * System call vectors. * * Argument checking cleaned up. Saved 20% in size. * This function doesn‘t need to set the kernel lock because * it is set by the callees. */ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; if (call < 1 || call > SYS_SENDMMSG) return -EINVAL; call = array_index_nospec(call, SYS_SENDMMSG + 1); len = nargs[call]; if (len > sizeof(a)) return -EINVAL; /* copy_from_user should be SMP safe. */ if (copy_from_user(a, args, len)) return -EFAULT; err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); if (err) return err; a0 = a[0]; a1 = a[1]; switch (call) { case SYS_SOCKET: err = __sys_socket(a0, a1, a[2]); break; case SYS_BIND: err = __sys_bind(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_CONNECT: err = __sys_connect(a0, (struct sockaddr __user *)a1, a[2]); break; case SYS_LISTEN: err = __sys_listen(a0, a1); break; case SYS_ACCEPT: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_GETPEERNAME: err = __sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); break; case SYS_SEND: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], NULL, 0); break; case SYS_SENDTO: err = __sys_sendto(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], a[5]); break; case SYS_RECV: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], NULL, NULL); break; case SYS_RECVFROM: err = __sys_recvfrom(a0, (void __user *)a1, a[2], a[3], (struct sockaddr __user *)a[4], (int __user *)a[5]); break; case SYS_SHUTDOWN: err = __sys_shutdown(a0, a1); break; case SYS_SETSOCKOPT: err = __sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); break; case SYS_GETSOCKOPT: err = __sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); break; case SYS_SENDMSG: err = __sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_SENDMMSG: err = __sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], true); break; case SYS_RECVMSG: err = __sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2], true); break; case SYS_RECVMMSG: if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct __kernel_timespec __user *)a[4], NULL); else err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], NULL, (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], a[3]); break; default: err = -EINVAL; break; } return err; } #endif /* __ARCH_WANT_SYS_SOCKETCALL */
接下来我们将通过gdb跟踪MenuOS中replyhi和hello指令的执行过程来了解Socket相关系统调用的内核函数。先打开linuxnet/lab3/main.c查看指令的实现代码:
#include"syswrapper.h" #define MAX_CONNECT_QUEUE 1024 int Replyhi() { char szBuf[MAX_BUF_LEN] = "\0"; char szReplyMsg[MAX_BUF_LEN] = "hi\0"; InitializeService(); while (1) { ServiceStart(); RecvMsg(szBuf); SendMsg(szReplyMsg); ServiceStop(); } ShutdownService(); return 0; } int StartReplyhi(int argc, char *argv[]) { int pid; /* fork another process */ pid = fork(); if (pid < 0) { /* error occurred */ fprintf(stderr, "Fork Failed!"); exit(-1); } else if (pid == 0) { /* child process */ Replyhi(); printf("Reply hi TCP Service Started!\n"); } else { /* parent process */ printf("Please input hello...\n"); } } int Hello(int argc, char *argv[]) { char szBuf[MAX_BUF_LEN] = "\0"; char szMsg[MAX_BUF_LEN] = "hello\0"; OpenRemoteService(); SendMsg(szMsg); RecvMsg(szBuf); CloseRemoteService(); return 0; }
不难发现Replyhi()和Hello()调用了封装函数InitializeService(),ServiceStart(),RecvMsg(szBuf),SendMsg(),ServiceStop(),OpenRemoteService(),CloseRemoteService(),打开linuxnet/lab3/syswrapper.h查看这些封装函数的实现:
/********************************************************************/ /* Copyright (C) SSE-USTC, 2012 */ /* */ /* FILE NAME : syswraper.h */ /* PRINCIPAL AUTHOR : Mengning */ /* SUBSYSTEM NAME : system */ /* MODULE NAME : syswraper */ /* LANGUAGE : C */ /* TARGET ENVIRONMENT : Linux */ /* DATE OF FIRST RELEASE : 2012/11/22 */ /* DESCRIPTION : the interface to Linux system(socket) */ /********************************************************************/ /* * Revision log: * * Created by Mengning,2012/11/22 * */ #ifndef _SYS_WRAPER_H_ #define _SYS_WRAPER_H_ #include<stdio.h> #include<arpa/inet.h> /* internet socket */ #include<string.h> //#define NDEBUG #include<assert.h> #define PORT 5001 #define IP_ADDR "127.0.0.1" #define MAX_BUF_LEN 1024 /* private macro */ #define PrepareSocket(addr,port) int sockfd = -1; struct sockaddr_in serveraddr; struct sockaddr_in clientaddr; socklen_t addr_len = sizeof(struct sockaddr); serveraddr.sin_family = AF_INET; serveraddr.sin_port = htons(port); serveraddr.sin_addr.s_addr = inet_addr(addr); memset(&serveraddr.sin_zero, 0, 8); sockfd = socket(PF_INET,SOCK_STREAM,0); #define InitServer() int ret = bind( sockfd, (struct sockaddr *)&serveraddr, sizeof(struct sockaddr)); if(ret == -1) { fprintf(stderr,"Bind Error,%s:%d\n", __FILE__,__LINE__); close(sockfd); return -1; } listen(sockfd,MAX_CONNECT_QUEUE); #define InitClient() int ret = connect(sockfd, (struct sockaddr *)&serveraddr, sizeof(struct sockaddr)); if(ret == -1) { fprintf(stderr,"Connect Error,%s:%d\n", __FILE__,__LINE__); return -1; } /* public macro */ #define InitializeService() \ PrepareSocket(IP_ADDR,PORT); InitServer(); #define ShutdownService() \ close(sockfd); #define OpenRemoteService() \ PrepareSocket(IP_ADDR,PORT); InitClient(); int newfd = sockfd; #define CloseRemoteService() \ close(sockfd); #define ServiceStart() int newfd = accept( sockfd, (struct sockaddr *)&clientaddr, &addr_len); if(newfd == -1) { fprintf(stderr,"Accept Error,%s:%d\n", __FILE__,__LINE__); } #define ServiceStop() \ close(newfd); #define RecvMsg(buf) \ ret = recv(newfd,buf,MAX_BUF_LEN,0); if(ret > 0) { printf("recv \"%s\" from %s:%d\n", buf, (char*)inet_ntoa(clientaddr.sin_addr), ntohs(clientaddr.sin_port)); } #define SendMsg(buf) \ ret = send(newfd,buf,strlen(buf),0); if(ret > 0) { printf("rely \"hi\" to %s:%d\n", (char*)inet_ntoa(clientaddr.sin_addr), ntohs(clientaddr.sin_port)); } #endif /* _SYS_WRAPER_H_ */
不难发现,过程中涉及到的Socket相关API有socket(),bind(),listen(),accept(),recv(),send(),close(),connect()。于是我们在一个终端打开qemu启动MenuOS(指令中去掉 -S),在另一个终端用gdb读入linux-5.0.1的vmlinux,通过端口1234与qemu建立连接,在相关的系统调用内核处理函数处设置断点,如下图所示:
先在gdb输入一次continue令MenuOS完成启动,然后在qemu中输入replyhi,再在gdb中不断continue,显示的调用过程如下:
然后在qemu中输入hello,再在gdb中不断continue,显示的调用过程如下:
最后查看qemu发现指令已经完整运行:
参考文献:
1.https://github.com/torvalds/linux