"一切皆文件"是Linux的设计哲学,在网络编程中也不例外。不管是在服务端还是在客户端,都会调用socket系统调用来创建一个socket虚拟文件,该系统调用返回该文件的fd,后续的操作都会基于该fd。本文就来分析一下在socket系统调用的底层做了哪些事情。
系统调用定义
// socket系统调用的实现
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
return __sys_socket(family, type, protocol);
}
int __sys_socket(int family, int type, int protocol)
{
struct socket *sock;
int flags;
// 创建socket变量
sock = __sys_socket_create(family, type, protocol);
if (IS_ERR(sock))
return PTR_ERR(sock);
flags = type & ~SOCK_TYPE_MASK;
if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
// 将socket映射为文件,可以在/proc/<pid>/fd/目录下查找到
return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}
整体来说,主要分为了两部分:
- 创建socket;
- 将socket映射为文件,并获取fd;
创建socket
static struct socket *__sys_socket_create(int family, int type, int protocol)
{
struct socket *sock;
int retval;
/* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);
if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return ERR_PTR(-EINVAL);
type &= SOCK_TYPE_MASK;
// 创建socket,将变量地址保存到sock指针变量中,其余三个参数是应用层传入的。
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
return ERR_PTR(retval);
return sock;
}
int sock_create(int family, int type, int protocol, struct socket **res)
{
return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
/*
* Check protocol is in range
*/
if (family < 0 || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < 0 || type >= SOCK_MAX)
return -EINVAL;
/* Compatibility.
This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
family = PF_PACKET;
}
err = security_socket_create(family, type, protocol, kern);
if (err)
return err;
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
// 分配socket对象
sock = sock_alloc();
if (!sock) {
net_warn_ratelimited("socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
}
// 设置用户层传入的协议类型,至于是哪种具体的协议,还需要通过protocol变量来确定。
sock->type = type;
#ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (rcu_access_pointer(net_families[family]) == NULL)
request_module("net-pf-%d", family);
#endif
rcu_read_lock();
/*
* 根据用户层传入的family获得指定协议族对应的net_proto_family变量(内部包含了create函数指针,下面会调用到该函数),
* 内核会在网络子系统的初始化中调用sock_register函数来将各种协议族注册到net_families中。
* 比如net/ipv4/af_inet.c中的inet_init函数就会通过调用sock_register来注册inet_family_ops变量。
*/
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release;
/*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release;
/* Now protected by module ref count */
rcu_read_unlock();
// 调用指定协议族的创建函数,对于AF_INET协议族而言是net/ipv4/af_inet.c#inet_create函数。
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
/*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy;
/*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock;
return 0;
out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err;
out_release:
rcu_read_unlock();
goto out_sock_release;
}
真正的关键在于__sock_create函数中,核心就是两步:
- 查找协议族:根据参数family来查找协议族,在一些man文档中该参数又被称为domain,只是名称不同而已。常用的协议族有AF_INET和AF_INET6,分别针对ipv4和ipv6;还有针对本地通信的AF_UNIX。
- 通过调用协议族的create方法;
对于协议族,本文只分析最常用的AF_INET。
初始化协议栈
在内核启动的时候,会初始化协议栈,即将协议族注册到net_families数组中。
/*
* ===========
* 初始化协议栈
* ===========
*/
static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
int rc;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
raw_hashinfo_init(&raw_v4_hashinfo);
// 注册各种协议的处理函数
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
rc = proto_register(&udp_prot, 1);
if (rc)
goto out_unregister_tcp_proto;
rc = proto_register(&raw_prot, 1);
if (rc)
goto out_unregister_udp_proto;
rc = proto_register(&ping_prot, 1);
if (rc)
goto out_unregister_raw_proto;
/*
* Tell SOCKET that we are alive...
*/
// 注册IPV4协议族
(void)sock_register(&inet_family_ops);
#ifdef CONFIG_SYSCTL
ip_static_sysctl_init();
#endif
/*
* Add all the base protocols.
*/
// 添加所有基础网络协议
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif
/* Register the socket-side information for inet_create. */
// 初始化inetsw中的每个元素,即初始化链表头
for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
// 将inetsw_array中预定义好的inet_protosw添加到inetsw中
for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);
/*
* Set the ARP module up
*/
// 加载arp模块
arp_init();
/*
* Set the IP module up
*/
// 加载ip模块
ip_init();
/* Initialise per-cpu ipv4 mibs */
if (init_ipv4_mibs())
panic("%s: Cannot init ipv4 mibs\n", __func__);
/* Setup TCP slab cache for open requests. */
tcp_init();
/* Setup UDP memory threshold */
udp_init();
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
raw_init();
ping_init();
/*
* Set the ICMP layer up
*/
if (icmp_init() < 0)
panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
if (ip_mr_init())
pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
ipv4_proc_init();
ipfrag_init();
// 注册IP包类型
dev_add_pack(&ip_packet_type);
ip_tunnel_core_init();
rc = 0;
out:
return rc;
out_unregister_raw_proto:
proto_unregister(&raw_prot);
out_unregister_udp_proto:
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
goto out;
}
初始化协议族
在inet_init函数中,有多项重要的初始化,其中就包括了sock_register函数来注册协议族。
int sock_register(const struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
return -ENOBUFS;
}
spin_lock(&net_family_lock);
if (rcu_dereference_protected(net_families[ops->family],
lockdep_is_held(&net_family_lock)))
err = -EEXIST;
else {
// 添加到数组中
rcu_assign_pointer(net_families[ops->family], ops);
err = 0;
}
spin_unlock(&net_family_lock);
pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
return err;
}
/*
* 下面inet_init函数中会通过sock_register函数将该变量注册到内核的net_families数组中。
* 应用层在调用socket系统调用,内核在进行socket_create的时候会查找到相应协议族,并调用create函数。
* 在这里设置的是inet_create函数。
*/
static const struct net_proto_family inet_family_ops = {
.family = PF_INET, // PF_INET这个宏的值为2
.create = inet_create,
.owner = THIS_MODULE,
};
注册时,传递了ipv4协议族操作函数集,其中就包括创建socket所需的inet_create函数。
inet_create函数
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
/*
* sock和socket有什么区别?
* socket结构体中包含一个sock类型的成员,而sock也包含一个socket类型的成员。
* sock主要是对网络协议的封装。
*/
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
//
struct proto *answer_prot;
unsigned char answer_flags;
int try_loading_module = 0;
int err;
if (protocol < 0 || protocol >= IPPROTO_MAX)
return -EINVAL;
// 设置初始状态为UNCONNECTED
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
/*
* 查找目标协议,这里的list表示inet_protosw结构体中的字段。
* 从inetsw中获取到对应type的protocol链表,然后再遍历该链表查找protocol。
*/
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
/*
* 如果用户层传入的protocol是0(IPPROTO_IP的值是0),那么使用每种type的默认协议,
* 也即是每种type对应的链表中的第一个节点,目前对于内核中的ipv4协议,每个type都只有一个协议,所以可以直接传入0。
* 比如SOCKET_STREAM是TCP协议,可以传入IPPROTO_IP(0),也可以传入IPPROTO_TCP(6)。
* 如果传入的是IPPROTO_TCP,那么进入的是上面的那个if代码块。
*/
if (IPPROTO_IP == protocol) {
// 上层可能传入的是0,这里设置成对应默认协议的编号,比如TCP的话就是6。
protocol = answer->protocol;
break;
}
// 目前只有SOCK_RAW类型的protocol是IPPROTO_IP。
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
}
// 处理异常,没找到protocol的情况
if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == 1)
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
}
err = -EPERM;
if (sock->type == SOCK_RAW && !kern &&
!ns_capable(net->user_ns, CAP_NET_RAW))
goto out_rcu_unlock;
/*
* 这里的ops、prot和flags属性都是struct inet_protosw中定义的字段,
* 具体值可以参考inetsw_array这个数组中为结构体变量字段赋值的值。
*/
// 将具体协议的ops赋到socket->ops上,对于TCP而言就是inet_stream_ops。
sock->ops = answer->ops;
// 获取传输层协议,对于TCP而言就是tcp_prot
answer_prot = answer->prot;
answer_flags = answer->flags;
rcu_read_unlock();
WARN_ON(!answer_prot->slab);
err = -ENOMEM;
// 分配struct sock(也就是sk)对象,这里传入了表示传输层协议的对象answer_prot。
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
err = 0;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
/*
* 为什么这里可以将struct sock转为struct inet_sock?
*
*/
inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
inet->nodefrag = 0;
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
inet->inet_id = 0;
// 对sk对象进行初始化
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
err = sk->sk_prot->hash(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
if (sk->sk_prot->init) {
/*
* 调用各传输层协议的init函数,
* 对于TCP协议而言是tcp_v4_init_sock,内核会设置负责重传、keepalive机制的定时器。
*/
err = sk->sk_prot->init(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
if (!kern) {
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
if (err) {
sk_common_release(sk);
goto out;
}
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}
该函数较长,主要的操作可以总结为如下:
- 查找协议;
- 创建sk对象;
- 初始化sk对象;
- 调用协议的初始化函数;
查找协议
会根据socket系统调用的第二个参数type从inetsw数组中找到协议类型链表,然后遍历该链表中的协议并匹配第三个参数protocol。在上面的inet_init方法中,会调用inet_register_protosw函数将inetsw_array数组中的元素添加到inetsw这个链表数组中。
/*
* 下面的inet_init中会遍历该数组中的每个元素,并交由inet_register_protosw函数处理。
* 而这个函数会将inet_protosw添加到inetsw数组中对应type的链表中。
*
* 虽然这里是一个type只对应了一种protocol,但是是支持每种type支持多种protocol。
* 比如完全可以实现另外一种type是SOCK_STREAM的协议,而不仅仅是使用TCP。
*
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.flags = INET_PROTOSW_PERMANENT,
},
{
.type = SOCK_DGRAM,
.protocol = IPPROTO_ICMP,
.prot = &ping_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
},
{
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.flags = INET_PROTOSW_REUSE,
}
};
void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm;
spin_lock_bh(&inetsw_lock);
if (p->type >= SOCK_MAX)
goto out_illegal;
/* If we are trying to override a permanent protocol, bail. */
// 获取协议类型链表
last_perm = &inetsw[p->type];
// 遍历环形链表,lh的初始值为头结点的下一个节点,终止条件为lh为头结点。
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list);
/* Check only the non-wild match. */
if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
break;
// 如果已经添加过协议到该链表,则结束该函数
if (protocol == answer->protocol)
goto out_permanent;
last_perm = lh;
}
/* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
// 将协议添加到链表中
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock);
return;
out_permanent:
pr_err("Attempt to override permanent protocol %d\n", protocol);
goto out;
out_illegal:
pr_err("Ignoring attempt to register invalid socket type %d\n",
p->type);
goto out;
}
对于TCP而言(type是SOCK_STREAM),在inetsw_array中就只定义了一种协议,所以最后链表中就只有一个节点,所以上面socket系统调用的第三个参数protocol可以传入0,表示使用默认的协议。对于SOCK_DGRAM类型而言,有两种协议,UDP和ICMP,所以链表中会有两个节点。
由于协议众多,本文只分析最常用的TCP。
回到inet_create函数中,对于TCP而言,answer变量就指向inetsw_array数组中的第一个元素。然后会把协议的操作函数集ops赋值给socket变量,以便实现后续对底层协议层的调用。
TCP初始化
在inet_init方法中设置的sk->sk_prot字段是tcp_prot。
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.pre_connect = tcp_v4_pre_connect,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.put_port = inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
.psock_update_sk_prot = tcp_bpf_update_proto,
#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
.diag_destroy = tcp_abort,
};
这里主要关注init函数,即tcp_v4_init_sock。
static int tcp_v4_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
// 初始化sock,做一些定时器设置等操作
tcp_init_sock(sk);
// 设置面向连接的协议操作
icsk->icsk_af_ops = &ipv4_specific;
#ifdef CONFIG_TCP_MD5SIG
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif
return 0;
}
void tcp_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
tp->out_of_order_queue = RB_ROOT;
sk->tcp_rtx_queue = RB_ROOT;
// 注册一些计时器
tcp_init_xmit_timers(sk);
// 初始化链表结构
INIT_LIST_HEAD(&tp->tsq_node);
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
// 设置一些属性
icsk->icsk_rto = TCP_TIMEOUT_INIT;
icsk->icsk_rto_min = TCP_RTO_MIN;
icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
/* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_sync_mss = tcp_sync_mss;
WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
sk_sockets_allocated_inc(sk);
}
在tcp_init_sock函数中,重点是调用了tcp_init_sock函数来设置与TCP相关的定时器。
void tcp_init_xmit_timers(struct sock *sk)
{
/*
* 注册一些计时器处理函数,如tcp_write_timer函数用于负责TCP重传。
*/
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS_PINNED_SOFT);
tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED_SOFT);
tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}
在这个函数中,设置了与TCP重传、延迟ACK和KeepAlive机制相关的定时器。本文的主线是socket系统调用,不会深入分析这几种定时器的实现原理。
将socket映射为文件
static int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
// 获取一个未被使用过的fd
int fd = get_unused_fd_flags(flags);
if (unlikely(fd < 0)) {
sock_release(sock);
return fd;
}
// 创建socket对应的file对象
newfile = sock_alloc_file(sock, flags, NULL);
if (!IS_ERR(newfile)) {
// 建立fd与struct file变量的关系
fd_install(fd, newfile);
return fd;
}
put_unused_fd(fd);
return PTR_ERR(newfile);
}
主要操作就三步:
- 分配一个fd,底层会调用file.c文件中的alloc_fd函数来分配文件描述符fd。
- 创建socket对应的struct file对象;
- 建立fd与struct file对象的关系,socket的后续操作只需要传入fd,内核会找到socket对象,以便后续操作。
创建file对象
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
struct file *file;
if (!dname)
dname = sock->sk ? sock->sk->sk_prot_creator->name : "";
// 分配一个伪文件
file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
O_RDWR | (flags & O_NONBLOCK),
// 这里传入的是socket_file_ops,该结构体变量封装了socket的操作函数。
&socket_file_ops);
if (IS_ERR(file)) {
sock_release(sock);
return file;
}
/*
* 建立struct socket与struct file的关系,两者互相引用。
* 这样当应用层传入fd的时候就很容易能够找到socket变量。
*/
sock->file = file;
file->private_data = sock;
stream_open(SOCK_INODE(sock), file);
return file;
}
static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read_iter = sock_read_iter,
.write_iter = sock_write_iter,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
.show_fdinfo = sock_show_fdinfo,
};
在sock_alloc_file函数中,会调用alloc_file_pseudo来在伪文件系统(proc伪文件系统)中分配一个伪文件,设置的文件操作函数集是socket_file_ops,以便VFS层调用。