Linux内核中的socket系统调用

"一切皆文件"是Linux的设计哲学,在网络编程中也不例外。不管是在服务端还是在客户端,都会调用socket系统调用来创建一个socket虚拟文件,该系统调用返回该文件的fd,后续的操作都会基于该fd。本文就来分析一下在socket系统调用的底层做了哪些事情。


系统调用定义

v5.19.17
c
net/socket.c
// socket系统调用的实现
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	return __sys_socket(family, type, protocol);
}
int __sys_socket(int family, int type, int protocol)
{
	struct socket *sock;
	int flags;

	// 创建socket变量
	sock = __sys_socket_create(family, type, protocol);
	if (IS_ERR(sock))
		return PTR_ERR(sock);

	flags = type & ~SOCK_TYPE_MASK;
	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

	// 将socket映射为文件,可以在/proc/<pid>/fd/目录下查找到
	return sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
}

整体来说,主要分为了两部分:

  • 创建socket;
  • 将socket映射为文件,并获取fd;

创建socket

v5.19.17
__sys_socket_create
sock_create
__sock_create
<
>
c
net/socket.c
static struct socket *__sys_socket_create(int family, int type, int protocol)
{
	struct socket *sock;
	int retval;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

	if ((type & ~SOCK_TYPE_MASK) & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return ERR_PTR(-EINVAL);
	type &= SOCK_TYPE_MASK;

	// 创建socket,将变量地址保存到sock指针变量中,其余三个参数是应用层传入的。
	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		return ERR_PTR(retval);

	return sock;
}
c
net/socket.c
int sock_create(int family, int type, int protocol, struct socket **res)
{
	return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
c
net/socket.c
int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 *      Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			     current->comm);
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	// 分配socket对象
	sock = sock_alloc();
	if (!sock) {
		net_warn_ratelimited("socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	// 设置用户层传入的协议类型,至于是哪种具体的协议,还需要通过protocol变量来确定。
	sock->type = type;

#ifdef CONFIG_MODULES
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (rcu_access_pointer(net_families[family]) == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	/*
	 * 根据用户层传入的family获得指定协议族对应的net_proto_family变量(内部包含了create函数指针,下面会调用到该函数),
	 * 内核会在网络子系统的初始化中调用sock_register函数来将各种协议族注册到net_families中。
	 * 比如net/ipv4/af_inet.c中的inet_init函数就会通过调用sock_register来注册inet_family_ops变量。
	 */
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	// 调用指定协议族的创建函数,对于AF_INET协议族而言是net/ipv4/af_inet.c#inet_create函数。
	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

真正的关键在于__sock_create函数中,核心就是两步:

  • 查找协议族:根据参数family来查找协议族,在一些man文档中该参数又被称为domain,只是名称不同而已。常用的协议族有AF_INETAF_INET6,分别针对ipv4和ipv6;还有针对本地通信的AF_UNIX
  • 通过调用协议族的create方法;

对于协议族,本文只分析最常用的AF_INET。

初始化协议栈

在内核启动的时候,会初始化协议栈,即将协议族注册到net_families数组中。

v5.19.17
c
net/ipv4/af_inet.c
/*
 * ===========
 * 初始化协议栈
 * ===========
 */
static int __init inet_init(void)
{
	struct inet_protosw *q;
	struct list_head *r;
	int rc;

	sock_skb_cb_check_size(sizeof(struct inet_skb_parm));

	raw_hashinfo_init(&raw_v4_hashinfo);

	// 注册各种协议的处理函数

	rc = proto_register(&tcp_prot, 1);
	if (rc)
		goto out;

	rc = proto_register(&udp_prot, 1);
	if (rc)
		goto out_unregister_tcp_proto;

	rc = proto_register(&raw_prot, 1);
	if (rc)
		goto out_unregister_udp_proto;

	rc = proto_register(&ping_prot, 1);
	if (rc)
		goto out_unregister_raw_proto;

	/*
	 *	Tell SOCKET that we are alive...
	 */

	// 注册IPV4协议族
	(void)sock_register(&inet_family_ops);

#ifdef CONFIG_SYSCTL
	ip_static_sysctl_init();
#endif

	/*
	 *	Add all the base protocols.
	 */

	// 添加所有基础网络协议

	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
		pr_crit("%s: Cannot add ICMP protocol\n", __func__);
	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
		pr_crit("%s: Cannot add UDP protocol\n", __func__);
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
		pr_crit("%s: Cannot add IGMP protocol\n", __func__);
#endif

	/* Register the socket-side information for inet_create. */
	// 初始化inetsw中的每个元素,即初始化链表头
	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
		INIT_LIST_HEAD(r);

	// 将inetsw_array中预定义好的inet_protosw添加到inetsw中
	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		inet_register_protosw(q);

	/*
	 *	Set the ARP module up
	 */

	// 加载arp模块
	arp_init();

	/*
	 *	Set the IP module up
	 */

	// 加载ip模块
	ip_init();

	/* Initialise per-cpu ipv4 mibs */
	if (init_ipv4_mibs())
		panic("%s: Cannot init ipv4 mibs\n", __func__);

	/* Setup TCP slab cache for open requests. */
	tcp_init();

	/* Setup UDP memory threshold */
	udp_init();

	/* Add UDP-Lite (RFC 3828) */
	udplite4_register();

	raw_init();

	ping_init();

	/*
	 *	Set the ICMP layer up
	 */

	if (icmp_init() < 0)
		panic("Failed to create the ICMP control socket.\n");

	/*
	 *	Initialise the multicast router
	 */
#if defined(CONFIG_IP_MROUTE)
	if (ip_mr_init())
		pr_crit("%s: Cannot init ipv4 mroute\n", __func__);
#endif

	if (init_inet_pernet_ops())
		pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);

	ipv4_proc_init();

	ipfrag_init();

	// 注册IP包类型
	dev_add_pack(&ip_packet_type);

	ip_tunnel_core_init();

	rc = 0;
out:
	return rc;
out_unregister_raw_proto:
	proto_unregister(&raw_prot);
out_unregister_udp_proto:
	proto_unregister(&udp_prot);
out_unregister_tcp_proto:
	proto_unregister(&tcp_prot);
	goto out;
}

初始化协议族

inet_init函数中,有多项重要的初始化,其中就包括了sock_register函数来注册协议族。

v5.19.17
sock_register
inet_family_ops
<
>
c
net/socket.c
int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);
	if (rcu_dereference_protected(net_families[ops->family],
				      lockdep_is_held(&net_family_lock)))
		err = -EEXIST;
	else {
        // 添加到数组中
		rcu_assign_pointer(net_families[ops->family], ops);
		err = 0;
	}
	spin_unlock(&net_family_lock);

	pr_info("NET: Registered %s protocol family\n", pf_family_names[ops->family]);
	return err;
}
c
net/ipv4/af_inet.c
/*
 * 下面inet_init函数中会通过sock_register函数将该变量注册到内核的net_families数组中。
 * 应用层在调用socket系统调用,内核在进行socket_create的时候会查找到相应协议族,并调用create函数。
 * 在这里设置的是inet_create函数。
 */
static const struct net_proto_family inet_family_ops = {
	.family = PF_INET, // PF_INET这个宏的值为2
	.create = inet_create,
	.owner	= THIS_MODULE,
};

注册时,传递了ipv4协议族操作函数集,其中就包括创建socket所需的inet_create函数。

inet_create函数

v5.19.17
c
net/ipv4/af_inet.c
static int inet_create(struct net *net, struct socket *sock, int protocol,
		       int kern)
{
	/*
	 * sock和socket有什么区别?
	 * socket结构体中包含一个sock类型的成员,而sock也包含一个socket类型的成员。
	 * sock主要是对网络协议的封装。
	 */
	struct sock *sk;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	//
	struct proto *answer_prot;
	unsigned char answer_flags;
	int try_loading_module = 0;
	int err;

	if (protocol < 0 || protocol >= IPPROTO_MAX)
		return -EINVAL;

	// 设置初始状态为UNCONNECTED
	sock->state = SS_UNCONNECTED;

	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	/*
	 * 查找目标协议,这里的list表示inet_protosw结构体中的字段。
	 * 从inetsw中获取到对应type的protocol链表,然后再遍历该链表查找protocol。
	 */
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			/*
			 * 如果用户层传入的protocol是0(IPPROTO_IP的值是0),那么使用每种type的默认协议,
			 * 也即是每种type对应的链表中的第一个节点,目前对于内核中的ipv4协议,每个type都只有一个协议,所以可以直接传入0。
			 * 比如SOCKET_STREAM是TCP协议,可以传入IPPROTO_IP(0),也可以传入IPPROTO_TCP(6)。
			 * 如果传入的是IPPROTO_TCP,那么进入的是上面的那个if代码块。
			 */
			if (IPPROTO_IP == protocol) {
				// 上层可能传入的是0,这里设置成对应默认协议的编号,比如TCP的话就是6。
				protocol = answer->protocol;
				break;
			}
			// 目前只有SOCK_RAW类型的protocol是IPPROTO_IP。
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
	}

	// 处理异常,没找到protocol的情况
	if (unlikely(err)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (sock->type == SOCK_RAW && !kern &&
	    !ns_capable(net->user_ns, CAP_NET_RAW))
		goto out_rcu_unlock;

	/*
	 * 这里的ops、prot和flags属性都是struct inet_protosw中定义的字段,
	 * 具体值可以参考inetsw_array这个数组中为结构体变量字段赋值的值。
	 */

	// 将具体协议的ops赋到socket->ops上,对于TCP而言就是inet_stream_ops。
	sock->ops = answer->ops;
	// 获取传输层协议,对于TCP而言就是tcp_prot
	answer_prot = answer->prot;
	answer_flags = answer->flags;
	rcu_read_unlock();

	WARN_ON(!answer_prot->slab);

	err = -ENOMEM;
	// 分配struct sock(也就是sk)对象,这里传入了表示传输层协议的对象answer_prot。
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
	if (!sk)
		goto out;

	err = 0;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = SK_CAN_REUSE;

	/*
	 * 为什么这里可以将struct sock转为struct inet_sock?
	 *
	 */
	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	inet->nodefrag = 0;

	if (SOCK_RAW == sock->type) {
		inet->inet_num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->inet_id = 0;

	// 对sk对象进行初始化
	sock_init_data(sock, sk);

	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_all	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;
	inet->rcv_tos	= 0;

	sk_refcnt_debug_inc(sk);

	if (inet->inet_num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->inet_sport = htons(inet->inet_num);
		/* Add to protocol hash chains. */
		err = sk->sk_prot->hash(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}

	if (sk->sk_prot->init) {
		/*
		 * 调用各传输层协议的init函数,
		 * 对于TCP协议而言是tcp_v4_init_sock,内核会设置负责重传、keepalive机制的定时器。
		 */
		err = sk->sk_prot->init(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}

	if (!kern) {
		err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

该函数较长,主要的操作可以总结为如下:

  • 查找协议;
  • 创建sk对象;
  • 初始化sk对象;
  • 调用协议的初始化函数;

查找协议

会根据socket系统调用的第二个参数typeinetsw数组中找到协议类型链表,然后遍历该链表中的协议并匹配第三个参数protocol。在上面的inet_init方法中,会调用inet_register_protosw函数将inetsw_array数组中的元素添加到inetsw这个链表数组中。

v5.19.17
inetsw_array
inet_register_protosw
<
>
c
net/ipv4/af_inet.c
/*
 * 下面的inet_init中会遍历该数组中的每个元素,并交由inet_register_protosw函数处理。
 * 而这个函数会将inet_protosw添加到inetsw数组中对应type的链表中。
 *
 * 虽然这里是一个type只对应了一种protocol,但是是支持每种type支持多种protocol。
 * 比如完全可以实现另外一种type是SOCK_STREAM的协议,而不仅仅是使用TCP。
 *
 */
static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.flags =      INET_PROTOSW_PERMANENT,
       },

       {
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_ICMP,
		.prot =       &ping_prot,
		.ops =        &inet_sockraw_ops,
		.flags =      INET_PROTOSW_REUSE,
       },

       {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .flags =      INET_PROTOSW_REUSE,
       }
};
c
net/ipv4/af_inet.c
void inet_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	int protocol = p->protocol;
	struct list_head *last_perm;

	spin_lock_bh(&inetsw_lock);

	if (p->type >= SOCK_MAX)
		goto out_illegal;

	/* If we are trying to override a permanent protocol, bail. */
	// 获取协议类型链表
	last_perm = &inetsw[p->type];
	// 遍历环形链表,lh的初始值为头结点的下一个节点,终止条件为lh为头结点。
	list_for_each(lh, &inetsw[p->type]) {
		answer = list_entry(lh, struct inet_protosw, list);
		/* Check only the non-wild match. */
		if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)
			break;
		// 如果已经添加过协议到该链表,则结束该函数
		if (protocol == answer->protocol)
			goto out_permanent;
		last_perm = lh;
	}

	/* Add the new entry after the last permanent entry if any, so that
	 * the new entry does not override a permanent entry when matched with
	 * a wild-card protocol. But it is allowed to override any existing
	 * non-permanent entry.  This means that when we remove this entry, the
	 * system automatically returns to the old behavior.
	 */
	// 将协议添加到链表中
	list_add_rcu(&p->list, last_perm);
out:
	spin_unlock_bh(&inetsw_lock);

	return;

out_permanent:
	pr_err("Attempt to override permanent protocol %d\n", protocol);
	goto out;

out_illegal:
	pr_err("Ignoring attempt to register invalid socket type %d\n",
	       p->type);
	goto out;
}

对于TCP而言(type是SOCK_STREAM),在inetsw_array中就只定义了一种协议,所以最后链表中就只有一个节点,所以上面socket系统调用的第三个参数protocol可以传入0,表示使用默认的协议。对于SOCK_DGRAM类型而言,有两种协议,UDP和ICMP,所以链表中会有两个节点。

由于协议众多,本文只分析最常用的TCP。

回到inet_create函数中,对于TCP而言,answer变量就指向inetsw_array数组中的第一个元素。然后会把协议的操作函数集ops赋值给socket变量,以便实现后续对底层协议层的调用。

TCP初始化

inet_init方法中设置的sk->sk_prot字段是tcp_prot

v5.19.17
c
net/ipv4/tcp_ipv4.c
struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.pre_connect		= tcp_v4_pre_connect,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.bpf_bypass_getsockopt	= tcp_bpf_bypass_getsockopt,
	.keepalive		= tcp_set_keepalive,
	.recvmsg		= tcp_recvmsg,
	.sendmsg		= tcp_sendmsg,
	.sendpage		= tcp_sendpage,
	.backlog_rcv		= tcp_v4_do_rcv,
	.release_cb		= tcp_release_cb,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.put_port		= inet_put_port,
#ifdef CONFIG_BPF_SYSCALL
	.psock_update_sk_prot	= tcp_bpf_update_proto,
#endif
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.leave_memory_pressure	= tcp_leave_memory_pressure,
	.stream_memory_free	= tcp_stream_memory_free,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_mem		= sysctl_tcp_mem,
	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.slab_flags		= SLAB_TYPESAFE_BY_RCU,
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
	.no_autobind		= true,
	.diag_destroy		= tcp_abort,
};

这里主要关注init函数,即tcp_v4_init_sock

v5.19.17
tcp_v4_init_sock
tcp_init_sock
<
>
c
net/ipv4/tcp_ipv4.c
static int tcp_v4_init_sock(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);

	// 初始化sock,做一些定时器设置等操作
	tcp_init_sock(sk);

	// 设置面向连接的协议操作
	icsk->icsk_af_ops = &ipv4_specific;

#ifdef CONFIG_TCP_MD5SIG
	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
#endif

	return 0;
}
c
net/ipv4/tcp.c
void tcp_init_sock(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	tp->out_of_order_queue = RB_ROOT;
	sk->tcp_rtx_queue = RB_ROOT;
	// 注册一些计时器
	tcp_init_xmit_timers(sk);
	// 初始化链表结构
	INIT_LIST_HEAD(&tp->tsq_node);
	INIT_LIST_HEAD(&tp->tsorted_sent_queue);

	// 设置一些属性
	icsk->icsk_rto = TCP_TIMEOUT_INIT;
	icsk->icsk_rto_min = TCP_RTO_MIN;
	icsk->icsk_delack_max = TCP_DELACK_MAX;
	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
	minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);

	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
	 * algorithms that we must have the following bandaid to talk
	 * efficiently to them.  -DaveM
	 */
	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);

	/* There's a bubble in the pipe until at least the first ACK. */
	tp->app_limited = ~0U;

	/* See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
	tp->snd_cwnd_clamp = ~0;
	tp->mss_cache = TCP_MSS_DEFAULT;

	tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
	tcp_assign_congestion_control(sk);

	tp->tsoffset = 0;
	tp->rack.reo_wnd_steps = 1;

	sk->sk_write_space = sk_stream_write_space;
	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

	icsk->icsk_sync_mss = tcp_sync_mss;

	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));

	sk_sockets_allocated_inc(sk);
}

tcp_init_sock函数中,重点是调用了tcp_init_sock函数来设置与TCP相关的定时器。

v5.19.17
c
net/ipv4/tcp_timer.c
void tcp_init_xmit_timers(struct sock *sk)
{
	/*
	 * 注册一些计时器处理函数,如tcp_write_timer函数用于负责TCP重传。
	 */
	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
				  &tcp_keepalive_timer);
	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
		     HRTIMER_MODE_ABS_PINNED_SOFT);
	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;

	hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
		     HRTIMER_MODE_REL_PINNED_SOFT);
	tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
}

在这个函数中,设置了与TCP重传、延迟ACK和KeepAlive机制相关的定时器。本文的主线是socket系统调用,不会深入分析这几种定时器的实现原理。

将socket映射为文件

v5.19.17
c
net/socket.c
static int sock_map_fd(struct socket *sock, int flags)
{
	struct file *newfile;
	// 获取一个未被使用过的fd
	int fd = get_unused_fd_flags(flags);
	if (unlikely(fd < 0)) {
		sock_release(sock);
		return fd;
	}

	// 创建socket对应的file对象
	newfile = sock_alloc_file(sock, flags, NULL);
	if (!IS_ERR(newfile)) {
		// 建立fd与struct file变量的关系
		fd_install(fd, newfile);
		return fd;
	}

	put_unused_fd(fd);
	return PTR_ERR(newfile);
}

主要操作就三步:

  • 分配一个fd,底层会调用file.c文件中的alloc_fd函数来分配文件描述符fd。
  • 创建socket对应的struct file对象;
  • 建立fd与struct file对象的关系,socket的后续操作只需要传入fd,内核会找到socket对象,以便后续操作。

创建file对象

v5.19.17
sock_alloc_file
socket_file_ops
<
>
c
net/socket.c
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
	struct file *file;

	if (!dname)
		dname = sock->sk ? sock->sk->sk_prot_creator->name : "";

	// 分配一个伪文件
	file = alloc_file_pseudo(SOCK_INODE(sock), sock_mnt, dname,
				O_RDWR | (flags & O_NONBLOCK),
				// 这里传入的是socket_file_ops,该结构体变量封装了socket的操作函数。
				&socket_file_ops);
	if (IS_ERR(file)) {
		sock_release(sock);
		return file;
	}

	/*
	 * 建立struct socket与struct file的关系,两者互相引用。
	 * 这样当应用层传入fd的时候就很容易能够找到socket变量。
	 */
	sock->file = file;
	file->private_data = sock;
	stream_open(SOCK_INODE(sock), file);
	return file;
}
c
net/socket.c
static const struct file_operations socket_file_ops = {
	.owner =	THIS_MODULE,
	.llseek =	no_llseek,
	.read_iter =	sock_read_iter,
	.write_iter =	sock_write_iter,
	.poll =		sock_poll,
	.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl = compat_sock_ioctl,
#endif
	.mmap =		sock_mmap,
	.release =	sock_close,
	.fasync =	sock_fasync,
	.sendpage =	sock_sendpage,
	.splice_write = generic_splice_sendpage,
	.splice_read =	sock_splice_read,
	.show_fdinfo =	sock_show_fdinfo,
};

sock_alloc_file函数中,会调用alloc_file_pseudo来在伪文件系统(proc伪文件系统)中分配一个伪文件,设置的文件操作函数集是socket_file_ops,以便VFS层调用。