深入理解ping原理及源码实现

深入理解ping原理及源码实现

ping原理

发起端发送ICMP探测报文,发起端和中间设备根据路由表进行转发,中间设备将报文TTL减1,并更新下一跳二层头信息转发,目的节点收到发给自己的ping包之后,检查报文是否合法,完成检查后给发起端发送ICMP响应报文。

A ->R1 -> R2 -> R3 -> B (ICMP echo request)

A <- R1 <- R2 <- R3 <-B (ICMP echo reply)

ICMP的报文格式定义在RFC 792中,它的IP Proto字段是1,从这里也可以看出ICMP是IP网络的基础协议。具体echo request和echo reply的报文格式请参考具体的RFC协议。

一般来说,上面两个方向的报文走的路径是一样的,所以ping报文可以测量发起端到接收端的往返延时(RTT),同时会计算出最小最大平均时延,有些工具还会计算出方差。对于ping包发送个数,Windows上默认是4个, Linux上默认一直发送除非ctrl+c停止,但是都提供配置发送个数选项。默认发送的icmp报文长度不会很大,Windows上是94字节,Linux上是118字节(上述是IPv6报文ICMPv6对应的长度),净荷字段的填充因为没有限制,更是五花八门,这里不在一一列出。因为ping包如果超过接口的MTU会默认分片,所以ping也提供了不分片设置标志,以便探测网络对大包的支持情况。TTL配置则会限制报文最大经过的中间设备数量,如果不设置Windows是128,Linux是64,一般场景足够了。每个ICMP回复等待的超时时间默认是1秒,如果有特殊使用,可以设置ICMP探测的超时时间。因为ping包发送时候使用的是发送端口的接口地址,如果有需要也可以配置发送报文的原地址。

上面说的都是Windows和Linux都支持的功能,除此之外每个工具还有自己特定的一些配置,另外IPv6和IPv4在使用上还有一些差异。

在实现上,发起端的ping工具一般实现在用户态,而接收端的ech request处理以及响应echo reply发送都是在内核态自动处理的,不需要用户态干预。在Windows比较新的版本上,比如Windows 7及之后的版本,默认不开启ech request报文上送,导致不会自动响应ping请求,需要手动开启一下。

ping源码实现

Talk is cheap, show me you code. 下面我们来分析ping的源码实现,因为Windows没有开源Ping的实现,我们现在以Linux下的ping工具为对象分析具体源码实现。ping工具在iputils中实现,Ubuntu使用的源码在这里

ping工具的实现在目录ping中,源文件如下所示:

path: root/ping
Mode	Name	Size	
-rw-r--r--	meson.build	518	log plain
-rw-r--r--	node_info.c	12858	log plain
-rw-r--r--	ping.c	        50280	log plain
-rw-r--r--	ping.h	        12646	log plain
-rw-r--r--	ping6_common.c	26860	log plain
-rw-r--r--	ping_common.c	28183	log plain

主要的代码实现在ping.c, ping_common.c和ping6_common.c中。

ping.c实现了ping工具中对于ipv4和ipv6的通用处理入口,包括创建socket的create_socket, 主入口函数main。关于ipv4的主入口ping4_run,收发包处理ping4_receive_error_msg, ping4_parse_reply, ping4_send_probe, ping4_install_filter也实现在ping.c中。

ping_common.c实现了ping工具用法usage, 权限操作接口limit_capabilities, modify_capability, drop_capabilities, ping包发送入口pinger, 配置入口setup, 主循环main_loop, 统计信息更新gather_statistics, 以及最后接触处理的finish和status。

ping6_common.c实现了ipv6的一些相关接口,包括主入口ping6_run, 以及针对ipv6的收发包处理ping6_receive_error_msg, ping6_parse_reply, ping6_send_probe和ping6_install_filter。

整体流程图如下:

上图图片是用Mermaid Live Editor生成,地址如下:https://mermaid.live/

源码分析

main (ping.c)

int
main(int argc, char **argv)
{
...
	struct addrinfo hints = {
		.ai_family = AF_UNSPEC,   /*默认地址族*/
		.ai_protocol = IPPROTO_UDP,
		.ai_socktype = SOCK_DGRAM, /*默认使用DGRAM类型*/
		.ai_flags = getaddrinfo_flags
	};
...
	static struct ping_rts rts = {
		.interval = 1000,   /*默认间隔1000ms*/
		.preload = 1,
		.lingertime = MAXWAIT * 1000,
		.confirm_flag = MSG_CONFIRM,
		.tmin = LONG_MAX,
		.pipesize = -1,
		.datalen = DEFDATALEN,
		.ident = -1,
		.screen_width = INT_MAX,
#ifdef HAVE_LIBCAP
		.cap_raw = CAP_NET_RAW,
		.cap_admin = CAP_NET_ADMIN,
#endif
		.pmtudisc = -1,
		.source.sin_family = AF_INET,
		.source6.sin6_family = AF_INET6,
		.ni.query = -1,
		.ni.subject_type = -1,
	};
...
        /*因为支持创建ping4和ping6的软链接,所以根据程序名称设置默认地址族*/ 
	/* Support being called using `ping4` or `ping6` symlinks */
	if (argv[0][strlen(argv[0]) - 1] == '4')
		hints.ai_family = AF_INET;
	else if (argv[0][strlen(argv[0]) - 1] == '6')
		hints.ai_family = AF_INET6;
...
        /*这里跳过解析的opt关键字,剩下hops和targets*/
	argc -= optind;
	argv += optind;

        /*必须指定目的地,如果argc是1则是target,如果argc大于1,则除最后一个外都是hop*/
	if (!argc)
		error(2, EDESTADDRREQ, "usage error");

         target = argv[argc - 1];   /*target就是ping的目标,必须是最后一个参数*/

	/* Create sockets */
	enable_capability_raw();  /*使能创建socket权限*/

	if (hints.ai_family != AF_INET6) {
                /*创建ipv4的socket*/
		create_socket(&rts, &sock4, AF_INET, hints.ai_socktype, IPPROTO_ICMP,
			      hints.ai_family == AF_INET);
	}

	if (hints.ai_family != AF_INET) {
                /*创建ipv6的socket*/
		create_socket(&rts, &sock6, AF_INET6, hints.ai_socktype, IPPROTO_ICMPV6, sock4.fd == -1);

		/* This may not be needed if both protocol versions always had the same value, but
		 * since I don't know that, it's better to be safe than sorry. */
		rts.pmtudisc = rts.pmtudisc == IP_PMTUDISC_DO	? IPV6_PMTUDISC_DO   :
			       rts.pmtudisc == IP_PMTUDISC_DONT ? IPV6_PMTUDISC_DONT :
			       rts.pmtudisc == IP_PMTUDISC_WANT ? IPV6_PMTUDISC_WANT :
			       rts.pmtudisc == IP_PMTUDISC_PROBE? IPV6_PMTUDISC_PROBE: rts.pmtudisc;
	}

        disable_capability_raw();   /*关闭打开socket的权限*/

        /*如果是通过ping执行的程序,这里地址族还是AF_UNSPEC,如果只有一种地址族支持则使能它,否则ipv4和ipv6都会创建成功*/
	/* Limit address family on single-protocol systems */
	if (hints.ai_family == AF_UNSPEC) {
		if (sock4.fd == -1)
			hints.ai_family = AF_INET6;
		else if (sock6.fd == -1)
			hints.ai_family = AF_INET;
	}

        /*下面是解析target,然后根据地址族计算ICMP报文的最大长度*/
	int max_s = MAX(ICMP_MAX_DATALEN, ICMPV6_MAX_DATALEN);

	/* Detect based on -4 / -6 */
	if (hints.ai_family == AF_INET)
		max_s = ICMP_MAX_DATALEN - get_ipv4_optlen(&rts);
	else if (hints.ai_family == AF_INET6)
		max_s = ICMPV6_MAX_DATALEN;

	/* Force limit on IPv4/IPv6 adresses */
	if (inet_pton(AF_INET, target, buf))
		max_s = ICMP_MAX_DATALEN - get_ipv4_optlen(&rts);
	else if (inet_pton(AF_INET6, target, buf))
		max_s = ICMPV6_MAX_DATALEN;

        /*支持设置tos或者tc*/
	/* Set socket options */
	if (rts.settos)
		set_socket_option(&sock4, IPPROTO_IP, IP_TOS, &rts.settos, sizeof(rts.settos));
	if (rts.tclass)
		set_socket_option(&sock6, IPPROTO_IPV6, IPV6_TCLASS, &rts.tclass, sizeof(rts.tclass));

        /*获取目标地址,这里是根据target做dns解析*/
	/* getaddrinfo fails to indicate a scopeid when not used in dual-stack mode.
	 * Work around by always using dual-stack name resolution.
	 *
	 * https://github.com/iputils/iputils/issues/252
	 */
	int target_ai_family = hints.ai_family;
	hints.ai_family = AF_UNSPEC;

        /*ipv6 linklocal地址需要指定接口或者scope-id*/
	if (!strchr(target, '%') && sock6.socktype == SOCK_DGRAM &&
		inet_pton(AF_INET6, target, buf) > 0 &&
		(IN6_IS_ADDR_LINKLOCAL(buf) || IN6_IS_ADDR_MC_LINKLOCAL(buf))) {
			error(0, 0, _(
				"Warning: IPv6 link-local address on ICMP datagram socket may require ifname or scope-id"
				" => use: address%%<ifname|scope-id>"));
	}

        /*使用getaddrinfo进行dns解析,注意这里如果target已经是有效的ip地址了,是如何处理的*/
	ret_val = getaddrinfo(target, NULL, &hints, &result);
	if (ret_val)
		error(2, 0, "%s: %s", target, gai_strerro,r(ret_val));

        /*遍历dns解析的结果,说明如果解析了多个地址,则分别执行ping么?*/
	for (ai = result; ai; ai = ai->ai_next) {
		if (rts.opt_verbose)
			printf("ai->ai_family: %s, ai->ai_canonname: '%s'\n",
				   str_family(ai->ai_family),
				   ai->ai_canonname ? ai->ai_canonname : "");

                /*如果指定的地址族和解析出来的地址族不一致,而且已经是最后一个了,则抛出错误*/
		if (target_ai_family != AF_UNSPEC &&
			target_ai_family != ai->ai_family) {
			if (!ai->ai_next) {
				/* An address was found, but not of the family we really want.
				 * Throw the appropriate gai error.
				 */
				error(2, 0, "%s: %s", target, gai_strerror(EAI_ADDRFAMILY));
			}
			continue;
		}

                /*根据地址族执行对应的ping程序*/
		switch (ai->ai_family) {
		case AF_INET:
			ret_val = ping4_run(&rts, argc, argv, ai, &sock4);
			break;
		case AF_INET6:
			ret_val = ping6_run(&rts, argc, argv, ai, &sock6);
			break;
		default:
			error(2, 0, _("unknown protocol family: %d"), ai->ai_family);
		}

                /*返回值大于等于0表示成功,然后就退出,小于0表示失败,则执行下一个地址族*/
		if (ret_val >= 0)
			break;
		/* ret_val < 0 means to go on to next addrinfo result, there
		 * better be one. */
		assert(ai->ai_next);
	}

	freeaddrinfo(result);

        /*返回值来自于ping4_run或者ping6_run*/
	return ret_val;
}

create_socket (ping.c)

static void create_socket(struct ping_rts *rts, socket_st *sock, int family,
			  int socktype, int protocol, int requisite)
{
	int do_fallback = 0;

	errno = 0;

	assert(sock->fd == -1);
	assert(socktype == SOCK_DGRAM || socktype == SOCK_RAW);

	/* Attempt to create a ping socket if requested. Attempt to create a raw
	 * socket otherwise or as a fallback. Well known errno values follow.
	 *
	 * 1) EACCES
	 *
	 * Kernel returns EACCES for all ping socket creation attempts when the
	 * user isn't allowed to use ping socket. A range of group ids is
	 * configured using the `net.ipv4.ping_group_range` sysctl. Fallback
	 * to raw socket is necessary.
	 *
	 * Kernel returns EACCES for all raw socket creation attempts when the
	 * process doesn't have the `CAP_NET_RAW` capability.
	 *
	 * 2) EAFNOSUPPORT
	 *
	 * Kernel returns EAFNOSUPPORT for IPv6 ping or raw socket creation
	 * attempts when run with IPv6 support disabled (e.g. via `ipv6.disable=1`
	 * kernel command-line option.
	 *
	 * https://github.com/iputils/iputils/issues/32
	 *
	 * OpenVZ 2.6.32-042stab113.11 and possibly other older kernels return
	 * EAFNOSUPPORT for all IPv4 ping socket creation attempts due to lack
	 * of support in the kernel. Fallback to raw socket is necessary.
	 *
	 * https://github.com/iputils/iputils/issues/54
	 *
	 * 3) EPROTONOSUPPORT
	 *
	 * OpenVZ 2.6.32-042stab113.11 and possibly other older kernels return
	 * EPROTONOSUPPORT for all IPv6 ping socket creation attempts due to lack
	 * of support in the kernel [1]. Debian 9.5 based container with kernel 4.10
	 * returns EPROTONOSUPPORT also for IPv4 [2]. Fallback to raw socket is
	 * necessary.
	 *
	 * [1] https://github.com/iputils/iputils/issues/54
	 * [2] https://github.com/iputils/iputils/issues/129
	 */
        /*如果没有显示指定RAW,则优先使用DGRAM类型创建socket*/
	if (socktype == SOCK_DGRAM)
		sock->fd = socket(family, socktype, protocol);

        /*创建失败,且是IPv4的地址族不支持,或者协议不支持,fallback到raw类型*/
	/* Kernel doesn't support ping sockets. */
	if (sock->fd == -1 && errno == EAFNOSUPPORT && family == AF_INET)
		do_fallback = 1;,
	if (sock->fd == -1 && errno == EPROTONOSUPPORT)
		do_fallback = 1;
        
        /*权限不允许,fallback到raw类型*/
	/* User is not allowed to use ping sockets. */
	if (sock->fd == -1 && errno == EACCES)
		do_fallback = 1;

        /*显试指定raw或者fallback到raw类型,创建raw socket*/
	if (socktype == SOCK_RAW || do_fallback) {
		socktype = SOCK_RAW;
		sock->fd = socket(family, SOCK_RAW, protocol);
	}

	sock->socktype = socktype;

	/* valid socket */
	if (sock->fd != -1)
		return;

        /*创建失败的一些处理,是权限问题还是其他原因,打印errno*/

	/* failed to create socket */

	if (requisite || rts->opt_verbose) {
		error(0, 0, "socktype: %s", str_socktype(socktype));
		error(0, errno, "socket");
	}

	if (requisite) {
		if (socktype == SOCK_RAW && geteuid() != 0)
			error(0, 0, _("=> missing cap_net_raw+p capability or setuid?"));

		exit(2);
	}
}

bind_to_device (ping.c)

static void bind_to_device(struct ping_rts *rts, int fd, in_addr_t addr)
{
	int rc;
	int errno_save;

	enable_capability_raw();
        /*绑定指定接口*/
	rc = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, rts->device,
			strlen(rts->device) + 1);
	errno_save = errno;
	disable_capability_raw();

	if (rc != -1)
		return;

        /*组播地址支持绑定到指定接口设备*/
	if (IN_MULTICAST(ntohl(addr))) {
		struct ip_mreqn imr;

		memset(&imr, 0, sizeof(imr));
		imr.imr_ifindex = iface_name2index(rts, fd);

		if (setsockopt(fd, SOL_IP, IP_MULTICAST_IF, &imr, sizeof(imr)) == -1)
			error(2, errno, "IP_MULTICAST_IF");
	} else {
		error(2, errno_save, "SO_BINDTODEVICE %s", rts->device);
	}
}

ping4_run (ping.c)

/* return >= 0: exit with this code, < 0: go on to next addrinfo result */
int ping4_run(struct ping_rts *rts, int argc, char **argv, struct addrinfo *ai,
	      socket_st *sock)
{
	static const struct addrinfo hints = {
		.ai_family = AF_INET,
		.ai_protocol = IPPROTO_UDP,
		.ai_flags = getaddrinfo_flags
	};
...
	if (argc > 1) {
                /*record route选项不能设置hop信息*/
		if (rts->opt_rroute)
			usage();
		else if (rts->opt_timestamp) {
                        /*时间戳选项只支持prespec类型,且跳数不能大于等于5个*/
			if (rts->ts_type != IPOPT_TS_PRESPEC)
				usage();
			if (argc > 5)
				usage();
		} else {
                        /*源路由支持的跳数不能大于等于10个*/
			if (argc > 10)
				usage();
			rts->opt_sourceroute = 1;
		}
	}

        /*解析域名或IP地址信息,每次迭代都更新hostname,也就是最终的目的地*/
	while (argc > 0) {
		target = *argv;

		memset((char *)&rts->whereto, 0, sizeof(rts->whereto));
		rts->whereto.sin_family = AF_INET;
		if (inet_aton(target, &rts->whereto.sin_addr) == 1) {
                        /*优先使用IP地址解析,成功则设置hostname,只有1个地址的时候,设置numric选项*/
			rts->hostname = target;
			if (argc == 1)
				rts->opt_numeric = 1;
		} else {
			struct addrinfo *result = ai;
			int ret_val;

                        /*当包含hop和target的时候,使用域名解析hop信息*/
			if (argc > 1) {
				ret_val = getaddrinfo(target, NULL, &hints, &result);
				if (ret_val)
					error(2, 0, "%s: %s", target, gai_strerror(ret_val));
			}

			memcpy(&rts->whereto, result->ai_addr, sizeof rts->whereto);
			memset(hnamebuf, 0, sizeof hnamebuf);

                        /*如果返回结果支持ai_canonname则使用ai_canonname,否则使用target来显示ping的结果*/
			/*
			 * On certain network setup getaddrinfo() can return empty
			 * ai_canonname. Instead of printing nothing in "PING"
			 * line use the target.
			 */
			if (result->ai_canonname)
				strncpy(hnamebuf, result->ai_canonname, sizeof hnamebuf - 1);
			else
				strncpy(hnamebuf, target, sizeof hnamebuf - 1);

			rts->hostname = hnamebuf;

                        /*argc大于1,意味着存在hop信息,释放上面getaddrinfo返回的result*/
			if (argc > 1)
				freeaddrinfo(result);
		}

                /*除了最后一个,记录IP地址到rts->route数组中,并递增rts->nroute个数信息*/
		if (argc > 1)
			rts->route[rts->nroute++] = rts->whereto.sin_addr.s_addr;

                /*继续处理下一个地址信息*/
		argc--;
		argv++;
	}

        /*源地址处理*/
	if (rts->source.sin_addr.s_addr == 0) {
                /*进入此处说明没有设置-I选项设置IP地址信息,可能设置了接口*/
		socklen_t alen;
                /*创建1个探测probe_fd,用来进行源地址的选择,udp类型*/
		int probe_fd = socket(AF_INET, SOCK_DGRAM, 0);
                /*通过上面解析出来的目的地*/
		dst = rts->whereto;

		if (probe_fd < 0)
			error(2, errno, "socket");

                /*如果指定了接口,则将probe_fd和sock->fd都绑定到这个接口上*/
		if (rts->device) {
			bind_to_device(rts, probe_fd, dst.sin_addr.s_addr);
			bind_to_device(rts, sock->fd, dst.sin_addr.s_addr);
		}

		if (rts->settos &&
		    setsockopt(probe_fd, IPPROTO_IP, IP_TOS, (char *)&rts->settos, sizeof(int)) < 0)
			error(0, errno, _("warning: QOS sockopts"));

		sock_setmark(rts, probe_fd);

                /*使用目的端口号1025进行探测,非保留端口号的第1个*/
		dst.sin_port = htons(1025);
                /*如果配置了源路由,则使用第一个地址*/
		if (rts->nroute)
			dst.sin_addr.s_addr = rts->route[0];
                /*通过connect连接probe_fd来判断目的地址是否可达,对于udp套接字来说connect在第一次建立路由信息*/
		if (connect(probe_fd, (struct sockaddr *)&dst, sizeof(dst)) == -1) {
                        /*返回-1表示失败,处理权限,地址不可达等各种错误*/
			if (errno == EACCES) {
				if (rts->broadcast_pings == 0)
					error(2, 0,
						_("Do you want to ping broadcast? Then -b. If not, check your local firewall rules"));
				fprintf(stderr, _("WARNING: pinging broadcast address\n"));
				if (setsockopt(probe_fd, SOL_SOCKET, SO_BROADCAST,
					       &rts->broadcast_pings, sizeof(rts->broadcast_pings)) < 0)
					error(2, errno, _("cannot set broadcasting"));
				if (connect(probe_fd, (struct sockaddr *)&dst, sizeof(dst)) == -1)
					error(2, errno, "connect");
			} else if ((errno == EHOSTUNREACH || errno == ENETUNREACH) && ai->ai_next) {
				close(probe_fd);
				return -1;,
			} else {
				error(2, errno, "connect");
			}
		}
                /*走到这里意味着地址可达,获取源地址信息*/
		alen = sizeof(rts->source);
		if (getsockname(probe_fd, (struct sockaddr *)&rts->source, &alen) == -1)
			error(2, errno, "getsockname");
		rts->source.sin_port = 0;

                /*如果指定了接口,判断接口和源地址是否匹配,如果二者不匹配则提示错误*/
		if (rts->device) {
			struct ifaddrs *ifa0, *ifa;
			int ret;

			ret = getifaddrs(&ifa0);
			if (ret)
				error(2, errno, _("gatifaddrs failed"));
			for (ifa = ifa0; ifa; ifa = ifa->ifa_next) {
				if (!ifa->ifa_name || !ifa->ifa_addr ||
				    ifa->ifa_addr->sa_family != AF_INET)
					continue;
				if (!strcmp(ifa->ifa_name, rts->device) &&
				    !memcmp(&((struct sockaddr_in *)ifa->ifa_addr)->sin_addr,
					    &rts->source.sin_addr, sizeof(rts->source.sin_addr)))
					break;
			}
			freeifaddrs(ifa0);
			if (!ifa)
				error(0, 0, _("Warning: source address might be selected on device other than: %s"), rts->device);
		}
		close(probe_fd);

	} else if (rts->device) {
                /*直接进行绑定*/
		bind_to_device(rts, sock->fd, rts->whereto.sin_addr.s_addr);
	}

        /*如果目的地址为0,则将源地址赋值给目的地址,什么会走到这里,按理说whereto应该都非0的了*/
	if (rts->whereto.sin_addr.s_addr == 0)
		rts->whereto.sin_addr.s_addr = rts->source.sin_addr.s_addr;
...
        /*如果支持recore route则设置相关option,如果支持timestamp或者source route则拷贝上面解析的rts->route到ip头的option字段中*/
	/* record route option */
	if (rts->opt_rroute) {
		memset(rspace, 0, sizeof(rspace));
		rspace[0] = IPOPT_NOP;
		rspace[1 + IPOPT_OPTVAL] = IPOPT_RR;
		rspace[1 + IPOPT_OLEN] = sizeof(rspace) - 1;
		rspace[1 + IPOPT_OFFSET] = IPOPT_MINOFF;
		if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, sizeof rspace) < 0)
			error(2, errno, "record route");
	}
	if (rts->opt_timestamp) {
		memset(rspace, 0, sizeof(rspace));
		rspace[0] = IPOPT_TIMESTAMP;
		rspace[1] = (rts->ts_type == IPOPT_TS_TSONLY ? 40 : 36);
		rspace[2] = 5;
		rspace[3] = rts->ts_type;
		if (rts->ts_type == IPOPT_TS_PRESPEC) {
			int i;
			rspace[1] = 4 + rts->nroute * 8;
			for (i = 0; i < rts->nroute; i++) {
				tmp_rspace = (uint32_t *)&rspace[4 + i * 8];
				*tmp_rspace = rts->route[i];
			}
		}
		if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, rspace[1]) < 0) {
			rspace[3] = 2;
			if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, rspace[1]) < 0)
				error(2, errno, "ts option");
		}
	}
	if (rts->opt_sourceroute) {
		int i;
		memset(rspace, 0, sizeof(rspace));
		rspace[0] = IPOPT_NOOP;
		rspace[1 + IPOPT_OPTVAL] = rts->opt_so_dontroute ? IPOPT_SSRR : IPOPT_LSRR;
		rspace[1 + IPOPT_OLEN] = 3 + rts->nroute * 4;
		rspace[1 + IPOPT_OFFSET] = IPOPT_MINOFF;
		for (i = 0; i < rts->nroute; i++) {
			tmp_rspace = (uint32_t *)&rspace[4 + i * 4];
			*tmp_rspace = rts->route[i];
		}

		if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, 4 + rts->nroute * 4) < 0)
			error(2, errno, "record route");
...
        /*如果指定报文长度大于时间戳长度,则支持通过报文净荷传输时间戳信息*/
	if (rts->datalen >= (int)sizeof(struct timeval))	/* can we time transfer */
		rts->timing = 1;
        /*报文长度还需要计算IP头和ICMP头*/
	packlen = rts->datalen + MAXIPLEN + MAXICMPLEN;
        /*动态申请发包内存*/
	if (!(packet = (unsigned char *)malloc((unsigned int)packlen)))
		error(2, errno, _("memory allocation failed"));

        /*打印ping目的地名称和ipv4地址,目的地名称和你命令行指定的域名可能是不一样的,因为这可能是dns解析出的ai_canonname。
        比如ping www.baidu.com,打印的结果是PING www.a.shifen.com (39.156.70.239) 56(84) bytes of data.*/
	printf(_("PING %s (%s) "), rts->hostname, inet_ntoa(rts->whereto.sin_addr));
	if (rts->device || rts->opt_strictsource)
		printf(_("from %s %s: "), inet_ntoa(rts->source.sin_addr), rts->device ? rts->device : "");
	printf(_("%d(%d) bytes of data.\n"), rts->datalen, rts->datalen + 8 + rts->optlen + 20);

        /*其他一些和协议无关的配置和参数检查*/
	setup(rts, sock);

        /*如果通过-C选项显式指定了connect调用但是失败了,则返回错误。
        之所以可以指定connect调用,是因为ping默认支持dgram方式的套接字*/
	if (rts->opt_connect_sk &&
	    connect(sock->fd, (struct sockaddr *)&dst, sizeof(dst)) == -1)
		error(2, errno, "connect failed");

        /*释放权限*/
	drop_capabilities();

        /*进入主循环发包和收包*/
	ret = main_loop(rts, &ping4_func_set, sock, packet, packlen);
        /*释放发包内存*/
	free(packet);

        /*返回main_loop返回的结果*/
	return ret;
}

ping6_run (ping6_common.c)

TODO

setup (ping_common.c)

/* Protocol independent setup and parameter checks. */

void setup(struct ping_rts *rts, socket_st *sock)
{
	int hold;
	struct timeval tv;
	sigset_t sset;

        /*flood ping如果没有设置发包间隔,则设置为0*/
	if (rts->opt_flood && !rts->opt_interval)
		rts->interval = 0;

        /*flood ping的最小间隔是MIN_USER_INTERVAL_MS(2ms)*/
	if (rts->uid && rts->interval < MIN_USER_INTERVAL_MS)
		error(2, 0, _("cannot flood, minimal interval for user must be >= %d ms, use -i %s (or higher)"),
			  MIN_USER_INTERVAL_MS, str_interval(MIN_USER_INTERVAL_MS));

	if (rts->interval >= INT_MAX / rts->preload)
		error(2, 0, _("illegal preload and/or interval: %d"), rts->interval);

	hold = 1;
        /*打开调试*/
	if (rts->opt_so_debug)
		setsockopt(sock->fd, SOL_SOCKET, SO_DEBUG, (char *)&hold, sizeof(hold));
	if (rts->opt_so_dontroute)
		setsockopt(sock->fd, SOL_SOCKET, SO_DONTROUTE, (char *)&hold, sizeof(hold));

#ifdef SO_TIMESTAMP
	if (!rts->opt_latency) {
		int on = 1;
		if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, &on, sizeof(on)))
			error(0, 0, _("Warning: no SO_TIMESTAMP support, falling back to SIOCGSTAMP"));
	}
#endif

	sock_setmark(rts, sock->fd);

        /*设置发送和接收超时时间,发送超时最长为1秒,接收超时间隔时间*/
	/* Set some SNDTIMEO to prevent blocking forever
	 * on sends, when device is too slow or stalls. Just put limit
	 * of one second, or "interval", if it is less.
	 */
	tv.tv_sec = 1;
	tv.tv_usec = 0;
	if (rts->interval < 1000) {
		tv.tv_sec = 0;
		tv.tv_usec = 1000 * SCHINT(rts->interval);
	}
	setsockopt(sock->fd, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));

	/* Set RCVTIMEO to "interval". Note, it is just an optimization
	 * allowing to avoid redundant poll(). */
	tv.tv_sec = SCHINT(rts->interval) / 1000;
	tv.tv_usec = 1000 * (SCHINT(rts->interval) % 1000);
	if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)))
		rts->opt_flood_poll = 1;

	if (!rts->opt_pingfilled) {
		int i;
		unsigned char *p = rts->outpack + 8;

		/* Do not forget about case of small datalen, fill timestamp area too! */
		for (i = 0; i < rts->datalen; ++i)
			*p++ = i;
	}

        /*如果是raw类型且没有通过—e设置ident,则使用pid作为ident,也就是icmp报文中的ident字段*/
	if (sock->socktype == SOCK_RAW && rts->ident == -1)
		rts->ident = htons(getpid() & 0xFFFF);

        /*设置信号处理函数*/
	set_signal(SIGINT, sigexit);
	set_signal(SIGALRM, sigexit);
	set_signal(SIGQUIT, sigstatus);
i
	sigemptyset(&sset);
	sigprocmask(SIG_SETMASK, &sset, NULL);

        /*获取当前系统时间*/
	clock_gettime(CLOCK_MONOTONIC_RAW, &rts->start_time);

        /*如果通过-w设置了时长,则创建定时器*/
	if (rts->deadline) {
		struct itimerval it;

		it.it_interval.tv_sec = 0;
		it.it_interval.tv_usec = 0;
		it.it_value.tv_sec = rts->deadline;
		it.it_value.tv_usec = 0;
		setitimer(ITIMER_REAL, &it, NULL);
	}

        /*控制台设置*/
	if (isatty(STDOUT_FILENO)) {
		struct winsize w;

		if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) != -1) {
			if (w.ws_col > 0)
				rts->screen_width = w.ws_col;
		}
	}
}

main_loop (ping_common.c)

int main_loop(struct ping_rts *rts, ping_func_set_st *fset, socket_st *sock,
	      uint8_t *packet, int packlen)
{
	char addrbuf[128];
	char ans_data[4096];
	struct iovec iov;
	struct msghdr msg;
	int cc;
	int next;
	int polling;
	int recv_error;

	iov.iov_base = (char *)packet;

	for (;;) {
		/* Check exit conditions. */
                /*通过ctrl+c退出,则exiting被置位*/
		if (rts->exiting)
			break;

                /*通过-c设置了最大接收报文个数,而且接收个数+错误个数>=设置的最大接收报文个数*/
		if (rts->npackets && rts->nreceived + rts->nerrors >= rts->npackets)

			break;
                /*设置了最大时间,存在错误报文,这种情况也退出*/
		if (rts->deadline && rts->nerrors)
			break;

		/* Check for and do special actions. */
                /*通过ctrl+\触发ping过程中的状态显示,可以在不停止ping的情况下查看截止到当前的收发报文统计*/
		if (rts->status_snapshot)
			status(rts);

		/* Send probes scheduled to this time. */
                /*发送探测报文然后进行调度,如果next小于等于0,则继续发送探测报文*/
		do {
			next = pinger(rts, fset, sock);
			next = schedule_exit(rts, next);
		} while (next <= 0);

		/* "next" is time to send next probe, if positive.
		 * If next<=0 send now or as soon as possible. */

		/* Technical part. Looks wicked. Could be dropped,
		 * if everyone used the newest kernel. :-)
		 * Its purpose is:
		 * 1. Provide intervals less than resolution of scheduler.
		 *    Solution: spinning.
		 * 2. Avoid use of poll(), when recvmsg() can provide
		 *    timed waiting (SO_RCVTIMEO). */
                /*上面的注释解释了下面代码的主要目的,在所有人使用的都不是最新内核版本的情况下,提供更加快速的调度策略。
                1. 实现比调度器粒度还小的间隔设置,方法是自旋;2. 当recvmsg可以通过接收超时退出的时候,避免使用poll调用。*/
		polling = 0;   /*默认需要poll调用等待超时*/
		recv_error = 0;
                /*如果开启了自适应ping(-A),或者fd的接收超时设置失败(opt_flood_poll),或者下次发送时间小于设置的发送间隔*/
		if (rts->opt_adaptive || rts->opt_flood_poll || next < SCHINT(rts->interval)) {
                        /*recv_expected是发送但是没有确认的报文个数*/
			int recv_expected = in_flight(rts);

			/* If we are here, recvmsg() is unable to wait for
			 * required timeout. */
			if (1000 % HZ == 0 ? next <= 1000 / HZ : (next < INT_MAX / HZ && next * HZ <= 1000)) {
				/* Very short timeout... So, if we wait for
				 * something, we sleep for MIN_INTERVAL_MS.
				 * Otherwise, spin! */
				if (recv_expected) {
                                        /*如果有报文待确认,则下次调度时间设置为MIN_INTERVAL_MS(10ms)*/
					next = MIN_INTERVAL_MS;
				} else {
                                        /*报文都已经确认了,直接自旋等待下次调度发包,通过sched_yield触发调用,
                                        polling设置为MSG_DONTWAIT,表示不需要poll,直接调用recvmsg*/
					next = 0;
					/* When spinning, no reasons to poll.
					 * Use nonblocking recvmsg() instead. */
					polling = MSG_DONTWAIT;
					/* But yield yet. */
					sched_yield();
				}
			}

			if (!polling &&
			    (rts->opt_adaptive || rts->opt_flood_poll || rts->interval)) {
                                /*没有进行快速调度,则需要通过poll设置超时时间*/
				struct pollfd pset;
				pset.fd = sock->fd;
				pset.events = POLLIN;
				pset.revents = 0;
                                /*poll返回-1表示失败,返回0表示超时时间到,返回1表示有fd被置位。
                                如果是前2者则继续下次调度发包,如果是后者则需要判断是否设置了POLLIN或者POLLERR,没有这两个标志则继续,有的话往下走*/
				if (poll(&pset, 1, next) < 1 ||
				    !(pset.revents & (POLLIN | POLLERR)))
					continue;
                                /*有POLLIN或者POLLERR被置位,将polling设置为MSG_DONTWAIT不需要再次poll,同时将revents中的POLLERR记录到recv_error中*/
				polling = MSG_DONTWAIT;
				recv_error = pset.revents & POLLERR;
			}
		}

		for (;;) {
			struct timeval *recv_timep = NULL;
			struct timeval recv_time;
			int not_ours = 0; /* Raw socket can receive messages
					   * destined to other running pings. */

			iov.iov_len = packlen;
			memset(&msg, 0, sizeof(msg));
			msg.msg_name = addrbuf;
			msg.msg_namelen = sizeof(addrbuf);
			msg.msg_iov = &iov;
			msg.msg_iovlen = 1;
			msg.msg_control = ans_data;
			msg.msg_controllen = sizeof(ans_data);

			cc = recvmsg(sock->fd, &msg, polling);
			polling = MSG_DONTWAIT;

                        /*recvmsg返回值小于0表示出错,根据errno和recv_error来处理错误*/
			if (cc < 0) {
				/* If there was a POLLERR and there is no packet
				 * on the socket, try to read the error queue.
				 * Otherwise, give up.
				 */
                                /*如果返回EAGAIN且poll没有设置POLLERR,或者返回EINTR则跳出重新发包*/
				if ((errno == EAGAIN && !recv_error) ||
				    errno == EINTR)
					break;
                                /*如果是其他错误,调用地址族的receive_error_msg进行错误处理,具体来说是读error queue的信息*/
				recv_error = 0;
				if (!fset->receive_error_msg(rts, sock)) {
					if (errno) {
						error(0, errno, "recvmsg");
						break;
					}
					not_ours = 1;
				}
			} else {

#ifdef SO_TIMESTAMP
				struct cmsghdr *c;
                                /*获取时间戳信息*/
				for (c = CMSG_FIRSTHDR(&msg); c; c = CMSG_NXTHDR(&msg, c)) {
					if (c->cmsg_level != SOL_SOCKET ||
					    c->cmsg_type != SO_TIMESTAMP)
						continue;
					if (c->cmsg_len < CMSG_LEN(sizeof(struct timeval)))
						continue;
					recv_timep = (struct timeval *)CMSG_DATA(c);
				}
#endif

				if (rts->opt_latency || recv_timep == NULL) {
                                        /*更新接收时间,如果没有时间戳则取当前系统时间*/
					if (rts->opt_latency ||
					    ioctl(sock->fd, SIOCGSTAMP, &recv_time))
						gettimeofday(&recv_time, NULL);
					recv_timep = &recv_time;
				}

                                /*调用地址族的parse_reply处理接收的报文,返回是否是我们自己需要处理的报文*/
				not_ours = fset->parse_reply(rts, sock, &msg, cc, addrbuf, recv_timep);
			}

			/* See? ... someone runs another ping on this host. */
                        /*收到了不是发给我们进程的报文,在raw类型会出现这种,则安装一下filter过滤一下*/
			if (not_ours && sock->socktype == SOCK_RAW)
				fset->install_filter(rts, sock);

			/* If nothing is in flight, "break" returns us to pinger. */
                        /*in_flight表示待确认报文个数,为0表示没有需要确认的报文,直接返回pinger触发下次发送*/
			if (in_flight(rts) == 0)
				break;

			/* Otherwise, try to recvmsg() again. recvmsg()
			 * is nonblocking after the first iteration, so that
			 * if nothing is queued, it will receive EAGAIN
			 * and return to pinger. */
                        /*即使有没有确认的报文,这里也没有什么额外的操作*/
		}
	}

        /*退出发包和收包处理流程,返回finish的处理结果*/
	return finish(rts);
}

pinger (ping_common.c)

/*
 * pinger --
 * 	Compose and transmit an ICMP ECHO REQUEST packet.  The IP packet
 * will be added on by the kernel.  The ID field is our UNIX process ID,
 * and the sequence number is an ascending integer.  The first several bytes
 * of the data portion are used to hold a UNIX "timeval" struct in VAX
 * byte-order, to compute the round-trip time.
 */
/*组包和发送ICMP ECHO REQUEST报文*/
int pinger(struct ping_rts *rts, ping_func_set_st *fset, socket_st *sock)
{
	static int oom_count;
	static int tokens;
	int i;

	/* Have we already sent enough? If we have, return an arbitrary positive value. */
        /*发包之前判断是否需要退出,如果满足退出条件,返回1000表示触发调度*/
	if (rts->exiting || (rts->npackets && rts->ntransmitted >= rts->npackets && !rts->deadline))
		return 1000;
        /*preload表示在等待回复报文之前预发送的报文个数,这些报文不计算收包和rtt时间,默认没有:preload=1*/
	/* Check that packets < rate*time + preload */
	if (rts->cur_time.tv_sec == 0 && rts->cur_time.tv_nsec == 0) {
                /*第1此进入这个流程,获取时间戳,通过preload和interval计算tockens*/
		clock_gettime(CLOCK_MONOTONIC_RAW, &rts->cur_time);
		tokens = rts->interval * (rts->preload - 1);
	} else {
		long ntokens, tmp;
		struct timespec tv;

                /*非首次进入,通过时间差计算ntokens,1ms=1token.*/
		clock_gettime(CLOCK_MONOTONIC_RAW, &tv);
		ntokens = (tv.tv_sec - rts->cur_time.tv_sec) * 1000 +
			  (tv.tv_nsec - rts->cur_time.tv_nsec) / 1000000;
		if (!rts->interval) {
			/* Case of unlimited flood is special;
			 * if we see no reply, they are limited to 100pps */
                        /*未设置interval,flood ping,限速100pps,这里MIN_INTERVAL_MS */
			if (ntokens < MIN_INTERVAL_MS && in_flight(rts) >= rts->preload)
				return MIN_INTERVAL_MS - ntokens;
		}
		ntokens += tokens; /*累计token到tokens上*/

                /*下面的token计算没有看懂*/
		tmp = (long)rts->interval * (long)rts->preload;
		if (tmp < ntokens)
			ntokens = tmp;
		if (ntokens < rts->interval)
			return rts->interval - ntokens;

		rts->cur_time = tv;
		tokens = ntokens - rts->interval;
	}

        /*打印通过-O设置的outstanding信息*/
	if (rts->opt_outstanding) {
		if (rts->ntransmitted > 0 && !rcvd_test(rts, rts->ntransmitted)) {
			print_timestamp(rts);
			printf(_("no answer yet for icmp_seq=%lu\n"), (rts->ntransmitted % MAX_DUP_CHK));
			fflush(stdout);
		}
	}

        /*调用地址族的send_probe进行发包*/
resend:
	i = fset->send_probe(rts, sock, rts->outpack, sizeof(rts->outpack));
        /*返回0表示发包正常*/
	if (i == 0) {
		oom_count = 0;
		advance_ntransmitted(rts); /*增加发包个数*/
		if (!rts->opt_quiet && rts->opt_flood) {
			/* Very silly, but without this output with
			 * high preload or pipe size is very confusing. */
                        /*flood ping打印.表示进度*/
			if ((rts->preload < rts->screen_width && rts->pipesize < rts->screen_width) ||
			    in_flight(rts) < rts->screen_width)
				write_stdout(".", 1);
		}
                /*消耗令牌后返回*/
		return rts->interval - tokens;
	}

	/* And handle various errors... */
        /*各种错误处理*/
	if (i > 0) {
		/* Apparently, it is some fatal bug. */
		abort();
	} else if (errno == ENOBUFS || errno == ENOMEM) {
		int nores_interval;

		/* Device queue overflow or OOM. Packet is not sent. */
		tokens = 0;
		/* Slowdown. This works only in adaptive mode (option -A) */
		rts->rtt_addend += (rts->rtt < 8 * 50000 ? rts->rtt / 8 : 50000);
		if (rts->opt_adaptive)
			update_interval(rts);
		nores_interval = SCHINT(rts->interval / 2);
		if (nores_interval > 500)
			nores_interval = 500;
		oom_count++;
		if (oom_count * nores_interval < rts->lingertime)
			return nores_interval;
		i = 0;
		/* Fall to hard error. It is to avoid complete deadlock
		 * on stuck output device even when dealine was not requested.
		 * Expected timings are screwed up in any case, but we will
		 * exit some day. :-) */
	} else if (errno == EAGAIN) {
		/* Socket buffer is full. */
		tokens += rts->interval;
		return MIN_INTERVAL_MS;
	} else if (errno == EMSGSIZE) {
		/* For example, sendto with len > 65527 on SOCK_DGRAM fails with this errno. */
		rts->nerrors++;
		i = 0;
	} else {
                /*调用地址族的错误处理*/
		if ((i = fset->receive_error_msg(rts, sock)) > 0) {
			/* An ICMP error arrived. In this case, we've received
			 * an error from sendto(), but we've also received an
			 * ICMP message, which means the packet did in fact
			 * send in some capacity. So, in this odd case, report
			 * the more specific errno as the error, and treat this
			 * as a hard local error. */
			i = 0;
			goto hard_local_error;
		}
		/* Compatibility with old linuces. */
		if (i == 0 && rts->confirm_flag && errno == EINVAL) {
			rts->confirm_flag = 0;
			errno = 0;
		}
		if (!errno)
			goto resend;
	}

hard_local_error:
	/* Hard local error. Pretend we sent packet. */
	advance_ntransmitted(rts);

	if (i == 0 && !rts->opt_quiet) {
		if (rts->opt_flood)
			write_stdout("E", 1);
		else
			error(0, errno, "sendmsg");
	}
	tokens = 0;
	return SCHINT(rts->interval);
}

ping4_send_probe (ping.c)

/*
 * pinger --
 * 	Compose and transmit an ICMP ECHO REQUEST packet.  The IP packet
 * will be added on by the kernel.  The ID field is our UNIX process ID,
 * and the sequence number is an ascending integer.  The first several bytes
 * of the data portion are used to hold a UNIX "timeval" struct in VAX
 * byte-order, to compute the round-trip time.
 */
int ping4_send_probe(struct ping_rts *rts, socket_st *sock, void *packet,
		     unsigned packet_size __attribute__((__unused__)))
{
	struct icmphdr *icp;
	int cc;
	int i;

        /*icmp报文组包,设置type,code,更新sequence和id信息*/
	icp = (struct icmphdr *)packet;
	icp->type = ICMP_ECHO;
	icp->code = 0;
	icp->checksum = 0;
	icp->un.echo.sequence = htons(rts->ntransmitted + 1);
	icp->un.echo.id = rts->ident;			/* ID */

	rcvd_clear(rts, rts->ntransmitted + 1);

        /*时间戳信息置位*/
	if (rts->timing) {
		if (rts->opt_latency) {
			struct timeval tmp_tv;
			gettimeofday(&tmp_tv, NULL);
			memcpy(icp + 1, &tmp_tv, sizeof(tmp_tv));
		} else {
			memset(icp + 1, 0, sizeof(struct timeval));
		}
	}

	cc = rts->datalen + 8;			/* skips ICMP portion */

	/* compute ICMP checksum here */
        /*计算校验和*/
	icp->checksum = in_cksum((unsigned short *)icp, cc, 0);

	if (rts->timing && !rts->opt_latency) {
                /*更新报文中的时间戳*/
		struct timeval tmp_tv;
		gettimeofday(&tmp_tv, NULL);
		memcpy(icp + 1, &tmp_tv, sizeof(tmp_tv));
		icp->checksum = in_cksum((unsigned short *)&tmp_tv, sizeof(tmp_tv), ~icp->checksum);
	}

        /*发送报文*/
	i = sendto(sock->fd, icp, cc, 0, (struct sockaddr *)&rts->whereto, sizeof(rts->whereto));

        /*根据成功发送报文的个数设置返回值,成功则返回0,否则返回sendto的结果*/
	return (cc == i ? 0 : i);
}

ping6_send_probe (ping6_common.c)

TODO

__schedule_exit (ping_common.c)

int __schedule_exit(int next)
{
	static unsigned long waittime;
	struct itimerval it;

	if (waittime)
		return next;

	if (global_rts->nreceived) {
		waittime = 2 * global_rts->tmax;
		if (waittime < 1000 * (unsigned long)global_rts->interval)
			waittime = 1000 * global_rts->interval;
	} else
		waittime = global_rts->lingertime * 1000;

	if (next < 0 || (unsigned long)next < waittime / 1000)
		next = waittime / 1000;

	it.it_interval.tv_sec = 0;
	it.it_interval.tv_usec = 0;
	it.it_value.tv_sec = waittime / 1000000;
	it.it_value.tv_usec = waittime % 1000000;
	setitimer(ITIMER_REAL, &it, NULL);
	return next;
}

/*schedule_exit实现在ping.h中*/
static inline int schedule_exit(struct ping_rts *rts, int next)
{
        /*设置了发包个数,且已发送个数大于设置的个数,且没有到最大发包时间,通过__schedule_exit调度*/
	if (rts->npackets && rts->ntransmitted >= rts->npackets && !rts->deadline)
		next = __schedule_exit(next);
	return next;
}

ping4_receive_error_msg (ping.c)

int ping4_receive_error_msg(struct ping_rts *rts, socket_st *sock)
{
	ssize_t res;
	char cbuf[512];
	struct iovec iov;
	struct msghdr msg;
	struct cmsghdr *cmsgh;
	struct sock_extended_err *e;
	struct icmphdr icmph;
	struct sockaddr_in target;
	int net_errors = 0;
	int local_errors = 0;
	int saved_errno = errno;

	iov.iov_base = &icmph;
	iov.iov_len = sizeof(icmph);
	msg.msg_name = (void *)&target;
	msg.msg_namelen = sizeof(target);
	msg.msg_iov = &iov;
	msg.msg_iovlen = 1;
	msg.msg_flags = 0;
	msg.msg_control = cbuf;
	msg.msg_controllen = sizeof(cbuf);

        /*读取MSG_ERRQUEUE中的信息,进行错误处理*/
	res = recvmsg(sock->fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT);
	if (res < 0) {
		if (errno == EAGAIN || errno == EINTR)
			local_errors++;
		goto out;
	}

	e = NULL;
	for (cmsgh = CMSG_FIRSTHDR(&msg); cmsgh; cmsgh = CMSG_NXTHDR(&msg, cmsgh)) {
		if (cmsgh->cmsg_level == SOL_IP) {
			if (cmsgh->cmsg_type == IP_RECVERR)
				e = (struct sock_extended_err *)CMSG_DATA(cmsgh);
		}
	}
	if (e == NULL)
		abort();

        /*错误类型参考:https://www.man7.org/linux/man-pages/man7/ip.7.html*/
	if (e->ee_origin == SO_EE_ORIGIN_LOCAL) {
		local_errors++;
		if (rts->opt_quiet)
			goto out;
		if (rts->opt_flood)
			write_stdout("E", 1);
		else if (e->ee_errno != EMSGSIZE)
			error(0, e->ee_errno, _("local error"));
		else
			error(0, 0, _("local error: message too long, mtu=%u"), e->ee_info);
		rts->nerrors++;
	} else if (e->ee_origin == SO_EE_ORIGIN_ICMP) {
		struct sockaddr_in *sin = (struct sockaddr_in *)(e + 1);

		if (res < (ssize_t) sizeof(icmph) ||
		    target.sin_addr.s_addr != rts->whereto.sin_addr.s_addr ||
		    icmph.type != ICMP_ECHO ||
		    !is_ours(rts, sock, icmph.un.echo.id)) {
			/* Not our error, not an error at all. Clear. */
			saved_errno = 0;
			goto out;
		}

		acknowledge(rts, ntohs(icmph.un.echo.sequence));

		if (sock->socktype == SOCK_RAW) {
			struct icmp_filter filt;

			filt.data = ~((1 << ICMP_SOURCE_QUENCH) |
				      (1 << ICMP_REDIRECT) |
				      (1 << ICMP_ECHOREPLY));
			if (setsockopt(sock->fd, SOL_RAW, ICMP_FILTER, (const void *)&filt,
				       sizeof(filt)) == -1)
				error(2, errno, "setsockopt(ICMP_FILTER)");
		}
		net_errors++;
		rts->nerrors++;
		if (rts->opt_quiet)
			goto out;
		if (rts->opt_flood) {
			write_stdout("\bE", 2);
		} else {
			print_timestamp(rts);
			printf(_("From %s icmp_seq=%u "), pr_addr(rts, sin, sizeof *sin), ntohs(icmph.un.echo.sequence));
			pr_icmph(rts, e->ee_type, e->ee_code, e->ee_info, NULL);
			fflush(stdout);
		}
	}

out:
	errno = saved_errno;
	return net_errors ? net_errors : -local_errors;
}

ping6_receive_error_msg (ping6_common.c)

TODO

ping4_parse_reply (ping.c)

int ping4_parse_reply(struct ping_rts *rts, struct socket_st *sock,
		      struct msghdr *msg, int cc, void *addr,
		      struct timeval *tv)
{
	struct sockaddr_in *from = addr;
	uint8_t *buf = msg->msg_iov->iov_base;
	struct icmphdr *icp;
	struct iphdr *ip;
	int hlen;
	int csfailed;
	struct cmsghdr *cmsgh;
	int reply_ttl;
	uint8_t *opts, *tmp_ttl;
	int olen;
	int wrong_source = 0;

        /*分别校验ip头和icmp头*/

	/* Check the IP header */
	ip = (struct iphdr *)buf;
	if (sock->socktype == SOCK_RAW) {
		hlen = ip->ihl * 4;
		if (cc < hlen + 8 || ip->ihl < 5) {
			if (rts->opt_verbose)
				error(0, 0, _("packet too short (%d bytes) from %s"), cc,
					pr_addr(rts,from, sizeof *from));
			return 1;
		}
		reply_ttl = ip->ttl;
		opts = buf + sizeof(struct iphdr);
		olen = hlen - sizeof(struct iphdr);
	} else {
		hlen = 0;
		reply_ttl = 0;
		opts = buf;
		olen = 0;
		for (cmsgh = CMSG_FIRSTHDR(msg); cmsgh; cmsgh = CMSG_NXTHDR(msg, cmsgh)) {
			if (cmsgh->cmsg_level != SOL_IP)
				continue;
			if (cmsgh->cmsg_type == IP_TTL) {
				if (cmsgh->cmsg_len < sizeof(int))
					continue;
				tmp_ttl = (uint8_t *)CMSG_DATA(cmsgh);
				reply_ttl = (int)*tmp_ttl;
			} else if (cmsgh->cmsg_type == IP_RETOPTS) {
				opts = (uint8_t *)CMSG_DATA(cmsgh);
				olen = cmsgh->cmsg_len;
			}
		}
	}

	/* Now the ICMP part */
	cc -= hlen;
	icp = (struct icmphdr *)(buf + hlen);
	csfailed = in_cksum((unsigned short *)icp, cc, 0);

	if (icp->type == ICMP_ECHOREPLY) {
		if (!is_ours(rts, sock, icp->un.echo.id))
			return 1;			/* 'Twas not our ECHO */

		if (!rts->broadcast_pings && !rts->multicast &&
		    from->sin_addr.s_addr != rts->whereto.sin_addr.s_addr)
			wrong_source = 1;
		if (gather_statistics(rts, (uint8_t *)icp, sizeof(*icp), cc,
				      ntohs(icp->un.echo.sequence),
				      reply_ttl, csfailed, tv, pr_addr(rts, from, sizeof *from),
				      pr_echo_reply, rts->multicast, wrong_source)) {
			fflush(stdout);
			return 0;
		}
	} else {
		/* We fall here when a redirect or source quench arrived. */

		switch (icp->type) {
		case ICMP_ECHO:
			/* MUST NOT */
			return 1;
		case ICMP_SOURCE_QUENCH:
		case ICMP_REDIRECT:
		case ICMP_DEST_UNREACH:
		case ICMP_TIME_EXCEEDED:
		case ICMP_PARAMETERPROB:
			{
				struct iphdr *iph = (struct iphdr *)(&icp[1]);
				struct icmphdr *icp1 = (struct icmphdr *)
						((unsigned char *)iph + iph->ihl * 4);
				int error_pkt;
				if (cc < (int)(8 + sizeof(struct iphdr) + 8) ||
				    cc < 8 + iph->ihl * 4 + 8)
					return 1;
				if (icp1->type != ICMP_ECHO ||
				    iph->daddr != rts->whereto.sin_addr.s_addr ||
				    !is_ours(rts, sock, icp1->un.echo.id))
					return 1;
				error_pkt = (icp->type != ICMP_REDIRECT &&
					     icp->type != ICMP_SOURCE_QUENCH);
				if (error_pkt) {
					acknowledge(rts, ntohs(icp1->un.echo.sequence));
					return 0;
				}
				if (rts->opt_quiet || rts->opt_flood)
					return 1;
				print_timestamp(rts);
				printf(_("From %s: icmp_seq=%u "), pr_addr(rts, from, sizeof *from),
				       ntohs(icp1->un.echo.sequence));
				if (csfailed)
					printf(_("(BAD CHECKSUM)"));
				pr_icmph(rts, icp->type, icp->code, ntohl(icp->un.gateway), icp);
				return 1;
			}
		default:
			/* MUST NOT */
			break;
		}
		if (rts->opt_flood && !(rts->opt_verbose || rts->opt_quiet)) {
			if (!csfailed)
				write_stdout("!E", 2);
			else
				write_stdout("!EC", 3);
			return 0;
		}
		if (!rts->opt_verbose || rts->uid)
			return 0;
		if (rts->opt_ptimeofday) {
			struct timeval recv_time;
			gettimeofday(&recv_time, NULL);
			printf("%lu.%06lu ", (unsigned long)recv_time.tv_sec, (unsigned long)recv_time.tv_usec);
		}
		printf(_("From %s: "), pr_addr(rts, from, sizeof *from));
		if (csfailed) {
			printf(_("(BAD CHECKSUM)\n"));
			return 0;
		}
		pr_icmph(rts, icp->type, icp->code, ntohl(icp->un.gateway), icp);
		return 0;
	}

	if (rts->opt_audible) {
		putchar('\a');
		if (rts->opt_flood)
			fflush(stdout);
	}
	if (!rts->opt_flood) {
		pr_options(rts, opts, olen + sizeof(struct iphdr));

		putchar('\n');
		fflush(stdout);
	}
	return 0;
}

ping6_parse_reply (ping6_common.c)

TODO

ping4_install_filter (ping.c)

void ping4_install_filter(struct ping_rts *rts, socket_st *sock)
{
        /*icmp包过滤器设置*/
	static int once;
	static struct sock_filter insns[] = {
		BPF_STMT(BPF_LDX | BPF_B   | BPF_MSH, 0),	/* Skip IP header due BSD, see ping6. */
		BPF_STMT(BPF_LD  | BPF_H   | BPF_IND, 4),	/* Load icmp echo ident */
		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xAAAA, 0, 1), /* Ours? */
		BPF_STMT(BPF_RET | BPF_K, ~0U),			/* Yes, it passes. */
		BPF_STMT(BPF_LD  | BPF_B   | BPF_IND, 0),	/* Load icmp type */
		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, ICMP_ECHOREPLY, 1, 0), /* Echo? */
		BPF_STMT(BPF_RET | BPF_K, 0xFFFFFFF),		/* No. It passes. */
		BPF_STMT(BPF_RET | BPF_K, 0)			/* Echo with wrong ident. Reject. */
	};
	static struct sock_fprog filter = {
		sizeof insns / sizeof(insns[0]),
		insns
	};

	if (once)
		return;
	once = 1;

	/* Patch bpflet for current identifier. */
	insns[2] = (struct sock_filter)BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(rts->ident), 0, 1);

	if (setsockopt(sock->fd, SOL_SOCKET, SO_ATTACH_FILTER, &filter, sizeof(filter)))
		error(0, errno, _("WARNING: failed to install socket filter"));
}

ping6_install_filter (ping6_common.c)

TODO

finish (ping_common.c)

/*
 * finish --
 *	Print out statistics, and give up.
 */
int finish(struct ping_rts *rts)
{
	struct timespec tv = rts->cur_time;
	char *comma = "";

	tssub(&tv, &rts->start_time);

        /*打印ping统计结果信息*/
	putchar('\n');
	fflush(stdout);
	printf(_("--- %s ping statistics ---\n"), rts->hostname);
	printf(_("%ld packets transmitted, "), rts->ntransmitted);
	printf(_("%ld received"), rts->nreceived);
	if (rts->nrepeats)
		printf(_(", +%ld duplicates"), rts->nrepeats);
	if (rts->nchecksum)
		printf(_(", +%ld corrupted"), rts->nchecksum);
	if (rts->nerrors)
		printf(_(", +%ld errors"), rts->nerrors);

	if (rts->ntransmitted) {
#ifdef USE_IDN
		setlocale(LC_ALL, "C");
#endif
                /*打印丢包数据统计百分比*/
		printf(_(", %g%% packet loss"),
		       (float)((((long long)(rts->ntransmitted - rts->nreceived)) * 100.0) / rts->ntransmitted));
		printf(_(", time %llums"), (unsigned long long)(1000 * tv.tv_sec + (tv.tv_nsec + 500000) / 1000000));
	}

	putchar('\n');

	if (rts->nreceived && rts->timing) {
		double tmdev;
		long total = rts->nreceived + rts->nrepeats;
		long tmavg = rts->tsum / total;
		long long tmvar;

                /*计算时间统计信息*/
		if (rts->tsum < INT_MAX)
			/* This slightly clumsy computation order is important to avoid
			 * integer rounding errors for small ping times. */
			tmvar = (rts->tsum2 - ((rts->tsum * rts->tsum) / total)) / total;
		else
			tmvar = (rts->tsum2 / total) - (tmavg * tmavg);

		tmdev = llsqrt(tmvar);

		printf(_("rtt min/avg/max/mdev = %ld.%03ld/%lu.%03ld/%ld.%03ld/%ld.%03ld ms"),
		       (long)rts->tmin / 1000, (long)rts->tmin % 1000,
		       (unsigned long)(tmavg / 1000), (long)(tmavg % 1000),
		       (long)rts->tmax / 1000, (long)rts->tmax % 1000,
		       (long)tmdev / 1000, (long)tmdev % 1000);
		comma = ", ";
	}
	if (rts->pipesize > 1) {
		printf(_("%spipe %d"), comma, rts->pipesize);
		comma = ", ";
	}

        /*打印rtt信息*/
	if (rts->nreceived && (!rts->interval || rts->opt_flood || rts->opt_adaptive) && rts->ntransmitted > 1) {
		int ipg = (1000000 * (long long)tv.tv_sec + tv.tv_nsec / 1000) / (rts->ntransmitted - 1);

		printf(_("%sipg/ewma %d.%03d/%d.%03d ms"),
		       comma, ipg / 1000, ipg % 1000, rts->rtt / 8000, (rts->rtt / 8) % 1000);
	}
	putchar('\n');
	return (!rts->nreceived || (rts->deadline && rts->nreceived < rts->npackets));
}

status (ping_common.c)

void status(struct ping_rts *rts)
{
	int loss = 0;
	long tavg = 0;

	rts->status_snapshot = 0;

        /*相对于finish少了一些信息*/

	if (rts->ntransmitted)
		loss = (((long long)(rts->ntransmitted - rts->nreceived)) * 100) / rts->ntransmitted;

	fprintf(stderr, "\r");
	fprintf(stderr, _("%ld/%ld packets, %d%% loss"), rts->nreceived, rts->ntransmitted, loss);

	if (rts->nreceived && rts->timing) {
		tavg = rts->tsum / (rts->nreceived + rts->nrepeats);

		fprintf(stderr, _(", min/avg/ewma/max = %ld.%03ld/%lu.%03ld/%d.%03d/%ld.%03ld ms"),
			(long)rts->tmin / 1000, (long)rts->tmin % 1000,
			tavg / 1000, tavg % 1000,
			rts->rtt / 8000, (rts->rtt / 8) % 1000, (long)rts->tmax / 1000, (long)rts->tmax % 1000);
	}
	fprintf(stderr, "\n");
}

gather_statistics (ping_common.c)

int gather_statistics(struct ping_rts *rts, uint8_t *icmph, int icmplen,
		      int cc, uint16_t seq, int hops,
		      int csfailed, struct timeval *tv, char *from,
		      void (*pr_reply)(uint8_t *icmph, int cc), int multicast,
		      int wrong_source)
{
	int dupflag = 0;
	long triptime = 0;
	uint8_t *ptr = icmph + icmplen;

        /*更新统计数据*/

	++rts->nreceived;
	if (!csfailed)
		acknowledge(rts, seq);

	if (rts->timing && cc >= (int)(8 + sizeof(struct timeval))) {
		struct timeval tmp_tv;
		memcpy(&tmp_tv, ptr, sizeof(tmp_tv));

restamp:
		tvsub(tv, &tmp_tv);
		triptime = tv->tv_sec * 1000000 + tv->tv_usec;
		if (triptime < 0) {
			error(0, 0, _("Warning: time of day goes back (%ldus), taking countermeasures"), triptime);
			triptime = 0;
			if (!rts->opt_latency) {
				gettimeofday(tv, NULL);
				rts->opt_latency = 1;
				goto restamp;
			}
		}
		if (!csfailed) {
			rts->tsum += triptime;
			rts->tsum2 += (double)((long long)triptime * (long long)triptime);
			if (triptime < rts->tmin)
				rts->tmin = triptime;
			if (triptime > rts->tmax)
				rts->tmax = triptime;
			if (!rts->rtt)
				rts->rtt = triptime * 8;
			else
				rts->rtt += triptime - rts->rtt / 8;
			if (rts->opt_adaptive)
				update_interval(rts);
		}
	}

	if (csfailed) {
		++rts->nchecksum;
		--rts->nreceived;
	} else if (rcvd_test(rts, seq)) {
		++rts->nrepeats;
		--rts->nreceived;
		dupflag = 1;
	} else {
		rcvd_set(rts, seq);
		dupflag = 0;
	}
	rts->confirm = rts->confirm_flag;

	if (rts->opt_quiet)
		return 1;

	if (rts->opt_flood) {
		if (!csfailed)
			write_stdout("\b \b", 3);
		else
			write_stdout("\bC", 2);
	} else {
		int i;
		uint8_t *cp, *dp;

		print_timestamp(rts);
		printf(_("%d bytes from %s:"), cc, from);

		if (pr_reply)
			pr_reply(icmph, cc);

		if (rts->opt_verbose && rts->ident != -1)
			printf(_(" ident=%d"), ntohs(rts->ident));

		if (hops >= 0)
			printf(_(" ttl=%d"), hops);

		if (cc < rts->datalen + 8) {
			printf(_(" (truncated)\n"));
			return 1;
		}
		if (rts->timing) {
			if (rts->opt_rtt_precision)
				printf(_(" time=%ld.%03ld ms"), triptime / 1000, triptime % 1000);
			else if (triptime >= 100000 - 50)
				printf(_(" time=%ld ms"), (triptime + 500) / 1000);
			else if (triptime >= 10000 - 5)
				printf(_(" time=%ld.%01ld ms"), (triptime + 50) / 1000,
				       ((triptime + 50) % 1000) / 100);
			else if (triptime >= 1000)
				printf(_(" time=%ld.%02ld ms"), (triptime + 5) / 1000,
				       ((triptime + 5) % 1000) / 10);
			else
				printf(_(" time=%ld.%03ld ms"), triptime / 1000,
				       triptime % 1000);
		}

		if (dupflag && (!multicast || rts->opt_verbose))
			printf(_(" (DUP!)"));
		if (csfailed)
			printf(_(" (BAD CHECKSUM!)"));
		if (wrong_source)
			printf(_(" (DIFFERENT ADDRESS!)"));

		/* check the data */
		cp = ((unsigned char *)ptr) + sizeof(struct timeval);
		dp = &rts->outpack[8 + sizeof(struct timeval)];
		for (i = sizeof(struct timeval); i < rts->datalen; ++i, ++cp, ++dp) {
			if (*cp != *dp) {
				printf(_("\nwrong data byte #%d should be 0x%x but was 0x%x"),
				       i, *dp, *cp);
				cp = (unsigned char *)ptr + sizeof(struct timeval);
				for (i = sizeof(struct timeval); i < rts->datalen; ++i, ++cp) {
					if ((i % 32) == sizeof(struct timeval))
						printf("\n#%d\t", i);
					printf("%x ", *cp);
				}
				break;
			}
		}
	}
	return 0;
}

in_cksum (ping.c)

static unsigned short
in_cksum(const unsigned short *addr, int len, unsigned short csum)
{
	int nleft = len;
	const unsigned short *w = addr;
	unsigned short answer;
	int sum = csum;

        /*计算报文crc的一种方法*/

	/*
	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
	 *  we add sequential 16 bit words to it, and at the end, fold
	 *  back all the carry bits from the top 16 bits into the lower
	 *  16 bits.
	 */
	while (nleft > 1) {
		sum += *w++;
		nleft -= 2;
	}

	/* mop up an odd byte, if necessary */
	if (nleft == 1)
		sum += ODDBYTE(*(unsigned char *)w); /* le16toh() may be unavailable on old systems */

	/*
	 * add back carry outs from top 16 bits to low 16 bits
	 */
	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
	sum += (sum >> 16);			/* add carry */
	answer = ~sum;				/* truncate to 16 bits */
	return (answer);
}

参考源码

Ubuntu源码

https://git.launchpad.net/ubuntu/+source/iputils/tree/ping?h=ubuntu/plucky

其他源码

https://github.com/iputils/iputils

Comments are closed.