深入理解ping原理及源码实现
ping原理
发起端发送ICMP探测报文,发起端和中间设备根据路由表进行转发,中间设备将报文TTL减1,并更新下一跳二层头信息转发,目的节点收到发给自己的ping包之后,检查报文是否合法,完成检查后给发起端发送ICMP响应报文。
A ->R1 -> R2 -> R3 -> B (ICMP echo request)
A <- R1 <- R2 <- R3 <-B (ICMP echo reply)
ICMP的报文格式定义在RFC 792中,它的IP Proto字段是1,从这里也可以看出ICMP是IP网络的基础协议。具体echo request和echo reply的报文格式请参考具体的RFC协议。
一般来说,上面两个方向的报文走的路径是一样的,所以ping报文可以测量发起端到接收端的往返延时(RTT),同时会计算出最小最大平均时延,有些工具还会计算出方差。对于ping包发送个数,Windows上默认是4个, Linux上默认一直发送除非ctrl+c停止,但是都提供配置发送个数选项。默认发送的icmp报文长度不会很大,Windows上是94字节,Linux上是118字节(上述是IPv6报文ICMPv6对应的长度),净荷字段的填充因为没有限制,更是五花八门,这里不在一一列出。因为ping包如果超过接口的MTU会默认分片,所以ping也提供了不分片设置标志,以便探测网络对大包的支持情况。TTL配置则会限制报文最大经过的中间设备数量,如果不设置Windows是128,Linux是64,一般场景足够了。每个ICMP回复等待的超时时间默认是1秒,如果有特殊使用,可以设置ICMP探测的超时时间。因为ping包发送时候使用的是发送端口的接口地址,如果有需要也可以配置发送报文的原地址。
上面说的都是Windows和Linux都支持的功能,除此之外每个工具还有自己特定的一些配置,另外IPv6和IPv4在使用上还有一些差异。
在实现上,发起端的ping工具一般实现在用户态,而接收端的ech request处理以及响应echo reply发送都是在内核态自动处理的,不需要用户态干预。在Windows比较新的版本上,比如Windows 7及之后的版本,默认不开启ech request报文上送,导致不会自动响应ping请求,需要手动开启一下。
ping源码实现
Talk is cheap, show me you code. 下面我们来分析ping的源码实现,因为Windows没有开源Ping的实现,我们现在以Linux下的ping工具为对象分析具体源码实现。ping工具在iputils中实现,Ubuntu使用的源码在这里。
ping工具的实现在目录ping中,源文件如下所示:
path: root/ping
Mode Name Size
-rw-r--r-- meson.build 518 log plain
-rw-r--r-- node_info.c 12858 log plain
-rw-r--r-- ping.c 50280 log plain
-rw-r--r-- ping.h 12646 log plain
-rw-r--r-- ping6_common.c 26860 log plain
-rw-r--r-- ping_common.c 28183 log plain
主要的代码实现在ping.c, ping_common.c和ping6_common.c中。
ping.c实现了ping工具中对于ipv4和ipv6的通用处理入口,包括创建socket的create_socket, 主入口函数main。关于ipv4的主入口ping4_run,收发包处理ping4_receive_error_msg, ping4_parse_reply, ping4_send_probe, ping4_install_filter也实现在ping.c中。
ping_common.c实现了ping工具用法usage, 权限操作接口limit_capabilities, modify_capability, drop_capabilities, ping包发送入口pinger, 配置入口setup, 主循环main_loop, 统计信息更新gather_statistics, 以及最后接触处理的finish和status。
ping6_common.c实现了ipv6的一些相关接口,包括主入口ping6_run, 以及针对ipv6的收发包处理ping6_receive_error_msg, ping6_parse_reply, ping6_send_probe和ping6_install_filter。
整体流程图如下:

上图图片是用Mermaid Live Editor生成,地址如下:https://mermaid.live/
源码分析
main (ping.c)
int
main(int argc, char **argv)
{
...
struct addrinfo hints = {
.ai_family = AF_UNSPEC, /*默认地址族*/
.ai_protocol = IPPROTO_UDP,
.ai_socktype = SOCK_DGRAM, /*默认使用DGRAM类型*/
.ai_flags = getaddrinfo_flags
};
...
static struct ping_rts rts = {
.interval = 1000, /*默认间隔1000ms*/
.preload = 1,
.lingertime = MAXWAIT * 1000,
.confirm_flag = MSG_CONFIRM,
.tmin = LONG_MAX,
.pipesize = -1,
.datalen = DEFDATALEN,
.ident = -1,
.screen_width = INT_MAX,
#ifdef HAVE_LIBCAP
.cap_raw = CAP_NET_RAW,
.cap_admin = CAP_NET_ADMIN,
#endif
.pmtudisc = -1,
.source.sin_family = AF_INET,
.source6.sin6_family = AF_INET6,
.ni.query = -1,
.ni.subject_type = -1,
};
...
/*因为支持创建ping4和ping6的软链接,所以根据程序名称设置默认地址族*/
/* Support being called using `ping4` or `ping6` symlinks */
if (argv[0][strlen(argv[0]) - 1] == '4')
hints.ai_family = AF_INET;
else if (argv[0][strlen(argv[0]) - 1] == '6')
hints.ai_family = AF_INET6;
...
/*这里跳过解析的opt关键字,剩下hops和targets*/
argc -= optind;
argv += optind;
/*必须指定目的地,如果argc是1则是target,如果argc大于1,则除最后一个外都是hop*/
if (!argc)
error(2, EDESTADDRREQ, "usage error");
target = argv[argc - 1]; /*target就是ping的目标,必须是最后一个参数*/
/* Create sockets */
enable_capability_raw(); /*使能创建socket权限*/
if (hints.ai_family != AF_INET6) {
/*创建ipv4的socket*/
create_socket(&rts, &sock4, AF_INET, hints.ai_socktype, IPPROTO_ICMP,
hints.ai_family == AF_INET);
}
if (hints.ai_family != AF_INET) {
/*创建ipv6的socket*/
create_socket(&rts, &sock6, AF_INET6, hints.ai_socktype, IPPROTO_ICMPV6, sock4.fd == -1);
/* This may not be needed if both protocol versions always had the same value, but
* since I don't know that, it's better to be safe than sorry. */
rts.pmtudisc = rts.pmtudisc == IP_PMTUDISC_DO ? IPV6_PMTUDISC_DO :
rts.pmtudisc == IP_PMTUDISC_DONT ? IPV6_PMTUDISC_DONT :
rts.pmtudisc == IP_PMTUDISC_WANT ? IPV6_PMTUDISC_WANT :
rts.pmtudisc == IP_PMTUDISC_PROBE? IPV6_PMTUDISC_PROBE: rts.pmtudisc;
}
disable_capability_raw(); /*关闭打开socket的权限*/
/*如果是通过ping执行的程序,这里地址族还是AF_UNSPEC,如果只有一种地址族支持则使能它,否则ipv4和ipv6都会创建成功*/
/* Limit address family on single-protocol systems */
if (hints.ai_family == AF_UNSPEC) {
if (sock4.fd == -1)
hints.ai_family = AF_INET6;
else if (sock6.fd == -1)
hints.ai_family = AF_INET;
}
/*下面是解析target,然后根据地址族计算ICMP报文的最大长度*/
int max_s = MAX(ICMP_MAX_DATALEN, ICMPV6_MAX_DATALEN);
/* Detect based on -4 / -6 */
if (hints.ai_family == AF_INET)
max_s = ICMP_MAX_DATALEN - get_ipv4_optlen(&rts);
else if (hints.ai_family == AF_INET6)
max_s = ICMPV6_MAX_DATALEN;
/* Force limit on IPv4/IPv6 adresses */
if (inet_pton(AF_INET, target, buf))
max_s = ICMP_MAX_DATALEN - get_ipv4_optlen(&rts);
else if (inet_pton(AF_INET6, target, buf))
max_s = ICMPV6_MAX_DATALEN;
/*支持设置tos或者tc*/
/* Set socket options */
if (rts.settos)
set_socket_option(&sock4, IPPROTO_IP, IP_TOS, &rts.settos, sizeof(rts.settos));
if (rts.tclass)
set_socket_option(&sock6, IPPROTO_IPV6, IPV6_TCLASS, &rts.tclass, sizeof(rts.tclass));
/*获取目标地址,这里是根据target做dns解析*/
/* getaddrinfo fails to indicate a scopeid when not used in dual-stack mode.
* Work around by always using dual-stack name resolution.
*
* https://github.com/iputils/iputils/issues/252
*/
int target_ai_family = hints.ai_family;
hints.ai_family = AF_UNSPEC;
/*ipv6 linklocal地址需要指定接口或者scope-id*/
if (!strchr(target, '%') && sock6.socktype == SOCK_DGRAM &&
inet_pton(AF_INET6, target, buf) > 0 &&
(IN6_IS_ADDR_LINKLOCAL(buf) || IN6_IS_ADDR_MC_LINKLOCAL(buf))) {
error(0, 0, _(
"Warning: IPv6 link-local address on ICMP datagram socket may require ifname or scope-id"
" => use: address%%<ifname|scope-id>"));
}
/*使用getaddrinfo进行dns解析,注意这里如果target已经是有效的ip地址了,是如何处理的*/
ret_val = getaddrinfo(target, NULL, &hints, &result);
if (ret_val)
error(2, 0, "%s: %s", target, gai_strerro,r(ret_val));
/*遍历dns解析的结果,说明如果解析了多个地址,则分别执行ping么?*/
for (ai = result; ai; ai = ai->ai_next) {
if (rts.opt_verbose)
printf("ai->ai_family: %s, ai->ai_canonname: '%s'\n",
str_family(ai->ai_family),
ai->ai_canonname ? ai->ai_canonname : "");
/*如果指定的地址族和解析出来的地址族不一致,而且已经是最后一个了,则抛出错误*/
if (target_ai_family != AF_UNSPEC &&
target_ai_family != ai->ai_family) {
if (!ai->ai_next) {
/* An address was found, but not of the family we really want.
* Throw the appropriate gai error.
*/
error(2, 0, "%s: %s", target, gai_strerror(EAI_ADDRFAMILY));
}
continue;
}
/*根据地址族执行对应的ping程序*/
switch (ai->ai_family) {
case AF_INET:
ret_val = ping4_run(&rts, argc, argv, ai, &sock4);
break;
case AF_INET6:
ret_val = ping6_run(&rts, argc, argv, ai, &sock6);
break;
default:
error(2, 0, _("unknown protocol family: %d"), ai->ai_family);
}
/*返回值大于等于0表示成功,然后就退出,小于0表示失败,则执行下一个地址族*/
if (ret_val >= 0)
break;
/* ret_val < 0 means to go on to next addrinfo result, there
* better be one. */
assert(ai->ai_next);
}
freeaddrinfo(result);
/*返回值来自于ping4_run或者ping6_run*/
return ret_val;
}
create_socket (ping.c)
static void create_socket(struct ping_rts *rts, socket_st *sock, int family,
int socktype, int protocol, int requisite)
{
int do_fallback = 0;
errno = 0;
assert(sock->fd == -1);
assert(socktype == SOCK_DGRAM || socktype == SOCK_RAW);
/* Attempt to create a ping socket if requested. Attempt to create a raw
* socket otherwise or as a fallback. Well known errno values follow.
*
* 1) EACCES
*
* Kernel returns EACCES for all ping socket creation attempts when the
* user isn't allowed to use ping socket. A range of group ids is
* configured using the `net.ipv4.ping_group_range` sysctl. Fallback
* to raw socket is necessary.
*
* Kernel returns EACCES for all raw socket creation attempts when the
* process doesn't have the `CAP_NET_RAW` capability.
*
* 2) EAFNOSUPPORT
*
* Kernel returns EAFNOSUPPORT for IPv6 ping or raw socket creation
* attempts when run with IPv6 support disabled (e.g. via `ipv6.disable=1`
* kernel command-line option.
*
* https://github.com/iputils/iputils/issues/32
*
* OpenVZ 2.6.32-042stab113.11 and possibly other older kernels return
* EAFNOSUPPORT for all IPv4 ping socket creation attempts due to lack
* of support in the kernel. Fallback to raw socket is necessary.
*
* https://github.com/iputils/iputils/issues/54
*
* 3) EPROTONOSUPPORT
*
* OpenVZ 2.6.32-042stab113.11 and possibly other older kernels return
* EPROTONOSUPPORT for all IPv6 ping socket creation attempts due to lack
* of support in the kernel [1]. Debian 9.5 based container with kernel 4.10
* returns EPROTONOSUPPORT also for IPv4 [2]. Fallback to raw socket is
* necessary.
*
* [1] https://github.com/iputils/iputils/issues/54
* [2] https://github.com/iputils/iputils/issues/129
*/
/*如果没有显示指定RAW,则优先使用DGRAM类型创建socket*/
if (socktype == SOCK_DGRAM)
sock->fd = socket(family, socktype, protocol);
/*创建失败,且是IPv4的地址族不支持,或者协议不支持,fallback到raw类型*/
/* Kernel doesn't support ping sockets. */
if (sock->fd == -1 && errno == EAFNOSUPPORT && family == AF_INET)
do_fallback = 1;,
if (sock->fd == -1 && errno == EPROTONOSUPPORT)
do_fallback = 1;
/*权限不允许,fallback到raw类型*/
/* User is not allowed to use ping sockets. */
if (sock->fd == -1 && errno == EACCES)
do_fallback = 1;
/*显试指定raw或者fallback到raw类型,创建raw socket*/
if (socktype == SOCK_RAW || do_fallback) {
socktype = SOCK_RAW;
sock->fd = socket(family, SOCK_RAW, protocol);
}
sock->socktype = socktype;
/* valid socket */
if (sock->fd != -1)
return;
/*创建失败的一些处理,是权限问题还是其他原因,打印errno*/
/* failed to create socket */
if (requisite || rts->opt_verbose) {
error(0, 0, "socktype: %s", str_socktype(socktype));
error(0, errno, "socket");
}
if (requisite) {
if (socktype == SOCK_RAW && geteuid() != 0)
error(0, 0, _("=> missing cap_net_raw+p capability or setuid?"));
exit(2);
}
}
bind_to_device (ping.c)
static void bind_to_device(struct ping_rts *rts, int fd, in_addr_t addr)
{
int rc;
int errno_save;
enable_capability_raw();
/*绑定指定接口*/
rc = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, rts->device,
strlen(rts->device) + 1);
errno_save = errno;
disable_capability_raw();
if (rc != -1)
return;
/*组播地址支持绑定到指定接口设备*/
if (IN_MULTICAST(ntohl(addr))) {
struct ip_mreqn imr;
memset(&imr, 0, sizeof(imr));
imr.imr_ifindex = iface_name2index(rts, fd);
if (setsockopt(fd, SOL_IP, IP_MULTICAST_IF, &imr, sizeof(imr)) == -1)
error(2, errno, "IP_MULTICAST_IF");
} else {
error(2, errno_save, "SO_BINDTODEVICE %s", rts->device);
}
}
ping4_run (ping.c)
/* return >= 0: exit with this code, < 0: go on to next addrinfo result */
int ping4_run(struct ping_rts *rts, int argc, char **argv, struct addrinfo *ai,
socket_st *sock)
{
static const struct addrinfo hints = {
.ai_family = AF_INET,
.ai_protocol = IPPROTO_UDP,
.ai_flags = getaddrinfo_flags
};
...
if (argc > 1) {
/*record route选项不能设置hop信息*/
if (rts->opt_rroute)
usage();
else if (rts->opt_timestamp) {
/*时间戳选项只支持prespec类型,且跳数不能大于等于5个*/
if (rts->ts_type != IPOPT_TS_PRESPEC)
usage();
if (argc > 5)
usage();
} else {
/*源路由支持的跳数不能大于等于10个*/
if (argc > 10)
usage();
rts->opt_sourceroute = 1;
}
}
/*解析域名或IP地址信息,每次迭代都更新hostname,也就是最终的目的地*/
while (argc > 0) {
target = *argv;
memset((char *)&rts->whereto, 0, sizeof(rts->whereto));
rts->whereto.sin_family = AF_INET;
if (inet_aton(target, &rts->whereto.sin_addr) == 1) {
/*优先使用IP地址解析,成功则设置hostname,只有1个地址的时候,设置numric选项*/
rts->hostname = target;
if (argc == 1)
rts->opt_numeric = 1;
} else {
struct addrinfo *result = ai;
int ret_val;
/*当包含hop和target的时候,使用域名解析hop信息*/
if (argc > 1) {
ret_val = getaddrinfo(target, NULL, &hints, &result);
if (ret_val)
error(2, 0, "%s: %s", target, gai_strerror(ret_val));
}
memcpy(&rts->whereto, result->ai_addr, sizeof rts->whereto);
memset(hnamebuf, 0, sizeof hnamebuf);
/*如果返回结果支持ai_canonname则使用ai_canonname,否则使用target来显示ping的结果*/
/*
* On certain network setup getaddrinfo() can return empty
* ai_canonname. Instead of printing nothing in "PING"
* line use the target.
*/
if (result->ai_canonname)
strncpy(hnamebuf, result->ai_canonname, sizeof hnamebuf - 1);
else
strncpy(hnamebuf, target, sizeof hnamebuf - 1);
rts->hostname = hnamebuf;
/*argc大于1,意味着存在hop信息,释放上面getaddrinfo返回的result*/
if (argc > 1)
freeaddrinfo(result);
}
/*除了最后一个,记录IP地址到rts->route数组中,并递增rts->nroute个数信息*/
if (argc > 1)
rts->route[rts->nroute++] = rts->whereto.sin_addr.s_addr;
/*继续处理下一个地址信息*/
argc--;
argv++;
}
/*源地址处理*/
if (rts->source.sin_addr.s_addr == 0) {
/*进入此处说明没有设置-I选项设置IP地址信息,可能设置了接口*/
socklen_t alen;
/*创建1个探测probe_fd,用来进行源地址的选择,udp类型*/
int probe_fd = socket(AF_INET, SOCK_DGRAM, 0);
/*通过上面解析出来的目的地*/
dst = rts->whereto;
if (probe_fd < 0)
error(2, errno, "socket");
/*如果指定了接口,则将probe_fd和sock->fd都绑定到这个接口上*/
if (rts->device) {
bind_to_device(rts, probe_fd, dst.sin_addr.s_addr);
bind_to_device(rts, sock->fd, dst.sin_addr.s_addr);
}
if (rts->settos &&
setsockopt(probe_fd, IPPROTO_IP, IP_TOS, (char *)&rts->settos, sizeof(int)) < 0)
error(0, errno, _("warning: QOS sockopts"));
sock_setmark(rts, probe_fd);
/*使用目的端口号1025进行探测,非保留端口号的第1个*/
dst.sin_port = htons(1025);
/*如果配置了源路由,则使用第一个地址*/
if (rts->nroute)
dst.sin_addr.s_addr = rts->route[0];
/*通过connect连接probe_fd来判断目的地址是否可达,对于udp套接字来说connect在第一次建立路由信息*/
if (connect(probe_fd, (struct sockaddr *)&dst, sizeof(dst)) == -1) {
/*返回-1表示失败,处理权限,地址不可达等各种错误*/
if (errno == EACCES) {
if (rts->broadcast_pings == 0)
error(2, 0,
_("Do you want to ping broadcast? Then -b. If not, check your local firewall rules"));
fprintf(stderr, _("WARNING: pinging broadcast address\n"));
if (setsockopt(probe_fd, SOL_SOCKET, SO_BROADCAST,
&rts->broadcast_pings, sizeof(rts->broadcast_pings)) < 0)
error(2, errno, _("cannot set broadcasting"));
if (connect(probe_fd, (struct sockaddr *)&dst, sizeof(dst)) == -1)
error(2, errno, "connect");
} else if ((errno == EHOSTUNREACH || errno == ENETUNREACH) && ai->ai_next) {
close(probe_fd);
return -1;,
} else {
error(2, errno, "connect");
}
}
/*走到这里意味着地址可达,获取源地址信息*/
alen = sizeof(rts->source);
if (getsockname(probe_fd, (struct sockaddr *)&rts->source, &alen) == -1)
error(2, errno, "getsockname");
rts->source.sin_port = 0;
/*如果指定了接口,判断接口和源地址是否匹配,如果二者不匹配则提示错误*/
if (rts->device) {
struct ifaddrs *ifa0, *ifa;
int ret;
ret = getifaddrs(&ifa0);
if (ret)
error(2, errno, _("gatifaddrs failed"));
for (ifa = ifa0; ifa; ifa = ifa->ifa_next) {
if (!ifa->ifa_name || !ifa->ifa_addr ||
ifa->ifa_addr->sa_family != AF_INET)
continue;
if (!strcmp(ifa->ifa_name, rts->device) &&
!memcmp(&((struct sockaddr_in *)ifa->ifa_addr)->sin_addr,
&rts->source.sin_addr, sizeof(rts->source.sin_addr)))
break;
}
freeifaddrs(ifa0);
if (!ifa)
error(0, 0, _("Warning: source address might be selected on device other than: %s"), rts->device);
}
close(probe_fd);
} else if (rts->device) {
/*直接进行绑定*/
bind_to_device(rts, sock->fd, rts->whereto.sin_addr.s_addr);
}
/*如果目的地址为0,则将源地址赋值给目的地址,什么会走到这里,按理说whereto应该都非0的了*/
if (rts->whereto.sin_addr.s_addr == 0)
rts->whereto.sin_addr.s_addr = rts->source.sin_addr.s_addr;
...
/*如果支持recore route则设置相关option,如果支持timestamp或者source route则拷贝上面解析的rts->route到ip头的option字段中*/
/* record route option */
if (rts->opt_rroute) {
memset(rspace, 0, sizeof(rspace));
rspace[0] = IPOPT_NOP;
rspace[1 + IPOPT_OPTVAL] = IPOPT_RR;
rspace[1 + IPOPT_OLEN] = sizeof(rspace) - 1;
rspace[1 + IPOPT_OFFSET] = IPOPT_MINOFF;
if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, sizeof rspace) < 0)
error(2, errno, "record route");
}
if (rts->opt_timestamp) {
memset(rspace, 0, sizeof(rspace));
rspace[0] = IPOPT_TIMESTAMP;
rspace[1] = (rts->ts_type == IPOPT_TS_TSONLY ? 40 : 36);
rspace[2] = 5;
rspace[3] = rts->ts_type;
if (rts->ts_type == IPOPT_TS_PRESPEC) {
int i;
rspace[1] = 4 + rts->nroute * 8;
for (i = 0; i < rts->nroute; i++) {
tmp_rspace = (uint32_t *)&rspace[4 + i * 8];
*tmp_rspace = rts->route[i];
}
}
if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, rspace[1]) < 0) {
rspace[3] = 2;
if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, rspace[1]) < 0)
error(2, errno, "ts option");
}
}
if (rts->opt_sourceroute) {
int i;
memset(rspace, 0, sizeof(rspace));
rspace[0] = IPOPT_NOOP;
rspace[1 + IPOPT_OPTVAL] = rts->opt_so_dontroute ? IPOPT_SSRR : IPOPT_LSRR;
rspace[1 + IPOPT_OLEN] = 3 + rts->nroute * 4;
rspace[1 + IPOPT_OFFSET] = IPOPT_MINOFF;
for (i = 0; i < rts->nroute; i++) {
tmp_rspace = (uint32_t *)&rspace[4 + i * 4];
*tmp_rspace = rts->route[i];
}
if (setsockopt(sock->fd, IPPROTO_IP, IP_OPTIONS, rspace, 4 + rts->nroute * 4) < 0)
error(2, errno, "record route");
...
/*如果指定报文长度大于时间戳长度,则支持通过报文净荷传输时间戳信息*/
if (rts->datalen >= (int)sizeof(struct timeval)) /* can we time transfer */
rts->timing = 1;
/*报文长度还需要计算IP头和ICMP头*/
packlen = rts->datalen + MAXIPLEN + MAXICMPLEN;
/*动态申请发包内存*/
if (!(packet = (unsigned char *)malloc((unsigned int)packlen)))
error(2, errno, _("memory allocation failed"));
/*打印ping目的地名称和ipv4地址,目的地名称和你命令行指定的域名可能是不一样的,因为这可能是dns解析出的ai_canonname。
比如ping www.baidu.com,打印的结果是PING www.a.shifen.com (39.156.70.239) 56(84) bytes of data.*/
printf(_("PING %s (%s) "), rts->hostname, inet_ntoa(rts->whereto.sin_addr));
if (rts->device || rts->opt_strictsource)
printf(_("from %s %s: "), inet_ntoa(rts->source.sin_addr), rts->device ? rts->device : "");
printf(_("%d(%d) bytes of data.\n"), rts->datalen, rts->datalen + 8 + rts->optlen + 20);
/*其他一些和协议无关的配置和参数检查*/
setup(rts, sock);
/*如果通过-C选项显式指定了connect调用但是失败了,则返回错误。
之所以可以指定connect调用,是因为ping默认支持dgram方式的套接字*/
if (rts->opt_connect_sk &&
connect(sock->fd, (struct sockaddr *)&dst, sizeof(dst)) == -1)
error(2, errno, "connect failed");
/*释放权限*/
drop_capabilities();
/*进入主循环发包和收包*/
ret = main_loop(rts, &ping4_func_set, sock, packet, packlen);
/*释放发包内存*/
free(packet);
/*返回main_loop返回的结果*/
return ret;
}
ping6_run (ping6_common.c)
TODO
setup (ping_common.c)
/* Protocol independent setup and parameter checks. */
void setup(struct ping_rts *rts, socket_st *sock)
{
int hold;
struct timeval tv;
sigset_t sset;
/*flood ping如果没有设置发包间隔,则设置为0*/
if (rts->opt_flood && !rts->opt_interval)
rts->interval = 0;
/*flood ping的最小间隔是MIN_USER_INTERVAL_MS(2ms)*/
if (rts->uid && rts->interval < MIN_USER_INTERVAL_MS)
error(2, 0, _("cannot flood, minimal interval for user must be >= %d ms, use -i %s (or higher)"),
MIN_USER_INTERVAL_MS, str_interval(MIN_USER_INTERVAL_MS));
if (rts->interval >= INT_MAX / rts->preload)
error(2, 0, _("illegal preload and/or interval: %d"), rts->interval);
hold = 1;
/*打开调试*/
if (rts->opt_so_debug)
setsockopt(sock->fd, SOL_SOCKET, SO_DEBUG, (char *)&hold, sizeof(hold));
if (rts->opt_so_dontroute)
setsockopt(sock->fd, SOL_SOCKET, SO_DONTROUTE, (char *)&hold, sizeof(hold));
#ifdef SO_TIMESTAMP
if (!rts->opt_latency) {
int on = 1;
if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP, &on, sizeof(on)))
error(0, 0, _("Warning: no SO_TIMESTAMP support, falling back to SIOCGSTAMP"));
}
#endif
sock_setmark(rts, sock->fd);
/*设置发送和接收超时时间,发送超时最长为1秒,接收超时间隔时间*/
/* Set some SNDTIMEO to prevent blocking forever
* on sends, when device is too slow or stalls. Just put limit
* of one second, or "interval", if it is less.
*/
tv.tv_sec = 1;
tv.tv_usec = 0;
if (rts->interval < 1000) {
tv.tv_sec = 0;
tv.tv_usec = 1000 * SCHINT(rts->interval);
}
setsockopt(sock->fd, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));
/* Set RCVTIMEO to "interval". Note, it is just an optimization
* allowing to avoid redundant poll(). */
tv.tv_sec = SCHINT(rts->interval) / 1000;
tv.tv_usec = 1000 * (SCHINT(rts->interval) % 1000);
if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)))
rts->opt_flood_poll = 1;
if (!rts->opt_pingfilled) {
int i;
unsigned char *p = rts->outpack + 8;
/* Do not forget about case of small datalen, fill timestamp area too! */
for (i = 0; i < rts->datalen; ++i)
*p++ = i;
}
/*如果是raw类型且没有通过—e设置ident,则使用pid作为ident,也就是icmp报文中的ident字段*/
if (sock->socktype == SOCK_RAW && rts->ident == -1)
rts->ident = htons(getpid() & 0xFFFF);
/*设置信号处理函数*/
set_signal(SIGINT, sigexit);
set_signal(SIGALRM, sigexit);
set_signal(SIGQUIT, sigstatus);
i
sigemptyset(&sset);
sigprocmask(SIG_SETMASK, &sset, NULL);
/*获取当前系统时间*/
clock_gettime(CLOCK_MONOTONIC_RAW, &rts->start_time);
/*如果通过-w设置了时长,则创建定时器*/
if (rts->deadline) {
struct itimerval it;
it.it_interval.tv_sec = 0;
it.it_interval.tv_usec = 0;
it.it_value.tv_sec = rts->deadline;
it.it_value.tv_usec = 0;
setitimer(ITIMER_REAL, &it, NULL);
}
/*控制台设置*/
if (isatty(STDOUT_FILENO)) {
struct winsize w;
if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) != -1) {
if (w.ws_col > 0)
rts->screen_width = w.ws_col;
}
}
}
main_loop (ping_common.c)
int main_loop(struct ping_rts *rts, ping_func_set_st *fset, socket_st *sock,
uint8_t *packet, int packlen)
{
char addrbuf[128];
char ans_data[4096];
struct iovec iov;
struct msghdr msg;
int cc;
int next;
int polling;
int recv_error;
iov.iov_base = (char *)packet;
for (;;) {
/* Check exit conditions. */
/*通过ctrl+c退出,则exiting被置位*/
if (rts->exiting)
break;
/*通过-c设置了最大接收报文个数,而且接收个数+错误个数>=设置的最大接收报文个数*/
if (rts->npackets && rts->nreceived + rts->nerrors >= rts->npackets)
break;
/*设置了最大时间,存在错误报文,这种情况也退出*/
if (rts->deadline && rts->nerrors)
break;
/* Check for and do special actions. */
/*通过ctrl+\触发ping过程中的状态显示,可以在不停止ping的情况下查看截止到当前的收发报文统计*/
if (rts->status_snapshot)
status(rts);
/* Send probes scheduled to this time. */
/*发送探测报文然后进行调度,如果next小于等于0,则继续发送探测报文*/
do {
next = pinger(rts, fset, sock);
next = schedule_exit(rts, next);
} while (next <= 0);
/* "next" is time to send next probe, if positive.
* If next<=0 send now or as soon as possible. */
/* Technical part. Looks wicked. Could be dropped,
* if everyone used the newest kernel. :-)
* Its purpose is:
* 1. Provide intervals less than resolution of scheduler.
* Solution: spinning.
* 2. Avoid use of poll(), when recvmsg() can provide
* timed waiting (SO_RCVTIMEO). */
/*上面的注释解释了下面代码的主要目的,在所有人使用的都不是最新内核版本的情况下,提供更加快速的调度策略。
1. 实现比调度器粒度还小的间隔设置,方法是自旋;2. 当recvmsg可以通过接收超时退出的时候,避免使用poll调用。*/
polling = 0; /*默认需要poll调用等待超时*/
recv_error = 0;
/*如果开启了自适应ping(-A),或者fd的接收超时设置失败(opt_flood_poll),或者下次发送时间小于设置的发送间隔*/
if (rts->opt_adaptive || rts->opt_flood_poll || next < SCHINT(rts->interval)) {
/*recv_expected是发送但是没有确认的报文个数*/
int recv_expected = in_flight(rts);
/* If we are here, recvmsg() is unable to wait for
* required timeout. */
if (1000 % HZ == 0 ? next <= 1000 / HZ : (next < INT_MAX / HZ && next * HZ <= 1000)) {
/* Very short timeout... So, if we wait for
* something, we sleep for MIN_INTERVAL_MS.
* Otherwise, spin! */
if (recv_expected) {
/*如果有报文待确认,则下次调度时间设置为MIN_INTERVAL_MS(10ms)*/
next = MIN_INTERVAL_MS;
} else {
/*报文都已经确认了,直接自旋等待下次调度发包,通过sched_yield触发调用,
polling设置为MSG_DONTWAIT,表示不需要poll,直接调用recvmsg*/
next = 0;
/* When spinning, no reasons to poll.
* Use nonblocking recvmsg() instead. */
polling = MSG_DONTWAIT;
/* But yield yet. */
sched_yield();
}
}
if (!polling &&
(rts->opt_adaptive || rts->opt_flood_poll || rts->interval)) {
/*没有进行快速调度,则需要通过poll设置超时时间*/
struct pollfd pset;
pset.fd = sock->fd;
pset.events = POLLIN;
pset.revents = 0;
/*poll返回-1表示失败,返回0表示超时时间到,返回1表示有fd被置位。
如果是前2者则继续下次调度发包,如果是后者则需要判断是否设置了POLLIN或者POLLERR,没有这两个标志则继续,有的话往下走*/
if (poll(&pset, 1, next) < 1 ||
!(pset.revents & (POLLIN | POLLERR)))
continue;
/*有POLLIN或者POLLERR被置位,将polling设置为MSG_DONTWAIT不需要再次poll,同时将revents中的POLLERR记录到recv_error中*/
polling = MSG_DONTWAIT;
recv_error = pset.revents & POLLERR;
}
}
for (;;) {
struct timeval *recv_timep = NULL;
struct timeval recv_time;
int not_ours = 0; /* Raw socket can receive messages
* destined to other running pings. */
iov.iov_len = packlen;
memset(&msg, 0, sizeof(msg));
msg.msg_name = addrbuf;
msg.msg_namelen = sizeof(addrbuf);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = ans_data;
msg.msg_controllen = sizeof(ans_data);
cc = recvmsg(sock->fd, &msg, polling);
polling = MSG_DONTWAIT;
/*recvmsg返回值小于0表示出错,根据errno和recv_error来处理错误*/
if (cc < 0) {
/* If there was a POLLERR and there is no packet
* on the socket, try to read the error queue.
* Otherwise, give up.
*/
/*如果返回EAGAIN且poll没有设置POLLERR,或者返回EINTR则跳出重新发包*/
if ((errno == EAGAIN && !recv_error) ||
errno == EINTR)
break;
/*如果是其他错误,调用地址族的receive_error_msg进行错误处理,具体来说是读error queue的信息*/
recv_error = 0;
if (!fset->receive_error_msg(rts, sock)) {
if (errno) {
error(0, errno, "recvmsg");
break;
}
not_ours = 1;
}
} else {
#ifdef SO_TIMESTAMP
struct cmsghdr *c;
/*获取时间戳信息*/
for (c = CMSG_FIRSTHDR(&msg); c; c = CMSG_NXTHDR(&msg, c)) {
if (c->cmsg_level != SOL_SOCKET ||
c->cmsg_type != SO_TIMESTAMP)
continue;
if (c->cmsg_len < CMSG_LEN(sizeof(struct timeval)))
continue;
recv_timep = (struct timeval *)CMSG_DATA(c);
}
#endif
if (rts->opt_latency || recv_timep == NULL) {
/*更新接收时间,如果没有时间戳则取当前系统时间*/
if (rts->opt_latency ||
ioctl(sock->fd, SIOCGSTAMP, &recv_time))
gettimeofday(&recv_time, NULL);
recv_timep = &recv_time;
}
/*调用地址族的parse_reply处理接收的报文,返回是否是我们自己需要处理的报文*/
not_ours = fset->parse_reply(rts, sock, &msg, cc, addrbuf, recv_timep);
}
/* See? ... someone runs another ping on this host. */
/*收到了不是发给我们进程的报文,在raw类型会出现这种,则安装一下filter过滤一下*/
if (not_ours && sock->socktype == SOCK_RAW)
fset->install_filter(rts, sock);
/* If nothing is in flight, "break" returns us to pinger. */
/*in_flight表示待确认报文个数,为0表示没有需要确认的报文,直接返回pinger触发下次发送*/
if (in_flight(rts) == 0)
break;
/* Otherwise, try to recvmsg() again. recvmsg()
* is nonblocking after the first iteration, so that
* if nothing is queued, it will receive EAGAIN
* and return to pinger. */
/*即使有没有确认的报文,这里也没有什么额外的操作*/
}
}
/*退出发包和收包处理流程,返回finish的处理结果*/
return finish(rts);
}
pinger (ping_common.c)
/*
* pinger --
* Compose and transmit an ICMP ECHO REQUEST packet. The IP packet
* will be added on by the kernel. The ID field is our UNIX process ID,
* and the sequence number is an ascending integer. The first several bytes
* of the data portion are used to hold a UNIX "timeval" struct in VAX
* byte-order, to compute the round-trip time.
*/
/*组包和发送ICMP ECHO REQUEST报文*/
int pinger(struct ping_rts *rts, ping_func_set_st *fset, socket_st *sock)
{
static int oom_count;
static int tokens;
int i;
/* Have we already sent enough? If we have, return an arbitrary positive value. */
/*发包之前判断是否需要退出,如果满足退出条件,返回1000表示触发调度*/
if (rts->exiting || (rts->npackets && rts->ntransmitted >= rts->npackets && !rts->deadline))
return 1000;
/*preload表示在等待回复报文之前预发送的报文个数,这些报文不计算收包和rtt时间,默认没有:preload=1*/
/* Check that packets < rate*time + preload */
if (rts->cur_time.tv_sec == 0 && rts->cur_time.tv_nsec == 0) {
/*第1此进入这个流程,获取时间戳,通过preload和interval计算tockens*/
clock_gettime(CLOCK_MONOTONIC_RAW, &rts->cur_time);
tokens = rts->interval * (rts->preload - 1);
} else {
long ntokens, tmp;
struct timespec tv;
/*非首次进入,通过时间差计算ntokens,1ms=1token.*/
clock_gettime(CLOCK_MONOTONIC_RAW, &tv);
ntokens = (tv.tv_sec - rts->cur_time.tv_sec) * 1000 +
(tv.tv_nsec - rts->cur_time.tv_nsec) / 1000000;
if (!rts->interval) {
/* Case of unlimited flood is special;
* if we see no reply, they are limited to 100pps */
/*未设置interval,flood ping,限速100pps,这里MIN_INTERVAL_MS */
if (ntokens < MIN_INTERVAL_MS && in_flight(rts) >= rts->preload)
return MIN_INTERVAL_MS - ntokens;
}
ntokens += tokens; /*累计token到tokens上*/
/*下面的token计算没有看懂*/
tmp = (long)rts->interval * (long)rts->preload;
if (tmp < ntokens)
ntokens = tmp;
if (ntokens < rts->interval)
return rts->interval - ntokens;
rts->cur_time = tv;
tokens = ntokens - rts->interval;
}
/*打印通过-O设置的outstanding信息*/
if (rts->opt_outstanding) {
if (rts->ntransmitted > 0 && !rcvd_test(rts, rts->ntransmitted)) {
print_timestamp(rts);
printf(_("no answer yet for icmp_seq=%lu\n"), (rts->ntransmitted % MAX_DUP_CHK));
fflush(stdout);
}
}
/*调用地址族的send_probe进行发包*/
resend:
i = fset->send_probe(rts, sock, rts->outpack, sizeof(rts->outpack));
/*返回0表示发包正常*/
if (i == 0) {
oom_count = 0;
advance_ntransmitted(rts); /*增加发包个数*/
if (!rts->opt_quiet && rts->opt_flood) {
/* Very silly, but without this output with
* high preload or pipe size is very confusing. */
/*flood ping打印.表示进度*/
if ((rts->preload < rts->screen_width && rts->pipesize < rts->screen_width) ||
in_flight(rts) < rts->screen_width)
write_stdout(".", 1);
}
/*消耗令牌后返回*/
return rts->interval - tokens;
}
/* And handle various errors... */
/*各种错误处理*/
if (i > 0) {
/* Apparently, it is some fatal bug. */
abort();
} else if (errno == ENOBUFS || errno == ENOMEM) {
int nores_interval;
/* Device queue overflow or OOM. Packet is not sent. */
tokens = 0;
/* Slowdown. This works only in adaptive mode (option -A) */
rts->rtt_addend += (rts->rtt < 8 * 50000 ? rts->rtt / 8 : 50000);
if (rts->opt_adaptive)
update_interval(rts);
nores_interval = SCHINT(rts->interval / 2);
if (nores_interval > 500)
nores_interval = 500;
oom_count++;
if (oom_count * nores_interval < rts->lingertime)
return nores_interval;
i = 0;
/* Fall to hard error. It is to avoid complete deadlock
* on stuck output device even when dealine was not requested.
* Expected timings are screwed up in any case, but we will
* exit some day. :-) */
} else if (errno == EAGAIN) {
/* Socket buffer is full. */
tokens += rts->interval;
return MIN_INTERVAL_MS;
} else if (errno == EMSGSIZE) {
/* For example, sendto with len > 65527 on SOCK_DGRAM fails with this errno. */
rts->nerrors++;
i = 0;
} else {
/*调用地址族的错误处理*/
if ((i = fset->receive_error_msg(rts, sock)) > 0) {
/* An ICMP error arrived. In this case, we've received
* an error from sendto(), but we've also received an
* ICMP message, which means the packet did in fact
* send in some capacity. So, in this odd case, report
* the more specific errno as the error, and treat this
* as a hard local error. */
i = 0;
goto hard_local_error;
}
/* Compatibility with old linuces. */
if (i == 0 && rts->confirm_flag && errno == EINVAL) {
rts->confirm_flag = 0;
errno = 0;
}
if (!errno)
goto resend;
}
hard_local_error:
/* Hard local error. Pretend we sent packet. */
advance_ntransmitted(rts);
if (i == 0 && !rts->opt_quiet) {
if (rts->opt_flood)
write_stdout("E", 1);
else
error(0, errno, "sendmsg");
}
tokens = 0;
return SCHINT(rts->interval);
}
ping4_send_probe (ping.c)
/*
* pinger --
* Compose and transmit an ICMP ECHO REQUEST packet. The IP packet
* will be added on by the kernel. The ID field is our UNIX process ID,
* and the sequence number is an ascending integer. The first several bytes
* of the data portion are used to hold a UNIX "timeval" struct in VAX
* byte-order, to compute the round-trip time.
*/
int ping4_send_probe(struct ping_rts *rts, socket_st *sock, void *packet,
unsigned packet_size __attribute__((__unused__)))
{
struct icmphdr *icp;
int cc;
int i;
/*icmp报文组包,设置type,code,更新sequence和id信息*/
icp = (struct icmphdr *)packet;
icp->type = ICMP_ECHO;
icp->code = 0;
icp->checksum = 0;
icp->un.echo.sequence = htons(rts->ntransmitted + 1);
icp->un.echo.id = rts->ident; /* ID */
rcvd_clear(rts, rts->ntransmitted + 1);
/*时间戳信息置位*/
if (rts->timing) {
if (rts->opt_latency) {
struct timeval tmp_tv;
gettimeofday(&tmp_tv, NULL);
memcpy(icp + 1, &tmp_tv, sizeof(tmp_tv));
} else {
memset(icp + 1, 0, sizeof(struct timeval));
}
}
cc = rts->datalen + 8; /* skips ICMP portion */
/* compute ICMP checksum here */
/*计算校验和*/
icp->checksum = in_cksum((unsigned short *)icp, cc, 0);
if (rts->timing && !rts->opt_latency) {
/*更新报文中的时间戳*/
struct timeval tmp_tv;
gettimeofday(&tmp_tv, NULL);
memcpy(icp + 1, &tmp_tv, sizeof(tmp_tv));
icp->checksum = in_cksum((unsigned short *)&tmp_tv, sizeof(tmp_tv), ~icp->checksum);
}
/*发送报文*/
i = sendto(sock->fd, icp, cc, 0, (struct sockaddr *)&rts->whereto, sizeof(rts->whereto));
/*根据成功发送报文的个数设置返回值,成功则返回0,否则返回sendto的结果*/
return (cc == i ? 0 : i);
}
ping6_send_probe (ping6_common.c)
TODO
__schedule_exit (ping_common.c)
int __schedule_exit(int next)
{
static unsigned long waittime;
struct itimerval it;
if (waittime)
return next;
if (global_rts->nreceived) {
waittime = 2 * global_rts->tmax;
if (waittime < 1000 * (unsigned long)global_rts->interval)
waittime = 1000 * global_rts->interval;
} else
waittime = global_rts->lingertime * 1000;
if (next < 0 || (unsigned long)next < waittime / 1000)
next = waittime / 1000;
it.it_interval.tv_sec = 0;
it.it_interval.tv_usec = 0;
it.it_value.tv_sec = waittime / 1000000;
it.it_value.tv_usec = waittime % 1000000;
setitimer(ITIMER_REAL, &it, NULL);
return next;
}
/*schedule_exit实现在ping.h中*/
static inline int schedule_exit(struct ping_rts *rts, int next)
{
/*设置了发包个数,且已发送个数大于设置的个数,且没有到最大发包时间,通过__schedule_exit调度*/
if (rts->npackets && rts->ntransmitted >= rts->npackets && !rts->deadline)
next = __schedule_exit(next);
return next;
}
ping4_receive_error_msg (ping.c)
int ping4_receive_error_msg(struct ping_rts *rts, socket_st *sock)
{
ssize_t res;
char cbuf[512];
struct iovec iov;
struct msghdr msg;
struct cmsghdr *cmsgh;
struct sock_extended_err *e;
struct icmphdr icmph;
struct sockaddr_in target;
int net_errors = 0;
int local_errors = 0;
int saved_errno = errno;
iov.iov_base = &icmph;
iov.iov_len = sizeof(icmph);
msg.msg_name = (void *)⌖
msg.msg_namelen = sizeof(target);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
/*读取MSG_ERRQUEUE中的信息,进行错误处理*/
res = recvmsg(sock->fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT);
if (res < 0) {
if (errno == EAGAIN || errno == EINTR)
local_errors++;
goto out;
}
e = NULL;
for (cmsgh = CMSG_FIRSTHDR(&msg); cmsgh; cmsgh = CMSG_NXTHDR(&msg, cmsgh)) {
if (cmsgh->cmsg_level == SOL_IP) {
if (cmsgh->cmsg_type == IP_RECVERR)
e = (struct sock_extended_err *)CMSG_DATA(cmsgh);
}
}
if (e == NULL)
abort();
/*错误类型参考:https://www.man7.org/linux/man-pages/man7/ip.7.html*/
if (e->ee_origin == SO_EE_ORIGIN_LOCAL) {
local_errors++;
if (rts->opt_quiet)
goto out;
if (rts->opt_flood)
write_stdout("E", 1);
else if (e->ee_errno != EMSGSIZE)
error(0, e->ee_errno, _("local error"));
else
error(0, 0, _("local error: message too long, mtu=%u"), e->ee_info);
rts->nerrors++;
} else if (e->ee_origin == SO_EE_ORIGIN_ICMP) {
struct sockaddr_in *sin = (struct sockaddr_in *)(e + 1);
if (res < (ssize_t) sizeof(icmph) ||
target.sin_addr.s_addr != rts->whereto.sin_addr.s_addr ||
icmph.type != ICMP_ECHO ||
!is_ours(rts, sock, icmph.un.echo.id)) {
/* Not our error, not an error at all. Clear. */
saved_errno = 0;
goto out;
}
acknowledge(rts, ntohs(icmph.un.echo.sequence));
if (sock->socktype == SOCK_RAW) {
struct icmp_filter filt;
filt.data = ~((1 << ICMP_SOURCE_QUENCH) |
(1 << ICMP_REDIRECT) |
(1 << ICMP_ECHOREPLY));
if (setsockopt(sock->fd, SOL_RAW, ICMP_FILTER, (const void *)&filt,
sizeof(filt)) == -1)
error(2, errno, "setsockopt(ICMP_FILTER)");
}
net_errors++;
rts->nerrors++;
if (rts->opt_quiet)
goto out;
if (rts->opt_flood) {
write_stdout("\bE", 2);
} else {
print_timestamp(rts);
printf(_("From %s icmp_seq=%u "), pr_addr(rts, sin, sizeof *sin), ntohs(icmph.un.echo.sequence));
pr_icmph(rts, e->ee_type, e->ee_code, e->ee_info, NULL);
fflush(stdout);
}
}
out:
errno = saved_errno;
return net_errors ? net_errors : -local_errors;
}
ping6_receive_error_msg (ping6_common.c)
TODO
ping4_parse_reply (ping.c)
int ping4_parse_reply(struct ping_rts *rts, struct socket_st *sock,
struct msghdr *msg, int cc, void *addr,
struct timeval *tv)
{
struct sockaddr_in *from = addr;
uint8_t *buf = msg->msg_iov->iov_base;
struct icmphdr *icp;
struct iphdr *ip;
int hlen;
int csfailed;
struct cmsghdr *cmsgh;
int reply_ttl;
uint8_t *opts, *tmp_ttl;
int olen;
int wrong_source = 0;
/*分别校验ip头和icmp头*/
/* Check the IP header */
ip = (struct iphdr *)buf;
if (sock->socktype == SOCK_RAW) {
hlen = ip->ihl * 4;
if (cc < hlen + 8 || ip->ihl < 5) {
if (rts->opt_verbose)
error(0, 0, _("packet too short (%d bytes) from %s"), cc,
pr_addr(rts,from, sizeof *from));
return 1;
}
reply_ttl = ip->ttl;
opts = buf + sizeof(struct iphdr);
olen = hlen - sizeof(struct iphdr);
} else {
hlen = 0;
reply_ttl = 0;
opts = buf;
olen = 0;
for (cmsgh = CMSG_FIRSTHDR(msg); cmsgh; cmsgh = CMSG_NXTHDR(msg, cmsgh)) {
if (cmsgh->cmsg_level != SOL_IP)
continue;
if (cmsgh->cmsg_type == IP_TTL) {
if (cmsgh->cmsg_len < sizeof(int))
continue;
tmp_ttl = (uint8_t *)CMSG_DATA(cmsgh);
reply_ttl = (int)*tmp_ttl;
} else if (cmsgh->cmsg_type == IP_RETOPTS) {
opts = (uint8_t *)CMSG_DATA(cmsgh);
olen = cmsgh->cmsg_len;
}
}
}
/* Now the ICMP part */
cc -= hlen;
icp = (struct icmphdr *)(buf + hlen);
csfailed = in_cksum((unsigned short *)icp, cc, 0);
if (icp->type == ICMP_ECHOREPLY) {
if (!is_ours(rts, sock, icp->un.echo.id))
return 1; /* 'Twas not our ECHO */
if (!rts->broadcast_pings && !rts->multicast &&
from->sin_addr.s_addr != rts->whereto.sin_addr.s_addr)
wrong_source = 1;
if (gather_statistics(rts, (uint8_t *)icp, sizeof(*icp), cc,
ntohs(icp->un.echo.sequence),
reply_ttl, csfailed, tv, pr_addr(rts, from, sizeof *from),
pr_echo_reply, rts->multicast, wrong_source)) {
fflush(stdout);
return 0;
}
} else {
/* We fall here when a redirect or source quench arrived. */
switch (icp->type) {
case ICMP_ECHO:
/* MUST NOT */
return 1;
case ICMP_SOURCE_QUENCH:
case ICMP_REDIRECT:
case ICMP_DEST_UNREACH:
case ICMP_TIME_EXCEEDED:
case ICMP_PARAMETERPROB:
{
struct iphdr *iph = (struct iphdr *)(&icp[1]);
struct icmphdr *icp1 = (struct icmphdr *)
((unsigned char *)iph + iph->ihl * 4);
int error_pkt;
if (cc < (int)(8 + sizeof(struct iphdr) + 8) ||
cc < 8 + iph->ihl * 4 + 8)
return 1;
if (icp1->type != ICMP_ECHO ||
iph->daddr != rts->whereto.sin_addr.s_addr ||
!is_ours(rts, sock, icp1->un.echo.id))
return 1;
error_pkt = (icp->type != ICMP_REDIRECT &&
icp->type != ICMP_SOURCE_QUENCH);
if (error_pkt) {
acknowledge(rts, ntohs(icp1->un.echo.sequence));
return 0;
}
if (rts->opt_quiet || rts->opt_flood)
return 1;
print_timestamp(rts);
printf(_("From %s: icmp_seq=%u "), pr_addr(rts, from, sizeof *from),
ntohs(icp1->un.echo.sequence));
if (csfailed)
printf(_("(BAD CHECKSUM)"));
pr_icmph(rts, icp->type, icp->code, ntohl(icp->un.gateway), icp);
return 1;
}
default:
/* MUST NOT */
break;
}
if (rts->opt_flood && !(rts->opt_verbose || rts->opt_quiet)) {
if (!csfailed)
write_stdout("!E", 2);
else
write_stdout("!EC", 3);
return 0;
}
if (!rts->opt_verbose || rts->uid)
return 0;
if (rts->opt_ptimeofday) {
struct timeval recv_time;
gettimeofday(&recv_time, NULL);
printf("%lu.%06lu ", (unsigned long)recv_time.tv_sec, (unsigned long)recv_time.tv_usec);
}
printf(_("From %s: "), pr_addr(rts, from, sizeof *from));
if (csfailed) {
printf(_("(BAD CHECKSUM)\n"));
return 0;
}
pr_icmph(rts, icp->type, icp->code, ntohl(icp->un.gateway), icp);
return 0;
}
if (rts->opt_audible) {
putchar('\a');
if (rts->opt_flood)
fflush(stdout);
}
if (!rts->opt_flood) {
pr_options(rts, opts, olen + sizeof(struct iphdr));
putchar('\n');
fflush(stdout);
}
return 0;
}
ping6_parse_reply (ping6_common.c)
TODO
ping4_install_filter (ping.c)
void ping4_install_filter(struct ping_rts *rts, socket_st *sock)
{
/*icmp包过滤器设置*/
static int once;
static struct sock_filter insns[] = {
BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0), /* Skip IP header due BSD, see ping6. */
BPF_STMT(BPF_LD | BPF_H | BPF_IND, 4), /* Load icmp echo ident */
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 0xAAAA, 0, 1), /* Ours? */
BPF_STMT(BPF_RET | BPF_K, ~0U), /* Yes, it passes. */
BPF_STMT(BPF_LD | BPF_B | BPF_IND, 0), /* Load icmp type */
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, ICMP_ECHOREPLY, 1, 0), /* Echo? */
BPF_STMT(BPF_RET | BPF_K, 0xFFFFFFF), /* No. It passes. */
BPF_STMT(BPF_RET | BPF_K, 0) /* Echo with wrong ident. Reject. */
};
static struct sock_fprog filter = {
sizeof insns / sizeof(insns[0]),
insns
};
if (once)
return;
once = 1;
/* Patch bpflet for current identifier. */
insns[2] = (struct sock_filter)BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, htons(rts->ident), 0, 1);
if (setsockopt(sock->fd, SOL_SOCKET, SO_ATTACH_FILTER, &filter, sizeof(filter)))
error(0, errno, _("WARNING: failed to install socket filter"));
}
ping6_install_filter (ping6_common.c)
TODO
finish (ping_common.c)
/*
* finish --
* Print out statistics, and give up.
*/
int finish(struct ping_rts *rts)
{
struct timespec tv = rts->cur_time;
char *comma = "";
tssub(&tv, &rts->start_time);
/*打印ping统计结果信息*/
putchar('\n');
fflush(stdout);
printf(_("--- %s ping statistics ---\n"), rts->hostname);
printf(_("%ld packets transmitted, "), rts->ntransmitted);
printf(_("%ld received"), rts->nreceived);
if (rts->nrepeats)
printf(_(", +%ld duplicates"), rts->nrepeats);
if (rts->nchecksum)
printf(_(", +%ld corrupted"), rts->nchecksum);
if (rts->nerrors)
printf(_(", +%ld errors"), rts->nerrors);
if (rts->ntransmitted) {
#ifdef USE_IDN
setlocale(LC_ALL, "C");
#endif
/*打印丢包数据统计百分比*/
printf(_(", %g%% packet loss"),
(float)((((long long)(rts->ntransmitted - rts->nreceived)) * 100.0) / rts->ntransmitted));
printf(_(", time %llums"), (unsigned long long)(1000 * tv.tv_sec + (tv.tv_nsec + 500000) / 1000000));
}
putchar('\n');
if (rts->nreceived && rts->timing) {
double tmdev;
long total = rts->nreceived + rts->nrepeats;
long tmavg = rts->tsum / total;
long long tmvar;
/*计算时间统计信息*/
if (rts->tsum < INT_MAX)
/* This slightly clumsy computation order is important to avoid
* integer rounding errors for small ping times. */
tmvar = (rts->tsum2 - ((rts->tsum * rts->tsum) / total)) / total;
else
tmvar = (rts->tsum2 / total) - (tmavg * tmavg);
tmdev = llsqrt(tmvar);
printf(_("rtt min/avg/max/mdev = %ld.%03ld/%lu.%03ld/%ld.%03ld/%ld.%03ld ms"),
(long)rts->tmin / 1000, (long)rts->tmin % 1000,
(unsigned long)(tmavg / 1000), (long)(tmavg % 1000),
(long)rts->tmax / 1000, (long)rts->tmax % 1000,
(long)tmdev / 1000, (long)tmdev % 1000);
comma = ", ";
}
if (rts->pipesize > 1) {
printf(_("%spipe %d"), comma, rts->pipesize);
comma = ", ";
}
/*打印rtt信息*/
if (rts->nreceived && (!rts->interval || rts->opt_flood || rts->opt_adaptive) && rts->ntransmitted > 1) {
int ipg = (1000000 * (long long)tv.tv_sec + tv.tv_nsec / 1000) / (rts->ntransmitted - 1);
printf(_("%sipg/ewma %d.%03d/%d.%03d ms"),
comma, ipg / 1000, ipg % 1000, rts->rtt / 8000, (rts->rtt / 8) % 1000);
}
putchar('\n');
return (!rts->nreceived || (rts->deadline && rts->nreceived < rts->npackets));
}
status (ping_common.c)
void status(struct ping_rts *rts)
{
int loss = 0;
long tavg = 0;
rts->status_snapshot = 0;
/*相对于finish少了一些信息*/
if (rts->ntransmitted)
loss = (((long long)(rts->ntransmitted - rts->nreceived)) * 100) / rts->ntransmitted;
fprintf(stderr, "\r");
fprintf(stderr, _("%ld/%ld packets, %d%% loss"), rts->nreceived, rts->ntransmitted, loss);
if (rts->nreceived && rts->timing) {
tavg = rts->tsum / (rts->nreceived + rts->nrepeats);
fprintf(stderr, _(", min/avg/ewma/max = %ld.%03ld/%lu.%03ld/%d.%03d/%ld.%03ld ms"),
(long)rts->tmin / 1000, (long)rts->tmin % 1000,
tavg / 1000, tavg % 1000,
rts->rtt / 8000, (rts->rtt / 8) % 1000, (long)rts->tmax / 1000, (long)rts->tmax % 1000);
}
fprintf(stderr, "\n");
}
gather_statistics (ping_common.c)
int gather_statistics(struct ping_rts *rts, uint8_t *icmph, int icmplen,
int cc, uint16_t seq, int hops,
int csfailed, struct timeval *tv, char *from,
void (*pr_reply)(uint8_t *icmph, int cc), int multicast,
int wrong_source)
{
int dupflag = 0;
long triptime = 0;
uint8_t *ptr = icmph + icmplen;
/*更新统计数据*/
++rts->nreceived;
if (!csfailed)
acknowledge(rts, seq);
if (rts->timing && cc >= (int)(8 + sizeof(struct timeval))) {
struct timeval tmp_tv;
memcpy(&tmp_tv, ptr, sizeof(tmp_tv));
restamp:
tvsub(tv, &tmp_tv);
triptime = tv->tv_sec * 1000000 + tv->tv_usec;
if (triptime < 0) {
error(0, 0, _("Warning: time of day goes back (%ldus), taking countermeasures"), triptime);
triptime = 0;
if (!rts->opt_latency) {
gettimeofday(tv, NULL);
rts->opt_latency = 1;
goto restamp;
}
}
if (!csfailed) {
rts->tsum += triptime;
rts->tsum2 += (double)((long long)triptime * (long long)triptime);
if (triptime < rts->tmin)
rts->tmin = triptime;
if (triptime > rts->tmax)
rts->tmax = triptime;
if (!rts->rtt)
rts->rtt = triptime * 8;
else
rts->rtt += triptime - rts->rtt / 8;
if (rts->opt_adaptive)
update_interval(rts);
}
}
if (csfailed) {
++rts->nchecksum;
--rts->nreceived;
} else if (rcvd_test(rts, seq)) {
++rts->nrepeats;
--rts->nreceived;
dupflag = 1;
} else {
rcvd_set(rts, seq);
dupflag = 0;
}
rts->confirm = rts->confirm_flag;
if (rts->opt_quiet)
return 1;
if (rts->opt_flood) {
if (!csfailed)
write_stdout("\b \b", 3);
else
write_stdout("\bC", 2);
} else {
int i;
uint8_t *cp, *dp;
print_timestamp(rts);
printf(_("%d bytes from %s:"), cc, from);
if (pr_reply)
pr_reply(icmph, cc);
if (rts->opt_verbose && rts->ident != -1)
printf(_(" ident=%d"), ntohs(rts->ident));
if (hops >= 0)
printf(_(" ttl=%d"), hops);
if (cc < rts->datalen + 8) {
printf(_(" (truncated)\n"));
return 1;
}
if (rts->timing) {
if (rts->opt_rtt_precision)
printf(_(" time=%ld.%03ld ms"), triptime / 1000, triptime % 1000);
else if (triptime >= 100000 - 50)
printf(_(" time=%ld ms"), (triptime + 500) / 1000);
else if (triptime >= 10000 - 5)
printf(_(" time=%ld.%01ld ms"), (triptime + 50) / 1000,
((triptime + 50) % 1000) / 100);
else if (triptime >= 1000)
printf(_(" time=%ld.%02ld ms"), (triptime + 5) / 1000,
((triptime + 5) % 1000) / 10);
else
printf(_(" time=%ld.%03ld ms"), triptime / 1000,
triptime % 1000);
}
if (dupflag && (!multicast || rts->opt_verbose))
printf(_(" (DUP!)"));
if (csfailed)
printf(_(" (BAD CHECKSUM!)"));
if (wrong_source)
printf(_(" (DIFFERENT ADDRESS!)"));
/* check the data */
cp = ((unsigned char *)ptr) + sizeof(struct timeval);
dp = &rts->outpack[8 + sizeof(struct timeval)];
for (i = sizeof(struct timeval); i < rts->datalen; ++i, ++cp, ++dp) {
if (*cp != *dp) {
printf(_("\nwrong data byte #%d should be 0x%x but was 0x%x"),
i, *dp, *cp);
cp = (unsigned char *)ptr + sizeof(struct timeval);
for (i = sizeof(struct timeval); i < rts->datalen; ++i, ++cp) {
if ((i % 32) == sizeof(struct timeval))
printf("\n#%d\t", i);
printf("%x ", *cp);
}
break;
}
}
}
return 0;
}
in_cksum (ping.c)
static unsigned short
in_cksum(const unsigned short *addr, int len, unsigned short csum)
{
int nleft = len;
const unsigned short *w = addr;
unsigned short answer;
int sum = csum;
/*计算报文crc的一种方法*/
/*
* Our algorithm is simple, using a 32 bit accumulator (sum),
* we add sequential 16 bit words to it, and at the end, fold
* back all the carry bits from the top 16 bits into the lower
* 16 bits.
*/
while (nleft > 1) {
sum += *w++;
nleft -= 2;
}
/* mop up an odd byte, if necessary */
if (nleft == 1)
sum += ODDBYTE(*(unsigned char *)w); /* le16toh() may be unavailable on old systems */
/*
* add back carry outs from top 16 bits to low 16 bits
*/
sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */
sum += (sum >> 16); /* add carry */
answer = ~sum; /* truncate to 16 bits */
return (answer);
}
参考源码
Ubuntu源码
https://git.launchpad.net/ubuntu/+source/iputils/tree/ping?h=ubuntu/plucky
其他源码