Linux邻居子系统学习笔记
在《深入理解Linux网络技术内幕》一书中,作者花费了近150页(中文版页数,对应P629-P778)的篇幅详细介绍了邻居子系统的实现。以前因为对这部分不了解,很搞不懂为什么需要这么多的文字来介绍ARP的实现,对,以前是把ARP和邻居子系统等价来看的。在最近的项目中,深入地开发和解决了一些ARP/ND的相关功能和问题,才发现邻居子系统是Linux网络技术中很重要的一部分,值得花一篇文章详细介绍一下。即使这样,也可能只是粗略的介绍,因为很多细节都需要阅读实际的代码才可以体会到设计和实现之妙。幸好现在AI工具很强大,阅读代码也没有以前那么困难了,让我们开始这趟旅程吧。
因为篇幅的限制,我们这里主要以IPv4的ARP为主介绍邻居子系统。对于IPv6的ND来说,邻居子系统的框架和ARP是一样的,所以本文暂不涉及。其实邻居子系统的概念,是在IPv6引入的时候同时引入的,具体可以参考1996年8月发布的RFC1970 Neighbor Discovery for IP Version 6 (IPv6),其中的3.1小节Comparison with IPv4详细比较了和IPv4的差异。我们可以这样理解, RFC1970是对邻居发现协议的抽象,虽然其主要针对IPv6而设计,但是同时也兼容了IPv4 的ARP协议实现流程,而这也是Linux Kernel能够用neighbour子系统同时支持IPv4和IPv6的理论基础。后面2007年又发布了RFC 4861: Neighbor Discovery for IP version 6 (IPv6),完全取代了RFC1970,在这两个版本之间还有个过渡版本RFC2461。
整体结构
报文收发流程
发送方向:
arp请求/arp响应 -> arp报文封装 -> arp报文发送。
接收方向:
arp报文 -> arp处理 ->arp学习 ->arp响应
发送消息
ip报文经过路由查找以后调用ip_output进行打包,其最终通过neigh_outpout触发邻居子系统发送报文。
具体调用路径:ip_output->ip_finish_output->ip_finish_output2
ip_finish_output2函数实现如下:
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = dst_rtable(dst);
struct net_device *dev = dst_dev(dst);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
bool is_v6gw = false;
/*此处省略部分代码*/
rcu_read_lock();
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
if (!IS_ERR(neigh)) {
int res;
/*走到这里说明neigh子系统注册是正常的,neigh如果不存在则先创建一个无效的,然后再去学习,如果已经有了,下面可以直接使用*/
sock_confirm_neigh(skb, neigh);
/* if crossing protocols, can not use the cached header */
/*这里触发neigh处理流程*/
res = neigh_output(neigh, skb, is_v6gw);
rcu_read_unlock();
return res;
}
rcu_read_unlock();
/*走到这里说明neigh邻居子系统可能注册的有问题,或者neigh无法正常创建,报文将被丢弃*/
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
return PTR_ERR(neigh);
}
其中 ip_neigh_for_gw 用来根据网关地址查找对应的 neigh 表项。
static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
__be32 daddr)
{
struct neighbour *neigh;
neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &daddr, dev, false);
return neigh;
}
static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
struct sk_buff *skb,
bool *is_v6gw)
{
struct net_device *dev = rt->dst.dev;
struct neighbour *neigh;
if (likely(rt->rt_gw_family == AF_INET)) {
neigh = ip_neigh_gw4(dev, rt->rt_gw4);
} else if (rt->rt_gw_family == AF_INET6) {
neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
*is_v6gw = true;
} else {
neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
}
return neigh;
}
对于 mpls 转来说, mpls_forward通过neigh_xmit触发arp发送,其具体实现如下:
static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
struct net *net = dev_net(dev);
struct mpls_shim_hdr *hdr;
const struct mpls_nh *nh;
struct mpls_route *rt;
struct net_device *out_dev;
int err;
/*此处省略部分代码*/
mpls_stats_inc_outucastpkts(out_dev, skb);
/* If via wasn't specified then send out using device address */
if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
/*使用NEIGH_LINK_TABLE表*/
err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
out_dev->dev_addr, skb);
else
/*使用nh_via_table指定的邻居表*/
err = neigh_xmit(nh->nh_via_table, out_dev,
mpls_nh_via(rt, nh), skb);
if (err)
net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err);
return 0;
/*此处省略部分代码*/
}
而neigh_xmit中根据index的不同,可能调用的是neigh_output发送arp,也可能直接调用dev_queue_xmit发送报文。
int neigh_xmit(int index, struct net_device *dev,
const void *addr, struct sk_buff *skb)
{
int err = -EAFNOSUPPORT;
if (likely(index < NEIGH_NR_TABLES)) {
struct neigh_table *tbl;
struct neighbour *neigh;
rcu_read_lock();
tbl = rcu_dereference(neigh_tables[index]);
if (!tbl)
goto out_unlock;
if (index == NEIGH_ARP_TABLE) {
u32 key = *((u32 *)addr);
/*查找ipv4 neigh(arp)是否存在*/
neigh = __ipv4_neigh_lookup_noref(dev, key);
} else {
neigh = __neigh_lookup_noref(tbl, addr, dev);
}
/*如果neigh不存在则创建一个*/
if (!neigh)
neigh = __neigh_create(tbl, addr, dev, false);
err = PTR_ERR(neigh);
if (IS_ERR(neigh)) {
rcu_read_unlock();
goto out_kfree_skb;
}
/*通过neigh的output回调进行发包*/
err = READ_ONCE(neigh->output)(neigh, skb);
out_unlock:
rcu_read_unlock();
}
else if (index == NEIGH_LINK_TABLE) {
/*直接通过dev_hard_header进行二层头封装*/
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
addr, NULL, skb->len);
if (err < 0)
goto out_kfree_skb;
err = dev_queue_xmit(skb);
}
out:
return err;
out_kfree_skb:
kfree_skb(skb);
goto out;
}
注意上面neigh_xmit中失败流程会释放skb资源,2026年4月有个commit:neigh: let neigh_xmit take skb ownership修改了这里的一个小问题。
可以看到,不论是 ip 转还是 mpls 转发,都是先用__ipv4_neigh_lookup_noref查找 neigh 表项,如果不存在则创建一个,如果存在则直接使用。对于首次发包来说,因为查不到 neigh 表而走创建流程,那就让我们先看看创建流程__neigh_create。
static struct neighbour *
___neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, u32 flags,
bool exempt_from_gc, bool want_ref)
{
u32 hash_val, key_len = tbl->key_len;
struct neighbour *n1, *rc, *n;
struct neigh_hash_table *nht;
int error;
n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
if (dev->netdev_ops->ndo_neigh_construct) {
error = dev->netdev_ops->ndo_neigh_construct(dev, n);
if (error < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
}
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
/*已经存在,则增加引用计数,返回改表项*/
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->dead = 0;
if (!exempt_from_gc)
list_add_tail(&n->gc_list, &n->tbl->gc_list);
if (n->flags & NTF_MANAGED)
list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
hlist_add_head_rcu(&n->dev_list,
neigh_get_dev_table(dev, tbl->family));
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
write_unlock_bh(&tbl->lock);
out_neigh_release:
if (!exempt_from_gc)
atomic_dec(&tbl->gc_entries);
neigh_release(n);
goto out;
}
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);
return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
}
neigh_alloc 用来申请 neigh 表项资源并初始化。
static struct neighbour *neigh_alloc(struct neigh_table *tbl,
struct net_device *dev,
u32 flags, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries, gc_thresh3;
/*如果设置了exempt_from_gc则跳过gc*/
if (exempt_from_gc)
goto do_alloc;
entries = atomic_inc_return(&tbl->gc_entries) - 1;
gc_thresh3 = READ_ONCE(tbl->gc_thresh3);
/*符合如下条件强制进行gc*/
if (entries >= gc_thresh3 ||
(entries >= READ_ONCE(tbl->gc_thresh2) &&
time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) {
if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) {
net_info_ratelimited("%s: neighbor table overflow!\n",
tbl->id);
NEIGH_CACHE_STAT_INC(tbl, table_fulls);
goto out_entries;
}
}
do_alloc:
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
__skb_queue_head_init(&n->arp_queue);
rwlock_init(&n->lock);
seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
n->flags = flags;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
timer_setup(&n->timer, neigh_timer_handler, 0);
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
refcount_set(&n->refcnt, 1);
n->dead = 1;
INIT_LIST_HEAD(&n->gc_list);
INIT_LIST_HEAD(&n->managed_list);
atomic_inc(&tbl->entries);
out:
return n;
out_entries:
if (!exempt_from_gc)
atomic_dec(&tbl->gc_entries);
goto out;
}
对于 arp 来说,上面 ___neigh_create 中的 tbl->construtor 对应的是 arp_constructor 函数,其实现如下:
static int arp_constructor(struct neighbour *neigh)
{
__be32 addr;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
u32 inaddr_any = INADDR_ANY;
/*对于环回设备和点对点设备,key固定为inaddr_any(0.0.0.0)*/
if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);
addr = *(__be32 *)neigh->primary_key;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (!in_dev) {
rcu_read_unlock();
return -EINVAL;
}
neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
/*获取neigh的参数配置*/
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
rcu_read_unlock();
if (!dev->header_ops) {
/*没有注册header_ops的设备,固定设置为NUD_NOARP*/
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
ARPHRD_ETHER: (ethernet, apfddi)
ARPHRD_FDDI: (fddi)
ARPHRD_IEEE802: (tr)
ARPHRD_METRICOM: (strip)
ARPHRD_ARCNET:
etc. etc. etc.
ARPHRD_IPDDP will also work, if author repairs it.
I did not it, because this driver does not work even
in old paradigm.
*/
if (neigh->type == RTN_MULTICAST) {
/*组播地址状态为NUD_NOARP,自动计算链路地址*/
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
/*环回口设备状态为NUD_NOARP*/
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
} else if (neigh->type == RTN_BROADCAST ||
(dev->flags & IFF_POINTOPOINT)) {
/*广播和点对点设备状态为NUD_NOARP*/
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
/*初始化ops,支持cache操作*/
if (dev->header_ops->cache)
neigh->ops = &arp_hh_ops;
else
neigh->ops = &arp_generic_ops;
/*初始化output回调,有效则为connected_output,否则为output*/
if (neigh->nud_state & NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output;
}
return 0;
}
接下来让我们看看发送接口 neigh_output,其会根据当前neigh的状态来决定是发送ARP报文,还是直接使用快速缓存直接封装L2头部信息。
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
bool skip_cache)
{
const struct hh_cache *hh = &n->hh;
/* n->nud_state and hh->hh_len could be changed under us.
* neigh_hh_output() is taking care of the race later.
*/
/*如果使能了cache功能,且neigh的状态是NUD_CONNECTED,且hh->hh_len不为零,则通过neigh_hh_output直接通过缓存封装后发*/
if (!skip_cache &&
(READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
READ_ONCE(hh->hh_len))
return neigh_hh_output(hh, skb);
/*调用neigh对应的output回调发包*/
return READ_ONCE(n->output)(n, skb);
}
对于一般的网络设备而言,在neigh创建的时候会用arp_hh_ops给neigh的ops成员赋值,可以看这里,然后根据nud_state的状态给output回调赋值,可以看这里。但不论是哪种状态,最终都用neigh_resolve_output给output回调赋值,所以 n->output 事实指向的是 neigh_reslove_output。
/* Slow and careful. */
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
int rc = 0;
/*当neigh_event_send返回0的时候进行发包流程,否则什么都不做直接返回0*/
if (!neigh_event_send(neigh, skb)) {
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
neigh_hh_init(neigh);
do {
__skb_pull(skb, skb_network_offset(skb));
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
rc = dev_queue_xmit(skb);
else
goto out_kfree_skb;
}
out:
return rc;
out_kfree_skb:
rc = -EINVAL;
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
goto out;
}
neigh_event_send函数实现如下:
static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
struct sk_buff *skb,
const bool immediate_ok)
{
unsigned long now = jiffies;
/*更新used时戳为当前时间*/
if (READ_ONCE(neigh->used) != now)
WRITE_ONCE(neigh->used, now);
/*如果neigh的状态不是NUD_CONNECTED,NUD_DELAY,NUD_PROBE中的一种则触发neigh发送流程*/
if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
return __neigh_event_send(neigh, skb, immediate_ok);
return 0;
}
static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
return neigh_event_send_probe(neigh, skb, true);
}
__neigh_event_send是实现具体发送流程的函数,其实现如下:
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
const bool immediate_ok)
{
int rc;
bool immediate_probe = false;
write_lock_bh(&neigh->lock);
rc = 0;
/*如果neigh状态是NUD_CONNECTED | NUD_DELAY | NUD_PROBE中的一种,则直接返回。*/
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
/*如果neigh设置了dead则直接走out_dead处理流程*/
if (neigh->dead)
goto out_dead;
/*如果neigh状态也不是NUD_STALE和NUD_INCOMPLETE中的一种,则触发探测流程,其实这里的neigh状态只能是NUD_NONE和NUD_FAILED。*/
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
NEIGH_VAR(neigh->parms, APP_PROBES)) {
unsigned long next, now = jiffies;
/*首次探测,设置单播探测次数,删除定时器,设置neigh状态为NUD_INCOMPLETE,设置updated时间戳为现在的时间*/
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh_del_timer(neigh);
WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
neigh->updated = now;
if (!immediate_ok) {
next = now + 1;
} else {
/*设置了immediate标志则触发立即探测,同时更新next时间*/
immediate_probe = true;
next = now + max(NEIGH_VAR(neigh->parms,
RETRANS_TIME),
HZ / 100);
}
neigh_add_timer(neigh, next);
} else {
/*探测失败,或者未设置组播和app探测次数,设置neigh状态是NUD_FAILED,更新updated时间戳为当前时间,释放skb资源*/
WRITE_ONCE(neigh->nud_state, NUD_FAILED);
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
/*当前是NUD_STALE状态,删除定时器,更新为NUD_DELAY状态,更新updated时间戳为当前时间,添加probe定时器*/
neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh_del_timer(neigh);
WRITE_ONCE(neigh->nud_state, NUD_DELAY);
neigh->updated = jiffies;
neigh_add_timer(neigh, jiffies +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
while (neigh->arp_queue_len_bytes + skb->truesize >
NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
struct sk_buff *buff;
/*当前neigh是NUD_INCOMPLETE状态,所以要将skb放入arp_queue中,但是经过计算超出了QUEUE_LEN_BYTES,所以要做替换,采用的策略是先进先出,即将最早入队的skb出队头并释放,然后将当前的skb入队尾*/
buff = __skb_dequeue(&neigh->arp_queue);
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
/*如果设置了immediate_probe则通过neigh_probe进行探测发包,否则什么也不做。*/
if (immediate_probe)
neigh_probe(neigh);
else
write_unlock(&neigh->lock);
local_bh_enable();
trace_neigh_event_send_done(neigh, rc);
return rc;
out_dead:
/*如果设置了dead的neigh当前处于NUD_STALE状态,则进入out_unlock_bh处理流程,否则释放skb*/
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
trace_neigh_event_send_dead(neigh, 1);
return 1;
}
neigh_probe负责进行neigh探测,具体实现如下:
static void neigh_probe(struct neighbour *neigh)
__releases(neigh->lock)
{
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
/*调用neigh注册的solicit函数,对于ipv4来说是arp_solicit*/
if (neigh->ops->solicit)
neigh->ops->solicit(neigh, skb);
/*增加一次探测次数*/
atomic_inc(&neigh->probes);
consume_skb(skb);
}
对于 arp 来说,上面的neigh->ops->solicit对应的函数是 arp_solicit,其负责发送 arp 请求报文,具体实现如下:
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
__be32 saddr = 0;
u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
struct net_device *dev = neigh->dev;
__be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(&neigh->probes);
struct in_device *in_dev;
struct dst_entry *dst = NULL;
/*获取 dev 对应的 in_dev,完整叫法是 inetdev,初始化代码可以参考这里。*/
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (!in_dev) {
rcu_read_unlock();
return;
}
/*根据ARP announce 模式选择源 IP 地址,也就是 arp 报文中的源 ip 地址*/
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
ip_hdr(skb)->saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)->saddr;
break;
case 1: /* Restrict announcements of saddr in same subnet */
if (!skb)
break;
saddr = ip_hdr(skb)->saddr;
if (inet_addr_type_dev_table(dev_net(dev), dev,
saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
}
saddr = 0;
break;
case 2: /* Avoid secondary IPs, get a primary/preferred one */
break;
}
rcu_read_unlock();
/*如果源地址 saddr 未选择,则根据目的地址选择同链路的源地址*/
if (!saddr)
saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
/*经过上述计算后,probes 小于 0 表示还未达到配置的单播探测次数,大于或者等于 0 表示已经到达了最大单播探测次数*/
if (probes < 0) {
/*当 neigh 状态无效的时候是不能单播探测的,打印一个提示信息*/
if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
pr_debug("trying to ucast probe in NUD_INVALID\n");
neigh_ha_snapshot(dst_ha, neigh, dev);
dst_hw = dst_ha;
} else {
probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
if (probes < 0) {
/*这里表示还没有达到 app 的探测次数。*/
neigh_app_ns(neigh);
return;
}
}
if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
dst = skb_dst(skb);
/*发送 arp 请求报文*/
arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
dst_hw, dev->dev_addr, NULL, dst);
}
arp_send_dst 是 arp 模块提供的发送接口,具体实现如下:
/* Create and send an arp packet. */
static void arp_send_dst(int type, int ptype, __be32 dest_ip,
struct net_device *dev, __be32 src_ip,
const unsigned char *dest_hw,
const unsigned char *src_hw,
const unsigned char *target_hw,
struct dst_entry *dst)
{
struct sk_buff *skb;
/* arp on this interface. */
if (dev->flags & IFF_NOARP)
return;
/*创建 arp 报文对应的 skb*/
skb = arp_create(type, ptype, dest_ip, dev, src_ip,
dest_hw, src_hw, target_hw);
if (!skb)
return;
/*设置 skb 对应的 dst*/
skb_dst_set(skb, dst_clone(dst));
/*arp_xmit 最终调用 dev_queue_xmit 发送报文*/
arp_xmit(skb);
}
接收消息
arp报文接收处理流程。
arp 模块注册的 arp 报文接收接口是 arp_rcv,其具体实现如下:
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
enum skb_drop_reason drop_reason;
const struct arphdr *arp;
/* do not tweak dropwatch on an ARP we will ignore */
if (dev->flags & IFF_NOARP ||
skb->pkt_type == PACKET_OTHERHOST ||
skb->pkt_type == PACKET_LOOPBACK)
goto consumeskb;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev));
if (drop_reason != SKB_NOT_DROPPED_YET)
goto freeskb;
arp = arp_hdr(skb);
if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) {
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
goto freeskb;
}
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
dev_net(dev), NULL, skb, dev, NULL,
arp_process);
consumeskb:
consume_skb(skb);
return NET_RX_SUCCESS;
freeskb:
kfree_skb_reason(skb, drop_reason);
out_of_mem:
return NET_RX_DROP;
}
arp_process 实现如下,注意这里会处理 arp 请求和响应报文。
static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct arphdr *arp;
unsigned char *arp_ptr;
struct rtable *rt;
unsigned char *sha;
unsigned char *tha = NULL;
__be32 sip, tip;
u16 dev_type = dev->type;
int addr_type;
struct neighbour *n;
struct dst_entry *reply_dst = NULL;
bool is_garp = false;
/* arp_rcv below verifies the ARP header and verifies the device
* is ARP'able.
*/
if (!in_dev)
goto out_free_skb;
arp = arp_hdr(skb);
switch (dev_type) {
default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
goto out_free_skb;
break;
case ARPHRD_ETHER:
case ARPHRD_FDDI:
case ARPHRD_IEEE802:
/*
* ETHERNET, and Fibre Channel (which are IEEE 802
* devices, according to RFC 2625) devices will accept ARP
* hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
* This is the case also of FDDI, where the RFC 1390 says that
* FDDI devices should accept ARP hardware of (1) Ethernet,
* however, to be more robust, we'll accept both 1 (Ethernet)
* or 6 (IEEE 802.2)
*/
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
goto out_free_skb;
break;
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
goto out_free_skb;
break;
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
goto out_free_skb;
break;
}
/*只处理 arp 请求和响应报文*/
/* Understand only these message types */
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
goto out_free_skb;
/*
* Extract fields
*/
/*提取报文中的源地址和目的地址信息*/
arp_ptr = (unsigned char *)(arp + 1);
sha = arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
case ARPHRD_IEEE1394:
break;
#endif
default:
tha = arp_ptr;
arp_ptr += dev->addr_len;
}
memcpy(&tip, arp_ptr, 4);
/*
* Check for bad requests for 127.x.x.x and requests for multicast
* addresses. If this is one such, delete it.
*/
if (ipv4_is_multicast(tip) ||
(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
goto out_free_skb;
/*
* For some 802.11 wireless deployments (and possibly other networks),
* there will be an ARP proxy and gratuitous ARP frames are attacks
* and thus should not be accepted.
*/
if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
*/
if (dev_type == ARPHRD_DLCI)
sha = dev->broadcast;
/*
* Process entry. The idea here is we want to send a reply if it is a
* request for us or if it is a request for someone else that we hold
* a proxy for. We want to add an entry to our cache if it is a reply
* to us or if it is a request for our address.
* (The assumption for this last is that if someone is requesting our
* address, they are probably intending to talk to us, so it saves time
* if we cache their address. Their address is also probably not in
* our cache, since ours is not in their cache.)
*
* Putting this another way, we only care about replies if they are to
* us, in which case we add them to the cache. For requests, we care
* about those for us and those for our proxies. We reply to both,
* and in the case of requests for us we add the requester to the arp
* cache.
*/
if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
reply_dst = (struct dst_entry *)
iptunnel_metadata_reply(skb_metadata_dst(skb),
GFP_ATOMIC);
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
/*处理ipv4 免费ARP报文,主要用在DHCP分配IP地址的场景*/
if (arp->ar_op == htons(ARPOP_REQUEST) &&
inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
!arp_ignore(in_dev, sip, tip))
arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
sha, dev->dev_addr, sha, reply_dst);
goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
/*处理arp请求报文,ip_route_input_noref返回值为0表示路由查找成功,也就是说用tip作为目的地址,sip作为源地址可以查找到路由*/
rt = skb_rtable(skb);
addr_type = rt->rt_type;
if (addr_type == RTN_LOCAL) {
int dont_send;
/*本机地址处理流程*/
dont_send = arp_ignore(in_dev, sip, tip);
if (!dont_send && IN_DEV_ARPFILTER(in_dev))
dont_send = arp_filter(sip, tip, dev);
if (!dont_send) {
/*更新邻居neigh状态,发送ARP响应报文*/
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
sip, dev, tip, sha,
dev->dev_addr, sha,
reply_dst);
neigh_release(n);
}
}
goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
/*不是本机地址,但是in_dev使能了转发功能,代理arp处理流程*/
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
(rt->dst.dev != dev &&
pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
skb->pkt_type == PACKET_HOST ||
NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
sip, dev, tip, sha,
dev->dev_addr, sha,
reply_dst);
} else {
pneigh_enqueue(&arp_tbl,
in_dev->arp_parms, skb);
goto out_free_dst;
}
goto out_consume_skb;
}
}
}
/* Update our ARP tables */
/*查找neigh表项,如果不存在则会创建一个*/
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
addr_type = -1;
if (n || arp_accept(in_dev, sip)) {
is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
sip, tip, sha, tha);
}
if (arp_accept(in_dev, sip)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
*/
if (!n &&
(is_garp ||
(arp->ar_op == htons(ARPOP_REPLY) &&
(addr_type == RTN_UNICAST ||
(addr_type < 0 &&
/* postpone calculation to as late as possible */
inet_addr_type_dev_table(net, dev, sip) ==
RTN_UNICAST)))))
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
if (n) {
int state = NUD_REACHABLE;
int override;
/* If several different ARP replies follows back-to-back,
use the FIRST one. It is possible, if several proxy
agents are active. Taking the first reply prevents
arp trashing and chooses the fastest router.
*/
override = time_after(jiffies,
n->updated +
NEIGH_VAR(n->parms, LOCKTIME)) ||
is_garp;
/* Broadcast replies and request packets
do not assert neighbour reachability.
*/
/*arp请求报文和广播回复不会更新为reachable状态而是stale状态*/
if (arp->ar_op != htons(ARPOP_REPLY) ||
skb->pkt_type != PACKET_HOST)
state = NUD_STALE;
neigh_update(n, sha, state,
override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
neigh_release(n);
}
out_consume_skb:
consume_skb(skb);
out_free_dst:
dst_release(reply_dst);
return NET_RX_SUCCESS;
out_free_skb:
kfree_skb(skb);
return NET_RX_DROP;
}
neigh_event_ns用来更新接收solicit报文后的状态,其内部调用了neigh_update,且更新后的状态为stale。
struct neighbour *neigh_event_ns(struct neigh_table *tbl,
u8 *lladdr, void *saddr,
struct net_device *dev)
{
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
lladdr || !dev->addr_len);
if (neigh)
neigh_update(neigh, lladdr, NUD_STALE,
NEIGH_UPDATE_F_OVERRIDE, 0);
return neigh;
}
EXPORT_SYMBOL(neigh_event_ns);
让我们看看neigh_update的实现。
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
-- flags
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
if it is different.
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
lladdr instead of overriding it
if it is different.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
NEIGH_UPDATE_F_USE means that the entry is user triggered.
NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed
or invalidated.
Caller MUST hold reference count on the entry.
*/
/*调用者必须保证持有该表项的引用计数*/
static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
u8 new, u32 flags, u32 nlmsg_pid,
struct netlink_ext_ack *extack)
{
bool gc_update = false, managed_update = false;
int update_isrouter = 0;
struct net_device *dev;
int err, notify = 0;
u8 old;
trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid);
write_lock_bh(&neigh->lock);
dev = neigh->dev;
old = neigh->nud_state;
err = -EPERM;
/*已经标记为 dead 的表项不再更新*/
if (neigh->dead) {
NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
new = old;
goto out;
}
/*flags 中没有设置 update 标志且 old 状态是静态 arp 或者 noarp 则直接退出*/
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update);
if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
new = old & ~NUD_PERMANENT;
WRITE_ONCE(neigh->nud_state, new);
err = 0;
goto out;
}
/*新状态是无效状态的处理*/
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
neigh_suspect(neigh);
WRITE_ONCE(neigh->nud_state, new);
err = 0;
notify = old & NUD_VALID;
if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
(new & NUD_FAILED)) {
neigh_invalidate(neigh);
notify = 1;
}
goto out;
}
/* Compare new lladdr with cached one */
if (!dev->addr_len) {
/* First case: device needs no address. */
lladdr = neigh->ha;
} else if (lladdr) {
/* The second case: if something is already cached
and a new address is proposed:
- compare new & old
- if they are different, check override flag
*/
if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
} else {
/* No address is supplied; if we know something,
use it, otherwise discard the request.
*/
err = -EINVAL;
if (!(old & NUD_VALID)) {
NL_SET_ERR_MSG(extack, "No link layer address given");
goto out;
}
lladdr = neigh->ha;
}
/* Update confirmed timestamp for neighbour entry after we
* received ARP packet even if it doesn't change IP to MAC binding.
*/
if (new & NUD_CONNECTED)
neigh->confirmed = jiffies;
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
err = 0;
update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
if (old & NUD_VALID) {
if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
update_isrouter = 0;
if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
(old & NUD_CONNECTED)) {
lladdr = neigh->ha;
new = NUD_STALE;
} else
goto out;
} else {
if (lladdr == neigh->ha && new == NUD_STALE &&
!(flags & NEIGH_UPDATE_F_ADMIN))
new = old;
}
}
/* Update timestamp only once we know we will make a change to the
* neighbour entry. Otherwise we risk to move the locktime window with
* noop updates and ignore relevant ARP updates.
*/
if (new != old || lladdr != neigh->ha)
neigh->updated = jiffies;
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_PROBE)
atomic_set(&neigh->probes, 0);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
WRITE_ONCE(neigh->nud_state, new);
notify = 1;
}
/*更新hh缓存,注意这里使用了seqlock锁机制*/
if (lladdr != neigh->ha) {
write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
(NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
notify = 1;
}
if (new == old)
goto out;
if (new & NUD_CONNECTED)
neigh_connect(neigh);
else
neigh_suspect(neigh);
if (!(old & NUD_VALID)) {
struct sk_buff *skb;
/* Again: avoid dead loop if something went wrong */
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
struct dst_entry *dst = skb_dst(skb);
struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
rcu_read_lock();
/* Why not just use 'neigh' as-is? The problem is that
* things such as shaper, eql, and sch_teql can end up
* using alternative, different, neigh objects to output
* the packet in the output path. So what we need to do
* here is re-lookup the top-level neigh in the path so
* we can reinject the packet there.
*/
n2 = NULL;
if (dst &&
READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
}
READ_ONCE(n1->output)(n1, skb);
if (n2)
neigh_release(n2);
rcu_read_unlock();
write_lock_bh(&neigh->lock);
}
__skb_queue_purge(&neigh->arp_queue);
neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter)
neigh_update_is_router(neigh, flags, ¬ify);
write_unlock_bh(&neigh->lock);
if (((new ^ old) & NUD_PERMANENT) || gc_update)
neigh_update_gc_list(neigh);
if (managed_update)
neigh_update_managed_list(neigh);
if (notify)
neigh_update_notify(neigh, nlmsg_pid);
trace_neigh_update_done(neigh, err);
return err;
}
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
u32 flags, u32 nlmsg_pid)
{
return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL);
}
邻居状态机定时器
对于 NUD_INCOMPLETE,NUD_REACHABLE,NUD_DELAY 和 NUD_PROBE 状态来说,有超时处理机制,具体是在neigh_timer_handler中处理的。
/* Called when a timer expires for a neighbour entry. */
static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
struct neighbour *neigh = timer_container_of(neigh, t, timer);
unsigned int state;
int notify = 0;
write_lock(&neigh->lock);
state = neigh->nud_state;
now = jiffies;
next = now + HZ;
if (!(state & NUD_IN_TIMER))
goto out;
if (state & NUD_REACHABLE) {
if (time_before_eq(now,
neigh->confirmed + neigh->parms->reachable_time)) {
/*还在reachable_time超时时间内,更新定时器时间*/
neigh_dbg(2, "neigh %p is still alive\n", neigh);
next = neigh->confirmed + neigh->parms->reachable_time;
} else if (time_before_eq(now,
neigh->used +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
/*在reachable_time超时时间外但是在delay时间内,更新为DELAY状态*/
neigh_dbg(2, "neigh %p is delayed\n", neigh);
WRITE_ONCE(neigh->nud_state, NUD_DELAY);
neigh->updated = jiffies;
neigh_suspect(neigh);
next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
} else {
/*过了delay时间,更新状态为stale*/
neigh_dbg(2, "neigh %p is suspected\n", neigh);
WRITE_ONCE(neigh->nud_state, NUD_STALE);
neigh->updated = jiffies;
neigh_suspect(neigh);
notify = 1;
}
} else if (state & NUD_DELAY) {
if (time_before_eq(now,
neigh->confirmed +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
/*在delay时间之前,更新状态为reachable,这是因为confirmed是接收确认的时间,也就是说delay之后有发送的报文导致confirmed时间被更新。*/
neigh_dbg(2, "neigh %p is now reachable\n", neigh);
WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
neigh->updated = jiffies;
neigh_connect(neigh);
notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
/*超过了delay时间,则进入probe状态*/
neigh_dbg(2, "neigh %p is probed\n", neigh);
WRITE_ONCE(neigh->nud_state, NUD_PROBE);
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
/*NUD_PROBE和NUD_INCOMPLETE会触发定时器更新*/
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
/*NUD_PROBE和NUD_INCOMPLETE状态的neigh,如果探测次数大于了neigh设置的次数*/
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
if (neigh->nud_state == NUD_PROBE &&
neigh->flags & NTF_EXT_VALIDATED) {
/*probe状态更新为stale状态*/
WRITE_ONCE(neigh->nud_state, NUD_STALE);
neigh->updated = jiffies;
} else {
/*incomplete状态更新为failed状态*/
WRITE_ONCE(neigh->nud_state, NUD_FAILED);
neigh_invalidate(neigh);
}
notify = 1;
goto out;
}
/*更新定时器下次到期的时间*/
if (neigh->nud_state & NUD_IN_TIMER) {
if (time_before(next, jiffies + HZ/100))
next = jiffies + HZ/100;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
/*incomplete和probe状态会触发探测*/
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
}
if (notify)
neigh_update_notify(neigh, 0);
trace_neigh_timer_handler(neigh, 0);
neigh_release(neigh);
}
垃圾回收机制
在 neigh_table_init 初始化中,会创建一个垃圾回收机制,具体实现可以看这里。实际执行还任务的函数是neigh_periodic_work,具体实现如下:
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
struct neigh_hash_table *nht;
struct hlist_node *tmp;
struct neighbour *n;
unsigned int i;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
/*
* periodically recompute ReachableTime from random function
*/
/*距离上次gc超过300Hz则更新reachable_time超时时间*/
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
WRITE_ONCE(tbl->last_rand, jiffies);
list_for_each_entry(p, &tbl->parms_list, list)
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
}
/*小于gc_thresh1则不进行GC*/
if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
write_lock(&n->lock);
/*静态arp,处于定时器状态的arp,以及外部控制的arp不进行gc*/
state = n->nud_state;
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
(n->flags &
(NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) {
write_unlock(&n->lock);
continue;
}
if (time_before(n->used, n->confirmed) &&
time_is_before_eq_jiffies(n->confirmed))
n->used = n->confirmed;
/*引用计数为1表示未被使用,状态为NUD_FAILED或者距离上次使用超过了GC_STALETIME时间,则标记neigh为dead,并触发垃圾回收*/
if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
!time_in_range_open(jiffies, n->used,
n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
hlist_del_rcu(&n->hash);
hlist_del_rcu(&n->dev_list);
neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
}
/*
* It's fine to release lock here, even if hash table
* grows while we are preempted.
*/
write_unlock_bh(&tbl->lock);
cond_resched();
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
}
out:
/* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
* ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
* BASE_REACHABLE_TIME.
*/
/*再次循环调度垃圾回收任务*/
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
write_unlock_bh(&tbl->lock);
}
参考资料
《深入理解Linux网络技术内幕》 (内核版本2.6)
《Linux Kernel Networking – Implementation and Theory》 (内核版本3.9)