Linux邻居子系统学习笔记

Linux邻居子系统学习笔记

在《深入理解Linux网络技术内幕》一书中,作者花费了近150页(中文版页数,对应P629-P778)的篇幅详细介绍了邻居子系统的实现。以前因为对这部分不了解,很搞不懂为什么需要这么多的文字来介绍ARP的实现,对,以前是把ARP和邻居子系统等价来看的。在最近的项目中,深入地开发和解决了一些ARP/ND的相关功能和问题,才发现邻居子系统是Linux网络技术中很重要的一部分,值得花一篇文章详细介绍一下。即使这样,也可能只是粗略的介绍,因为很多细节都需要阅读实际的代码才可以体会到设计和实现之妙。幸好现在AI工具很强大,阅读代码也没有以前那么困难了,让我们开始这趟旅程吧。

因为篇幅的限制,我们这里主要以IPv4的ARP为主介绍邻居子系统。对于IPv6的ND来说,邻居子系统的框架和ARP是一样的,所以本文暂不涉及。其实邻居子系统的概念,是在IPv6引入的时候同时引入的,具体可以参考1996年8月发布的RFC1970 Neighbor Discovery for IP Version 6 (IPv6),其中的3.1小节Comparison with IPv4详细比较了和IPv4的差异。我们可以这样理解, RFC1970是对邻居发现协议的抽象,虽然其主要针对IPv6而设计,但是同时也兼容了IPv4 的ARP协议实现流程,而这也是Linux Kernel能够用neighbour子系统同时支持IPv4和IPv6的理论基础。后面2007年又发布了RFC 4861: Neighbor Discovery for IP version 6 (IPv6),完全取代了RFC1970,在这两个版本之间还有个过渡版本RFC2461。

整体结构

报文收发流程

发送方向:

arp请求/arp响应 -> arp报文封装 -> arp报文发送。

接收方向:

arp报文 -> arp处理 ->arp学习 ->arp响应

发送消息

ip报文经过路由查找以后调用ip_output进行打包,其最终通过neigh_outpout触发邻居子系统发送报文。

具体调用路径:ip_output->ip_finish_output->ip_finish_output2

ip_finish_output2函数实现如下:

static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	struct rtable *rt = dst_rtable(dst);
	struct net_device *dev = dst_dev(dst);
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
	struct neighbour *neigh;
	bool is_v6gw = false;

        /*此处省略部分代码*/

	rcu_read_lock();
	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
	if (!IS_ERR(neigh)) {
		int res;

                /*走到这里说明neigh子系统注册是正常的,neigh如果不存在则先创建一个无效的,然后再去学习,如果已经有了,下面可以直接使用*/
		sock_confirm_neigh(skb, neigh);
		/* if crossing protocols, can not use the cached header */
                /*这里触发neigh处理流程*/
		res = neigh_output(neigh, skb, is_v6gw);
		rcu_read_unlock();
		return res;
	}
	rcu_read_unlock();

        /*走到这里说明neigh邻居子系统可能注册的有问题,或者neigh无法正常创建,报文将被丢弃*/
	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
	kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
	return PTR_ERR(neigh);
}

其中 ip_neigh_for_gw 用来根据网关地址查找对应的 neigh 表项。

static inline struct neighbour *ip_neigh_gw4(struct net_device *dev,
					     __be32 daddr)
{
	struct neighbour *neigh;

	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)daddr);
	if (unlikely(!neigh))
		neigh = __neigh_create(&arp_tbl, &daddr, dev, false);

	return neigh;
}

static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt,
						struct sk_buff *skb,
						bool *is_v6gw)
{
	struct net_device *dev = rt->dst.dev;
	struct neighbour *neigh;

	if (likely(rt->rt_gw_family == AF_INET)) {
		neigh = ip_neigh_gw4(dev, rt->rt_gw4);
	} else if (rt->rt_gw_family == AF_INET6) {
		neigh = ip_neigh_gw6(dev, &rt->rt_gw6);
		*is_v6gw = true;
	} else {
		neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr);
	}
	return neigh;
}
对于 mpls  转来说, mpls_forward通过neigh_xmit触发arp发送,其具体实现如下:
static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
			struct packet_type *pt, struct net_device *orig_dev)
{
	struct net *net = dev_net(dev);
	struct mpls_shim_hdr *hdr;
	const struct mpls_nh *nh;
	struct mpls_route *rt;
	struct net_device *out_dev;
	int err;

        /*此处省略部分代码*/

	mpls_stats_inc_outucastpkts(out_dev, skb);

	/* If via wasn't specified then send out using device address */
	if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
                /*使用NEIGH_LINK_TABLE表*/
		err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
				 out_dev->dev_addr, skb);
	else
                /*使用nh_via_table指定的邻居表*/
		err = neigh_xmit(nh->nh_via_table, out_dev,
				 mpls_nh_via(rt, nh), skb);
	if (err)
		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
				    __func__, err);
	return 0;

        /*此处省略部分代码*/

}

而neigh_xmit中根据index的不同,可能调用的是neigh_output发送arp,也可能直接调用dev_queue_xmit发送报文。

int neigh_xmit(int index, struct net_device *dev,
	       const void *addr, struct sk_buff *skb)
{
	int err = -EAFNOSUPPORT;

	if (likely(index < NEIGH_NR_TABLES)) {
		struct neigh_table *tbl;
		struct neighbour *neigh;

		rcu_read_lock();
		tbl = rcu_dereference(neigh_tables[index]);
		if (!tbl)
			goto out_unlock;
		if (index == NEIGH_ARP_TABLE) {
			u32 key = *((u32 *)addr);
                        /*查找ipv4 neigh(arp)是否存在*/
			neigh = __ipv4_neigh_lookup_noref(dev, key);
		} else {
			neigh = __neigh_lookup_noref(tbl, addr, dev);
		}
                /*如果neigh不存在则创建一个*/
		if (!neigh)
			neigh = __neigh_create(tbl, addr, dev, false);
		err = PTR_ERR(neigh);
		if (IS_ERR(neigh)) {
			rcu_read_unlock();
			goto out_kfree_skb;
		}
                /*通过neigh的output回调进行发包*/
		err = READ_ONCE(neigh->output)(neigh, skb);
out_unlock:
		rcu_read_unlock();
	}
	else if (index == NEIGH_LINK_TABLE) {
                /*直接通过dev_hard_header进行二层头封装*/
		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
				      addr, NULL, skb->len);
		if (err < 0)
			goto out_kfree_skb;
		err = dev_queue_xmit(skb);
	}
out:
	return err;
out_kfree_skb:
	kfree_skb(skb);
	goto out;
}

注意上面neigh_xmit中失败流程会释放skb资源,2026年4月有个commit:neigh: let neigh_xmit take skb ownership修改了这里的一个小问题。

可以看到,不论是 ip 转还是 mpls 转发,都是先用__ipv4_neigh_lookup_noref查找 neigh 表项,如果不存在则创建一个,如果存在则直接使用。对于首次发包来说,因为查不到 neigh 表而走创建流程,那就让我们先看看创建流程__neigh_create。

static struct neighbour *
___neigh_create(struct neigh_table *tbl, const void *pkey,
		struct net_device *dev, u32 flags,
		bool exempt_from_gc, bool want_ref)
{
	u32 hash_val, key_len = tbl->key_len;
	struct neighbour *n1, *rc, *n;
	struct neigh_hash_table *nht;
	int error;

	n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
	trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
	if (!n) {
		rc = ERR_PTR(-ENOBUFS);
		goto out;
	}

	memcpy(n->primary_key, pkey, key_len);
	n->dev = dev;
	netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);

	/* Protocol specific setup. */
	if (tbl->constructor &&	(error = tbl->constructor(n)) < 0) {
		rc = ERR_PTR(error);
		goto out_neigh_release;
	}

	if (dev->netdev_ops->ndo_neigh_construct) {
		error = dev->netdev_ops->ndo_neigh_construct(dev, n);
		if (error < 0) {
			rc = ERR_PTR(error);
			goto out_neigh_release;
		}
	}

	/* Device specific setup. */
	if (n->parms->neigh_setup &&
	    (error = n->parms->neigh_setup(n)) < 0) {
		rc = ERR_PTR(error);
		goto out_neigh_release;
	}

	n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);

	write_lock_bh(&tbl->lock);
	nht = rcu_dereference_protected(tbl->nht,
					lockdep_is_held(&tbl->lock));

	if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
		nht = neigh_hash_grow(tbl, nht->hash_shift + 1);

	hash_val = tbl->hash(n->primary_key, dev, nht->hash_rnd) >> (32 - nht->hash_shift);

	if (n->parms->dead) {
		rc = ERR_PTR(-EINVAL);
		goto out_tbl_unlock;
	}

	neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
                /*已经存在,则增加引用计数,返回改表项*/
		if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
			if (want_ref)
				neigh_hold(n1);
			rc = n1;
			goto out_tbl_unlock;
		}
	}

	n->dead = 0;
	if (!exempt_from_gc)
		list_add_tail(&n->gc_list, &n->tbl->gc_list);
	if (n->flags & NTF_MANAGED)
		list_add_tail(&n->managed_list, &n->tbl->managed_list);
	if (want_ref)
		neigh_hold(n);
	hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);

	hlist_add_head_rcu(&n->dev_list,
			   neigh_get_dev_table(dev, tbl->family));

	write_unlock_bh(&tbl->lock);
	neigh_dbg(2, "neigh %p is created\n", n);
	rc = n;
out:
	return rc;
out_tbl_unlock:
	write_unlock_bh(&tbl->lock);
out_neigh_release:
	if (!exempt_from_gc)
		atomic_dec(&tbl->gc_entries);
	neigh_release(n);
	goto out;
}

struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
				 struct net_device *dev, bool want_ref)
{
	bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);

	return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
}

neigh_alloc 用来申请 neigh 表项资源并初始化。

static struct neighbour *neigh_alloc(struct neigh_table *tbl,
				     struct net_device *dev,
				     u32 flags, bool exempt_from_gc)
{
	struct neighbour *n = NULL;
	unsigned long now = jiffies;
	int entries, gc_thresh3;

        /*如果设置了exempt_from_gc则跳过gc*/
	if (exempt_from_gc)
		goto do_alloc;

	entries = atomic_inc_return(&tbl->gc_entries) - 1;
	gc_thresh3 = READ_ONCE(tbl->gc_thresh3);
        /*符合如下条件强制进行gc*/
	if (entries >= gc_thresh3 ||
	    (entries >= READ_ONCE(tbl->gc_thresh2) &&
	     time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) {
		if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) {
			net_info_ratelimited("%s: neighbor table overflow!\n",
					     tbl->id);
			NEIGH_CACHE_STAT_INC(tbl, table_fulls);
			goto out_entries;
		}
	}

do_alloc:
	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
	if (!n)
		goto out_entries;

	__skb_queue_head_init(&n->arp_queue);
	rwlock_init(&n->lock);
	seqlock_init(&n->ha_lock);
	n->updated	  = n->used = now;
	n->nud_state	  = NUD_NONE;
	n->output	  = neigh_blackhole;
	n->flags	  = flags;
	seqlock_init(&n->hh.hh_lock);
	n->parms	  = neigh_parms_clone(&tbl->parms);
	timer_setup(&n->timer, neigh_timer_handler, 0);

	NEIGH_CACHE_STAT_INC(tbl, allocs);
	n->tbl		  = tbl;
	refcount_set(&n->refcnt, 1);
	n->dead		  = 1;
	INIT_LIST_HEAD(&n->gc_list);
	INIT_LIST_HEAD(&n->managed_list);

	atomic_inc(&tbl->entries);
out:
	return n;

out_entries:
	if (!exempt_from_gc)
		atomic_dec(&tbl->gc_entries);
	goto out;
}

对于 arp 来说,上面 ___neigh_create 中的 tbl->construtor 对应的是 arp_constructor 函数,其实现如下:

static int arp_constructor(struct neighbour *neigh)
{
	__be32 addr;
	struct net_device *dev = neigh->dev;
	struct in_device *in_dev;
	struct neigh_parms *parms;
	u32 inaddr_any = INADDR_ANY;

        /*对于环回设备和点对点设备,key固定为inaddr_any(0.0.0.0)*/
	if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
		memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);

	addr = *(__be32 *)neigh->primary_key;
	rcu_read_lock();
	in_dev = __in_dev_get_rcu(dev);
	if (!in_dev) {
		rcu_read_unlock();
		return -EINVAL;
	}

	neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
        /*获取neigh的参数配置*/
	parms = in_dev->arp_parms;
	__neigh_parms_put(neigh->parms);
	neigh->parms = neigh_parms_clone(parms);
	rcu_read_unlock();

	if (!dev->header_ops) {
                /*没有注册header_ops的设备,固定设置为NUD_NOARP*/
		neigh->nud_state = NUD_NOARP;
		neigh->ops = &arp_direct_ops;
		neigh->output = neigh_direct_output;
	} else {
		/* Good devices (checked by reading texts, but only Ethernet is
		   tested)

		   ARPHRD_ETHER: (ethernet, apfddi)
		   ARPHRD_FDDI: (fddi)
		   ARPHRD_IEEE802: (tr)
		   ARPHRD_METRICOM: (strip)
		   ARPHRD_ARCNET:
		   etc. etc. etc.

		   ARPHRD_IPDDP will also work, if author repairs it.
		   I did not it, because this driver does not work even
		   in old paradigm.
		 */

		if (neigh->type == RTN_MULTICAST) {
                        /*组播地址状态为NUD_NOARP,自动计算链路地址*/
			neigh->nud_state = NUD_NOARP;
			arp_mc_map(addr, neigh->ha, dev, 1);
		} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
                        /*环回口设备状态为NUD_NOARP*/
			neigh->nud_state = NUD_NOARP;
			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
		} else if (neigh->type == RTN_BROADCAST ||
			   (dev->flags & IFF_POINTOPOINT)) {
                        /*广播和点对点设备状态为NUD_NOARP*/
			neigh->nud_state = NUD_NOARP;
			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
		}
                /*初始化ops,支持cache操作*/
		if (dev->header_ops->cache)
			neigh->ops = &arp_hh_ops;
		else
			neigh->ops = &arp_generic_ops;
                /*初始化output回调,有效则为connected_output,否则为output*/
		if (neigh->nud_state & NUD_VALID)
			neigh->output = neigh->ops->connected_output;
		else
			neigh->output = neigh->ops->output;
	}
	return 0;
}

接下来让我们看看发送接口 neigh_output,其会根据当前neigh的状态来决定是发送ARP报文,还是直接使用快速缓存直接封装L2头部信息。

static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
			       bool skip_cache)
{
	const struct hh_cache *hh = &n->hh;

	/* n->nud_state and hh->hh_len could be changed under us.
	 * neigh_hh_output() is taking care of the race later.
	 */
        /*如果使能了cache功能,且neigh的状态是NUD_CONNECTED,且hh->hh_len不为零,则通过neigh_hh_output直接通过缓存封装后发*/
	if (!skip_cache &&
	    (READ_ONCE(n->nud_state) & NUD_CONNECTED) &&
	    READ_ONCE(hh->hh_len))
		return neigh_hh_output(hh, skb);

        /*调用neigh对应的output回调发包*/
	return READ_ONCE(n->output)(n, skb);
}

对于一般的网络设备而言,在neigh创建的时候会用arp_hh_ops给neigh的ops成员赋值,可以看这里,然后根据nud_state的状态给output回调赋值,可以看这里。但不论是哪种状态,最终都用neigh_resolve_output给output回调赋值,所以 n->output 事实指向的是 neigh_reslove_output。

/* Slow and careful. */

int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
{
	int rc = 0;

        /*当neigh_event_send返回0的时候进行发包流程,否则什么都不做直接返回0*/
	if (!neigh_event_send(neigh, skb)) {
		int err;
		struct net_device *dev = neigh->dev;
		unsigned int seq;

		if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
			neigh_hh_init(neigh);

		do {
			__skb_pull(skb, skb_network_offset(skb));
			seq = read_seqbegin(&neigh->ha_lock);
			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
					      neigh->ha, NULL, skb->len);
		} while (read_seqretry(&neigh->ha_lock, seq));

		if (err >= 0)
			rc = dev_queue_xmit(skb);
		else
			goto out_kfree_skb;
	}
out:
	return rc;
out_kfree_skb:
	rc = -EINVAL;
	kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
	goto out;
}

neigh_event_send函数实现如下:

static __always_inline int neigh_event_send_probe(struct neighbour *neigh,
						  struct sk_buff *skb,
						  const bool immediate_ok)
{
	unsigned long now = jiffies;

        /*更新used时戳为当前时间*/
	if (READ_ONCE(neigh->used) != now)
		WRITE_ONCE(neigh->used, now);
        /*如果neigh的状态不是NUD_CONNECTED,NUD_DELAY,NUD_PROBE中的一种则触发neigh发送流程*/
	if (!(READ_ONCE(neigh->nud_state) & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)))
		return __neigh_event_send(neigh, skb, immediate_ok);
	return 0;
}

static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
	return neigh_event_send_probe(neigh, skb, true);
}

__neigh_event_send是实现具体发送流程的函数,其实现如下:

int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
		       const bool immediate_ok)
{
	int rc;
	bool immediate_probe = false;

	write_lock_bh(&neigh->lock);

	rc = 0;
        /*如果neigh状态是NUD_CONNECTED | NUD_DELAY | NUD_PROBE中的一种,则直接返回。*/
	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
		goto out_unlock_bh;
        /*如果neigh设置了dead则直接走out_dead处理流程*/
	if (neigh->dead)
		goto out_dead;

         /*如果neigh状态也不是NUD_STALE和NUD_INCOMPLETE中的一种,则触发探测流程,其实这里的neigh状态只能是NUD_NONE和NUD_FAILED。*/
	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
		if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
		    NEIGH_VAR(neigh->parms, APP_PROBES)) {
			unsigned long next, now = jiffies;

                         /*首次探测,设置单播探测次数,删除定时器,设置neigh状态为NUD_INCOMPLETE,设置updated时间戳为现在的时间*/
			atomic_set(&neigh->probes,
				   NEIGH_VAR(neigh->parms, UCAST_PROBES));
			neigh_del_timer(neigh);
			WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
			neigh->updated = now;
			if (!immediate_ok) {
				next = now + 1;
			} else {
                                /*设置了immediate标志则触发立即探测,同时更新next时间*/
				immediate_probe = true;
				next = now + max(NEIGH_VAR(neigh->parms,
							   RETRANS_TIME),
						 HZ / 100);
			}
			neigh_add_timer(neigh, next);
		} else {
                        /*探测失败,或者未设置组播和app探测次数,设置neigh状态是NUD_FAILED,更新updated时间戳为当前时间,释放skb资源*/
			WRITE_ONCE(neigh->nud_state, NUD_FAILED);
			neigh->updated = jiffies;
			write_unlock_bh(&neigh->lock);

			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
			return 1;
		}
	} else if (neigh->nud_state & NUD_STALE) {
                /*当前是NUD_STALE状态,删除定时器,更新为NUD_DELAY状态,更新updated时间戳为当前时间,添加probe定时器*/
		neigh_dbg(2, "neigh %p is delayed\n", neigh);
		neigh_del_timer(neigh);
		WRITE_ONCE(neigh->nud_state, NUD_DELAY);
		neigh->updated = jiffies;
		neigh_add_timer(neigh, jiffies +
				NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
	}

	if (neigh->nud_state == NUD_INCOMPLETE) {
		if (skb) {
			while (neigh->arp_queue_len_bytes + skb->truesize >
			       NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) {
				struct sk_buff *buff;

                                /*当前neigh是NUD_INCOMPLETE状态,所以要将skb放入arp_queue中,但是经过计算超出了QUEUE_LEN_BYTES,所以要做替换,采用的策略是先进先出,即将最早入队的skb出队头并释放,然后将当前的skb入队尾*/
				buff = __skb_dequeue(&neigh->arp_queue);
				if (!buff)
					break;
				neigh->arp_queue_len_bytes -= buff->truesize;
				kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
				NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
			}
			skb_dst_force(skb);
			__skb_queue_tail(&neigh->arp_queue, skb);
			neigh->arp_queue_len_bytes += skb->truesize;
		}
		rc = 1;
	}
out_unlock_bh:
        /*如果设置了immediate_probe则通过neigh_probe进行探测发包,否则什么也不做。*/
	if (immediate_probe)
		neigh_probe(neigh);
	else
		write_unlock(&neigh->lock);
	local_bh_enable();
	trace_neigh_event_send_done(neigh, rc);
	return rc;

out_dead:
        /*如果设置了dead的neigh当前处于NUD_STALE状态,则进入out_unlock_bh处理流程,否则释放skb*/
	if (neigh->nud_state & NUD_STALE)
		goto out_unlock_bh;
	write_unlock_bh(&neigh->lock);
	kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
	trace_neigh_event_send_dead(neigh, 1);
	return 1;
}

neigh_probe负责进行neigh探测,具体实现如下:

static void neigh_probe(struct neighbour *neigh)
	__releases(neigh->lock)
{
	struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
	/* keep skb alive even if arp_queue overflows */
	if (skb)
		skb = skb_clone(skb, GFP_ATOMIC);
	write_unlock(&neigh->lock);
        /*调用neigh注册的solicit函数,对于ipv4来说是arp_solicit*/
	if (neigh->ops->solicit)
		neigh->ops->solicit(neigh, skb);
        /*增加一次探测次数*/
	atomic_inc(&neigh->probes);
	consume_skb(skb);
}

对于 arp 来说,上面的neigh->ops->solicit对应的函数是 arp_solicit,其负责发送 arp 请求报文,具体实现如下:

static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
	__be32 saddr = 0;
	u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
	struct net_device *dev = neigh->dev;
	__be32 target = *(__be32 *)neigh->primary_key;
	int probes = atomic_read(&neigh->probes);
	struct in_device *in_dev;
	struct dst_entry *dst = NULL;

        /*获取 dev 对应的 in_dev,完整叫法是 inetdev,初始化代码可以参考这里。*/
	rcu_read_lock();
	in_dev = __in_dev_get_rcu(dev);
	if (!in_dev) {
		rcu_read_unlock();
		return;
	}

        /*根据ARP announce 模式选择源 IP 地址,也就是 arp 报文中的源 ip 地址*/
	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
	default:
	case 0:		/* By default announce any local IP */
		if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
					  ip_hdr(skb)->saddr) == RTN_LOCAL)
			saddr = ip_hdr(skb)->saddr;
		break;
	case 1:		/* Restrict announcements of saddr in same subnet */
		if (!skb)
			break;
		saddr = ip_hdr(skb)->saddr;
		if (inet_addr_type_dev_table(dev_net(dev), dev,
					     saddr) == RTN_LOCAL) {
			/* saddr should be known to target */
			if (inet_addr_onlink(in_dev, target, saddr))
				break;
		}
		saddr = 0;
		break;
	case 2:		/* Avoid secondary IPs, get a primary/preferred one */
		break;
	}
	rcu_read_unlock();

        /*如果源地址 saddr 未选择,则根据目的地址选择同链路的源地址*/
	if (!saddr)
		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);

	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
        /*经过上述计算后,probes 小于 0 表示还未达到配置的单播探测次数,大于或者等于 0 表示已经到达了最大单播探测次数*/
	if (probes < 0) {
                /*当 neigh 状态无效的时候是不能单播探测的,打印一个提示信息*/
		if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
			pr_debug("trying to ucast probe in NUD_INVALID\n");
		neigh_ha_snapshot(dst_ha, neigh, dev);
		dst_hw = dst_ha;
	} else {
		probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
		if (probes < 0) {
                        /*这里表示还没有达到 app 的探测次数。*/			
                        neigh_app_ns(neigh);
			return;
		}
	}

	if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
		dst = skb_dst(skb);
        /*发送 arp 请求报文*/
	arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
		     dst_hw, dev->dev_addr, NULL, dst);
}

arp_send_dst 是 arp 模块提供的发送接口,具体实现如下:

/* Create and send an arp packet. */
static void arp_send_dst(int type, int ptype, __be32 dest_ip,
			 struct net_device *dev, __be32 src_ip,
			 const unsigned char *dest_hw,
			 const unsigned char *src_hw,
			 const unsigned char *target_hw,
			 struct dst_entry *dst)
{
	struct sk_buff *skb;

	/* arp on this interface. */
	if (dev->flags & IFF_NOARP)
		return;

        /*创建 arp 报文对应的 skb*/
	skb = arp_create(type, ptype, dest_ip, dev, src_ip,
			 dest_hw, src_hw, target_hw);
	if (!skb)
		return;

        /*设置 skb 对应的 dst*/
	skb_dst_set(skb, dst_clone(dst));
        /*arp_xmit 最终调用 dev_queue_xmit 发送报文*/
	arp_xmit(skb);
}

接收消息

arp报文接收处理流程。

arp 模块注册的 arp 报文接收接口是 arp_rcv,其具体实现如下:

static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
		   struct packet_type *pt, struct net_device *orig_dev)
{
	enum skb_drop_reason drop_reason;
	const struct arphdr *arp;

	/* do not tweak dropwatch on an ARP we will ignore */
	if (dev->flags & IFF_NOARP ||
	    skb->pkt_type == PACKET_OTHERHOST ||
	    skb->pkt_type == PACKET_LOOPBACK)
		goto consumeskb;

	skb = skb_share_check(skb, GFP_ATOMIC);
	if (!skb)
		goto out_of_mem;

	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
	drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev));
	if (drop_reason != SKB_NOT_DROPPED_YET)
		goto freeskb;

	arp = arp_hdr(skb);
	if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) {
		drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
		goto freeskb;
	}

	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));

	return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
		       dev_net(dev), NULL, skb, dev, NULL,
		       arp_process);

consumeskb:
	consume_skb(skb);
	return NET_RX_SUCCESS;
freeskb:
	kfree_skb_reason(skb, drop_reason);
out_of_mem:
	return NET_RX_DROP;
}

arp_process 实现如下,注意这里会处理 arp 请求和响应报文。

static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
	struct net_device *dev = skb->dev;
	struct in_device *in_dev = __in_dev_get_rcu(dev);
	struct arphdr *arp;
	unsigned char *arp_ptr;
	struct rtable *rt;
	unsigned char *sha;
	unsigned char *tha = NULL;
	__be32 sip, tip;
	u16 dev_type = dev->type;
	int addr_type;
	struct neighbour *n;
	struct dst_entry *reply_dst = NULL;
	bool is_garp = false;

	/* arp_rcv below verifies the ARP header and verifies the device
	 * is ARP'able.
	 */

	if (!in_dev)
		goto out_free_skb;

	arp = arp_hdr(skb);

	switch (dev_type) {
	default:
		if (arp->ar_pro != htons(ETH_P_IP) ||
		    htons(dev_type) != arp->ar_hrd)
			goto out_free_skb;
		break;
	case ARPHRD_ETHER:
	case ARPHRD_FDDI:
	case ARPHRD_IEEE802:
		/*
		 * ETHERNET, and Fibre Channel (which are IEEE 802
		 * devices, according to RFC 2625) devices will accept ARP
		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
		 * This is the case also of FDDI, where the RFC 1390 says that
		 * FDDI devices should accept ARP hardware of (1) Ethernet,
		 * however, to be more robust, we'll accept both 1 (Ethernet)
		 * or 6 (IEEE 802.2)
		 */
		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
		    arp->ar_pro != htons(ETH_P_IP))
			goto out_free_skb;
		break;
	case ARPHRD_AX25:
		if (arp->ar_pro != htons(AX25_P_IP) ||
		    arp->ar_hrd != htons(ARPHRD_AX25))
			goto out_free_skb;
		break;
	case ARPHRD_NETROM:
		if (arp->ar_pro != htons(AX25_P_IP) ||
		    arp->ar_hrd != htons(ARPHRD_NETROM))
			goto out_free_skb;
		break;
	}

        /*只处理 arp 请求和响应报文*/
	/* Understand only these message types */

	if (arp->ar_op != htons(ARPOP_REPLY) &&
	    arp->ar_op != htons(ARPOP_REQUEST))
		goto out_free_skb;

/*
 *	Extract fields
 */
/*提取报文中的源地址和目的地址信息*/
	arp_ptr = (unsigned char *)(arp + 1);
	sha	= arp_ptr;
	arp_ptr += dev->addr_len;
	memcpy(&sip, arp_ptr, 4);
	arp_ptr += 4;
	switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
	case ARPHRD_IEEE1394:
		break;
#endif
	default:
		tha = arp_ptr;
		arp_ptr += dev->addr_len;
	}
	memcpy(&tip, arp_ptr, 4);
/*
 *	Check for bad requests for 127.x.x.x and requests for multicast
 *	addresses.  If this is one such, delete it.
 */
	if (ipv4_is_multicast(tip) ||
	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
		goto out_free_skb;

 /*
  *	For some 802.11 wireless deployments (and possibly other networks),
  *	there will be an ARP proxy and gratuitous ARP frames are attacks
  *	and thus should not be accepted.
  */
	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
		goto out_free_skb;

/*
 *     Special case: We must set Frame Relay source Q.922 address
 */
	if (dev_type == ARPHRD_DLCI)
		sha = dev->broadcast;

/*
 *  Process entry.  The idea here is we want to send a reply if it is a
 *  request for us or if it is a request for someone else that we hold
 *  a proxy for.  We want to add an entry to our cache if it is a reply
 *  to us or if it is a request for our address.
 *  (The assumption for this last is that if someone is requesting our
 *  address, they are probably intending to talk to us, so it saves time
 *  if we cache their address.  Their address is also probably not in
 *  our cache, since ours is not in their cache.)
 *
 *  Putting this another way, we only care about replies if they are to
 *  us, in which case we add them to the cache.  For requests, we care
 *  about those for us and those for our proxies.  We reply to both,
 *  and in the case of requests for us we add the requester to the arp
 *  cache.
 */

	if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
		reply_dst = (struct dst_entry *)
			    iptunnel_metadata_reply(skb_metadata_dst(skb),
						    GFP_ATOMIC);

	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
	if (sip == 0) {
                /*处理ipv4 免费ARP报文,主要用在DHCP分配IP地址的场景*/
		if (arp->ar_op == htons(ARPOP_REQUEST) &&
		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
		    !arp_ignore(in_dev, sip, tip))
			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
				     sha, dev->dev_addr, sha, reply_dst);
		goto out_consume_skb;
	}

	if (arp->ar_op == htons(ARPOP_REQUEST) &&
	    ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
                /*处理arp请求报文,ip_route_input_noref返回值为0表示路由查找成功,也就是说用tip作为目的地址,sip作为源地址可以查找到路由*/
		rt = skb_rtable(skb);
		addr_type = rt->rt_type;

		if (addr_type == RTN_LOCAL) {
			int dont_send;
                        /*本机地址处理流程*/
			dont_send = arp_ignore(in_dev, sip, tip);
			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
				dont_send = arp_filter(sip, tip, dev);
			if (!dont_send) {
                                /*更新邻居neigh状态,发送ARP响应报文*/
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
				if (n) {
					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
						     sip, dev, tip, sha,
						     dev->dev_addr, sha,
						     reply_dst);
					neigh_release(n);
				}
			}
			goto out_consume_skb;
		} else if (IN_DEV_FORWARD(in_dev)) {
                        /*不是本机地址,但是in_dev使能了转发功能,代理arp处理流程*/
			if (addr_type == RTN_UNICAST  &&
			    (arp_fwd_proxy(in_dev, dev, rt) ||
			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
			     (rt->dst.dev != dev &&
			      pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
				if (n)
					neigh_release(n);

				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
				    skb->pkt_type == PACKET_HOST ||
				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
					arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
						     sip, dev, tip, sha,
						     dev->dev_addr, sha,
						     reply_dst);
				} else {
					pneigh_enqueue(&arp_tbl,
						       in_dev->arp_parms, skb);
					goto out_free_dst;
				}
				goto out_consume_skb;
			}
		}
	}

	/* Update our ARP tables */
        /*查找neigh表项,如果不存在则会创建一个*/
	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);

	addr_type = -1;
	if (n || arp_accept(in_dev, sip)) {
		is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
				      sip, tip, sha, tha);
	}

	if (arp_accept(in_dev, sip)) {
		/* Unsolicited ARP is not accepted by default.
		   It is possible, that this option should be enabled for some
		   devices (strip is candidate)
		 */
		if (!n &&
		    (is_garp ||
		     (arp->ar_op == htons(ARPOP_REPLY) &&
		      (addr_type == RTN_UNICAST ||
		       (addr_type < 0 &&
			/* postpone calculation to as late as possible */
			inet_addr_type_dev_table(net, dev, sip) ==
				RTN_UNICAST)))))
			n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
	}

	if (n) {
		int state = NUD_REACHABLE;
		int override;

		/* If several different ARP replies follows back-to-back,
		   use the FIRST one. It is possible, if several proxy
		   agents are active. Taking the first reply prevents
		   arp trashing and chooses the fastest router.
		 */
		override = time_after(jiffies,
				      n->updated +
				      NEIGH_VAR(n->parms, LOCKTIME)) ||
			   is_garp;

		/* Broadcast replies and request packets
		   do not assert neighbour reachability.
		 */
                /*arp请求报文和广播回复不会更新为reachable状态而是stale状态*/
		if (arp->ar_op != htons(ARPOP_REPLY) ||
		    skb->pkt_type != PACKET_HOST)
			state = NUD_STALE;
		neigh_update(n, sha, state,
			     override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
		neigh_release(n);
	}

out_consume_skb:
	consume_skb(skb);

out_free_dst:
	dst_release(reply_dst);
	return NET_RX_SUCCESS;

out_free_skb:
	kfree_skb(skb);
	return NET_RX_DROP;
}

neigh_event_ns用来更新接收solicit报文后的状态,其内部调用了neigh_update,且更新后的状态为stale。

struct neighbour *neigh_event_ns(struct neigh_table *tbl,
				 u8 *lladdr, void *saddr,
				 struct net_device *dev)
{
	struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
						 lladdr || !dev->addr_len);
	if (neigh)
		neigh_update(neigh, lladdr, NUD_STALE,
			     NEIGH_UPDATE_F_OVERRIDE, 0);
	return neigh;
}
EXPORT_SYMBOL(neigh_event_ns);

让我们看看neigh_update的实现。

/* Generic update routine.
   -- lladdr is new lladdr or NULL, if it is not supplied.
   -- new    is new state.
   -- flags
	NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
				if it is different.
	NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
				lladdr instead of overriding it
				if it is different.
	NEIGH_UPDATE_F_ADMIN	means that the change is administrative.
	NEIGH_UPDATE_F_USE	means that the entry is user triggered.
	NEIGH_UPDATE_F_MANAGED	means that the entry will be auto-refreshed.
	NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
				NTF_ROUTER flag.
	NEIGH_UPDATE_F_ISROUTER	indicates if the neighbour is known as
				a router.
	NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed
				or invalidated.

   Caller MUST hold reference count on the entry.
 */
/*调用者必须保证持有该表项的引用计数*/
static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
			  u8 new, u32 flags, u32 nlmsg_pid,
			  struct netlink_ext_ack *extack)
{
	bool gc_update = false, managed_update = false;
	int update_isrouter = 0;
	struct net_device *dev;
	int err, notify = 0;
	u8 old;

	trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid);

	write_lock_bh(&neigh->lock);

	dev    = neigh->dev;
	old    = neigh->nud_state;
	err    = -EPERM;

        /*已经标记为 dead 的表项不再更新*/
	if (neigh->dead) {
		NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
		new = old;
		goto out;
	}
        /*flags 中没有设置 update 标志且 old 状态是静态 arp 或者 noarp 则直接退出*/
	if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
	    (old & (NUD_NOARP | NUD_PERMANENT)))
		goto out;

	neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
	if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
		new = old & ~NUD_PERMANENT;
		WRITE_ONCE(neigh->nud_state, new);
		err = 0;
		goto out;
	}
      
        /*新状态是无效状态的处理*/  
	if (!(new & NUD_VALID)) {
		neigh_del_timer(neigh);
		if (old & NUD_CONNECTED)
			neigh_suspect(neigh);
		WRITE_ONCE(neigh->nud_state, new);
		err = 0;
		notify = old & NUD_VALID;
		if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
		    (new & NUD_FAILED)) {
			neigh_invalidate(neigh);
			notify = 1;
		}
		goto out;
	}

	/* Compare new lladdr with cached one */
	if (!dev->addr_len) {
		/* First case: device needs no address. */
		lladdr = neigh->ha;
	} else if (lladdr) {
		/* The second case: if something is already cached
		   and a new address is proposed:
		   - compare new & old
		   - if they are different, check override flag
		 */
		if ((old & NUD_VALID) &&
		    !memcmp(lladdr, neigh->ha, dev->addr_len))
			lladdr = neigh->ha;
	} else {
		/* No address is supplied; if we know something,
		   use it, otherwise discard the request.
		 */
		err = -EINVAL;
		if (!(old & NUD_VALID)) {
			NL_SET_ERR_MSG(extack, "No link layer address given");
			goto out;
		}
		lladdr = neigh->ha;
	}

	/* Update confirmed timestamp for neighbour entry after we
	 * received ARP packet even if it doesn't change IP to MAC binding.
	 */
	if (new & NUD_CONNECTED)
		neigh->confirmed = jiffies;

	/* If entry was valid and address is not changed,
	   do not change entry state, if new one is STALE.
	 */
	err = 0;
	update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
	if (old & NUD_VALID) {
		if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
			update_isrouter = 0;
			if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
			    (old & NUD_CONNECTED)) {
				lladdr = neigh->ha;
				new = NUD_STALE;
			} else
				goto out;
		} else {
			if (lladdr == neigh->ha && new == NUD_STALE &&
			    !(flags & NEIGH_UPDATE_F_ADMIN))
				new = old;
		}
	}

	/* Update timestamp only once we know we will make a change to the
	 * neighbour entry. Otherwise we risk to move the locktime window with
	 * noop updates and ignore relevant ARP updates.
	 */
	if (new != old || lladdr != neigh->ha)
		neigh->updated = jiffies;

	if (new != old) {
		neigh_del_timer(neigh);
		if (new & NUD_PROBE)
			atomic_set(&neigh->probes, 0);
		if (new & NUD_IN_TIMER)
			neigh_add_timer(neigh, (jiffies +
						((new & NUD_REACHABLE) ?
						 neigh->parms->reachable_time :
						 0)));
		WRITE_ONCE(neigh->nud_state, new);
		notify = 1;
	}

        /*更新hh缓存,注意这里使用了seqlock锁机制*/
	if (lladdr != neigh->ha) {
		write_seqlock(&neigh->ha_lock);
		memcpy(&neigh->ha, lladdr, dev->addr_len);
		write_sequnlock(&neigh->ha_lock);
		neigh_update_hhs(neigh);
		if (!(new & NUD_CONNECTED))
			neigh->confirmed = jiffies -
				      (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
		notify = 1;
	}
	if (new == old)
		goto out;
	if (new & NUD_CONNECTED)
		neigh_connect(neigh);
	else
		neigh_suspect(neigh);
	if (!(old & NUD_VALID)) {
		struct sk_buff *skb;

		/* Again: avoid dead loop if something went wrong */

		while (neigh->nud_state & NUD_VALID &&
		       (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
			struct dst_entry *dst = skb_dst(skb);
			struct neighbour *n2, *n1 = neigh;
			write_unlock_bh(&neigh->lock);

			rcu_read_lock();

			/* Why not just use 'neigh' as-is?  The problem is that
			 * things such as shaper, eql, and sch_teql can end up
			 * using alternative, different, neigh objects to output
			 * the packet in the output path.  So what we need to do
			 * here is re-lookup the top-level neigh in the path so
			 * we can reinject the packet there.
			 */
			n2 = NULL;
			if (dst &&
			    READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
				n2 = dst_neigh_lookup_skb(dst, skb);
				if (n2)
					n1 = n2;
			}
			READ_ONCE(n1->output)(n1, skb);
			if (n2)
				neigh_release(n2);
			rcu_read_unlock();

			write_lock_bh(&neigh->lock);
		}
		__skb_queue_purge(&neigh->arp_queue);
		neigh->arp_queue_len_bytes = 0;
	}
out:
	if (update_isrouter)
		neigh_update_is_router(neigh, flags, &notify);
	write_unlock_bh(&neigh->lock);
	if (((new ^ old) & NUD_PERMANENT) || gc_update)
		neigh_update_gc_list(neigh);
	if (managed_update)
		neigh_update_managed_list(neigh);
	if (notify)
		neigh_update_notify(neigh, nlmsg_pid);
	trace_neigh_update_done(neigh, err);
	return err;
}

int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
		 u32 flags, u32 nlmsg_pid)
{
	return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL);
}

邻居状态机定时器

对于 NUD_INCOMPLETENUD_REACHABLENUD_DELAYNUD_PROBE 状态来说,有超时处理机制,具体是在neigh_timer_handler中处理的。

/* Called when a timer expires for a neighbour entry. */

static void neigh_timer_handler(struct timer_list *t)
{
	unsigned long now, next;
	struct neighbour *neigh = timer_container_of(neigh, t, timer);
	unsigned int state;
	int notify = 0;

	write_lock(&neigh->lock);

	state = neigh->nud_state;
	now = jiffies;
	next = now + HZ;

	if (!(state & NUD_IN_TIMER))
		goto out;

	if (state & NUD_REACHABLE) {
		if (time_before_eq(now,
				   neigh->confirmed + neigh->parms->reachable_time)) {
                        /*还在reachable_time超时时间内,更新定时器时间*/
			neigh_dbg(2, "neigh %p is still alive\n", neigh);
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else if (time_before_eq(now,
					  neigh->used +
					  NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
                        /*在reachable_time超时时间外但是在delay时间内,更新为DELAY状态*/
			neigh_dbg(2, "neigh %p is delayed\n", neigh);
			WRITE_ONCE(neigh->nud_state, NUD_DELAY);
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
		} else {
                        /*过了delay时间,更新状态为stale*/
			neigh_dbg(2, "neigh %p is suspected\n", neigh);
			WRITE_ONCE(neigh->nud_state, NUD_STALE);
			neigh->updated = jiffies;
			neigh_suspect(neigh);
			notify = 1;
		}
	} else if (state & NUD_DELAY) {
		if (time_before_eq(now,
				   neigh->confirmed +
				   NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
                        /*在delay时间之前,更新状态为reachable,这是因为confirmed是接收确认的时间,也就是说delay之后有发送的报文导致confirmed时间被更新。*/
			neigh_dbg(2, "neigh %p is now reachable\n", neigh);
			WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
			neigh->updated = jiffies;
			neigh_connect(neigh);
			notify = 1;
			next = neigh->confirmed + neigh->parms->reachable_time;
		} else {
                        /*超过了delay时间,则进入probe状态*/
			neigh_dbg(2, "neigh %p is probed\n", neigh);
			WRITE_ONCE(neigh->nud_state, NUD_PROBE);
			neigh->updated = jiffies;
			atomic_set(&neigh->probes, 0);
			notify = 1;
			next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
					 HZ/100);
		}
	} else {
		/* NUD_PROBE|NUD_INCOMPLETE */
                /*NUD_PROBE和NUD_INCOMPLETE会触发定时器更新*/
		next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
	}
        /*NUD_PROBE和NUD_INCOMPLETE状态的neigh,如果探测次数大于了neigh设置的次数*/
	if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
	    atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
		if (neigh->nud_state == NUD_PROBE &&
		    neigh->flags & NTF_EXT_VALIDATED) {
                        /*probe状态更新为stale状态*/
			WRITE_ONCE(neigh->nud_state, NUD_STALE);
			neigh->updated = jiffies;
		} else {
                        /*incomplete状态更新为failed状态*/
			WRITE_ONCE(neigh->nud_state, NUD_FAILED);
			neigh_invalidate(neigh);
		}
		notify = 1;
		goto out;
	}

        /*更新定时器下次到期的时间*/
	if (neigh->nud_state & NUD_IN_TIMER) {
		if (time_before(next, jiffies + HZ/100))
			next = jiffies + HZ/100;
		if (!mod_timer(&neigh->timer, next))
			neigh_hold(neigh);
	}
        /*incomplete和probe状态会触发探测*/
	if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
		neigh_probe(neigh);
	} else {
out:
		write_unlock(&neigh->lock);
	}

	if (notify)
		neigh_update_notify(neigh, 0);

	trace_neigh_timer_handler(neigh, 0);

	neigh_release(neigh);
}

垃圾回收机制

在 neigh_table_init 初始化中,会创建一个垃圾回收机制,具体实现可以看这里。实际执行还任务的函数是neigh_periodic_work,具体实现如下:

static void neigh_periodic_work(struct work_struct *work)
{
	struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
	struct neigh_hash_table *nht;
	struct hlist_node *tmp;
	struct neighbour *n;
	unsigned int i;

	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);

	write_lock_bh(&tbl->lock);
	nht = rcu_dereference_protected(tbl->nht,
					lockdep_is_held(&tbl->lock));

	/*
	 *	periodically recompute ReachableTime from random function
	 */

        /*距离上次gc超过300Hz则更新reachable_time超时时间*/
	if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
		struct neigh_parms *p;

		WRITE_ONCE(tbl->last_rand, jiffies);
		list_for_each_entry(p, &tbl->parms_list, list)
			p->reachable_time =
				neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
	}
        /*小于gc_thresh1则不进行GC*/
	if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
		goto out;

	for (i = 0 ; i < (1 << nht->hash_shift); i++) {
		neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
			unsigned int state;

			write_lock(&n->lock);
                        /*静态arp,处于定时器状态的arp,以及外部控制的arp不进行gc*/
			state = n->nud_state;
			if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
			    (n->flags &
			     (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) {
				write_unlock(&n->lock);
				continue;
			}

			if (time_before(n->used, n->confirmed) &&
			    time_is_before_eq_jiffies(n->confirmed))
				n->used = n->confirmed;

                        /*引用计数为1表示未被使用,状态为NUD_FAILED或者距离上次使用超过了GC_STALETIME时间,则标记neigh为dead,并触发垃圾回收*/
			if (refcount_read(&n->refcnt) == 1 &&
			    (state == NUD_FAILED ||
			     !time_in_range_open(jiffies, n->used,
						 n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
				hlist_del_rcu(&n->hash);
				hlist_del_rcu(&n->dev_list);
				neigh_mark_dead(n);
				write_unlock(&n->lock);
				neigh_cleanup_and_release(n);
				continue;
			}
			write_unlock(&n->lock);
		}
		/*
		 * It's fine to release lock here, even if hash table
		 * grows while we are preempted.
		 */
		write_unlock_bh(&tbl->lock);
		cond_resched();
		write_lock_bh(&tbl->lock);
		nht = rcu_dereference_protected(tbl->nht,
						lockdep_is_held(&tbl->lock));
	}
out:
	/* Cycle through all hash buckets every BASE_REACHABLE_TIME/2 ticks.
	 * ARP entry timeouts range from 1/2 BASE_REACHABLE_TIME to 3/2
	 * BASE_REACHABLE_TIME.
	 */
        /*再次循环调度垃圾回收任务*/
	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
			      NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
	write_unlock_bh(&tbl->lock);
}

参考资料

《深入理解Linux网络技术内幕》 (内核版本2.6)

《Linux Kernel Networking – Implementation and Theory》 (内核版本3.9)

Comments are closed.